From 65e5a7b02f674e6ed6e50784a6cfb43a131d2826 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Tue, 5 Nov 2024 12:58:40 +0100 Subject: [PATCH 01/49] support 1D conv --- main_stream_co copy.py | 69 ++++++++++++ outputs/custom_ssm.onnx | Bin 0 -> 8163 bytes .../mapping/tpu_like_quad_core copy.yaml | 55 ++++++++++ stream/node_tensor.py | 2 +- stream/onnx_utils.py | 26 +++++ stream/parser/onnx/asymmetric_simd.py | 18 +-- stream/parser/onnx/conv.py | 103 ++++++++---------- stream/parser/onnx/einsum.py | 92 ++++++++++++++++ stream/parser/onnx/model.py | 2 +- stream/parser/onnx/operator_parser.py | 8 +- stream/parser/onnx/reduce_1d.py | 2 +- stream/parser/onnx/simd.py | 2 +- stream/parser/onnx/softmax.py | 2 +- stream/utils.py | 21 ---- 14 files changed, 305 insertions(+), 97 deletions(-) create mode 100644 main_stream_co copy.py create mode 100644 outputs/custom_ssm.onnx create mode 100644 stream/inputs/examples/mapping/tpu_like_quad_core copy.yaml create mode 100644 stream/onnx_utils.py create mode 100644 stream/parser/onnx/einsum.py diff --git a/main_stream_co copy.py b/main_stream_co copy.py new file mode 100644 index 00000000..dff081c1 --- /dev/null +++ b/main_stream_co copy.py @@ -0,0 +1,69 @@ +import logging as _logging +import re + +from stream.api import optimize_allocation_co +from stream.utils import CostModelEvaluationLUT +from stream.visualization.memory_usage import plot_memory_usage +from stream.visualization.schedule import ( + visualize_timeline_plotly, +) + +_logging_level = _logging.INFO +_logging_format = "%(asctime)s - %(name)s.%(funcName)s +%(lineno)s - %(levelname)s - %(message)s" +_logging.basicConfig(level=_logging_level, format=_logging_format) + +############################################INPUTS############################################ +accelerator = "stream/inputs/examples/hardware/tpu_like_quad_core.yaml" +workload_path = "outputs/custom_ssm.onnx" +mapping_path = "stream/inputs/examples/mapping/tpu_like_quad_core copy.yaml" +mode = "fused" +layer_stacks = [tuple(range(0, 11)), tuple(range(11, 22))] + list((i,) for i in range(22, 49)) +############################################################################################## + +################################PARSING############################### +hw_name = accelerator.split("/")[-1].split(".")[0] +wl_name = re.split(r"/|\.", workload_path)[-1] +if wl_name == "onnx": + wl_name = re.split(r"/|\.", workload_path)[-2] +experiment_id = f"{hw_name}-{wl_name}-{mode}-constraint_optimization" +###################################################################### + +scme = optimize_allocation_co( + hardware=accelerator, + workload=workload_path, + mapping=mapping_path, + mode=mode, + layer_stacks=layer_stacks, + experiment_id=experiment_id, + output_path="outputs", + skip_if_exists=False, +) + +############PLOTTING############# +plot_full_schedule = True +draw_dependencies = True +plot_data_transfer = True +section_start_percent = (0,) +percent_shown = (100,) +################################# + +#########################PLOTTING PATHS############################## +timeline_fig_path_plotly = f"outputs/{experiment_id}/schedule.html" +memory_fig_path = f"outputs/{experiment_id}/memory.png" +##################################################################### + +#####################CostModelEvaluationLUT LOAD############################# +cost_lut_path = f"outputs/{experiment_id}/cost_lut_post_co.pickle" +cost_lut = CostModelEvaluationLUT(cost_lut_path) +############################################################################# + +# Plotting schedule timeline of best SCME +visualize_timeline_plotly( + scme, + draw_dependencies=draw_dependencies, + draw_communication=plot_data_transfer, + fig_path=timeline_fig_path_plotly, + cost_lut=cost_lut, +) +# Plotting memory usage of best SCME +plot_memory_usage(scme, section_start_percent, percent_shown, fig_path=memory_fig_path) diff --git a/outputs/custom_ssm.onnx b/outputs/custom_ssm.onnx new file mode 100644 index 0000000000000000000000000000000000000000..669004a5e20ead7c1ce9ea8b18c2113ef3578945 GIT binary patch literal 8163 zcmbst%W@mX5leu;viX8mluRQsMXysf0ZRg~eCUyf5Iqi3nJU>-s$w33tcW$aE4++{ zMEepZuDB|dW2$n>f!9>!k`Kr+m;98|GrP0XvpWl>OjdER+uhUM)350s)3i#%ZzhA$ z(Me_AxwCtx`QX0~;hFVby+0WB>W>Hg@x<*<+Jot2IGwZ`l@%3Xn}Vda2J>&-?$oQ9 zW#f4vm&>)zjRHJ@dB2ZKA&aABg&bcF-bpD7G0e^*0w$oLvfdA=zVJHJBX8ex`;tL9 ziLybGp{fP<)El>c5P!zGQL3$i`Ns2x9ltkTN5CBH3khP6?Pf}XL}i-dpa1#fE;Gen z4anA5pd`Ool(5jRI-M9+i7bRLJCBc=1&i-I|M+Cm9{b;V>n7^YHsl^r01`!x(0&JBkBIL<0__4rX)NSO6(b+SJ908Dl*^qxWEOgrvVvp>I+cWJp7?L2 z+{iQtv-9|Xhav={7^1|2!Tz+HVt_C^j}LezOhl<*k!8r8(CF4$Xc|U9Vq}p(Lsl3A zT2fdY^!uj|9_+gl#^e63wO;r8?crz;)FW7&f~yIPT?{FHd<+~|E1~#_DkAX}|W}v9(;MAzvx5&r2rLJy4lqcpVJBnhd+sF%2&P zvrX{^FOrHjF$Z6b-2Qkt7<+OtNT9tC<1y6>LvPe;@sTUcE+uIE6+YxpfsM0{0>%PuJdm|uYq-q?1ZG(5(Uq+##7`h zjUibeXRMI*1OK=;@JV%DqF}oe;o)Tl8wBgvSXVfhR*i}&Kw&+2iV=baO&qcdF^uG1 zMdaR!X<+V|2FbN&Sb$b{-#qqzVp6UwFrq6hG}eKHwK+Z>|*WLc@U*b#u+{nY944BCqjgZ_M_AFePN2e)8Ruh<5 ziJL@;GLwy3oD8`dL$1ePSYoIi$G##%j)ODX`FiHgq)te~-6{}L`xlSY#iVwLNUr@aH}-%#J=UUzXq`lkKlRBh~*75F7Pyf0}0?8Q3KGi{FqK=KGk))&!0v#@x0=9&zWbi}E@O#xlVapZ zE)98IPw->r_g`BdP-dy~zY>mO(ar>#c#otnUTOUM?)SUQPyaCBqFumj!Kz7>DMT^p z-nsaIGLUTSD2M%4G~qX! z4Ri|1)6f0oBves3&g2-jIvWgT2{ud4_>V&%p3rI81 zX&8=P;Sk)~;YX0VkSPNWUKX&mYxp?-{O~8@eB>A<*PE(p=3&;M!Z3G{Q z6{3EU3ouKlP!%dD1xR>I;a9@~z_nE>(*G-4=2lv^a_^TRJF*n>S)BKJ!{5SFnd_+* zn@qcA-i7H_oPQqNP{%*V;n7v%(Vq`KNtY*`nps;kJR{V>Q>XE!>Nzs}OH&mvvUKAzTt|H^JPo4rjvWnnvKYaO*|WEG2X5iYEikR( zwEw2vZp3fR_|1vmwu4tQM&^64hrA^9PP}e6e%p)R?#6HTg5O>=$67y!2gqBlz7Gpl ziEG?!$k(QPb>!=Iut8IT%)#tbDdNHQGCk~GQsRs3sS_>Kz`fj(3hv0)UHQ5vU+)IZ ztXXwE)Adf)6li6yQECU75M>1B16F3!6JJKUHq@~}$rZI0hLvI~*t>8h(j||9=hFqF zl4v37s!A6(5~7vS&MKOOH`04Zj~Uu^PP)e@1zc}0%^FhB^=q0lx^=B1*B;H4%l`v% CE+ None: # type: ignore """Protect the original shape attribute to prevent errors""" - raise ValueError("The numpy shape of NodeTensor is hidden in an abstraction layer") + raise ValueError("The numpy shape of NodeTensor is hidden in an abstraction layer. Call `tensor_shape` instead") @property def full_shape(self): diff --git a/stream/onnx_utils.py b/stream/onnx_utils.py new file mode 100644 index 00000000..07dba98d --- /dev/null +++ b/stream/onnx_utils.py @@ -0,0 +1,26 @@ +from onnx import ModelProto, NodeProto +from zigzag.parser.onnx.utils import get_onnx_tensor_type + + +def get_onnx_input_shapes(node: NodeProto, onnx_model: ModelProto) -> list[list[int]]: + """Return the shape of each input operand""" + input_names = node.input + input_shapes = [get_onnx_tensor_type(name, onnx_model).shape for name in input_names] + return input_shapes + + +def get_onnx_output_shapes(node: NodeProto, onnx_model: ModelProto) -> list[list[int]]: + """Return the shape of each output operand""" + + output_names = node.output + output_shapes = [get_onnx_tensor_type(name, onnx_model).shape for name in output_names] + return output_shapes + + +def has_asymmetric_input_data(node: NodeProto, onnx_model: ModelProto): + """Return true iff the node has two inputs and the input nodes have a different shape""" + if len(node.input) != 2: + return False + + input_shape1, input_shape2 = get_onnx_input_shapes(node, onnx_model) + return input_shape1 != input_shape2 diff --git a/stream/parser/onnx/asymmetric_simd.py b/stream/parser/onnx/asymmetric_simd.py index a5ca54e7..027d0af3 100644 --- a/stream/parser/onnx/asymmetric_simd.py +++ b/stream/parser/onnx/asymmetric_simd.py @@ -1,12 +1,9 @@ from typing import Any -from zigzag.parser.onnx.utils import ( - get_node_input_output_dimension_shapes, -) from zigzag.parser.workload_factory import LayerNodeFactory +from stream.onnx_utils import get_onnx_input_shapes, get_onnx_output_shapes from stream.parser.onnx.operator_parser import OnnxComputeOperatorParser -from stream.utils import get_onnx_input_shapes from stream.workload.computation.computation_node import ComputationNode @@ -30,7 +27,7 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ data["name"] = self.node.name data["operator_type"] = self.node.op_type data["operand_source"] = self.get_operand_source_input_format() - data["operand_precision"] = self.get_operand_precision_input_format() + data["operand_precision"] = self.get_operand_precision_user_format() data["dimension_relations"] = [] data["loop_sizes"] = output_shape @@ -41,8 +38,15 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ def generate_node(self): # Get the input and output activation shapes - input_shape1, input_shape2 = get_onnx_input_shapes(self.node, self.onnx_model) - _, output_shape = get_node_input_output_dimension_shapes(self.node, self.onnx_model) + input_shapes = get_onnx_input_shapes(self.node, self.onnx_model) + if len(input_shapes) != 2: + raise NotImplementedError("Only SIMD nodes with input length 2 are supported") + input_shape1, input_shape2 = input_shapes + + output_shapes = get_onnx_output_shapes(self.node, self.onnx_model) + if len(output_shapes) != 1: + raise NotImplementedError("Only SIMD nodes with input length 2 are supported") + output_shape = output_shapes.pop() if input_shape1 == output_shape: non_batched_input_shape = input_shape2 diff --git a/stream/parser/onnx/conv.py b/stream/parser/onnx/conv.py index 939a8556..a5bc8aff 100644 --- a/stream/parser/onnx/conv.py +++ b/stream/parser/onnx/conv.py @@ -19,44 +19,59 @@ class ConvParser(OnnxComputeOperatorParser): OP_TYPE = "conv" - def get_layer_node_user_format( # type: ignore + def get_layer_node_user_format( self, - kernel_shape: list[int], - strides: list[int], - dilations: list[int], - group_size: int, - padding: list[int], - ia_shape: list[int], - oa_shape: list[int], + input_shape: list[int], + output_shape: list[int], ) -> dict[str, Any]: """ Generate the necessary dictionary items required for the LayerNode creation. """ - # convert the data types to precisions based on the onnx definition + predecessors = self.get_node_predecessors() + + # Extract extra attributes + attrs = self.node.attribute + kernel_shape: list[int] = get_attribute_ints_with_name("kernel_shape", attrs, default=None) # type:ignore + strides: list[int] = get_attribute_ints_with_name("strides", attrs, default=[1, 1]) # type:ignore + dilations: list[int] = get_attribute_ints_with_name("dilations", attrs, default=[1, 1]) # type:ignore + group_size: int = get_attribute_ints_with_name("group", attrs, default=1) # type:ignore + padding: list[int] = get_attribute_ints_with_name("pads", attrs, default=[0, 0, 0, 0]) # type:ignore + + # 1D Conv case: append dimensions of size 1 so equation holds. Conv in FY dimension + print(kernel_shape) + if len(kernel_shape) == 1: + kernel_shape.insert(0, 1) + input_shape.append(1) + output_shape.append(1) + strides.append(1) + dilations.append(1) + assert len(input_shape) == 4 + assert len(output_shape) == 4 + + if len(padding) == 2: + padding = 2 * padding - # Equation data: dict[str, Any] = {} data["id"] = self.node_id - data["name"] = f"Layer{self.node_id}" + data["name"] = self.node.name data["operator_type"] = ConvParser.OP_TYPE + # IMPORTANT: If any of the input loops require padding, they should be defined as the rightmost dimensions in # the equation. This is because we construct the dimensionality order and then add the padding to those last # dimensions in the order - if group_size > 1: - data["equation"] = "O[b][g][k][oy][ox]+=W[g][c][fy][fx]*I[b][g][c][iy][ix]" - else: - data["equation"] = "O[b][g][k][oy][ox]+=W[k][c][fy][fx]*I[b][g][c][iy][ix]" + weight_dim = "g" if group_size > 1 else "k" + data["equation"] = f"O[b][g][k][oy][ox]+=W[{weight_dim}][c][fy][fx]*I[b][g][c][iy][ix]" # Get dimension sizes from input parameters - assert ia_shape[0] == oa_shape[0], "Batch size is different for input and output activations." - B = oa_shape[0] + assert input_shape[0] == output_shape[0], "Batch size is different for input and output activations." + B = output_shape[0] G = group_size - K = ceil(oa_shape[1] / G) - OX = oa_shape[3] - OY = oa_shape[2] - C = ceil(ia_shape[1] / G) - IX = ia_shape[3] - IY = ia_shape[2] + K = ceil(output_shape[1] / G) + OX = output_shape[3] + OY = output_shape[2] + C = ceil(input_shape[1] / G) + IX = input_shape[3] + IY = input_shape[2] FX = kernel_shape[0] FY = kernel_shape[1] data["loop_dims"] = ["B", "K", "G", "OX", "OY", "C", "FX", "FY"] @@ -68,7 +83,8 @@ def get_layer_node_user_format( # type: ignore f"ix={strides[0]}*ox+{dilations[0]}*fx", f"iy={strides[1]}*oy+{dilations[1]}*fy", ] - data["operand_precision"] = {"O": 16, "O_final": 8, "W": 8, "I": 8} + data["operand_precision"] = self.get_operand_precision_user_format() + data["operand_source"] = self.get_operand_source_user_format(predecessors) # Add information wrt how this conv node's input/output tensors # are represented in the onnx model vs how they are represented in the equation above. @@ -83,49 +99,16 @@ def get_layer_node_user_format( # type: ignore [padding[1], padding[3]], ] - # Find the previous layer(s) that should be this node's parent(s) - node_inputs = self.node.input - assert len(node_inputs) >= 2, f"Conv should have at least two input names, but has: {node_inputs}." - (first_input_name, second_input_name) = node_inputs[:2] - - source_list_I = [ - src for (src, src_output_names) in self.nodes_outputs.items() if first_input_name in src_output_names - ] - source_list_W = [ - src for (src, src_output_names) in self.nodes_outputs.items() if second_input_name in src_output_names - ] - assert len(source_list_I) <= 1 - assert len(source_list_W) <= 1 - - source_I = source_list_I[0] if len(source_list_I) == 1 else self.node_id - source_W = source_list_W[0] if len(source_list_W) == 1 else self.node_id - - data["operand_source"] = { - "I": source_I, - "W": source_W, - } - return data def generate_node(self): - attrs = self.node.attribute - kernel_shape: list[int] = get_attribute_ints_with_name("kernel_shape", attrs, default=None) # type:ignore - strides: list[int] = get_attribute_ints_with_name("strides", attrs, default=[1, 1]) # type:ignore - dilations: list[int] = get_attribute_ints_with_name("dilations", attrs, default=[1, 1]) # type:ignore - group_size: int = get_attribute_ints_with_name("group", attrs, default=1) # type:ignore - padding: list[int] = get_attribute_ints_with_name("pads", attrs, default=[0, 0, 0, 0]) # type:ignore # Get the input and output activation shapes - ia_dimension_shape, oa_dimension_shape = get_node_input_output_dimension_shapes(self.node, self.onnx_model) + input_shape, output_shape = get_node_input_output_dimension_shapes(self.node, self.onnx_model) node_data: dict[str, Any] = self.get_layer_node_user_format( - kernel_shape, - strides, - dilations, - group_size, - padding, - ia_dimension_shape, - oa_dimension_shape, + input_shape, + output_shape, ) node_factory = LayerNodeFactory(node_data, mapping_data=None) diff --git a/stream/parser/onnx/einsum.py b/stream/parser/onnx/einsum.py new file mode 100644 index 00000000..003ed5ab --- /dev/null +++ b/stream/parser/onnx/einsum.py @@ -0,0 +1,92 @@ +import logging +import re +from typing import Any + +from stream.onnx_utils import get_onnx_input_shapes, get_onnx_output_shapes +from stream.parser.onnx.operator_parser import OnnxComputeOperatorParser + +logger = logging.getLogger(__name__) + + +class EinsumParser(OnnxComputeOperatorParser): + + def get_einsum_equation(self): + ATTR_NAME = "equation" + + attrs_names = [attr.name for attr in self.node.attribute] + name_idx = attrs_names.index(ATTR_NAME) + value = self.node.attribute[name_idx] + return str(value) + + def get_layer_dims_per_op(self): + einsum_equation = self.get_einsum_equation() + + return re.split(",|->", einsum_equation) + + def get_layer_equation(self, layer_dims_per_op: list[str]): + def put_in_brackets(s: str): + """e.g. `abc` -> `[a][b][c]""" + return "".join([f"[{char}]" for char in s]) + + if len(layer_dims_per_op) != 3: + raise NotImplementedError + + dims_I, dims_W, dims_O = layer_dims_per_op + equation = f"O{put_in_brackets(dims_O)}+=I{put_in_brackets(dims_I)}*{put_in_brackets(dims_W)}" + return equation + + # def get_layer_dims(self, layer_dims_per_op: list[str]): + # all_dims = {char.upper() for group in layer_dims_per_op for char in group} + # return list(all_dims) + + def get_layer_dim_sizes_dict(self, layer_dims_per_op: list[str]): + input_shapes = get_onnx_input_shapes(self.node, self.onnx_model) + output_shapes = get_onnx_output_shapes(self.node, self.onnx_model) + + if len(output_shapes) != 1: + raise ValueError("Einsum with more than one output not supported") + + shapes = input_shapes + output_shapes + + if len(layer_dims_per_op) != len(shapes): + raise ValueError("Einsum equation has more parts than node inputs") + + sizes_dict: dict[str, int] = {} + for layer_dims, sizes in zip(layer_dims_per_op, shapes): + if len(layer_dims) != len(sizes): + # TODO is the order of the equation guaranteed to be the same as the input order? + raise ValueError(f"Einsum equation part {layer_dims} and operand input shape {sizes} do not match") + for layer_dim, size in zip(layer_dims.upper(), sizes): + if layer_dim not in sizes_dict: + sizes_dict[layer_dim] = size + else: + if sizes_dict[layer_dim] != size: + raise ValueError(f"Not clear what the size of {layer_dim} is in Einsum") + + return sizes_dict + + def get_layer_node_user_format( + self, + input_shape: list[int], # Argument required because of a caller function in superclass + output_shape: list[int], # TODO put shape logic in this method for all `OnnxComputeOperatorParser` subclasses + ) -> dict[str, Any]: + """! Generate layer data in user input format for Einsum.""" + predecessors = self.get_node_predecessors() + + data: dict[str, Any] = {} + data["id"] = self.node_id + data["name"] = self.node.name + data["operator_type"] = self.node.op_type + data["dimension_relations"] = [] + data["operand_source"] = self.get_operand_source_user_format(predecessors) + data["operand_precision"] = self.get_operand_precision_user_format() + + # + layer_dims_per_op = self.get_layer_dims_per_op() + sizes_dict = self.get_layer_dim_sizes_dict(layer_dims_per_op) + + data["loop_dims"] = list(sizes_dict.keys()) + data["loop_sizes"] = list(sizes_dict.values()) + data["equation"] = self.get_layer_equation(layer_dims_per_op) + + return data diff --git a/stream/parser/onnx/model.py b/stream/parser/onnx/model.py index 3de76808..b465980c 100644 --- a/stream/parser/onnx/model.py +++ b/stream/parser/onnx/model.py @@ -5,6 +5,7 @@ from zigzag.parser.onnx.utils import parse_onnx_model_from_path from stream.hardware.architecture.accelerator import Accelerator +from stream.onnx_utils import get_onnx_input_shapes, has_asymmetric_input_data from stream.parser.onnx.asymmetric_simd import AsymmetricSimdParser from stream.parser.onnx.concat import ConcatParser from stream.parser.onnx.conv import ConvParser @@ -20,7 +21,6 @@ from stream.parser.onnx.simd import SimdParser from stream.parser.onnx.softmax import SoftmaxParser from stream.parser.onnx.transpose import TransposeParser -from stream.utils import get_onnx_input_shapes, has_asymmetric_input_data from stream.workload.mapping import InterCoreMappingAttributes from stream.workload.onnx_workload import ONNXWorkload diff --git a/stream/parser/onnx/operator_parser.py b/stream/parser/onnx/operator_parser.py index 343b2665..fdd7300e 100644 --- a/stream/parser/onnx/operator_parser.py +++ b/stream/parser/onnx/operator_parser.py @@ -58,10 +58,10 @@ def run(self) -> Generator[ComputationNode, None, None]: @abstractmethod def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[int]) -> dict[str, Any]: ... - def get_operand_precision_input_format(self) -> dict[str, int]: - act_precision = self.get_activation_precision() - weight_precision = self.get_weight_precision() - intermediate_output_precision = self.get_intermediate_output_precision() + def get_operand_precision_user_format(self) -> dict[str, int]: + act_precision: int = self.get_activation_precision() + weight_precision: int = self.get_weight_precision() + intermediate_output_precision: int = self.get_intermediate_output_precision() predecessors = self.get_node_predecessors() match len(predecessors): case 1: diff --git a/stream/parser/onnx/reduce_1d.py b/stream/parser/onnx/reduce_1d.py index be4ecc1e..b34289b0 100644 --- a/stream/parser/onnx/reduce_1d.py +++ b/stream/parser/onnx/reduce_1d.py @@ -20,7 +20,7 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ data["name"] = self.node.name data["operator_type"] = self.node.op_type data["operand_source"] = self.get_operand_source_input_format() - data["operand_precision"] = self.get_operand_precision_input_format() + data["operand_precision"] = self.get_operand_precision_user_format() data["dimension_relations"] = [] data["loop_sizes"] = input_shape diff --git a/stream/parser/onnx/simd.py b/stream/parser/onnx/simd.py index 2c5ae21d..9dae37d3 100644 --- a/stream/parser/onnx/simd.py +++ b/stream/parser/onnx/simd.py @@ -22,7 +22,7 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ data["name"] = self.node.name data["operator_type"] = self.node.op_type data["operand_source"] = self.get_operand_source_input_format() - data["operand_precision"] = self.get_operand_precision_input_format() + data["operand_precision"] = self.get_operand_precision_user_format() data["dimension_relations"] = [] data["loop_sizes"] = output_shape diff --git a/stream/parser/onnx/softmax.py b/stream/parser/onnx/softmax.py index 3f0a0506..25703b26 100644 --- a/stream/parser/onnx/softmax.py +++ b/stream/parser/onnx/softmax.py @@ -93,7 +93,7 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ data["name"] = self.node.name data["operator_type"] = self.node.op_type data["operand_source"] = self.get_operand_source_input_format() - data["operand_precision"] = self.get_operand_precision_input_format() + data["operand_precision"] = self.get_operand_precision_user_format() data["dimension_relations"] = [] data["loop_sizes"] = input_shape diff --git a/stream/utils.py b/stream/utils.py index 67f93f7a..06328b57 100644 --- a/stream/utils.py +++ b/stream/utils.py @@ -4,11 +4,9 @@ from typing import TYPE_CHECKING, Any, TypeAlias from numpy.typing import NDArray -from onnx import ModelProto, NodeProto from zigzag.cost_model.cost_model import CostModelEvaluation from zigzag.datatypes import MemoryOperand from zigzag.mapping.data_movement import FourWayDataMoving -from zigzag.parser.onnx.utils import get_onnx_tensor_type from stream.hardware.architecture.core import Core from stream.workload.mapping import TILING_T @@ -21,25 +19,6 @@ ARRAY_T: TypeAlias = NDArray[Any] -def get_onnx_input_shapes(node: NodeProto, onnx_model: ModelProto) -> tuple[list[int], list[int]]: - if len(node.input) != 2: - raise ValueError(f"Node {node.name} does not have two inputs") - input_name1 = node.input[0] - input_name2 = node.input[1] - input_shape1 = get_onnx_tensor_type(input_name1, onnx_model).shape - input_shape2 = get_onnx_tensor_type(input_name2, onnx_model).shape - return input_shape1, input_shape2 - - -def has_asymmetric_input_data(node: NodeProto, onnx_model: ModelProto): - """Return true iff the node has two inputs and the input nodes have a different shape""" - if len(node.input) != 2: - return False - - input_shape1, input_shape2 = get_onnx_input_shapes(node, onnx_model) - return input_shape1 != input_shape2 - - def get_too_large_operands(cme: CostModelEvaluation, accelerator: "Accelerator", core_id: int) -> list[MemoryOperand]: """Create a list of memory operands for which an extra memory level (i.e. offchip) was added. From 742a5f917c34d587d7763f2b5689ce94dd3cf55f Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Tue, 5 Nov 2024 16:57:43 +0100 Subject: [PATCH 02/49] add SplitNode --- stream/node_tensor.py | 4 ++ stream/onnx_utils.py | 65 ++++++++++++++++++- stream/parser/onnx/asymmetric_simd.py | 2 + stream/parser/onnx/concat.py | 23 ++++--- stream/parser/onnx/conv.py | 2 + stream/parser/onnx/default.py | 2 + stream/parser/onnx/elementwise.py | 3 + stream/parser/onnx/flatten.py | 9 ++- stream/parser/onnx/gather.py | 14 +--- stream/parser/onnx/lpnormalization.py | 3 + stream/parser/onnx/model.py | 3 + stream/parser/onnx/operator_parser.py | 7 +- stream/parser/onnx/pooling.py | 2 + stream/parser/onnx/reshape.py | 2 + stream/parser/onnx/split.py | 32 +++++++++ stream/parser/onnx/transpose.py | 2 + .../generation/tiled_workload_generation.py | 49 ++++---------- .../workload/computation/computation_node.py | 2 + .../dependency_propagation/concat_node.py | 20 ++---- .../dependency_propagation/dummy_node.py | 19 +++--- .../elementwise_node.py | 19 +++--- .../dependency_propagation/flatten_node.py | 31 ++++----- .../dependency_propagation/gather_node.py | 20 ++---- .../propagation_node.py | 28 ++++++++ .../dependency_propagation/reshape_node.py | 22 ++----- .../dependency_propagation/split_node.py | 56 ++++++++++++++++ .../dependency_propagation/transpose_node.py | 28 +++----- stream/workload/node.py | 15 +++-- 28 files changed, 309 insertions(+), 175 deletions(-) create mode 100644 stream/parser/onnx/split.py create mode 100644 stream/workload/dependency_propagation/propagation_node.py create mode 100644 stream/workload/dependency_propagation/split_node.py diff --git a/stream/node_tensor.py b/stream/node_tensor.py index 98aed143..ad7fdb62 100644 --- a/stream/node_tensor.py +++ b/stream/node_tensor.py @@ -125,6 +125,10 @@ def gather(self, gather_indices: int | list[int], axis: int) -> "NodeTensor": axis = axis - 1 if axis < 0 else axis return (np.take(self.as_ndarray(), gather_indices, axis=axis)).view(NodeTensor) + def split(self, split_indices: list[int], axis: int) -> "list[NodeTensor]": + axis = axis - 1 if axis < 0 else axis + return [t.view(NodeTensor) for t in np.split(self.as_ndarray(), split_indices, axis=axis)] + def concat_with_empty(self, shape: tuple[int, ...], axis: int, variable_input_first: bool): empty_shape = self.convert_to_full_shape(shape) empty_tensor = np.zeros(empty_shape, dtype=object) diff --git a/stream/onnx_utils.py b/stream/onnx_utils.py index 07dba98d..150d3701 100644 --- a/stream/onnx_utils.py +++ b/stream/onnx_utils.py @@ -1,6 +1,40 @@ -from onnx import ModelProto, NodeProto +from onnx import AttributeProto, ModelProto, NodeProto, numpy_helper from zigzag.parser.onnx.utils import get_onnx_tensor_type +import numpy as np +import onnx + + +def get_attribute_as_ints( + node: NodeProto, attribute_name: str, default: list[int] | int | None = None +) -> list[int] | int: + """! Return the value of an attribute of given name from the given attributes + If name does not exist in attrs, the default provided by the caller is used. + If the caller doesn't supply a default, an error is thrown. + + """ + attrs = node.attribute + attrs_names = [attr.name for attr in attrs] + try: + name_idx = attrs_names.index(attribute_name) + value = attrs[name_idx] + attr_type = value.type + if attr_type == AttributeProto.AttributeType.INT: # type: ignore + return int(value.i) + elif attr_type == AttributeProto.AttributeType.INTS: # type: ignore + return list(value.ints) + elif attr_type == AttributeProto.AttributeType.TENSOR: # type: ignore + return list(numpy_helper.to_array(value.t).tolist()) # type: ignore + else: + raise NotImplementedError(f"Attribute extraction of type {attr_type} not supported.") + except ValueError as exc: + if default is not None: + return default + else: + raise ValueError( + f"Node {node.name} has no attribute called {attribute_name} and no default was given. Attributes = {attrs_names}." + ) from exc + def get_onnx_input_shapes(node: NodeProto, onnx_model: ModelProto) -> list[list[int]]: """Return the shape of each input operand""" @@ -24,3 +58,32 @@ def has_asymmetric_input_data(node: NodeProto, onnx_model: ModelProto): input_shape1, input_shape2 = get_onnx_input_shapes(node, onnx_model) return input_shape1 != input_shape2 + + +def get_axis_attribute(node: NodeProto): + """Find the value of the axis associated with this ONNX node""" + ATTR_NAME = "axis" + + value = get_attribute_as_ints(node, ATTR_NAME) + if not isinstance(value, int): + raise ValueError(f"{ATTR_NAME} attribute as list of ints not supported") + return value + + +def get_split_attribute(node: NodeProto, onnx_model: ModelProto): + # ATTR_NAME = "split" + + output_name = next(n for n in node.input if "split" in n.lower()) + + for node in onnx_model.graph.node: + if node.op_type == "Constant" and node.output[0] == output_name: + for attr in node.attribute: + if attr.name == "value": + tensor = attr.t # This is an ONNX TensorProto + # Decode tensor to a numpy array + array = np.frombuffer(tensor.raw_data, dtype=int) + array = array.reshape([dim for dim in tensor.dims]) + + return [int(i) for i in array] + + raise ValueError diff --git a/stream/parser/onnx/asymmetric_simd.py b/stream/parser/onnx/asymmetric_simd.py index 027d0af3..3bedfa15 100644 --- a/stream/parser/onnx/asymmetric_simd.py +++ b/stream/parser/onnx/asymmetric_simd.py @@ -61,6 +61,7 @@ def generate_node(self): node_factory = LayerNodeFactory(node_data, mapping_data=None) node_attrs = node_factory.create_node_attr() mapping = self.get_mapping_this_node() + input_names = list(self.node.input) return ComputationNode( node_id=self.node_id, @@ -68,4 +69,5 @@ def generate_node(self): node_attr=node_attrs, mapping_attr=mapping, op_type=self.node.op_type, + input_names=input_names, ) diff --git a/stream/parser/onnx/concat.py b/stream/parser/onnx/concat.py index 3c63643e..229db1b9 100644 --- a/stream/parser/onnx/concat.py +++ b/stream/parser/onnx/concat.py @@ -7,10 +7,21 @@ class ConcatParser(OnnxOperatorParser): """Parses an onnx gather operator into a ConcatNode.""" + def get_axis_value(self): + AXIS_ATTR = "axis" + + """Find the value of the axis associated with this concat node in ONNX""" + # `axis` is an attribute of the node + try: + axis_attr = next(filter(lambda x: x.name == AXIS_ATTR, self.node.attribute)) + return axis_attr.i + except StopIteration: + raise ValueError("Axis attribute not found in ONNX node") + def generate_node(self): predecessors = self.get_node_predecessors() - axis = self.get_axis_value() + input_names = list(self.node.input) input_1, input_2 = self.node.input[0], self.node.input[1] @@ -36,13 +47,5 @@ def generate_node(self): axis=axis, constant_shape=constant_shape, variable_input_first=variable_input_first, + input_names=input_names, ) - - def get_axis_value(self): - """Find the value of the axis associated with this concat node in ONNX""" - # `axis` is an attribute of the node - try: - axis_attr = next(filter(lambda x: x.name == "axis", self.node.attribute)) - return axis_attr.i - except StopIteration: - raise ValueError("Axis attribute not found in ONNX node") diff --git a/stream/parser/onnx/conv.py b/stream/parser/onnx/conv.py index a5bc8aff..10f8566e 100644 --- a/stream/parser/onnx/conv.py +++ b/stream/parser/onnx/conv.py @@ -114,6 +114,7 @@ def generate_node(self): node_factory = LayerNodeFactory(node_data, mapping_data=None) node_attrs = node_factory.create_node_attr() mapping = self.get_mapping_this_node() + input_names = list(self.node.input) return ComputationNode( node_id=self.node_id, @@ -122,4 +123,5 @@ def generate_node(self): mapping_attr=mapping, op_type=ConvParser.OP_TYPE, operand_tensor_reshape=None, + input_names=input_names, ) diff --git a/stream/parser/onnx/default.py b/stream/parser/onnx/default.py index 8bdd3f99..645fc88a 100644 --- a/stream/parser/onnx/default.py +++ b/stream/parser/onnx/default.py @@ -7,10 +7,12 @@ class DefaultNodeParser(OnnxOperatorParser): def generate_node(self): predecessors = self.get_node_predecessors() + input_names = list(self.node.input) return DummyNode( node_id=self.node_id, node_name=self.node.name, predecessors=predecessors, op_type=self.node.op_type.lower(), + input_names=input_names, ) diff --git a/stream/parser/onnx/elementwise.py b/stream/parser/onnx/elementwise.py index d7b68a55..55e035d8 100644 --- a/stream/parser/onnx/elementwise.py +++ b/stream/parser/onnx/elementwise.py @@ -14,6 +14,8 @@ def __init__(self, node_id, node, nodes_outputs, mapping, onnx_model) -> None: self.name = node.name def generate_node(self): + input_names = list(self.node.input) + # Get the predecessors of this node predecessors = [] for node_input in self.node.input: @@ -28,5 +30,6 @@ def generate_node(self): node_id=self.node_id, node_name=self.name, predecessor=predecessors, + input_names=input_names, ) return node_obj diff --git a/stream/parser/onnx/flatten.py b/stream/parser/onnx/flatten.py index 215f6676..35e4f0c0 100644 --- a/stream/parser/onnx/flatten.py +++ b/stream/parser/onnx/flatten.py @@ -1,5 +1,3 @@ -from zigzag.parser.onnx.utils import get_attribute_ints_with_name - from stream.parser.onnx.operator_parser import OnnxOperatorParser from stream.workload.dependency_propagation.flatten_node import FlattenNode @@ -12,12 +10,13 @@ def generate_node(self): assert len(predecessors) == 1 predecessor = predecessors[0] - attrs = self.node.attribute - # Get the axis which indicates how to flatten the input tensor - axis: int | None = get_attribute_ints_with_name("axis", attrs, default=None) # type: ignore + input_names = list(self.node.input) + axis = self.get_axis_attribute() + return FlattenNode( node_id=self.node_id, node_name=self.node.name, predecessor=predecessor, axis=axis, + input_names=input_names, ) diff --git a/stream/parser/onnx/gather.py b/stream/parser/onnx/gather.py index e7a32cc9..b9c3fde2 100644 --- a/stream/parser/onnx/gather.py +++ b/stream/parser/onnx/gather.py @@ -9,8 +9,9 @@ class GatherParser(OnnxOperatorParser): def generate_node(self): predecessors = self.get_node_predecessors() - axis = self.get_axis_value() + axis = self.get_axis_attribute() indices = self.get_indices_value() + input_names = list(self.node.input) return GatherNode( node_id=self.node_id, @@ -18,6 +19,7 @@ def generate_node(self): predecessors=predecessors, gather_axis=axis, gather_indices=indices, + input_names=input_names, ) def get_indices_value(self): @@ -39,13 +41,3 @@ def get_indices_value(self): indices = DEFAULT return indices - - def get_axis_value(self): - """Find the value of the axis associated with this gather node in ONNX""" - # `axis` is an attribute of the node - try: - axis_attr = next(filter(lambda x: x.name == "axis", self.node.attribute)) - axis = axis_attr.i - except StopIteration: - axis = 0 - return axis diff --git a/stream/parser/onnx/lpnormalization.py b/stream/parser/onnx/lpnormalization.py index 0ca2569f..6f4ddc5b 100644 --- a/stream/parser/onnx/lpnormalization.py +++ b/stream/parser/onnx/lpnormalization.py @@ -11,6 +11,8 @@ def __init__(self, node_id, node, nodes_outputs, mapping, onnx_model) -> None: super().__init__(node_id, node, nodes_outputs, mapping, onnx_model) def generate_node(self): + input_names = list(self.node.input) + # Get the predecessors of this node # TODO use superclass' `get_node_predecessors` predecessors = [] @@ -23,5 +25,6 @@ def generate_node(self): node_id=self.node_id, node_name=self.node_name, predecessor=self.predecessor, + input_names=input_names, ) return node_obj diff --git a/stream/parser/onnx/model.py b/stream/parser/onnx/model.py index b465980c..a648fba8 100644 --- a/stream/parser/onnx/model.py +++ b/stream/parser/onnx/model.py @@ -20,6 +20,7 @@ from stream.parser.onnx.reshape import ReshapeParser from stream.parser.onnx.simd import SimdParser from stream.parser.onnx.softmax import SoftmaxParser +from stream.parser.onnx.split import SplitParser from stream.parser.onnx.transpose import TransposeParser from stream.workload.mapping import InterCoreMappingAttributes from stream.workload.onnx_workload import ONNXWorkload @@ -46,12 +47,14 @@ class ONNXModelParser: "Relu": SimdParser, "Gelu": SimdParser, "Silu": SimdParser, + # Dependency propagation "LpNormalization": LpNormalizationParser, "Gather": GatherParser, "Transpose": TransposeParser, "Reshape": ReshapeParser, "Flatten": FlattenParser, "Concat": ConcatParser, + "Split": SplitParser, } def __init__( diff --git a/stream/parser/onnx/operator_parser.py b/stream/parser/onnx/operator_parser.py index fdd7300e..a4345895 100644 --- a/stream/parser/onnx/operator_parser.py +++ b/stream/parser/onnx/operator_parser.py @@ -7,6 +7,7 @@ from zigzag.parser.workload_factory import LayerNodeFactory from stream.hardware.architecture.accelerator import Accelerator +from stream.onnx_utils import get_axis_attribute from stream.workload.computation.computation_node import ComputationNode from stream.workload.mapping import InterCoreMappingAttributes from stream.workload.node import Node @@ -49,6 +50,9 @@ def get_operand_source_input_format(self): case _: raise ValueError("No more than 2 layer predecessors expected") + def get_axis_attribute(self): + return get_axis_attribute(self.node) + class OnnxComputeOperatorParser(OnnxOperatorParser, metaclass=ABCMeta): @@ -120,8 +124,8 @@ def generate_node(self): node_data = self.get_layer_node_user_format(input_shape, output_shape) node_factory = LayerNodeFactory(node_data, mapping_data=[]) node_attrs = node_factory.create_node_attr() - mapping = self.get_mapping_this_node() + input_names = list(self.node.input) return ComputationNode( node_id=self.node_id, @@ -129,4 +133,5 @@ def generate_node(self): op_type=self.node.op_type, node_attr=node_attrs, mapping_attr=mapping, + input_names=input_names, ) diff --git a/stream/parser/onnx/pooling.py b/stream/parser/onnx/pooling.py index ff120f9f..780efbec 100644 --- a/stream/parser/onnx/pooling.py +++ b/stream/parser/onnx/pooling.py @@ -117,10 +117,12 @@ def generate_node(self): node_factory = LayerNodeFactory(node_data, None) node_attrs = node_factory.create_node_attr() mapping = self.get_mapping_this_node() + input_names = list(self.node.input) return PoolingNode( node_id=self.node_id, node_name=self.node.name, node_attr=node_attrs, mapping_attr=mapping, + input_names=input_names, ) diff --git a/stream/parser/onnx/reshape.py b/stream/parser/onnx/reshape.py index 325eb378..1ed9c193 100644 --- a/stream/parser/onnx/reshape.py +++ b/stream/parser/onnx/reshape.py @@ -14,10 +14,12 @@ def generate_node(self): # The operator shape is saved as the second input, so we need to get the input's dimension shape shape = tuple(get_node_input_output_dimension_shapes(self.node, self.onnx_model)[1]) + input_names = list(self.node.input) return ReshapeNode( node_id=self.node_id, node_name=self.node.name, predecessor=predecessor, shape=shape, + input_names=input_names, ) diff --git a/stream/parser/onnx/split.py b/stream/parser/onnx/split.py new file mode 100644 index 00000000..95d1967b --- /dev/null +++ b/stream/parser/onnx/split.py @@ -0,0 +1,32 @@ +from stream.onnx_utils import get_split_attribute +from stream.parser.onnx.operator_parser import OnnxOperatorParser +from stream.workload.dependency_propagation.split_node import SplitNode + + +class SplitParser(OnnxOperatorParser): + """Parses an onnx gather operator into a SplitNode.""" + + def generate_node(self): + # Single predecessor + predecessors = self.get_node_predecessors() + if len(predecessors) > 1: + raise ValueError("Split node should not have more than one input") + predecessor = predecessors.pop() + + axis = self.get_axis_attribute() + splits = get_split_attribute(self.node, self.onnx_model) + input_names = list(self.node.input) + output_names = list(self.node.output) + + if len(splits) != len(output_names): + raise ValueError + + return SplitNode( + node_id=self.node_id, + node_name=self.node.name, + predecessor=predecessor, + axis=axis, + splits=splits, + input_names=input_names, + output_names=output_names, + ) diff --git a/stream/parser/onnx/transpose.py b/stream/parser/onnx/transpose.py index ba6dae2a..0b3bcb7a 100644 --- a/stream/parser/onnx/transpose.py +++ b/stream/parser/onnx/transpose.py @@ -11,12 +11,14 @@ def generate_node(self): predecessor = predecessors.pop() permute_axes = self.get_permute_indices() + input_names = list(self.node.input) return TransposeNode( node_id=self.node_id, node_name=self.node.name, predecessor=predecessor, permute_axes=permute_axes, + input_names=input_names, ) def get_permute_indices(self): diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index df5b4953..92853964 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -19,12 +19,7 @@ from stream.workload.computation.computation_node import ComputationNode, LoopRanges from stream.workload.dependency_propagation.concat_node import ConcatNode from stream.workload.dependency_propagation.dummy_node import DummyNode -from stream.workload.dependency_propagation.elementwise_node import ElementwiseNode -from stream.workload.dependency_propagation.flatten_node import FlattenNode -from stream.workload.dependency_propagation.gather_node import GatherNode -from stream.workload.dependency_propagation.lpnormalization_node import LpNormalizationNode -from stream.workload.dependency_propagation.reshape_node import ReshapeNode -from stream.workload.dependency_propagation.transpose_node import TransposeNode +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.dnn_workload import DNNWorkloadStream from stream.workload.node import Node from stream.workload.onnx_workload import ComputationNodeWorkload, ONNXWorkload @@ -585,6 +580,7 @@ def get_tensor_cn_for_op(node: ComputationNode, dependent_operand: LayerOperand) assert ( len(paths_between) > 0 ), "No paths between producer and consumer found without ComputationNode in intermediates." + for path_between in paths_between: # First node in the path is a ComputationNode, of which we extract the output operand dependency tensor first_node = path_between[0] @@ -592,10 +588,10 @@ def get_tensor_cn_for_op(node: ComputationNode, dependent_operand: LayerOperand) tensor = get_tensor_cn_for_op(first_node, dependent_operand=Constants.OUTPUT_LAYER_OP) # Propagate through intermediate, non-computation nodes - for _, node in enumerate(path_between[1:-1], start=1): - if isinstance(node, ComputationNode): - raise ValueError("Intermediate nodes should not be of type ComputationNode.") - tensor = self.propagate_cn_production_for_non_cn(node, tensor) + for i, node in enumerate(path_between[1:-1], start=1): + assert isinstance(node, PropagationNode), "Intermediate nodes should not be of type ComputationNode" + next_node = path_between[i + 1] + tensor = node.propagate(tensor, next_node) # Final node: Computation node final_node: ComputationNode = path_between[-1] # type: ignore @@ -607,7 +603,7 @@ def get_tensor_cn_for_op(node: ComputationNode, dependent_operand: LayerOperand) ) # Error handling of shape mismatches in tensor propagation - def get_final_tensor_alt_operand(): + def _get_final_tensor_alt_operand(): """Error handling case 1: sources for `W` and `I` operand are swapped for this node -> try the other one""" try: @@ -617,7 +613,7 @@ def get_final_tensor_alt_operand(): raise TensorDimensionMismatchException return get_tensor_cn_for_op(final_node, alt_operand) - def get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: NodeTensor): + def _get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: NodeTensor): """Error handling case 2: dimensions of ComputationNode (`final_tensor`) were altered by stream (e.g. to be properly divisible) but this is not reflected in `ConcatNode` with constant shape. -> manually fix shape""" @@ -644,17 +640,17 @@ def get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: NodeT inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) except TensorDimensionMismatchException: try: # Error case 1 - final_tensor = get_final_tensor_alt_operand() + final_tensor = _get_final_tensor_alt_operand() inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) except TensorDimensionMismatchException: try: # Error case 2 final_tensor = get_tensor_cn_for_op(final_node, dependent_operand) - tensor = get_shape_inferred_propagated_tensor(tensor, final_tensor) + tensor = _get_shape_inferred_propagated_tensor(tensor, final_tensor) inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) except TensorDimensionMismatchException: # Error case 1 and 2 combined - final_tensor = get_final_tensor_alt_operand() - tensor = get_shape_inferred_propagated_tensor(tensor, final_tensor) + final_tensor = _get_final_tensor_alt_operand() + tensor = _get_shape_inferred_propagated_tensor(tensor, final_tensor) inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) for producer, cons in inter_edges: @@ -670,27 +666,6 @@ def get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: NodeT ) return all_inter_edges - def propagate_cn_production_for_non_cn(self, node: Node, input_tensor: NodeTensor) -> NodeTensor: - match node: - case ReshapeNode(): - return node.reshape_operand_tensor(input_tensor) - case TransposeNode(): - return node.transpose(input_tensor) - case LpNormalizationNode(): - return node.lpnormalization_operand_tensor(input_tensor) - case FlattenNode(): - return node.flatten(input_tensor) - case ElementwiseNode(): - return input_tensor.copy() - case GatherNode(): - return node.gather_operand_tensor(input_tensor) - case ConcatNode(): - return node.concat(input_tensor) - case DummyNode(): - return input_tensor - case _: - raise NotImplementedError(f"Tensor propagation not implemented for node {node.name}.") - @staticmethod def get_inter_edges_tensor_based(producer_output_tensor: NodeTensor, consumer_input_tensor: NodeTensor): """This method obtains the edges between a producer and consumer. diff --git a/stream/workload/computation/computation_node.py b/stream/workload/computation/computation_node.py index cad237c5..58e7e535 100644 --- a/stream/workload/computation/computation_node.py +++ b/stream/workload/computation/computation_node.py @@ -61,6 +61,7 @@ def __init__( produces_final_output: bool = False, group_id: int = 0, sub_id: int = -1, # To distinguish alternative versions of this node + input_names: list[str] = [], ): op_type = op_type.lower() @@ -76,6 +77,7 @@ def __init__( offchip_energy=0, runtime=0, possible_core_allocation=mapping_attr.core_allocation, + input_names=input_names, ) # Overwrite default spatial mapping with given one diff --git a/stream/workload/dependency_propagation/concat_node.py b/stream/workload/dependency_propagation/concat_node.py index 113aba48..acd956a5 100644 --- a/stream/workload/dependency_propagation/concat_node.py +++ b/stream/workload/dependency_propagation/concat_node.py @@ -1,11 +1,11 @@ from zigzag.datatypes import LayerOperand -from zigzag.workload.layer_node_abc import LayerNodeABC from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node -class ConcatNode(Node, LayerNodeABC): +class ConcatNode(PropagationNode): """Class that represents an onnx Concat node with one constant input.""" def __init__( @@ -16,6 +16,7 @@ def __init__( axis: int, constant_shape: tuple[int, ...], variable_input_first: bool, + input_names: list[str] = [], ) -> None: """Initialize the ConcatNode @@ -26,17 +27,8 @@ def __init__( variable_input_first: Wether the result is `concat(input, constant_tensor)` or `concat(constant_tensor, input)` """ - Node.__init__( - self, - node_id=node_id, - node_name=node_name, - type="gather", - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) - LayerNodeABC.__init__(self, node_id=node_id, node_name=node_name) + op_type = "concat" + super().__init__(node_id, node_name, op_type, input_names) self.axis = axis self.constant_shape = constant_shape @@ -53,7 +45,7 @@ def __init__( case _: raise ValueError("More than two inputs for ConcatNode") - def concat(self, tensor: NodeTensor) -> NodeTensor: + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: """Perform gather operation on the tensor.""" return tensor.concat_with_empty( shape=self.constant_shape, axis=self.axis, variable_input_first=self.variable_input_first diff --git a/stream/workload/dependency_propagation/dummy_node.py b/stream/workload/dependency_propagation/dummy_node.py index e24dc0bd..9e26f04e 100644 --- a/stream/workload/dependency_propagation/dummy_node.py +++ b/stream/workload/dependency_propagation/dummy_node.py @@ -1,9 +1,11 @@ from zigzag.workload.dummy_node import DummyNode as DummyNodeZigZag +from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node -class DummyNode(DummyNodeZigZag, Node): +class DummyNode(DummyNodeZigZag, PropagationNode): """DummyNode of an onnx operator that is not import for finer graph generation or for cost estimation, but plays a role because of the passing of the input and output tensors. """ @@ -14,7 +16,9 @@ def __init__( node_name: str, predecessors: list[int], op_type: str = "dummy", + input_names: list[str] = [], ) -> None: + PropagationNode.__init__(self, node_id, node_name, op_type, input_names) DummyNodeZigZag.__init__( self, node_id=node_id, @@ -22,13 +26,6 @@ def __init__( node_type=op_type, node_name=node_name, ) - Node.__init__( - self, - node_id=node_id, - node_name=node_name, - type=op_type, - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) + + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: + return tensor diff --git a/stream/workload/dependency_propagation/elementwise_node.py b/stream/workload/dependency_propagation/elementwise_node.py index fbb507b2..47d2fa66 100644 --- a/stream/workload/dependency_propagation/elementwise_node.py +++ b/stream/workload/dependency_propagation/elementwise_node.py @@ -1,25 +1,21 @@ from zigzag.datatypes import LayerOperand +from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node -class ElementwiseNode(Node): +class ElementwiseNode(PropagationNode): def __init__( self, node_id: int, node_name: str, predecessor: int, + input_names: list[str], ) -> None: - super().__init__( - node_id=node_id, - node_name=node_name, - type="elementwise", - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) + op_type = "elementwise" + super().__init__(node_id, node_name, op_type, input_names) self.input_operand_source = {LayerOperand("I"): predecessor} def join(self, tensor1, tensor2): @@ -30,3 +26,6 @@ def join(self, tensor1, tensor2): tensor2 (np.ndarray): The second input tensor """ return tensor1 | tensor2 + + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: + return tensor diff --git a/stream/workload/dependency_propagation/flatten_node.py b/stream/workload/dependency_propagation/flatten_node.py index dbe48577..cd82be9e 100644 --- a/stream/workload/dependency_propagation/flatten_node.py +++ b/stream/workload/dependency_propagation/flatten_node.py @@ -1,12 +1,12 @@ import numpy as np from zigzag.datatypes import LayerOperand -from zigzag.workload.layer_node_abc import LayerNodeABC from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node -class FlattenNode(Node, LayerNodeABC): +class FlattenNode(PropagationNode): """Class that represents an onnx Flatten node.""" def __init__( @@ -15,32 +15,23 @@ def __init__( node_name: str, predecessor: int | None, axis: int | None, + input_names: list[str], ) -> None: """Initialize the FlattenNode Args: - shape (list): The output tensor's shape. + shape: The output tensor's shape. """ - super().__init__( - node_id=node_id, - node_name=node_name, - type="flatten", - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) + op_type = "flatten" + super().__init__(node_id, node_name, op_type, input_names) + self.axis = axis if predecessor is not None: self.input_operand_source = {LayerOperand("I"): predecessor} - def flatten(self, input_tensor: NodeTensor) -> NodeTensor: - """Reshape an input tensor - - Args: - input_tensor (np.ndarray): The input tensor - """ - shape = input_tensor.tensor_shape + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: + """Reshape an input tensor""" + shape = tensor.tensor_shape # taken from https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-51 new_shape = (1, -1) if self.axis == 0 else (np.prod(shape[0 : self.axis]).astype(int), -1) - return input_tensor.reshape(new_shape) + return tensor.reshape(new_shape) diff --git a/stream/workload/dependency_propagation/gather_node.py b/stream/workload/dependency_propagation/gather_node.py index 1967ac06..6d584072 100644 --- a/stream/workload/dependency_propagation/gather_node.py +++ b/stream/workload/dependency_propagation/gather_node.py @@ -1,11 +1,11 @@ from zigzag.datatypes import LayerOperand -from zigzag.workload.layer_node_abc import LayerNodeABC from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node -class GatherNode(Node, LayerNodeABC): +class GatherNode(PropagationNode): """Class that represents an onnx Reshape node.""" def __init__( @@ -15,6 +15,7 @@ def __init__( predecessors: list[int], gather_axis: int, gather_indices: int | list[int], + input_names: list[str] = [], ) -> None: """Initialize the GatherNode @@ -23,17 +24,8 @@ def __init__( gather_axis: Which axis to gather on. gather_indices: Indices of elements to be gathered. """ - Node.__init__( - self, - node_id=node_id, - node_name=node_name, - type="gather", - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) - LayerNodeABC.__init__(self, node_id=node_id, node_name=node_name) + op_type = "gather" + super().__init__(node_id, node_name, op_type, input_names) self.gather_axis = gather_axis self.gather_indices = gather_indices @@ -48,6 +40,6 @@ def __init__( case _: raise ValueError("More than two inputs for GatherNode") - def gather_operand_tensor(self, tensor: NodeTensor) -> NodeTensor: + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: """Perform gather operation on the tensor.""" return tensor.gather(self.gather_indices, axis=self.gather_axis) diff --git a/stream/workload/dependency_propagation/propagation_node.py b/stream/workload/dependency_propagation/propagation_node.py new file mode 100644 index 00000000..401a3242 --- /dev/null +++ b/stream/workload/dependency_propagation/propagation_node.py @@ -0,0 +1,28 @@ +from abc import abstractmethod + +from zigzag.workload.layer_node_abc import LayerNodeABC + +from stream.node_tensor import NodeTensor +from stream.workload.node import Node + + +class PropagationNode(Node, LayerNodeABC): + """Stream node that does not perform computations and is not mapped on hardware, but propagates dependencies + between nodes""" + + def __init__(self, node_id: int, node_name: str, op_type: str, input_names: list[str]): + Node.__init__( + self, + node_id=node_id, + node_name=node_name, + type=op_type, + onchip_energy=0, + offchip_energy=0, + runtime=0, + possible_core_allocation=[-1], + input_names=input_names, + ) + LayerNodeABC.__init__(self, node_id=node_id, node_name=node_name) + + @abstractmethod + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: ... diff --git a/stream/workload/dependency_propagation/reshape_node.py b/stream/workload/dependency_propagation/reshape_node.py index c1223240..33ef3537 100644 --- a/stream/workload/dependency_propagation/reshape_node.py +++ b/stream/workload/dependency_propagation/reshape_node.py @@ -1,11 +1,11 @@ +from yaml import Node from zigzag.datatypes import Constants -from zigzag.workload.layer_node_abc import LayerNodeABC from stream.node_tensor import NodeTensor -from stream.workload.node import Node +from stream.workload.dependency_propagation.propagation_node import PropagationNode -class ReshapeNode(Node, LayerNodeABC): +class ReshapeNode(PropagationNode): """Class that represents an onnx Reshape node.""" def __init__( @@ -15,6 +15,7 @@ def __init__( predecessor: int, shape: tuple[int, ...], allow_zero: bool = False, + input_names: list[str] = [], ) -> None: """Initialize the ReshapeNode @@ -23,23 +24,14 @@ def __init__( shape: The output tensor's shape. allow_zero: wether the output shape can be 0 at some dimensions. Iff True, shape `[2,0,3]` becomes `[2,3]` """ - Node.__init__( - self, - node_id=node_id, - node_name=node_name, - type="reshape", - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) - LayerNodeABC.__init__(self, node_id=node_id, node_name=node_name) + op_type = "reshape" + super().__init__(node_id, node_name, op_type, input_names) self.allow_zero = allow_zero self.shape = shape self.input_operand_source = {Constants.LAYER_OP_I: predecessor} - def reshape_operand_tensor(self, tensor: NodeTensor): + def propagate(self, tensor: NodeTensor, next_node: Node) -> NodeTensor: """Reshape the tensor back to the representation needed for producer/consumer.""" new_shape = self.shape if not new_shape: diff --git a/stream/workload/dependency_propagation/split_node.py b/stream/workload/dependency_propagation/split_node.py new file mode 100644 index 00000000..631be9f0 --- /dev/null +++ b/stream/workload/dependency_propagation/split_node.py @@ -0,0 +1,56 @@ +import numpy as np +from zigzag.datatypes import Constants + +from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode +from stream.workload.node import Node + + +class SplitNode(PropagationNode): + """Class that represents an onnx Split node.""" + + def __init__( + self, + node_id: int, + node_name: str, + predecessor: int, + axis: int, + splits: list[int], + output_names: list[str], + input_names: list[str] = [], + ) -> None: + """Initialize the SplitNode + Split the tensor at axis `axis`. The sizes are given by `splits`. `len(splits)` is the number of output nodes. + + Args: + predecessors: The id of this node's parent. + axis: axis in which to split + splits: sizes of the output splits in the given axis + output_names: the node names that correspond to the splits + """ + assert len(splits) == len(output_names) + op_type = "split" + super().__init__(node_id, node_name, op_type, input_names) + + self.axis = axis + self.splits = splits + self.input_operand_source = {Constants.LAYER_OP_I: predecessor} + self.output_names = output_names + + def propagate(self, tensor: NodeTensor, next_node: Node): + """Split the tensor back to the representation needed for producer/consumer.""" + + # Numpy requires the indices where to split instead of the sizes of the resulting splits + split_indices = list(np.cumsum(self.splits)[:-1]) + output_tensors = tensor.split(split_indices, axis=self.axis) + + # Find which split part corresponds to the input of the next node + try: + index = next(i for i, output_name in enumerate(self.output_names) if output_name in next_node.input_names) + except StopIteration: + raise ValueError( + f"Cannot find this nodes' ({self.name}) outputs {self.output_names} in next nodes' inputs {next_node.input_names}" + ) + + output_tensor = output_tensors[index] + return output_tensor diff --git a/stream/workload/dependency_propagation/transpose_node.py b/stream/workload/dependency_propagation/transpose_node.py index e4fb1223..d2d2fb23 100644 --- a/stream/workload/dependency_propagation/transpose_node.py +++ b/stream/workload/dependency_propagation/transpose_node.py @@ -1,11 +1,11 @@ from zigzag.datatypes import LayerOperand -from zigzag.workload.layer_node_abc import LayerNodeABC from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node -class TransposeNode(Node, LayerNodeABC): +class TransposeNode(PropagationNode): """Class that represents an onnx Transpose node.""" def __init__( @@ -14,26 +14,14 @@ def __init__( node_name: str, predecessor: int, permute_axes: list[int] | None = None, + input_names: list[str] = [], ) -> None: - Node.__init__( - self, - node_id=node_id, - node_name=node_name, - type="reshape", - onchip_energy=0, - offchip_energy=0, - runtime=0, - possible_core_allocation=[-1], - ) - LayerNodeABC.__init__(self, node_id=node_id, node_name=node_name) + op_type = "transpose" + super().__init__(node_id, node_name, op_type, input_names) self.permute_axes = permute_axes self.input_operand_source = {LayerOperand("I"): predecessor} - def transpose(self, input_tensor: NodeTensor) -> NodeTensor: - """Transpose an input tensor. - - Args: - input_tensor (np.ndarray): The input tensor - """ - return input_tensor.transpose(axes=self.permute_axes) + def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: + """Transpose an input tensor.""" + return tensor.transpose(axes=self.permute_axes) diff --git a/stream/workload/node.py b/stream/workload/node.py index 06f216ac..c720ef88 100644 --- a/stream/workload/node.py +++ b/stream/workload/node.py @@ -20,17 +20,19 @@ def __init__( possible_core_allocation: list[int], core_allocation_is_fixed: bool = False, chosen_core_allocation: int | None = None, + input_names: list[str] = [], ) -> None: """Initialize the Node metaclass Args: - type (str): The type of Node. - energy (float): The energy consumption of this Node. - runtime (int): The runtime of this Node. - possible_core_allocation (int): The core id on which this Node can be mapped. - inputs: (List[str]): The names of the input tensors of this node - outputs: (List[str]): The names of the output tensors of this node. + type: The type of Node. + energy: The energy consumption of this Node. + runtime: The runtime of this Node. + possible_core_allocation: The core id on which this Node can be mapped. + inputs: The names of the input tensors of this node + outputs: The names of the output tensors of this node. chosen_core_allocation: The final core allocation of this node + input_names: Names of the ONNX input node """ super().__init__(node_id, node_name) @@ -41,6 +43,7 @@ def __init__( self.possible_core_allocation = possible_core_allocation self.core_allocation_is_fixed = core_allocation_is_fixed self.chosen_core_allocation = chosen_core_allocation + self.input_names = input_names # will be set by the scheduler self.start = None # will be set by the scheduler From 0e3f4040c79ed69da0021be33865addb8a5389aa Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 7 Nov 2024 13:42:18 +0100 Subject: [PATCH 03/49] add slice operator --- .gitignore | 1 + outputs/custom_ssm.onnx | Bin 8163 -> 8121 bytes stream/node_tensor.py | 15 +++ stream/onnx_utils.py | 48 ++++--- stream/parser/onnx/conv.py | 122 +++++++++++------- stream/parser/onnx/einsum.py | 7 +- stream/parser/onnx/model.py | 5 + stream/parser/onnx/slice.py | 33 +++++ .../generation/tiled_workload_generation.py | 3 +- stream/stages/generation/tiling_generation.py | 58 +++++++-- .../workload/computation/computation_node.py | 23 ---- .../dependency_propagation/slice_node.py | 45 +++++++ 12 files changed, 258 insertions(+), 102 deletions(-) create mode 100644 stream/parser/onnx/slice.py create mode 100644 stream/workload/dependency_propagation/slice_node.py diff --git a/.gitignore b/.gitignore index adb0043f..a4f8f6de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.out typings +outputs # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/outputs/custom_ssm.onnx b/outputs/custom_ssm.onnx index 669004a5e20ead7c1ce9ea8b18c2113ef3578945..f0003846f46c477ba1fa72252d450eb4220016da 100644 GIT binary patch delta 2111 zcmah}OK%%h6z-jIlkv6V8ILDpdu+#ULK`~`89$PcL@Eg+6{w;w3`n#B%Sl|~sb7i5 zGzFmsv7ks4Fb5Vq>kTT8vOrlN1RMTAcStN~P*;^$A~s0CnXx@}>?BySukXF*eCM3+ z-1FU=|6yL@>|%fAs$Sk$SyK)tC#EJ6bB($LBn?Ak2Hxfb-@mO-gV^X*DypUOMo~Ri zz5-k1D3pm(|0N7gG6D^T!bRpdtdeQCBRk*=avL>pHC@QBzT<^M})~ZMi_yb zU9!JHs9Vjh=qg;cDa>RFZrihblscQfCpRmZ*%XB6-wc0fTXDqMBbU(r>0EU!Tghe0 zHN8^PGjU3mM4wu$6*5bi&G%j2BrfX7e5qP14)-j?A>$R+&FPDa$%Z0Vd_}asd z8f$~F^dka7NC{92q~&cz5aAu(3-??fVjl1c|0}r&zX=OQpTeDedY2%C(WuNzSv|L^ zo~{+(j5`KTi9Box(-Kd)%eh=-HD9dyNP{r+c`*z-!l1z_qsZppQ$&GrH^+X1M6U}G zxFx#a55bNmj_oV%`-FS~S#qwkp-t+wF3(~{+67nE=sGB3=0A)B1zdU> z1K&z#uZ>W?$?55=j%j7E`vP^LJ*6zSCE-uw`~_t<@p#9fg$>an7rFsvXFCC+CdzV% z_BET*lgy8UF=wpv8Q+v;2WDydX`>XvSoeMZ08zm$2YHbS7};snazggv>-LC%tp3Ut(2=dxGDSlV+IAjo{PZk)%omb+J}V*NP8HJrFsCDjGPuZ}zDJ^mSjS1l-_#LN_Jw zQ)q&WflH8$M-#A`t@i3~ht5Q`;}q@+L-3k^fSEGo)z{q>q{z^Ywk$g!BXeW|es>23 zdnb!%D#XR$o;OqA3J>pjlYoE3prH|M{flY|3ge!@?z1?By2sQKXRAKxaYm5-1L5n~ AD*ylh literal 8163 zcmbst%W@mX5leu;viX8mluRQsMXysf0ZRg~eCUyf5Iqi3nJU>-s$w33tcW$aE4++{ zMEepZuDB|dW2$n>f!9>!k`Kr+m;98|GrP0XvpWl>OjdER+uhUM)350s)3i#%ZzhA$ z(Me_AxwCtx`QX0~;hFVby+0WB>W>Hg@x<*<+Jot2IGwZ`l@%3Xn}Vda2J>&-?$oQ9 zW#f4vm&>)zjRHJ@dB2ZKA&aABg&bcF-bpD7G0e^*0w$oLvfdA=zVJHJBX8ex`;tL9 ziLybGp{fP<)El>c5P!zGQL3$i`Ns2x9ltkTN5CBH3khP6?Pf}XL}i-dpa1#fE;Gen z4anA5pd`Ool(5jRI-M9+i7bRLJCBc=1&i-I|M+Cm9{b;V>n7^YHsl^r01`!x(0&JBkBIL<0__4rX)NSO6(b+SJ908Dl*^qxWEOgrvVvp>I+cWJp7?L2 z+{iQtv-9|Xhav={7^1|2!Tz+HVt_C^j}LezOhl<*k!8r8(CF4$Xc|U9Vq}p(Lsl3A zT2fdY^!uj|9_+gl#^e63wO;r8?crz;)FW7&f~yIPT?{FHd<+~|E1~#_DkAX}|W}v9(;MAzvx5&r2rLJy4lqcpVJBnhd+sF%2&P zvrX{^FOrHjF$Z6b-2Qkt7<+OtNT9tC<1y6>LvPe;@sTUcE+uIE6+YxpfsM0{0>%PuJdm|uYq-q?1ZG(5(Uq+##7`h zjUibeXRMI*1OK=;@JV%DqF}oe;o)Tl8wBgvSXVfhR*i}&Kw&+2iV=baO&qcdF^uG1 zMdaR!X<+V|2FbN&Sb$b{-#qqzVp6UwFrq6hG}eKHwK+Z>|*WLc@U*b#u+{nY944BCqjgZ_M_AFePN2e)8Ruh<5 ziJL@;GLwy3oD8`dL$1ePSYoIi$G##%j)ODX`FiHgq)te~-6{}L`xlSY#iVwLNUr@aH}-%#J=UUzXq`lkKlRBh~*75F7Pyf0}0?8Q3KGi{FqK=KGk))&!0v#@x0=9&zWbi}E@O#xlVapZ zE)98IPw->r_g`BdP-dy~zY>mO(ar>#c#otnUTOUM?)SUQPyaCBqFumj!Kz7>DMT^p z-nsaIGLUTSD2M%4G~qX! z4Ri|1)6f0oBves3&g2-jIvWgT2{ud4_>V&%p3rI81 zX&8=P;Sk)~;YX0VkSPNWUKX&mYxp?-{O~8@eB>A<*PE(p=3&;M!Z3G{Q z6{3EU3ouKlP!%dD1xR>I;a9@~z_nE>(*G-4=2lv^a_^TRJF*n>S)BKJ!{5SFnd_+* zn@qcA-i7H_oPQqNP{%*V;n7v%(Vq`KNtY*`nps;kJR{V>Q>XE!>Nzs}OH&mvvUKAzTt|H^JPo4rjvWnnvKYaO*|WEG2X5iYEikR( zwEw2vZp3fR_|1vmwu4tQM&^64hrA^9PP}e6e%p)R?#6HTg5O>=$67y!2gqBlz7Gpl ziEG?!$k(QPb>!=Iut8IT%)#tbDdNHQGCk~GQsRs3sS_>Kz`fj(3hv0)UHQ5vU+)IZ ztXXwE)Adf)6li6yQECU75M>1B16F3!6JJKUHq@~}$rZI0hLvI~*t>8h(j||9=hFqF zl4v37s!A6(5~7vS&MKOOH`04Zj~Uu^PP)e@1zc}0%^FhB^=q0lx^=B1*B;H4%l`v% CE+ "list[NodeTensor]": axis = axis - 1 if axis < 0 else axis return [t.view(NodeTensor) for t in np.split(self.as_ndarray(), split_indices, axis=axis)] + def slice(self, starts: int, ends: int, axis: int, steps: int) -> "NodeTensor": + assert starts != 1 and ends != -1 + axis = len(self.tensor_shape) - 1 if axis < 0 else axis + match axis: + case 0: + return self.as_ndarray()[starts:ends:steps, ...].view(NodeTensor) + case 1: + return self.as_ndarray()[:, starts:ends:steps, ...].view(NodeTensor) + case 2: + return self.as_ndarray()[:, :, starts:ends:steps, ...].view(NodeTensor) + case 3: + return self.as_ndarray()[:, :, :, starts:ends:steps, ...].view(NodeTensor) + case _: + raise NotImplementedError + def concat_with_empty(self, shape: tuple[int, ...], axis: int, variable_input_first: bool): empty_shape = self.convert_to_full_shape(shape) empty_tensor = np.zeros(empty_shape, dtype=object) diff --git a/stream/onnx_utils.py b/stream/onnx_utils.py index 150d3701..dfa9b434 100644 --- a/stream/onnx_utils.py +++ b/stream/onnx_utils.py @@ -1,9 +1,7 @@ +import numpy as np from onnx import AttributeProto, ModelProto, NodeProto, numpy_helper from zigzag.parser.onnx.utils import get_onnx_tensor_type -import numpy as np -import onnx - def get_attribute_as_ints( node: NodeProto, attribute_name: str, default: list[int] | int | None = None @@ -60,6 +58,25 @@ def has_asymmetric_input_data(node: NodeProto, onnx_model: ModelProto): return input_shape1 != input_shape2 +def get_constant_tensor_int(onnx_model: ModelProto, constant_output_name: str): + """In some cases, the constants to a node (e.g. slice and split indices) are saved as tensors within a constant + node. The output name of the constant nodes corresponds to the input name of the node that uses this constant + tensor.""" + + for node in onnx_model.graph.node: + if node.op_type == "Constant" and node.output[0] == constant_output_name: + for attr in node.attribute: + if attr.name == "value": + tensor = attr.t # This is an ONNX TensorProto + # Decode tensor to a numpy array + array = np.frombuffer(tensor.raw_data, dtype=int) + array = array.reshape([dim for dim in tensor.dims]) + + return [int(i) for i in array] + + raise ValueError(f"Cannot find {constant_output_name}") + + def get_axis_attribute(node: NodeProto): """Find the value of the axis associated with this ONNX node""" ATTR_NAME = "axis" @@ -71,19 +88,20 @@ def get_axis_attribute(node: NodeProto): def get_split_attribute(node: NodeProto, onnx_model: ModelProto): - # ATTR_NAME = "split" - output_name = next(n for n in node.input if "split" in n.lower()) + return get_constant_tensor_int(onnx_model, output_name) - for node in onnx_model.graph.node: - if node.op_type == "Constant" and node.output[0] == output_name: - for attr in node.attribute: - if attr.name == "value": - tensor = attr.t # This is an ONNX TensorProto - # Decode tensor to a numpy array - array = np.frombuffer(tensor.raw_data, dtype=int) - array = array.reshape([dim for dim in tensor.dims]) - return [int(i) for i in array] +def get_slice_attributes(node: NodeProto, onnx_model: ModelProto): + """Get the `starts`, `ends`, `axes` and `steps` tensors for a slice node. + NOTE: this assumes that the attributes are given as inputs in this order""" + if len(node.input) != 5: + raise NotImplementedError("Unsure how to get slice attributes from Node") + + starts_output_name, ends_output_name, axes_output_name, steps_output_name = node.input[1:5] - raise ValueError + starts_value = get_constant_tensor_int(onnx_model, starts_output_name) + ends_value = get_constant_tensor_int(onnx_model, ends_output_name) + axes_value = get_constant_tensor_int(onnx_model, axes_output_name) + steps_value = get_constant_tensor_int(onnx_model, steps_output_name) + return starts_value, ends_value, axes_value, steps_value diff --git a/stream/parser/onnx/conv.py b/stream/parser/onnx/conv.py index 10f8566e..af53bb26 100644 --- a/stream/parser/onnx/conv.py +++ b/stream/parser/onnx/conv.py @@ -26,6 +26,8 @@ def get_layer_node_user_format( ) -> dict[str, Any]: """ Generate the necessary dictionary items required for the LayerNode creation. + + """ predecessors = self.get_node_predecessors() @@ -37,67 +39,93 @@ def get_layer_node_user_format( group_size: int = get_attribute_ints_with_name("group", attrs, default=1) # type:ignore padding: list[int] = get_attribute_ints_with_name("pads", attrs, default=[0, 0, 0, 0]) # type:ignore - # 1D Conv case: append dimensions of size 1 so equation holds. Conv in FY dimension - print(kernel_shape) - if len(kernel_shape) == 1: - kernel_shape.insert(0, 1) - input_shape.append(1) - output_shape.append(1) - strides.append(1) - dilations.append(1) - assert len(input_shape) == 4 - assert len(output_shape) == 4 - - if len(padding) == 2: - padding = 2 * padding - data: dict[str, Any] = {} data["id"] = self.node_id data["name"] = self.node.name data["operator_type"] = ConvParser.OP_TYPE + data["operand_precision"] = self.get_operand_precision_user_format() + data["operand_source"] = self.get_operand_source_user_format(predecessors) - # IMPORTANT: If any of the input loops require padding, they should be defined as the rightmost dimensions in - # the equation. This is because we construct the dimensionality order and then add the padding to those last - # dimensions in the order - weight_dim = "g" if group_size > 1 else "k" - data["equation"] = f"O[b][g][k][oy][ox]+=W[{weight_dim}][c][fy][fx]*I[b][g][c][iy][ix]" + # 1D Conv case: append dimensions of size 1 so equation holds. Conv in FY dimension + is_1d_conv = len(kernel_shape) == 1 + + # if len(kernel_shape) == 1: + # kernel_shape.insert(0, 1) + # input_shape.append(1) + # output_shape.append(1) + # strides.append(1) + # dilations.append(1) + # assert len(input_shape) == 4 + # assert len(output_shape) == 4 + + # if len(padding) == 2: + # padding = 2 * padding # Get dimension sizes from input parameters assert input_shape[0] == output_shape[0], "Batch size is different for input and output activations." B = output_shape[0] G = group_size K = ceil(output_shape[1] / G) - OX = output_shape[3] - OY = output_shape[2] C = ceil(input_shape[1] / G) - IX = input_shape[3] - IY = input_shape[2] FX = kernel_shape[0] - FY = kernel_shape[1] - data["loop_dims"] = ["B", "K", "G", "OX", "OY", "C", "FX", "FY"] - data["loop_sizes"] = [B, K, G, OX, OY, C, FX, FY] - - data["pr_loop_dims"] = ["IX", "IY"] - data["pr_loop_sizes"] = [IX, IY] - data["dimension_relations"] = [ - f"ix={strides[0]}*ox+{dilations[0]}*fx", - f"iy={strides[1]}*oy+{dilations[1]}*fy", - ] - data["operand_precision"] = self.get_operand_precision_user_format() - data["operand_source"] = self.get_operand_source_user_format(predecessors) + IX = input_shape[2] + OX = output_shape[2] - # Add information wrt how this conv node's input/output tensors - # are represented in the onnx model vs how they are represented in the equation above. - # Because onnx doesn't actually encode the group dimension in a separate dimension - # but instead keeps it as a "groups" parameter. - # Concretely, this entry contains for the I and O operand how the G + C/K should be converted - # to a single "CH" (channel) dimension. - - # Add padding information - data["padding"] = [ - [padding[0], padding[2]], - [padding[1], padding[3]], - ] + weight_dim = "g" if group_size > 1 else "k" + + # IMPORTANT: If any of the input loops require padding, they should be defined as the rightmost dimensions in + # the equation. This is because we construct the dimensionality order and then add the padding to those last + # dimensions in the order. + # Add information wrt how this conv node's input/output tensors are represented in the onnx model vs how they + # are represented in the equation. Because onnx doesn't actually encode the group dimension in a separate + # dimension but instead keeps it as a "groups" parameter. Concretely, this entry contains for the I and O + # operand how the G + C/K should be converted to a single "CH" (channel) dimension. + + if is_1d_conv: + # No FY, OY, IY + data["loop_sizes"] = [B, K, G, OX, C, FX] + data["loop_dims"] = ["B", "K", "G", "OX", "C", "FX"] + data["equation"] = f"O[b][g][k][ox]+=W[{weight_dim}][c][fx]*I[b][g][c][ix]" + data["pr_loop_dims"] = ["IX"] + data["pr_loop_sizes"] = [IX] + data["dimension_relations"] = [ + f"ix={strides[0]}*ox+{dilations[0]}*fx", + ] + data["padding"] = [ + [padding[0], padding[1]], + ] + else: + assert len(input_shape) == 4 and len(output_shape) == 4 and len(padding) == 4 and len(strides) == 2 + FY = kernel_shape[1] # TODO is kernel_shape in (FX, FY) format or (FY, FX)? (I assumed the former) + IY = input_shape[3] + OY = output_shape[3] + data["loop_sizes"] = [B, K, G, OX, C, FX, OY, FY] + data["loop_dims"] = ["B", "K", "G", "OX", "C", "FX", "OY", "FY"] + data["equation"] = f"O[b][g][k][oy][ox]+=W[{weight_dim}][c][fy][fx]*I[b][g][c][iy][ix]" + data["pr_loop_dims"] = ["IX", "IY"] + data["pr_loop_sizes"] = [IX, IY] + data["dimension_relations"] = [ + f"ix={strides[0]}*ox+{dilations[0]}*fx", + f"iy={strides[1]}*oy+{dilations[1]}*fy", + ] + data["padding"] = [ + [padding[0], padding[2]], + [padding[1], padding[3]], + ] + + # Remove dims with size 1 + dims_size_1 = [dim for dim, size in zip(data["loop_dims"], data["loop_sizes"]) if size == 1] + data["loop_sizes"] = [s for s in data["loop_sizes"] if s > 1] + data["loop_dims"] = [d for d in data["loop_dims"] if d not in dims_size_1] + for dim in dims_size_1: + data["equation"] = data["equation"].replace(f"[{dim.lower()}]", "") + + # Filter out loops with size 1 + # loop_sizes = {"B": B, "K": K, "G": G, "OX": OX, "OY": OY, "C": C, "FX": FX, "FY": FY} + # dims_with_size_1 = [k for k, v in loop_sizes.items() if v == 1] + # loop_sizes = {k: v for k, v in loop_sizes.items() if v > 1} + # data["loop_dims"] = list(loop_sizes.keys()) + # data["loop_sizes"] = list(loop_sizes.values()) return data diff --git a/stream/parser/onnx/einsum.py b/stream/parser/onnx/einsum.py index 003ed5ab..1cdc53f0 100644 --- a/stream/parser/onnx/einsum.py +++ b/stream/parser/onnx/einsum.py @@ -15,8 +15,9 @@ def get_einsum_equation(self): attrs_names = [attr.name for attr in self.node.attribute] name_idx = attrs_names.index(ATTR_NAME) - value = self.node.attribute[name_idx] - return str(value) + attr_proto = self.node.attribute[name_idx] + value = attr_proto.s.decode("utf-8") + return value def get_layer_dims_per_op(self): einsum_equation = self.get_einsum_equation() @@ -32,7 +33,7 @@ def put_in_brackets(s: str): raise NotImplementedError dims_I, dims_W, dims_O = layer_dims_per_op - equation = f"O{put_in_brackets(dims_O)}+=I{put_in_brackets(dims_I)}*{put_in_brackets(dims_W)}" + equation = f"O{put_in_brackets(dims_O)}+=I{put_in_brackets(dims_I)}*W{put_in_brackets(dims_W)}" return equation # def get_layer_dims(self, layer_dims_per_op: list[str]): diff --git a/stream/parser/onnx/model.py b/stream/parser/onnx/model.py index a648fba8..21a34f0b 100644 --- a/stream/parser/onnx/model.py +++ b/stream/parser/onnx/model.py @@ -10,6 +10,7 @@ from stream.parser.onnx.concat import ConcatParser from stream.parser.onnx.conv import ConvParser from stream.parser.onnx.default import DefaultNodeParser +from stream.parser.onnx.einsum import EinsumParser from stream.parser.onnx.flatten import FlattenParser from stream.parser.onnx.gather import GatherParser from stream.parser.onnx.gemm import GemmParser @@ -19,6 +20,7 @@ from stream.parser.onnx.pooling import PoolingParser from stream.parser.onnx.reshape import ReshapeParser from stream.parser.onnx.simd import SimdParser +from stream.parser.onnx.slice import SliceParser from stream.parser.onnx.softmax import SoftmaxParser from stream.parser.onnx.split import SplitParser from stream.parser.onnx.transpose import TransposeParser @@ -37,6 +39,7 @@ class ONNXModelParser: "Conv": ConvParser, "MatMul": MatMulParser, "Gemm": GemmParser, + "Einsum": EinsumParser, "MaxPool": PoolingParser, "AveragePool": PoolingParser, "GlobalMaxPool": PoolingParser, @@ -44,6 +47,7 @@ class ONNXModelParser: "Add": SimdParser, "Mul": SimdParser, "Softmax": SoftmaxParser, + # Activations "Relu": SimdParser, "Gelu": SimdParser, "Silu": SimdParser, @@ -55,6 +59,7 @@ class ONNXModelParser: "Flatten": FlattenParser, "Concat": ConcatParser, "Split": SplitParser, + "Slice": SliceParser, } def __init__( diff --git a/stream/parser/onnx/slice.py b/stream/parser/onnx/slice.py new file mode 100644 index 00000000..113b5d8e --- /dev/null +++ b/stream/parser/onnx/slice.py @@ -0,0 +1,33 @@ +from stream.onnx_utils import get_slice_attributes +from stream.parser.onnx.operator_parser import OnnxOperatorParser +from stream.workload.dependency_propagation.slice_node import SliceNode + + +class SliceParser(OnnxOperatorParser): + """Parses an onnx gather operator into a SliceNode.""" + + def generate_node(self): + if len(self.node.output) > 1: + raise NotImplementedError("Slice node with multiple output slices not yet supported.") + + # Single predecessor + predecessors = self.get_node_predecessors() + if len(predecessors) > 1: + raise ValueError("Slice node should not have more than one input") + predecessor = predecessors.pop() + + starts_value, ends_value, axes_value, steps_value = get_slice_attributes(self.node, self.onnx_model) + input_names = list(self.node.input) + output_names = list(self.node.output) + + return SliceNode( + node_id=self.node_id, + node_name=self.node.name, + predecessor=predecessor, + starts=starts_value, + ends=ends_value, + axes=axes_value, + steps=steps_value, + input_names=input_names, + output_names=output_names, + ) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 92853964..88479341 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -20,7 +20,6 @@ from stream.workload.dependency_propagation.concat_node import ConcatNode from stream.workload.dependency_propagation.dummy_node import DummyNode from stream.workload.dependency_propagation.propagation_node import PropagationNode -from stream.workload.dnn_workload import DNNWorkloadStream from stream.workload.node import Node from stream.workload.onnx_workload import ComputationNodeWorkload, ONNXWorkload from stream.workload.tensor import Tensor @@ -123,7 +122,7 @@ def get_scheduling_order(workload: ComputationNodeWorkload): return sorted(((n.id, n.sub_id) for n in workload.node_list), reverse=True) @staticmethod - def get_all_node_pairs(G: DNNWorkloadStream) -> tuple[tuple[ComputationNode, ComputationNode, bool], ...]: + def get_all_node_pairs(G: ONNXWorkload) -> tuple[tuple[ComputationNode, ComputationNode, bool], ...]: pairs: list[tuple[ComputationNode, ComputationNode, bool]] = [] for node in G.topological_sort(): if not isinstance(node, ComputationNode): diff --git a/stream/stages/generation/tiling_generation.py b/stream/stages/generation/tiling_generation.py index 829c8b13..d0a463fb 100644 --- a/stream/stages/generation/tiling_generation.py +++ b/stream/stages/generation/tiling_generation.py @@ -1,4 +1,5 @@ import logging +from collections import defaultdict from typing import Any import numpy as np @@ -16,6 +17,31 @@ class TilingGenerationStage(Stage): + # Split the node in this dimension to enable fusion within core + FUSION_PARTITION_DIM_DEFAULT: defaultdict[str, LayerDim] = defaultdict( + lambda: LayerDim("K"), + { + "conv": LayerDim("OY"), + "matmul": LayerDim("D"), + "gemm": LayerDim("D"), + "pooling": LayerDim("OY"), + "add": LayerDim("D"), + "mul": LayerDim("D"), + "softmax": LayerDim("K"), + "max": LayerDim("K"), + "div": LayerDim("K"), + "exp": LayerDim("K"), + "sum": LayerDim("K"), + "relu": LayerDim("K"), + "gelu": LayerDim("K"), + "silu": LayerDim("K"), + }, + ) + FUSION_PARTITION_SIZE_DEFAULT = 2 + + # Split node in this dimension to partition layer over cores. NOTE this list is ordered + INTER_CORE_PARTITION_DIM_DEFAULT = [LayerDim("G"), LayerDim("H"), LayerDim("K")] + def __init__( self, list_of_callables: list[StageCallable], @@ -109,11 +135,22 @@ def remove_invalid_entries_from_intra_core_tiling(self, node: ComputationNode): node.intra_core_tiling = valid_tiling - def generate_intra_core_tiling(self, node: ComputationNode) -> TILING_T: - partition_dim = node.fusion_partition_dims[0] + def get_fusion_partition_dim(self, node: ComputationNode) -> LayerDim: + partition_dim = TilingGenerationStage.FUSION_PARTITION_DIM_DEFAULT[node.type] + + # Default partition dim is not present in this node -> take some arbitrary other dim if partition_dim not in node.layer_dim_sizes: - raise ValueError(f"Suggested partition dimension {partition_dim} for {node} is not part of this node") - return [(node.fusion_partition_dims[0], node.layer_dim_sizes[partition_dim])] + partition_dim: LayerDim = next( + dim for dim in node.layer_dim_sizes if dim != LayerDim("B") and dim != LayerDim("G") + ) + + return partition_dim + + def generate_intra_core_tiling(self, node: ComputationNode) -> TILING_T: + partition_dim = self.get_fusion_partition_dim(node) + size = min(TilingGenerationStage.FUSION_PARTITION_SIZE_DEFAULT, node.layer_dim_sizes[partition_dim]) + tiling = [(partition_dim, size)] + return tiling def remove_invalid_entries_from_inter_core_tiling(self, node: ComputationNode): """Check wether this node's inter core tiling has invalid entries: non-existent layer dimension for this node @@ -143,14 +180,11 @@ def remove_invalid_entries_from_inter_core_tiling(self, node: ComputationNode): node.inter_core_tiling = valid_tiling def generate_inter_core_tiling(self, node: ComputationNode) -> TILING_T: - if node.layer_dim_sizes.data.get(LayerDim("G"), 1) > 1: - loop_dim = LayerDim("G") - elif node.layer_dim_sizes.data.get(LayerDim("K"), 1) > 1: - loop_dim = LayerDim("K") - else: - raise ValueError("Unknown what loop dim to split across cores") - - return [(loop_dim, "*")] + for dim in TilingGenerationStage.INTER_CORE_PARTITION_DIM_DEFAULT: + if dim in node.layer_dim_sizes and node.layer_dim_sizes[dim] > 1: + return [(dim, "*")] + + raise ValueError("Unknown what loop dim to split across cores") @staticmethod def split_operator(model: ModelProto, node_name: str, num_splits: int): diff --git a/stream/workload/computation/computation_node.py b/stream/workload/computation/computation_node.py index 58e7e535..2cdc51af 100644 --- a/stream/workload/computation/computation_node.py +++ b/stream/workload/computation/computation_node.py @@ -32,24 +32,6 @@ class ComputationNode(LayerNode, Node): too_large_operands: list[MemoryOperand] - # Map the node's op_type to the corresponding layer dimension to split on for fusion - FUSION_DIM_MAPPING: dict[str, list[LayerDim]] = { - "conv": [LayerDim("OY")], - "matmul": [LayerDim("D")], - "gemm": [LayerDim("D")], - "pooling": [LayerDim("OY")], - "add": [LayerDim("D")], - "mul": [LayerDim("D")], - "softmax": [LayerDim("K")], - "max": [LayerDim("K")], - "div": [LayerDim("K")], - "exp": [LayerDim("K")], - "sum": [LayerDim("K")], - "relu": [LayerDim("K")], - "gelu": [LayerDim("K")], - "silu": [LayerDim("K")], - } # TODO default to "K" ? - def __init__( self, node_id: int, @@ -113,11 +95,6 @@ def __init__( self.nb_real_predecessors = None self._static_hash_value = self.__compute_static_hash() - try: - self.fusion_partition_dims = ComputationNode.FUSION_DIM_MAPPING[op_type] - except KeyError: - raise NotImplementedError(f"Fusion partitioning dimensions not defined for {op_type}") - # Each ComputationNode will save a tensor for all its defined operands. # For example, a conv layer will have an I tensor, W tensor and O tensor. self.operand_tensors: dict[LayerOperand, Tensor] = {} diff --git a/stream/workload/dependency_propagation/slice_node.py b/stream/workload/dependency_propagation/slice_node.py new file mode 100644 index 00000000..49de89d5 --- /dev/null +++ b/stream/workload/dependency_propagation/slice_node.py @@ -0,0 +1,45 @@ +from zigzag.datatypes import Constants + +from stream.node_tensor import NodeTensor +from stream.workload.dependency_propagation.propagation_node import PropagationNode +from stream.workload.node import Node + + +class SliceNode(PropagationNode): + """Class that represents an onnx Slice node.""" + + def __init__( + self, + node_id: int, + node_name: str, + predecessor: int, + starts: list[int], + ends: list[int], + axes: list[int], + steps: list[int], + output_names: list[str], + input_names: list[str] = [], + ) -> None: + """Initialize the SliceNode + Slice the tensor at axis `axis`. The sizes are given by `Slices`. `len(Slices)` is the number of output nodes. + + Args: + predecessors: The id of this node's parent. + axis: axis in which to Slice + Slices: sizes of the output Slices in the given axis + output_names: the node names that correspond to the Slices + """ + op_type = "Slice" + super().__init__(node_id, node_name, op_type, input_names) + + self.starts = starts + self.ends = ends + self.axes = axes + self.steps = steps + self.input_operand_source = {Constants.LAYER_OP_I: predecessor} + self.output_names = output_names + + def propagate(self, tensor: NodeTensor, next_node: Node | None = None): + """Slice the tensor. + Currently assumes only one slice is created.""" + return tensor.slice(starts=self.starts[0], ends=self.ends[0], axis=self.axes[0], steps=self.steps[0]) From e27c2a8b8137ba498ac4f122a36da1e19dbe49cd Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 7 Nov 2024 16:22:51 +0100 Subject: [PATCH 04/49] Create MulParser that deals with asymmetrical inputs and allows broadcasting --- outputs/custom_ssm.onnx | Bin 8121 -> 10998009 bytes stream/parser/onnx/einsum.py | 19 +++-- stream/parser/onnx/model.py | 24 +++--- stream/parser/onnx/mul.py | 73 ++++++++++++++++++ stream/parser/onnx/simd.py | 1 + .../generation/tiled_workload_generation.py | 8 +- stream/stages/generation/tiling_generation.py | 3 +- 7 files changed, 105 insertions(+), 23 deletions(-) create mode 100644 stream/parser/onnx/mul.py diff --git a/outputs/custom_ssm.onnx b/outputs/custom_ssm.onnx index f0003846f46c477ba1fa72252d450eb4220016da..97da8bdb06facb9bb63f254d0aadd3a0dffa129e 100644 GIT binary patch literal 10998009 zcmeFw!A=uV6adiH(l87OG^2^wmFdQ$3lI#^-~x=ekc7mD`=+52X@n`#7Q|IQ!Nd;$ z_x^@EzrkP8sdPGp!XKPjyiD(Xz3;yB-UM^e?D2W8bFzO>Ew3%FFRyO=`EgMW-$oBt zlFrG|%Ii+j?KP8Lqtovl_j`?})rIScT98etKM2dG&BK0N4=Tl-nL?qkby=JVx5IKf zN&gz&RWXXLD!%KS<*Uq%W3^J62=}5#*Vk>V=94QoQxi>9v!)h;Vr6zG_&a)rt(}Lr zEFIpxP}alyxYgf}cjIO<)kB`Dh537ZF$_M%@o}qt)LklG7Vm|-S*=X3u{KpJPfhIS zI`Yq7Po58V^1Ya~oi94Bm!a_MgpG@4UFb~#dhN)YGp5-mxJn?6D3Ka^2 z&7shTDTTaTcw0*u&35}VUw1^4#%iT>2sg9KJ-SGfld}T5{llpOX{=UChw%4cP>vQm zN%Cc5<4rs1_KzB?FRJtTQLRtx>>{0!M2oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkL{A3Jm)000000ObGL2nh}xIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!1KP2>-nNqS0KldtTlDxt^0C%Yl#Qb@i^f*Gr6qZ@seu-o%?1b-aM4{~t$V@H z5@VB;4xrHoHJ)Wo_Xgz zb4Y*y0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7csf&Vjsw~giSI7!;S91Ty#-Ql=1IvpRMjyn%FujFN|W_ESE z6)t_#Jv{Ag-)>%9TlsvgnGHNl|45trO-S0q(aWRur@0gbH!iGC<=)JGZ_Pwg$-i%eGv^R${UxjI zY1i2#>jgXYg5Ax_(_31XCUjGgvww|rW?v=i#nSB*D|NM0dd~c^2+8Kx zD9vv_)CRp17WS3g%A%rE8bBxt&kL6S`tDA72D=Mquv?nJ@4mZ}tH?F=57tw_eaBT zb`JjJRj>d2#W?wX`{?wrla{rA)gPXm9u=Le)i1R!PpfSIAT)blpLWOn(Xe&x*}<#( z2Zygd{PXz7+s0BDhO^sD-fzF`9h~m>7Wa5-!PUIe+oeudVqa;8X@7qg&TTYYolW>z zzk8BgZ~x-ppqS!v`mwbXe{3h=>Xf8&(*L%%(TslEtD)K5AD0SZrP)$VDy^l+e-^5Z zChxW5P3_J^Ti-wAYz1{9c)4%C__IiSw=gaGr@^=*eqMRun z=ACbrGR4DErf6jkpWDrg@!_+UWs2+}lsE>$f$~5QxRr14fdG-JkF`jUKzsZlvH-0%ekv;BphsD5dFRsuEMdkE#^w+)K@j?ISWMgvN{Hoe+KHbtY zrqXUvp)}?7Y|72G#^=lb`SCw}luq_P8sQI(xmuxUJD*f_`O<8YZ-~nB#VhZ)B_3B- zrrcFLaF^EX$=5GSr~0+}rPgv)&i%5lRL zPukKZsF7#YbGKx(Jg)R~QCjMobJwPuq8rY+t9~SVFyBRy$v5Y^h);_+itxL}d>_Sa z*%R?`cbpFSX9=@0 zJ5pT;(?8N}aug%wtf`c9XVGgZcNPh?lsg}umpi{Mg-vmHxYQrUx7p-lvBgViPxqewxE@jQKoMfbj@_gq zi9gk`*y@Z;=;JVO^d zGau>QY^0Ucxv05&F$5=bs-)6nQPugLi&Qt?^Jo2U&YwuGnI@6kdg0n^X8|%hy6yZt z`CfQ0(%hWUY^5>Mh#x)~Y{jYPG0Svf8kDUgllow>8?%x zZ^dRxubm4+ZfUSx$~vVgW%+FDMuk#)r9fLyd1>&I`LD|=yJObs^58*?;iI_f`wPA{ zTYN|R{4c5It+qn#Tcw)#y}{kHzA2iknFdVyl~g5A=W5krVDD#@7)aZwzqHp!eK&6I zxmr85HU4B#rQNwoKZx_Me2-JgmxJ_TdmA*)n29RRVwjZN;k!RuEz5jAsV;FBtr~#&;L@PmiM+M8D($whIlalt-{%9X!p!_ z__ObcX|DDXe|Ft;uhm{;D?2BjaeFrUMen!eU7X+W;aM+_i`%cPP;rPU1^>mtAD1#t zX%TB%=`!OU@7Y^$^|4#8JU8t5SBUy8AI=}$OCz3BTu{BVhvur^kE4&z%>T(_F+M{pu~9lR W7b{cUy`}J}hUWB0`su~B_5TKE)quPJ literal 8121 zcmbst%W~UBk`O5oYDp#yOO8T24i`>hk?R;HCHa-gnz7|WT%}ZO{)Nia@!3-Tu!5iQWdV$1=A=O zpPIkMe@wMEpb-t(g18RPz0UN&+x6VOqJu!$Ab!_NV1Dg+!;aq@uN$Yv9PDyl+-Y*-m~kxY|N!!jGc_zXZi?|4FBlz$&rr5oYJ{QMX{}wdWrmP1`x0}4Q*I7C9CGTY=w+`g2(bt&@J-Zvu>RUm+NfuS@O+7v3LJ-(S~pegWHIbL21zM zA3u1o>rNQ>-aTvG@%!!JXb?CtEXl`G3S%!s6#u{5_N)t$_`PA*pR}8e%9043JH*m; zo?F+TVd#x|tz2uag&Qxz{|t=r1fQ&jm4iY5Rikq& z%5PYTKSP78Inft814s%iQ?OlN$a)zTJAT)l_;^}`0_!UfUUhwCRbRxU2V^4H#| z?{(YbBX{Vv3aAv)6ozhR+{(A|?DGN?$CHuY@gf%RCFnJ<*2qrCDX>hzGp+Fi9f5kM z4z|SvMnXJ&d>AC8KebkqCG#6NNkCXNXRwN+{I4w0B&|e^s}NSSTF%o~6kAhDp(0D_u75zn8vjPdYPVSpRdF|aO%U|N$aCI>~8J5?pOHc6G-x=oywRV7=o|M{Ca?|TDO$#Qwz z+C=4|@+cglf*LBQiCc&l&MuQ1upAu#PmYJ}`#Ebki$cfiPTck*=ZW7RPkTbxB@NbIh^XpU!SueFhQ-@n zdD%U>`Lf&p@K5-oe`@66ZwAcdjYdfC4KFL9GLMcGMM^15qQtvIi87OoS{w~SHHM)c zKNy(1vd8g4ks-&r=1hkjcjC3N#!tMlqQ{yRpQ%l#Ok|Wuo5wV~wuPZdk=-WYMK0|N zPYdh}Vafgi&QOy^lMrH+oyXJO9`$cAhS&=c297}?QlqsQJL;(YSTRuKD1k{#bB#Ev z#C*g&6`O{|%U7C!0AnW>wbRau34uvOT_K{DnW&pbQ9v*`|6=z^=6<1L;)WO_SSA?j zvg2Z_^QOIXEMNClB=|Y+eJ^Nq+)0Gf;xp?Soi0B7TFqi6fQ%i`SP>%e*2uGyR7w!Q zS=DDB?~LkdEfC$y#s%#|%(jkHrz@otCMxHDw%(y|<%UpM)uBmBrSC2LqR))<4;C94 z`oL_t*1L3}rkSo>))UNb_g~0-lkF0xNPGnW*)pjjIA8hs?%%iBX8&WrSvt!Ef_D;` zviqbyNOIr#TcTarGgMSO5s70+9uGwQK0(Dpg1^LFJ;8`grOa~RWB+{Ml4e{o`E!pG zFjw;5m~h%8IE%Mcp<%L%b&9z9qJoQ(!i5>D(jDP#RfM>7R}>M=NW{MlRfbpw^4#J6 zee?-l`}Bi{avF91$-`@OXO_Hp(qb({_t46UGxElMXXF;Bb8)_MIC6(a`&YT7 zOK{ONtQD9uP5duG-aJ{|UlS7p02A>A#8vl8?2My;_foyJ#knDu8ZI&Lzp!7CTka)T zM2_=JQf>cw${RQZDg#b2Q4G@rAE(cdfqYe`Dx?{pI|+l0kju+=AU0^CAl+{8hwQQK z0u)f0Tu|c^EIPo|bGxAo;Y3!1)+>hQGy;bcYAUd0K^X-vqX8qopfQ8%d_hv(L{wW` z(+#evRCaXiYMAxm{97tRzf&^lXcfuwfKaoM#G0&T@Rmz&O$Tdf##B(V;PhE=RqiC4 z4Px_r@PS%kc+7H3W87`1bU2$*Go=>1pXS%p*sqG7OApBut5gH#Ii(NMva6%##K=Nx zz&Ha-!Og4*6WvctZH3f=GYMQuJ!e{B_6AkdIElqX4VIof#%+3KBkVG1NJMASkl3R% zp>L(xir#0;BHDR~yvMK>*3c{JutLu_6?r(r5ZdKKKIu+c$7dtBp>{D1O&GLvgu3u+ z1y?ZqqzML7EUxMyQkqXtBnwk7P$Yu2PSOc(-M!N+HHv$L09quD^3 zT~(8LI~fq8Em{9F&u4x=(BAviJk)G~U+~2lt6SDk eZ_7k`O;OwkG)qX~)-Jg{ `[a][b][c]""" + if s == "": + return "[]" return "".join([f"[{char}]" for char in s]) - if len(layer_dims_per_op) != 3: - raise NotImplementedError + match len(layer_dims_per_op): + case 2: + dims_I, dims_O = layer_dims_per_op + dims_W = "" + case 3: + dims_I, dims_W, dims_O = layer_dims_per_op + case _: + raise NotImplementedError - dims_I, dims_W, dims_O = layer_dims_per_op equation = f"O{put_in_brackets(dims_O)}+=I{put_in_brackets(dims_I)}*W{put_in_brackets(dims_W)}" return equation - # def get_layer_dims(self, layer_dims_per_op: list[str]): - # all_dims = {char.upper() for group in layer_dims_per_op for char in group} - # return list(all_dims) - def get_layer_dim_sizes_dict(self, layer_dims_per_op: list[str]): input_shapes = get_onnx_input_shapes(self.node, self.onnx_model) output_shapes = get_onnx_output_shapes(self.node, self.onnx_model) @@ -71,7 +74,7 @@ def get_layer_node_user_format( input_shape: list[int], # Argument required because of a caller function in superclass output_shape: list[int], # TODO put shape logic in this method for all `OnnxComputeOperatorParser` subclasses ) -> dict[str, Any]: - """! Generate layer data in user input format for Einsum.""" + """Generate layer data in user input format for Einsum.""" predecessors = self.get_node_predecessors() data: dict[str, Any] = {} diff --git a/stream/parser/onnx/model.py b/stream/parser/onnx/model.py index 21a34f0b..7ec7f96a 100644 --- a/stream/parser/onnx/model.py +++ b/stream/parser/onnx/model.py @@ -5,8 +5,6 @@ from zigzag.parser.onnx.utils import parse_onnx_model_from_path from stream.hardware.architecture.accelerator import Accelerator -from stream.onnx_utils import get_onnx_input_shapes, has_asymmetric_input_data -from stream.parser.onnx.asymmetric_simd import AsymmetricSimdParser from stream.parser.onnx.concat import ConcatParser from stream.parser.onnx.conv import ConvParser from stream.parser.onnx.default import DefaultNodeParser @@ -16,6 +14,7 @@ from stream.parser.onnx.gemm import GemmParser from stream.parser.onnx.lpnormalization import LpNormalizationParser from stream.parser.onnx.matmul import MatMulParser +from stream.parser.onnx.mul import MulParser from stream.parser.onnx.operator_parser import OnnxOperatorParser from stream.parser.onnx.pooling import PoolingParser from stream.parser.onnx.reshape import ReshapeParser @@ -44,8 +43,8 @@ class ONNXModelParser: "AveragePool": PoolingParser, "GlobalMaxPool": PoolingParser, "GlobalAveragePool": PoolingParser, - "Add": SimdParser, - "Mul": SimdParser, + "Add": MulParser, + "Mul": MulParser, "Softmax": SoftmaxParser, # Activations "Relu": SimdParser, @@ -79,15 +78,14 @@ def run(self): self.workload = self.parse_workload() def get_parser_class(self, node: NodeProto): - # A temporary fix an element-wise Add or Mul which has asymmetric input data -> treat it as a DummyNode. - # TODO support node with asymmetric input data. - if node.op_type in ["Add", "Mul"] and has_asymmetric_input_data(node, self.onnx_model): - in_shape_1, in_shape_2 = get_onnx_input_shapes(node, self.onnx_model) - # In case only the batch dimension is missing. Other cases are not supported for now - if abs(len(in_shape_1) - len(in_shape_2)) == 1: - return AsymmetricSimdParser - else: - return DefaultNodeParser + # # A temporary fix an element-wise Add which has asymmetric input data -> treat it as a DummyNode. + # if node.op_type in ["Add", "Mul"] and has_asymmetric_input_data(node, self.onnx_model): + # in_shape_1, in_shape_2 = get_onnx_input_shapes(node, self.onnx_model) + # # In case only the batch dimension is missing. Other cases are not supported for now + # if abs(len(in_shape_1) - len(in_shape_2)) == 1: + # return AsymmetricSimdParser + # else: + # return DefaultNodeParser parser_class = ONNXModelParser.OP_TYPE_TO_PARSER.get(node.op_type) if not parser_class: diff --git a/stream/parser/onnx/mul.py b/stream/parser/onnx/mul.py new file mode 100644 index 00000000..1d78fced --- /dev/null +++ b/stream/parser/onnx/mul.py @@ -0,0 +1,73 @@ +from typing import Any + +from stream.onnx_utils import get_onnx_input_shapes, get_onnx_output_shapes +from stream.parser.onnx.operator_parser import OnnxComputeOperatorParser + + +class MulParser(OnnxComputeOperatorParser): + """Parses an ONNX operator representing an elementwise operation (Mul) into a ComputationNode.""" + + def get_common_and_broadcast_shape(self): + """This node assumes that the ONNX node has 2 inputs and 1 output. One input shape is identical to the output + shape, and the other shape can broadcast in dimensions. + Returns the common shape (in and out) and the broadcast shape""" + input_shapes = get_onnx_input_shapes(self.node, self.onnx_model) + output_shapes = get_onnx_output_shapes(self.node, self.onnx_model) + + if len(input_shapes) != 2 or len(output_shapes) != 1: + raise NotImplementedError + + output_shape = output_shapes.pop() + if not any(shape == output_shape for shape in input_shapes): + raise NotImplementedError + + input_shape = output_shape + input_shapes.remove(output_shape) + broadcast_shape = input_shapes.pop() + + # e.g. (3,5) * (8,3,5) is ok (broadcast over dim 0), but (3,2) * (8,3,5) is unclear + for broadcast_size, in_size in zip(reversed(broadcast_shape), reversed(input_shape)): + if broadcast_size != in_size and broadcast_size != 1: + raise ValueError + + return input_shape, broadcast_shape + + def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[int]): + """ + Generate the necessary dictionary items required for the LayerNode creation. + """ + common_shape, broadcast_shape = self.get_common_and_broadcast_shape() + + data: dict[str, Any] = {} + data["id"] = self.node_id + data["name"] = self.node.name + data["operator_type"] = self.node.op_type + data["operand_source"] = self.get_operand_source_input_format() + data["operand_precision"] = self.get_operand_precision_user_format() + data["dimension_relations"] = [] + data["loop_sizes"] = common_shape + + match len(common_shape): + case 1: + loop_dims = ["K"] + case 2: + loop_dims = ["D", "K"] + case 3: + loop_dims = ["B", "D", "K"] + case 4: + loop_dims = ["B", "H", "D", "K"] + case _: + raise NotImplementedError + + loop_dims_broadcast = reversed( + [dim for dim, size in zip(reversed(loop_dims), reversed(broadcast_shape)) if size > 1] + ) + + equation_dims_common = "".join([f"[{dim.lower()}]" for dim in loop_dims]) + equation_dims_broadcast = "".join([f"[{dim.lower()}]" for dim in loop_dims_broadcast]) + equation = f"O{equation_dims_common}+=I{equation_dims_common}*W{equation_dims_broadcast}" + + data["loop_dims"] = loop_dims + data["equation"] = equation + + return data diff --git a/stream/parser/onnx/simd.py b/stream/parser/onnx/simd.py index 9dae37d3..32d83b1e 100644 --- a/stream/parser/onnx/simd.py +++ b/stream/parser/onnx/simd.py @@ -6,6 +6,7 @@ class SimdParser(OnnxComputeOperatorParser): """Parses an ONNX operator representing an elementwise operation (simd) into a ComputationNode. e.g. Add, etc. + # TODO this functionality is exactly the same as Mul but without support for broadcast (asymmetric) shapes """ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[int]): diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 88479341..36560dad 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -395,6 +395,12 @@ def bounding_box_generator( inclusive_ranges = self.convert_to_inclusive_data_range(node.loop_ranges) dimensions = node.operand_dimensionality_order[operand] bounds = self.get_bounding_box_dimensions(producer, consumer, dimensions, inclusive_ranges) + + # TODO this is a whacky fix + # RTree doesn't accept bound of one dimension + if len(bounds) == 2: + bounds = (0, 0) + bounds + yield (i, bounds, None) def get_nb_input_dimensions(self, node: ComputationNode, operand: LayerOperand): @@ -416,7 +422,7 @@ def build_rtree( """ props = index.Property() # We assume all nodes in 'nodes' have identical dimensions - props.dimension = self.get_nb_input_dimensions(nodes[0], operand) + props.dimension = max(self.get_nb_input_dimensions(nodes[0], operand), 2) rtree = index.Index(self.bounding_box_generator(producer, consumer, nodes, operand), properties=props) return rtree diff --git a/stream/stages/generation/tiling_generation.py b/stream/stages/generation/tiling_generation.py index d0a463fb..70b3191d 100644 --- a/stream/stages/generation/tiling_generation.py +++ b/stream/stages/generation/tiling_generation.py @@ -184,7 +184,8 @@ def generate_inter_core_tiling(self, node: ComputationNode) -> TILING_T: if dim in node.layer_dim_sizes and node.layer_dim_sizes[dim] > 1: return [(dim, "*")] - raise ValueError("Unknown what loop dim to split across cores") + # No valid dim found -> just take someting + return [(next(iter(node.layer_dim_sizes)), "*")] @staticmethod def split_operator(model: ModelProto, node_name: str, num_splits: int): From 9fa69b4abbe4d21635589eafba19e6d5317da75f Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 7 Nov 2024 20:28:34 +0100 Subject: [PATCH 05/49] some parsing bugfixes --- iismodel.ilp | 123 +++++++++++++++++++++++++++++++++++++ stream/parser/onnx/conv.py | 13 ++-- stream/parser/onnx/mul.py | 33 +++++++++- 3 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 iismodel.ilp diff --git a/iismodel.ilp b/iismodel.ilp new file mode 100644 index 00000000..813ee161 --- /dev/null +++ b/iismodel.ilp @@ -0,0 +1,123 @@ +\ Model scheduling_copy +\ LP format - for model browsing. Use MPS format to capture full model detail. +Minimize + +Subject To + R26: core_assignments[Core_5,7000] <= 0 + R65: core_assignments[Core_5,7000] - assignments[Core_5,0,7000] + - assignments[Core_5,1,7000] - assignments[Core_5,2,7000] = 0 + R89: - node_assignments[1,7000] - 2 node_assignments[2,7000] + + slot_per_id[7000] = 0 + R91: - slot_per_id[5000] + slot_per_id[7000] >= 1 + R92: weights_per_core[Core_0] <= 1.6777216e+07 + R93: weights_per_core[Core_1] <= 1.6777216e+07 + R94: weights_per_core[Core_2] <= 1.6777216e+07 + R95: weights_per_core[Core_3] <= 1.6777216e+07 + R96: weights_per_core[Core_4] <= 1.048576e+06 + qc0: [ - k_splits[7000] * node_assignments[1,7000] + + node_assignments[1,7000] * assignments[Core_0,1,7000] + + node_assignments[1,7000] * assignments[Core_1,1,7000] + + node_assignments[1,7000] * assignments[Core_2,1,7000] + + node_assignments[1,7000] * assignments[Core_3,1,7000] + + node_assignments[1,7000] * assignments[Core_4,1,7000] + + node_assignments[1,7000] * assignments[Core_5,1,7000] ] = 0 + qc1: [ - k_splits[7000] * node_assignments[2,7000] + + node_assignments[2,7000] * assignments[Core_0,2,7000] + + node_assignments[2,7000] * assignments[Core_1,2,7000] + + node_assignments[2,7000] * assignments[Core_2,2,7000] + + node_assignments[2,7000] * assignments[Core_3,2,7000] + + node_assignments[2,7000] * assignments[Core_4,2,7000] + + node_assignments[2,7000] * assignments[Core_5,2,7000] ] = 0 + qc2: [ k_splits[7000] * weights_per_split[7000] ] >= 2.12992e+08 + qc3: weights_per_core[Core_0] + [ + - assignments[Core_0,0,2000] * weights_per_split[2000] + - assignments[Core_0,0,5000] * weights_per_split[5000] + - assignments[Core_0,0,7000] * weights_per_split[7000] + - assignments[Core_0,1,2000] * weights_per_split[2000] + - assignments[Core_0,1,5000] * weights_per_split[5000] + - assignments[Core_0,1,7000] * weights_per_split[7000] + - assignments[Core_0,2,2000] * weights_per_split[2000] + - assignments[Core_0,2,5000] * weights_per_split[5000] + - assignments[Core_0,2,7000] * weights_per_split[7000] ] >= 0 + qc4: weights_per_core[Core_1] + [ + - assignments[Core_1,0,2000] * weights_per_split[2000] + - assignments[Core_1,0,5000] * weights_per_split[5000] + - assignments[Core_1,0,7000] * weights_per_split[7000] + - assignments[Core_1,1,2000] * weights_per_split[2000] + - assignments[Core_1,1,5000] * weights_per_split[5000] + - assignments[Core_1,1,7000] * weights_per_split[7000] + - assignments[Core_1,2,2000] * weights_per_split[2000] + - assignments[Core_1,2,5000] * weights_per_split[5000] + - assignments[Core_1,2,7000] * weights_per_split[7000] ] >= 0 + qc5: weights_per_core[Core_2] + [ + - assignments[Core_2,0,2000] * weights_per_split[2000] + - assignments[Core_2,0,5000] * weights_per_split[5000] + - assignments[Core_2,0,7000] * weights_per_split[7000] + - assignments[Core_2,1,2000] * weights_per_split[2000] + - assignments[Core_2,1,5000] * weights_per_split[5000] + - assignments[Core_2,1,7000] * weights_per_split[7000] + - assignments[Core_2,2,2000] * weights_per_split[2000] + - assignments[Core_2,2,5000] * weights_per_split[5000] + - assignments[Core_2,2,7000] * weights_per_split[7000] ] >= 0 + qc6: weights_per_core[Core_3] + [ + - assignments[Core_3,0,2000] * weights_per_split[2000] + - assignments[Core_3,0,5000] * weights_per_split[5000] + - assignments[Core_3,0,7000] * weights_per_split[7000] + - assignments[Core_3,1,2000] * weights_per_split[2000] + - assignments[Core_3,1,5000] * weights_per_split[5000] + - assignments[Core_3,1,7000] * weights_per_split[7000] + - assignments[Core_3,2,2000] * weights_per_split[2000] + - assignments[Core_3,2,5000] * weights_per_split[5000] + - assignments[Core_3,2,7000] * weights_per_split[7000] ] >= 0 + qc7: weights_per_core[Core_4] + [ + - assignments[Core_4,0,2000] * weights_per_split[2000] + - assignments[Core_4,0,5000] * weights_per_split[5000] + - assignments[Core_4,0,7000] * weights_per_split[7000] + - assignments[Core_4,1,2000] * weights_per_split[2000] + - assignments[Core_4,1,5000] * weights_per_split[5000] + - assignments[Core_4,1,7000] * weights_per_split[7000] + - assignments[Core_4,2,2000] * weights_per_split[2000] + - assignments[Core_4,2,5000] * weights_per_split[5000] + - assignments[Core_4,2,7000] * weights_per_split[7000] ] >= 0 +Bounds + k_splits[7000] free + slot_per_id[7000] free + weights_per_split[7000] free + weights_per_core[Core_0] free + weights_per_core[Core_1] free + weights_per_core[Core_2] free + weights_per_core[Core_3] free + weights_per_core[Core_4] free +Binaries + core_assignments[Core_5,7000] node_assignments[1,7000] + node_assignments[2,7000] assignments[Core_0,0,2000] + assignments[Core_0,0,5000] assignments[Core_0,0,7000] + assignments[Core_0,1,2000] assignments[Core_0,1,5000] + assignments[Core_0,1,7000] assignments[Core_0,2,2000] + assignments[Core_0,2,5000] assignments[Core_0,2,7000] + assignments[Core_1,0,2000] assignments[Core_1,0,5000] + assignments[Core_1,0,7000] assignments[Core_1,1,2000] + assignments[Core_1,1,5000] assignments[Core_1,1,7000] + assignments[Core_1,2,2000] assignments[Core_1,2,5000] + assignments[Core_1,2,7000] assignments[Core_2,0,2000] + assignments[Core_2,0,5000] assignments[Core_2,0,7000] + assignments[Core_2,1,2000] assignments[Core_2,1,5000] + assignments[Core_2,1,7000] assignments[Core_2,2,2000] + assignments[Core_2,2,5000] assignments[Core_2,2,7000] + assignments[Core_3,0,2000] assignments[Core_3,0,5000] + assignments[Core_3,0,7000] assignments[Core_3,1,2000] + assignments[Core_3,1,5000] assignments[Core_3,1,7000] + assignments[Core_3,2,2000] assignments[Core_3,2,5000] + assignments[Core_3,2,7000] assignments[Core_4,0,2000] + assignments[Core_4,0,5000] assignments[Core_4,0,7000] + assignments[Core_4,1,2000] assignments[Core_4,1,5000] + assignments[Core_4,1,7000] assignments[Core_4,2,2000] + assignments[Core_4,2,5000] assignments[Core_4,2,7000] + assignments[Core_5,0,7000] assignments[Core_5,1,7000] + assignments[Core_5,2,7000] +Generals + k_splits[7000] slot_per_id[5000] slot_per_id[7000] weights_per_split[2000] + weights_per_split[5000] weights_per_split[7000] weights_per_core[Core_0] + weights_per_core[Core_1] weights_per_core[Core_2] weights_per_core[Core_3] + weights_per_core[Core_4] +End diff --git a/stream/parser/onnx/conv.py b/stream/parser/onnx/conv.py index af53bb26..d286e4c5 100644 --- a/stream/parser/onnx/conv.py +++ b/stream/parser/onnx/conv.py @@ -113,11 +113,14 @@ def get_layer_node_user_format( [padding[1], padding[3]], ] - # Remove dims with size 1 - dims_size_1 = [dim for dim, size in zip(data["loop_dims"], data["loop_sizes"]) if size == 1] - data["loop_sizes"] = [s for s in data["loop_sizes"] if s > 1] - data["loop_dims"] = [d for d in data["loop_dims"] if d not in dims_size_1] - for dim in dims_size_1: + # Remove dims with size 1, except batch + dim_sizes_larger_than_1 = { + dim: size for dim, size in zip(data["loop_dims"], data["loop_sizes"]) if size > 1 or dim == "B" + } + dims_with_size_1 = [dim for dim in data["loop_dims"] if dim not in dim_sizes_larger_than_1] + data["loop_dims"] = list(dim_sizes_larger_than_1.keys()) + data["loop_sizes"] = list(dim_sizes_larger_than_1.values()) + for dim in dims_with_size_1: data["equation"] = data["equation"].replace(f"[{dim.lower()}]", "") # Filter out loops with size 1 diff --git a/stream/parser/onnx/mul.py b/stream/parser/onnx/mul.py index 1d78fced..8cb43702 100644 --- a/stream/parser/onnx/mul.py +++ b/stream/parser/onnx/mul.py @@ -1,5 +1,7 @@ from typing import Any +from numpy import broadcast, broadcast_shapes + from stream.onnx_utils import get_onnx_input_shapes, get_onnx_output_shapes from stream.parser.onnx.operator_parser import OnnxComputeOperatorParser @@ -32,6 +34,35 @@ def get_common_and_broadcast_shape(self): return input_shape, broadcast_shape + def get_operand_source_input_format(self, shape_of_w: list[int]): + """This method needs more care in this subclass, since the equation assumes that the input with 'broadcast' + shape is always at `W`""" + predecessors = self.get_node_predecessors() + match len(predecessors): + case 1: + # One source operand, one constant + return {"W": self.node_id, "I": predecessors[0]} + case 2: + # Two source operands, none are constant + # Name of the input that corresponds to the W shape + broadcast_intput = self.node.input[get_onnx_input_shapes(self.node, self.onnx_model).index(shape_of_w)] + try: + node_id_W = next( + node_id + for node_id, outputs in self.nodes_outputs.items() + if broadcast_intput in outputs and node_id in predecessors + ) + node_id_I = ( + node_id_W + if predecessors[0] == predecessors[1] + else next(i for i in predecessors if i != node_id_W) + ) + return {"W": node_id_W, "I": node_id_I} + except StopIteration: + raise ValueError(f"Cannot find correct inputs of {self .node.name}") + case _: + raise ValueError("No more than 2 layer predecessors expected") + def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[int]): """ Generate the necessary dictionary items required for the LayerNode creation. @@ -42,7 +73,7 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ data["id"] = self.node_id data["name"] = self.node.name data["operator_type"] = self.node.op_type - data["operand_source"] = self.get_operand_source_input_format() + data["operand_source"] = self.get_operand_source_input_format(shape_of_w=broadcast_shape) data["operand_precision"] = self.get_operand_precision_user_format() data["dimension_relations"] = [] data["loop_sizes"] = common_shape From a38309a511832e75091a51e945dbb1eb1a2e29e0 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 8 Nov 2024 10:09:04 +0100 Subject: [PATCH 06/49] fix bug in mulparser: don't remove dims of size 1 --- iismodel.ilp | 123 -------------------------------------- stream/parser/onnx/mul.py | 6 +- 2 files changed, 1 insertion(+), 128 deletions(-) delete mode 100644 iismodel.ilp diff --git a/iismodel.ilp b/iismodel.ilp deleted file mode 100644 index 813ee161..00000000 --- a/iismodel.ilp +++ /dev/null @@ -1,123 +0,0 @@ -\ Model scheduling_copy -\ LP format - for model browsing. Use MPS format to capture full model detail. -Minimize - -Subject To - R26: core_assignments[Core_5,7000] <= 0 - R65: core_assignments[Core_5,7000] - assignments[Core_5,0,7000] - - assignments[Core_5,1,7000] - assignments[Core_5,2,7000] = 0 - R89: - node_assignments[1,7000] - 2 node_assignments[2,7000] - + slot_per_id[7000] = 0 - R91: - slot_per_id[5000] + slot_per_id[7000] >= 1 - R92: weights_per_core[Core_0] <= 1.6777216e+07 - R93: weights_per_core[Core_1] <= 1.6777216e+07 - R94: weights_per_core[Core_2] <= 1.6777216e+07 - R95: weights_per_core[Core_3] <= 1.6777216e+07 - R96: weights_per_core[Core_4] <= 1.048576e+06 - qc0: [ - k_splits[7000] * node_assignments[1,7000] - + node_assignments[1,7000] * assignments[Core_0,1,7000] - + node_assignments[1,7000] * assignments[Core_1,1,7000] - + node_assignments[1,7000] * assignments[Core_2,1,7000] - + node_assignments[1,7000] * assignments[Core_3,1,7000] - + node_assignments[1,7000] * assignments[Core_4,1,7000] - + node_assignments[1,7000] * assignments[Core_5,1,7000] ] = 0 - qc1: [ - k_splits[7000] * node_assignments[2,7000] - + node_assignments[2,7000] * assignments[Core_0,2,7000] - + node_assignments[2,7000] * assignments[Core_1,2,7000] - + node_assignments[2,7000] * assignments[Core_2,2,7000] - + node_assignments[2,7000] * assignments[Core_3,2,7000] - + node_assignments[2,7000] * assignments[Core_4,2,7000] - + node_assignments[2,7000] * assignments[Core_5,2,7000] ] = 0 - qc2: [ k_splits[7000] * weights_per_split[7000] ] >= 2.12992e+08 - qc3: weights_per_core[Core_0] + [ - - assignments[Core_0,0,2000] * weights_per_split[2000] - - assignments[Core_0,0,5000] * weights_per_split[5000] - - assignments[Core_0,0,7000] * weights_per_split[7000] - - assignments[Core_0,1,2000] * weights_per_split[2000] - - assignments[Core_0,1,5000] * weights_per_split[5000] - - assignments[Core_0,1,7000] * weights_per_split[7000] - - assignments[Core_0,2,2000] * weights_per_split[2000] - - assignments[Core_0,2,5000] * weights_per_split[5000] - - assignments[Core_0,2,7000] * weights_per_split[7000] ] >= 0 - qc4: weights_per_core[Core_1] + [ - - assignments[Core_1,0,2000] * weights_per_split[2000] - - assignments[Core_1,0,5000] * weights_per_split[5000] - - assignments[Core_1,0,7000] * weights_per_split[7000] - - assignments[Core_1,1,2000] * weights_per_split[2000] - - assignments[Core_1,1,5000] * weights_per_split[5000] - - assignments[Core_1,1,7000] * weights_per_split[7000] - - assignments[Core_1,2,2000] * weights_per_split[2000] - - assignments[Core_1,2,5000] * weights_per_split[5000] - - assignments[Core_1,2,7000] * weights_per_split[7000] ] >= 0 - qc5: weights_per_core[Core_2] + [ - - assignments[Core_2,0,2000] * weights_per_split[2000] - - assignments[Core_2,0,5000] * weights_per_split[5000] - - assignments[Core_2,0,7000] * weights_per_split[7000] - - assignments[Core_2,1,2000] * weights_per_split[2000] - - assignments[Core_2,1,5000] * weights_per_split[5000] - - assignments[Core_2,1,7000] * weights_per_split[7000] - - assignments[Core_2,2,2000] * weights_per_split[2000] - - assignments[Core_2,2,5000] * weights_per_split[5000] - - assignments[Core_2,2,7000] * weights_per_split[7000] ] >= 0 - qc6: weights_per_core[Core_3] + [ - - assignments[Core_3,0,2000] * weights_per_split[2000] - - assignments[Core_3,0,5000] * weights_per_split[5000] - - assignments[Core_3,0,7000] * weights_per_split[7000] - - assignments[Core_3,1,2000] * weights_per_split[2000] - - assignments[Core_3,1,5000] * weights_per_split[5000] - - assignments[Core_3,1,7000] * weights_per_split[7000] - - assignments[Core_3,2,2000] * weights_per_split[2000] - - assignments[Core_3,2,5000] * weights_per_split[5000] - - assignments[Core_3,2,7000] * weights_per_split[7000] ] >= 0 - qc7: weights_per_core[Core_4] + [ - - assignments[Core_4,0,2000] * weights_per_split[2000] - - assignments[Core_4,0,5000] * weights_per_split[5000] - - assignments[Core_4,0,7000] * weights_per_split[7000] - - assignments[Core_4,1,2000] * weights_per_split[2000] - - assignments[Core_4,1,5000] * weights_per_split[5000] - - assignments[Core_4,1,7000] * weights_per_split[7000] - - assignments[Core_4,2,2000] * weights_per_split[2000] - - assignments[Core_4,2,5000] * weights_per_split[5000] - - assignments[Core_4,2,7000] * weights_per_split[7000] ] >= 0 -Bounds - k_splits[7000] free - slot_per_id[7000] free - weights_per_split[7000] free - weights_per_core[Core_0] free - weights_per_core[Core_1] free - weights_per_core[Core_2] free - weights_per_core[Core_3] free - weights_per_core[Core_4] free -Binaries - core_assignments[Core_5,7000] node_assignments[1,7000] - node_assignments[2,7000] assignments[Core_0,0,2000] - assignments[Core_0,0,5000] assignments[Core_0,0,7000] - assignments[Core_0,1,2000] assignments[Core_0,1,5000] - assignments[Core_0,1,7000] assignments[Core_0,2,2000] - assignments[Core_0,2,5000] assignments[Core_0,2,7000] - assignments[Core_1,0,2000] assignments[Core_1,0,5000] - assignments[Core_1,0,7000] assignments[Core_1,1,2000] - assignments[Core_1,1,5000] assignments[Core_1,1,7000] - assignments[Core_1,2,2000] assignments[Core_1,2,5000] - assignments[Core_1,2,7000] assignments[Core_2,0,2000] - assignments[Core_2,0,5000] assignments[Core_2,0,7000] - assignments[Core_2,1,2000] assignments[Core_2,1,5000] - assignments[Core_2,1,7000] assignments[Core_2,2,2000] - assignments[Core_2,2,5000] assignments[Core_2,2,7000] - assignments[Core_3,0,2000] assignments[Core_3,0,5000] - assignments[Core_3,0,7000] assignments[Core_3,1,2000] - assignments[Core_3,1,5000] assignments[Core_3,1,7000] - assignments[Core_3,2,2000] assignments[Core_3,2,5000] - assignments[Core_3,2,7000] assignments[Core_4,0,2000] - assignments[Core_4,0,5000] assignments[Core_4,0,7000] - assignments[Core_4,1,2000] assignments[Core_4,1,5000] - assignments[Core_4,1,7000] assignments[Core_4,2,2000] - assignments[Core_4,2,5000] assignments[Core_4,2,7000] - assignments[Core_5,0,7000] assignments[Core_5,1,7000] - assignments[Core_5,2,7000] -Generals - k_splits[7000] slot_per_id[5000] slot_per_id[7000] weights_per_split[2000] - weights_per_split[5000] weights_per_split[7000] weights_per_core[Core_0] - weights_per_core[Core_1] weights_per_core[Core_2] weights_per_core[Core_3] - weights_per_core[Core_4] -End diff --git a/stream/parser/onnx/mul.py b/stream/parser/onnx/mul.py index 8cb43702..303d0769 100644 --- a/stream/parser/onnx/mul.py +++ b/stream/parser/onnx/mul.py @@ -1,7 +1,5 @@ from typing import Any -from numpy import broadcast, broadcast_shapes - from stream.onnx_utils import get_onnx_input_shapes, get_onnx_output_shapes from stream.parser.onnx.operator_parser import OnnxComputeOperatorParser @@ -90,9 +88,7 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ case _: raise NotImplementedError - loop_dims_broadcast = reversed( - [dim for dim, size in zip(reversed(loop_dims), reversed(broadcast_shape)) if size > 1] - ) + loop_dims_broadcast = reversed([dim for dim, _ in zip(reversed(loop_dims), reversed(broadcast_shape))]) equation_dims_common = "".join([f"[{dim.lower()}]" for dim in loop_dims]) equation_dims_broadcast = "".join([f"[{dim.lower()}]" for dim in loop_dims_broadcast]) From d1db0d9133bc1b5f960d5a21ef78573f8151ef4a Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 8 Nov 2024 10:51:00 +0100 Subject: [PATCH 07/49] fix bug in conv: dont remove dims of size 1, except K and C (not present in some 1D convs) --- stream/parser/onnx/conv.py | 31 ++++++++------------- stream/parser/onnx/operator_parser.py | 8 ++++++ stream/workload/computation/pooling_node.py | 4 +++ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/stream/parser/onnx/conv.py b/stream/parser/onnx/conv.py index d286e4c5..d22dafc0 100644 --- a/stream/parser/onnx/conv.py +++ b/stream/parser/onnx/conv.py @@ -83,8 +83,7 @@ def get_layer_node_user_format( if is_1d_conv: # No FY, OY, IY - data["loop_sizes"] = [B, K, G, OX, C, FX] - data["loop_dims"] = ["B", "K", "G", "OX", "C", "FX"] + loop_size_dict = {"B": B, "K": K, "G": G, "OX": OX, "C": C, "FX": FX} data["equation"] = f"O[b][g][k][ox]+=W[{weight_dim}][c][fx]*I[b][g][c][ix]" data["pr_loop_dims"] = ["IX"] data["pr_loop_sizes"] = [IX] @@ -99,8 +98,7 @@ def get_layer_node_user_format( FY = kernel_shape[1] # TODO is kernel_shape in (FX, FY) format or (FY, FX)? (I assumed the former) IY = input_shape[3] OY = output_shape[3] - data["loop_sizes"] = [B, K, G, OX, C, FX, OY, FY] - data["loop_dims"] = ["B", "K", "G", "OX", "C", "FX", "OY", "FY"] + loop_size_dict = {"B": B, "K": K, "G": G, "OX": OX, "C": C, "FX": FX, "OY": OY, "FY": FY} data["equation"] = f"O[b][g][k][oy][ox]+=W[{weight_dim}][c][fy][fx]*I[b][g][c][iy][ix]" data["pr_loop_dims"] = ["IX", "IY"] data["pr_loop_sizes"] = [IX, IY] @@ -113,22 +111,15 @@ def get_layer_node_user_format( [padding[1], padding[3]], ] - # Remove dims with size 1, except batch - dim_sizes_larger_than_1 = { - dim: size for dim, size in zip(data["loop_dims"], data["loop_sizes"]) if size > 1 or dim == "B" - } - dims_with_size_1 = [dim for dim in data["loop_dims"] if dim not in dim_sizes_larger_than_1] - data["loop_dims"] = list(dim_sizes_larger_than_1.keys()) - data["loop_sizes"] = list(dim_sizes_larger_than_1.values()) - for dim in dims_with_size_1: - data["equation"] = data["equation"].replace(f"[{dim.lower()}]", "") - - # Filter out loops with size 1 - # loop_sizes = {"B": B, "K": K, "G": G, "OX": OX, "OY": OY, "C": C, "FX": FX, "FY": FY} - # dims_with_size_1 = [k for k, v in loop_sizes.items() if v == 1] - # loop_sizes = {k: v for k, v in loop_sizes.items() if v > 1} - # data["loop_dims"] = list(loop_sizes.keys()) - # data["loop_sizes"] = list(loop_sizes.values()) + # Remove C/K if they have size 1 + for dim in ["C", "K"]: + if loop_size_dict[dim] == 1: + del loop_size_dict[dim] + # Remove from equation + data["equation"] = data["equation"].replace(f"[{dim.lower()}]", "") + + data["loop_dims"] = list(loop_size_dict.keys()) + data["loop_sizes"] = list(loop_size_dict.values()) return data diff --git a/stream/parser/onnx/operator_parser.py b/stream/parser/onnx/operator_parser.py index a4345895..78d5e99c 100644 --- a/stream/parser/onnx/operator_parser.py +++ b/stream/parser/onnx/operator_parser.py @@ -68,6 +68,14 @@ def get_operand_precision_user_format(self) -> dict[str, int]: intermediate_output_precision: int = self.get_intermediate_output_precision() predecessors = self.get_node_predecessors() match len(predecessors): + case 0: + # e.g. the first node in the network -> assume only one variable input + return { + "W": weight_precision, + "I": act_precision, + "O_final": act_precision, + "O": intermediate_output_precision, + } case 1: # One source operand, one constant return { diff --git a/stream/workload/computation/pooling_node.py b/stream/workload/computation/pooling_node.py index 0c4151c5..74a27169 100644 --- a/stream/workload/computation/pooling_node.py +++ b/stream/workload/computation/pooling_node.py @@ -5,12 +5,15 @@ class PoolingNode(ComputationNode): + """TODO this node can be replaced by instantiating ComputationNode directly""" + def __init__( self, node_id: int, node_name: str, node_attr: LayerNodeAttributes, mapping_attr: InterCoreMappingAttributes, + input_names: list[str] = [], ): super().__init__( node_id=node_id, @@ -18,4 +21,5 @@ def __init__( node_attr=node_attr, mapping_attr=mapping_attr, op_type="pooling", + input_names=input_names, ) From 5485268a95b72be477c1e719a5fed2f42074ebaf Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 8 Nov 2024 17:41:20 +0100 Subject: [PATCH 08/49] bugfix in reduce_1d: explicitly manage the keep_dim option --- stream/parser/onnx/conv.py | 12 ------ stream/parser/onnx/model.py | 9 +++- stream/parser/onnx/mul.py | 3 ++ stream/parser/onnx/operator_parser.py | 3 ++ stream/parser/onnx/reduce_1d.py | 42 ++++++++++++++++--- .../generation/tiled_workload_generation.py | 1 + 6 files changed, 52 insertions(+), 18 deletions(-) diff --git a/stream/parser/onnx/conv.py b/stream/parser/onnx/conv.py index d22dafc0..1181c176 100644 --- a/stream/parser/onnx/conv.py +++ b/stream/parser/onnx/conv.py @@ -49,18 +49,6 @@ def get_layer_node_user_format( # 1D Conv case: append dimensions of size 1 so equation holds. Conv in FY dimension is_1d_conv = len(kernel_shape) == 1 - # if len(kernel_shape) == 1: - # kernel_shape.insert(0, 1) - # input_shape.append(1) - # output_shape.append(1) - # strides.append(1) - # dilations.append(1) - # assert len(input_shape) == 4 - # assert len(output_shape) == 4 - - # if len(padding) == 2: - # padding = 2 * padding - # Get dimension sizes from input parameters assert input_shape[0] == output_shape[0], "Batch size is different for input and output activations." B = output_shape[0] diff --git a/stream/parser/onnx/model.py b/stream/parser/onnx/model.py index 7ec7f96a..7109471a 100644 --- a/stream/parser/onnx/model.py +++ b/stream/parser/onnx/model.py @@ -17,6 +17,7 @@ from stream.parser.onnx.mul import MulParser from stream.parser.onnx.operator_parser import OnnxOperatorParser from stream.parser.onnx.pooling import PoolingParser +from stream.parser.onnx.reduce_1d import Reduce1DParser from stream.parser.onnx.reshape import ReshapeParser from stream.parser.onnx.simd import SimdParser from stream.parser.onnx.slice import SliceParser @@ -34,6 +35,7 @@ class ONNXModelParser: # Map the node's op_type to the corresponding Parser class OP_TYPE_TO_PARSER: dict[str, Type[OnnxOperatorParser]] = { + # General "QLinearConv": ConvParser, "Conv": ConvParser, "MatMul": MatMulParser, @@ -46,10 +48,15 @@ class ONNXModelParser: "Add": MulParser, "Mul": MulParser, "Softmax": SoftmaxParser, - # Activations + # Single-input element-wise + "ReduceMean": Reduce1DParser, "Relu": SimdParser, "Gelu": SimdParser, "Silu": SimdParser, + "Sqrt": SimdParser, + "Div": SimdParser, + "Pow": SimdParser, + "Reciprocal": SimdParser, # Div with 1 as numerator # Dependency propagation "LpNormalization": LpNormalizationParser, "Gather": GatherParser, diff --git a/stream/parser/onnx/mul.py b/stream/parser/onnx/mul.py index 303d0769..612a9406 100644 --- a/stream/parser/onnx/mul.py +++ b/stream/parser/onnx/mul.py @@ -37,6 +37,9 @@ def get_operand_source_input_format(self, shape_of_w: list[int]): shape is always at `W`""" predecessors = self.get_node_predecessors() match len(predecessors): + case 0: + # e.g. first node of graph + return {"W": self.node_id, "I": self.node_id} case 1: # One source operand, one constant return {"W": self.node_id, "I": predecessors[0]} diff --git a/stream/parser/onnx/operator_parser.py b/stream/parser/onnx/operator_parser.py index 78d5e99c..e288d18d 100644 --- a/stream/parser/onnx/operator_parser.py +++ b/stream/parser/onnx/operator_parser.py @@ -40,6 +40,9 @@ def generate_node(self) -> Node: ... def get_operand_source_input_format(self): predecessors = self.get_node_predecessors() match len(predecessors): + case 0: + # e.g. first node of graph + return {"W": self.node_id, "I": self.node_id} case 1: # One source operand, one constant return {"W": self.node_id, "I": predecessors[0]} diff --git a/stream/parser/onnx/reduce_1d.py b/stream/parser/onnx/reduce_1d.py index b34289b0..26f8d7ff 100644 --- a/stream/parser/onnx/reduce_1d.py +++ b/stream/parser/onnx/reduce_1d.py @@ -8,12 +8,36 @@ class Reduce1DParser(OnnxComputeOperatorParser): e.g. sum over one row or max of a single row """ + def get_reduction_dim(self, input_shape: list[int], output_shape: list[int]): + """Returns the axis in which the dimension is reduced""" + + # The case that keepdim=True: the reduced dimension is kept with size 1 + if len(input_shape) == len(output_shape): + different_size = [a != b for a, b in zip(input_shape, output_shape)] + if sum(different_size) != 1: + raise ValueError(f"Input and output shapes {input_shape}, {output_shape} should only differ in one dim") + reduction_dim = different_size.index(True) + if output_shape[reduction_dim] != 1: + raise ValueError(f"The reduced dimension at axis {reduction_dim} in {output_shape} is larger than 1") + return reduction_dim + + # Other: assume that the reduction is at axis=-1 + if not all(a == b for a, b in zip(input_shape, output_shape)): + raise NotImplementedError("Reduce node with reduction axis other than -1 not implemented yet.") + reduction_dim = len(input_shape) - 1 # Last dimension + def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[int]): """ Generate the necessary dictionary items required for the LayerNode creation. """ - # TODO check the output shape as well? - assert len(self.get_node_predecessors()) == 1 + if len(self.get_node_predecessors()) != 1: + raise NotImplementedError + + if self.get_reduction_dim(input_shape, output_shape) != len(input_shape) - 1: + raise NotImplementedError("Only reduction in axis=-1 is supported") + + # This is a ONNX node property but can be inferred from the shapes + keep_dim = len(input_shape) == len(output_shape) data: dict[str, Any] = {} data["id"] = self.node_id @@ -24,17 +48,25 @@ def get_layer_node_user_format(self, input_shape: list[int], output_shape: list[ data["dimension_relations"] = [] data["loop_sizes"] = input_shape + # C is always the reduction dim + # If keep_dim: add an arbitrary dim of size 1 + reduced_dim_output = "CR" # C reduced to 1 + eq_part_CR = f"[{reduced_dim_output}]" if keep_dim else "" match len(input_shape): case 2: - data["equation"] = "O[k]+=I[k][c]*W[]" + data["equation"] = f"O[k]{eq_part_CR}+=I[k][c]*W[]" data["loop_dims"] = ["K", "C"] case 3: - data["equation"] = "O[b][k]+=I[b][k][c]*W[]" + data["equation"] = f"O[b][k]{eq_part_CR}+=I[b][k][c]*W[]" data["loop_dims"] = ["B", "K", "C"] case 4: - data["equation"] = "O[b][h][k]+=I[b][h][k][c]*W[]" + data["equation"] = f"O[b][h][k]{eq_part_CR}+=I[b][h][k][c]*W[]" data["loop_dims"] = ["B", "H", "K", "C"] case _: raise NotImplementedError + if keep_dim: + data["loop_dims"] += [reduced_dim_output] + data["loop_sizes"] += [1] + return data diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 36560dad..5bb772f3 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -376,6 +376,7 @@ def get_bounding_box_dimensions( # where the onnx tensors are always flattened back to 4D (merging the G+C or G+K into one channel dimension) dimensions, loop_ranges = self.flatten_grouped_convolution_ranges(producer, consumer, dimensions, loop_ranges) bounding_box = [loop_ranges[dim] for dim in dimensions] + # TODO can bounding box have size 1? Will probably crash if so if not interleaved: bounding_box_flat = tuple([item for sublist in bounding_box for item in sublist]) From a6421e9045cc9281b8f1bf6e67ab8d42ab2a2bb4 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Wed, 18 Dec 2024 11:58:10 +0100 Subject: [PATCH 09/49] support changes for mamba mapping --- stream/onnx_utils.py | 6 ++++- stream/parser/accelerator_validator.py | 2 +- stream/parser/onnx/model.py | 2 ++ stream/parser/onnx/reduce_1d.py | 21 ++++++++++++---- .../zigzag_core_mapping_estimation.py | 25 +++++++++++++++++++ .../generation/tiled_workload_generation.py | 1 + stream/utils.py | 8 +----- stream/visualization/memory_usage.py | 4 ++- 8 files changed, 54 insertions(+), 15 deletions(-) diff --git a/stream/onnx_utils.py b/stream/onnx_utils.py index dfa9b434..3f0371cf 100644 --- a/stream/onnx_utils.py +++ b/stream/onnx_utils.py @@ -80,8 +80,12 @@ def get_constant_tensor_int(onnx_model: ModelProto, constant_output_name: str): def get_axis_attribute(node: NodeProto): """Find the value of the axis associated with this ONNX node""" ATTR_NAME = "axis" + DEFAULT = -1 - value = get_attribute_as_ints(node, ATTR_NAME) + try: + value = get_attribute_as_ints(node, ATTR_NAME) + except ValueError: + return DEFAULT if not isinstance(value, int): raise ValueError(f"{ATTR_NAME} attribute as list of ints not supported") return value diff --git a/stream/parser/accelerator_validator.py b/stream/parser/accelerator_validator.py index 573fd842..d8d2a7ff 100644 --- a/stream/parser/accelerator_validator.py +++ b/stream/parser/accelerator_validator.py @@ -15,7 +15,7 @@ class AcceleratorValidator: INPUT_DIR_LOCATION = "stream/inputs/" GRAPH_TYPES = ["2d_mesh", "bus"] FILENAME_REGEX = r"^(?:[a-zA-Z0-9_\-]+|[a-zA-Z0-9_\-\///]+(\.yaml|\.yml))$" - CORE_IDS_REGEX = r"^\d+\s*,\s*\d+$" + CORE_IDS_REGEX = r"^(\d+\s*,\s*)+\d+$" SCHEMA: dict[str, Any] = { "name": {"type": "string", "required": True}, diff --git a/stream/parser/onnx/model.py b/stream/parser/onnx/model.py index 7109471a..925143c5 100644 --- a/stream/parser/onnx/model.py +++ b/stream/parser/onnx/model.py @@ -49,10 +49,12 @@ class ONNXModelParser: "Mul": MulParser, "Softmax": SoftmaxParser, # Single-input element-wise + "Exp": SimdParser, "ReduceMean": Reduce1DParser, "Relu": SimdParser, "Gelu": SimdParser, "Silu": SimdParser, + "Sigmoid": SimdParser, "Sqrt": SimdParser, "Div": SimdParser, "Pow": SimdParser, diff --git a/stream/parser/onnx/reduce_1d.py b/stream/parser/onnx/reduce_1d.py index 26f8d7ff..cef5bfe7 100644 --- a/stream/parser/onnx/reduce_1d.py +++ b/stream/parser/onnx/reduce_1d.py @@ -14,11 +14,22 @@ def get_reduction_dim(self, input_shape: list[int], output_shape: list[int]): # The case that keepdim=True: the reduced dimension is kept with size 1 if len(input_shape) == len(output_shape): different_size = [a != b for a, b in zip(input_shape, output_shape)] - if sum(different_size) != 1: - raise ValueError(f"Input and output shapes {input_shape}, {output_shape} should only differ in one dim") - reduction_dim = different_size.index(True) - if output_shape[reduction_dim] != 1: - raise ValueError(f"The reduced dimension at axis {reduction_dim} in {output_shape} is larger than 1") + match sum(different_size): + case 0: + # Input and output size are the same: can happen with when a Reduce1D node is inferred but + # not present in ONNX -> default to -1 + reduction_dim = len(input_shape) - 1 + case 1: + reduction_dim = different_size.index(True) + if output_shape[reduction_dim] != 1: + raise ValueError( + f"The reduced dimension at axis {reduction_dim} in {output_shape} is larger than 1" + ) + case _: + # More than 1 dimension has different size + raise ValueError( + f"Input and output shapes {input_shape}, {output_shape} should only differ in one dim" + ) return reduction_dim # Other: assume that the reduction is at axis=-1 diff --git a/stream/stages/estimation/zigzag_core_mapping_estimation.py b/stream/stages/estimation/zigzag_core_mapping_estimation.py index ba6d3860..0e0c7d91 100644 --- a/stream/stages/estimation/zigzag_core_mapping_estimation.py +++ b/stream/stages/estimation/zigzag_core_mapping_estimation.py @@ -103,6 +103,12 @@ def run(self): # It's possible this node might not fully fit within the core's top level memories. # If so, we update the core too_large_operands_for_cme = self.check_core_capacity_for_node(core, node_duplicate) + # ! --- ensure all constant weights are accessed via blocking behavior i.s.o. transfer + for layer_op in node.constant_operands: + mem_op = node.memory_operand_links.layer_to_mem_op(layer_op) + if mem_op not in too_large_operands_for_cme: + too_large_operands_for_cme.append(mem_op) + # ! --- node_duplicate.set_chosen_core_allocation(core_id) # Attempt to override the node's spatial mapping based on the core's dataflow @@ -119,6 +125,8 @@ def run(self): answers = main_stage.run() assert len(answers) == 1, "ZigZagCoreMappingEstimationStage's subflow returned more than one CME" cme: CostModelEvaluation = answers[0][0] # type: ignore + cme = self.increase_cc_per_op(cme, node.type) + node_duplicate.set_chosen_core_allocation(None) # Reset the node's chosen core allocation self.cost_lut.add_cme(node, core, cme, allow_overwrite=False) self.cost_lut.save() @@ -134,6 +142,23 @@ def run(self): for cme, extra_info in sub_stage.run(): yield cme, extra_info + def increase_cc_per_op(self, cme: CostModelEvaluation, op_type: str): + match op_type: + case "silu": + factor = 4 + case "sigmoid": + factor = 4 + case "exp": + factor = 4 + case _: + factor = 1 + + if factor > 1: + logger.warning(f"Setting cycles per mac of {op_type} node to {factor}") + + cme.calc_overall_latency(cycles_per_mac=factor) + return cme + def visualize_cost_lut(self): # Get the scale factors scale_factors = { diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 5bb772f3..822a2e08 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -563,6 +563,7 @@ def get_inter_edges_numpy( producer: ComputationNode, consumer: ComputationNode, ): + numpy_tensors: dict[ComputationNode, dict[LayerOperand, NodeTensor]] = {} all_inter_edges: list[tuple[ComputationNode, ComputationNode, dict[str, Any]]] = [] diff --git a/stream/utils.py b/stream/utils.py index 06328b57..a4994134 100644 --- a/stream/utils.py +++ b/stream/utils.py @@ -89,13 +89,7 @@ def get_unique_nodes(workload: "ComputationNodeWorkload") -> list["ComputationNo """! Get the unique nodes from a workload.""" unique_nodes: list[ComputationNode] = [] for node in workload.node_list: - equal_nodes = list( - ( - unique_node - for unique_node in unique_nodes - if node.has_same_performance(unique_node) and node.group == unique_node.group - ) - ) + equal_nodes = list((unique_node for unique_node in unique_nodes if node.has_same_performance(unique_node))) if not equal_nodes: unique_nodes.append(node) return unique_nodes diff --git a/stream/visualization/memory_usage.py b/stream/visualization/memory_usage.py index 3ebf0870..3b69f1c7 100644 --- a/stream/visualization/memory_usage.py +++ b/stream/visualization/memory_usage.py @@ -111,7 +111,9 @@ def plot_memory_usage( peak_usages_bytes[ti] = peak_usage_bytes if not peak_usage_bytes > 0: continue # Happens for weight memory on pooling core because it's encoded as zero bit - assert min(stored_bytes) >= 0, f"We used negative amount of memory on top instance {ti}." + # assert min(stored_bytes) >= 0, f"We used negative amount of memory on top instance {ti}." + if min(stored_bytes) < 0: + logger.warn(f"We used negative amount of memory on top instance {ti}.") ax.plot(timesteps, stored_bytes, drawstyle="steps-post") # Plot the timesteps and used memory through time ax.axhline( y=peak_usage_bytes, From e45c6746d35b45b342f0843cd02c636d2683096a Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Wed, 18 Dec 2024 18:02:21 +0100 Subject: [PATCH 10/49] CommunicationLinkEvent can only contain 1 tensor --- stream/cost_model/communication_manager.py | 90 ++++++++++--------- .../architecture/noc/communication_link.py | 39 ++++---- .../genetic_algorithm/fitness_evaluator.py | 4 +- .../set_fixed_allocation_performance.py | 14 ++- stream/utils.py | 16 +++- stream/visualization/schedule.py | 21 ++--- stream/workload/node.py | 12 +-- 7 files changed, 114 insertions(+), 82 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index e9ba79fb..94793be5 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -1,5 +1,4 @@ import itertools -from math import ceil from typing import TYPE_CHECKING, Any from zigzag.datatypes import Constants, MemoryOperand @@ -43,25 +42,25 @@ class CommunicationLinkEvent: - a type, e.g. "transfer" or "block" - a start time - an end time - - a list of tensors relevant for the event: + - a tensors relevant for the event: * the tensor being transferred - * the tensor(s) for which we are blocking + * the tensor for which we are blocking - an activity: * the bits per clock cycle used of the link bandwidth """ - def __init__(self, type: str, start: int, end: int, tensors: list[Tensor], energy: float, activity: float) -> None: + def __init__(self, type: str, start: int, end: int, tensor: Tensor, energy: float, activity: float) -> None: self.type = type self.start = start self.end = end self.duration = self.end - self.start - self.tensors = tensors + self.tensor = tensor self.energy = energy self.activity = activity def __str__(self) -> str: return ( - f"CommunicationLinkEvent(type={self.type}, start={self.start}, end={self.end}, tensors={self.tensors}, " + f"CommunicationLinkEvent(type={self.type}, start={self.start}, end={self.end}, tensor={self.tensor}, " f"energy={self.energy:.2e}, activity={self.activity:.2f})" ) @@ -69,12 +68,10 @@ def __repr__(self) -> str: return str(self) def get_operands(self): - return [tensor.layer_operand for tensor in self.tensors] + return self.tensor.layer_operand def get_origin(self): - origins = [tensor.origin for tensor in self.tensors] - assert all([origin == origins[0] for origin in origins]) - return origins[0] + return self.tensor.origin class CommunicationManager: @@ -139,8 +136,8 @@ def update_links( sender (Core): The sending core. receiver (Core): The receiving core. receiver_memory_operand (str): The memory operand storing the tensor on the receiving end of the transfer. - start_timestep (int): The timestep at which to start the data transfer. - duration (int): Duration of the transfer + start_timestep: The timestep at which to start the data transfer. + duration: Duration of the transfer Returns: tuple: A tuple containing the link and memory energy costs associated with this transfer. @@ -159,7 +156,7 @@ def update_links( type="transfer", start=start_timestep, end=end_timestep, - tensors=[tensor], + tensor=tensor, energy=duration * link.unit_energy_cost, activity=link.bandwidth, ) @@ -192,52 +189,63 @@ def block_offchip_links( core_id: int, start_timestep: int, duration: int, - cn: ComputationNode, + node: ComputationNode, ) -> int: """Block the communication link between 'core' and the offchip core starting at timestep 'start_timestep' for duration 'duration'. Args: - too_large_operands (list): List of insufficient memory operands. This decides which links to block - core_id (int): The core id. - start_timestep (int): The ideal start timestep of the blocking. - duration (int): The duration of the blocking in cycles. - cn (ComputationNode): The computational node for which we are blocking the links. + too_large_operands: List of insufficient memory operands. This decides which links to block + core_id: The core id. + start_timestep: The ideal start timestep of the blocking. + duration: The duration of the blocking in cycles. + node: The computational node for which we are blocking the links. """ + + def get_inst_bw(op: MemoryOperand) -> int: + assert op in node.offchip_bandwidth_per_op + if op == Constants.OUTPUT_MEM_OP: + return node.offchip_bandwidth_per_op[op].wr_in_by_low + return node.offchip_bandwidth_per_op[op].rd_out_to_low + if not too_large_operands: return start_timestep - links_to_block: dict["CommunicationLink", int] = {} core = self.accelerator.get_core(core_id) assert self.accelerator.offchip_core_id is not None, "Off-chip core id is not set." offchip_core = self.accelerator.get_core(self.accelerator.offchip_core_id) tensors_per_link: dict["CommunicationLink", list[Tensor]] = {} + + # Output operand if Constants.OUTPUT_MEM_OP in too_large_operands: links_to_offchip = set(self.get_links_for_pair(core, offchip_core)) - req_bw_to_offchip = cn.offchip_bw.wr_in_by_low + for link in links_to_offchip: - links_to_block[link] = links_to_block.get(link, 0) + req_bw_to_offchip - # Add tensors for which this link will be blocked - if not tensors_per_link.get(link): - tensors_per_link[link] = [] - tensors_per_link[link].append(cn.operand_tensors[Constants.OUTPUT_LAYER_OP]) + tensors_per_link[link] = [(node.operand_tensors[Constants.OUTPUT_LAYER_OP])] + + # Input operands non_output_mem_ops = [op for op in too_large_operands if op != Constants.OUTPUT_MEM_OP] if non_output_mem_ops: links_from_offchip = set(self.get_links_for_pair(offchip_core, core)) - req_bw_from_offchip = cn.offchip_bw.rd_out_to_low for link in links_from_offchip: - links_to_block[link] = links_to_block.get(link, 0) + req_bw_from_offchip - # Add tensors for which this link will be blocked - if not tensors_per_link.get(link): - tensors_per_link[link] = [] - tensors_per_link[link] += [ - cn.operand_tensors[cn.memory_operand_links.mem_to_layer_op(op)] for op in non_output_mem_ops + tensors_per_link[link] = [ + node.operand_tensors[node.memory_operand_links.mem_to_layer_op(op)] for op in non_output_mem_ops ] + + # Sum the required bandwidth for all tensors on each link + total_required_bw_per_link = { + link: sum([get_inst_bw(tensor.memory_operand) for tensor in tensors]) + for link, tensors in tensors_per_link.items() + } + # Get idle window of the involved links - block_start = self.get_links_idle_window(links_to_block, start_timestep, duration, tensors_per_link) - # Block them - for link, req_bw in links_to_block.items(): - req_bw = ceil(req_bw) - link.block(block_start, duration, tensors_per_link[link], activity=req_bw) + block_start = self.get_links_idle_window(total_required_bw_per_link, start_timestep, duration, tensors_per_link) + + # # Block them + for link, tensors in tensors_per_link.items(): + operands = [tensor.memory_operand for tensor in tensors] + bandwidths = [get_inst_bw(op) for op in operands] + link.block(block_start, duration, tensors, bandwidth_per_tensor=bandwidths) + return block_start def get_links_idle_window( @@ -253,9 +261,9 @@ def get_links_idle_window( Args: links (dict): CommunicationLinks involved in the transfer and their required bandwidth. - best_case_start (int): The best case start timestep of the transfer. - duration (int): The required duration of the idle window. - tensors (list): The tensors to be transferred. Used to broadcast from previous transfer. + best_case_start: The best case start timestep of the transfer. + duration: The required duration of the idle window. + tensors: The tensors to be transferred. Used to broadcast from previous transfer. """ assert len(links) > 0 idle_intersections: list[tuple[int, int]] = [] diff --git a/stream/hardware/architecture/noc/communication_link.py b/stream/hardware/architecture/noc/communication_link.py index 57280f3f..c89d91de 100644 --- a/stream/hardware/architecture/noc/communication_link.py +++ b/stream/hardware/architecture/noc/communication_link.py @@ -107,7 +107,7 @@ def block( start: int, duration: int, tensors: list["Tensor"], - activity: int = 100, + bandwidth_per_tensor: list[int], ): """Block this communication link from start timestep for a given duration. @@ -117,17 +117,19 @@ def block( tensors: A list of tensors for which we are blocking the link. activity: The percentage of the link bandwidth used """ + assert len(tensors) == len(bandwidth_per_tensor) end = start + duration - # Create a CLEvent - event = CommunicationLinkEvent( - type="block", - start=start, - end=end, - tensors=tensors, - energy=tensors[0].origin.get_offchip_energy(), - activity=activity, - ) - self.update_activity(event) + # Create a CLEvent per tensor + for tensor, bandwidth in zip(tensors, bandwidth_per_tensor): + event = CommunicationLinkEvent( + type="block", + start=start, + end=end, + tensor=tensor, + energy=tensor.origin.get_offchip_energy(), + activity=bandwidth, + ) + self.update_activity(event) return def update_activity(self, event: CommunicationLinkEvent): @@ -136,11 +138,12 @@ def update_activity(self, event: CommunicationLinkEvent): activity = event.activity if start == end: return + # Check if this is a duplicate event for broadcast - for tensor in event.tensors: - previous_events = self.tensors.get(tensor, []) - if any((previous_event.start == event.start for previous_event in previous_events)): - return + previous_events = self.tensors.get(event.tensor, []) + if any((previous_event.start == event.start for previous_event in previous_events)): + return + idx_start = np.searchsorted(self.active_ts, start) if self.active_ts[idx_start] == start: self.active_deltas[idx_start] += activity @@ -153,9 +156,9 @@ def update_activity(self, event: CommunicationLinkEvent): else: self.active_ts = np.insert(self.active_ts, idx_end, end) self.active_deltas = np.insert(self.active_deltas, idx_end, -activity) - # Track that this link has transferred the tensors of this event for future broadcasts - for tensor in event.tensors: - self.tensors[tensor] = self.tensors.get(tensor, []) + [event] + # Track that this link has transferred the tensors of this event for future broadcasts + + self.tensors[event.tensor] = self.tensors.get(event.tensor, []) + [event] self.events.append(event) def get_idle_window(self, activity: float, duration: int, earliest_t: int, tensors: list["Tensor"]): diff --git a/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py b/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py index 59c9d814..f2d84dcf 100644 --- a/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py +++ b/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py @@ -3,7 +3,7 @@ from stream.cost_model.cost_model import StreamCostModelEvaluation from stream.hardware.architecture.accelerator import Accelerator -from stream.utils import CostModelEvaluationLUT, get_required_offchip_bandwidth, get_too_large_operands +from stream.utils import CostModelEvaluationLUT, get_too_large_operands, get_total_required_offchip_bandwidth from stream.workload.computation.computation_node import ComputationNode from stream.workload.onnx_workload import ComputationNodeWorkload @@ -100,7 +100,7 @@ def set_node_core_allocations(self, core_allocations: list[int]): offchip_energy += layer_operand_offchip_energy onchip_energy -= layer_operand_offchip_energy # If there was offchip memory added for too_large_operands, get the offchip bandwidth - required_offchip_bandwidth = get_required_offchip_bandwidth(cme, too_large_operands) + required_offchip_bandwidth = get_total_required_offchip_bandwidth(cme, too_large_operands) node.set_onchip_energy(onchip_energy) node.set_offchip_energy(offchip_energy) node.set_runtime(int(latency)) diff --git a/stream/stages/set_fixed_allocation_performance.py b/stream/stages/set_fixed_allocation_performance.py index 89388330..1454a916 100644 --- a/stream/stages/set_fixed_allocation_performance.py +++ b/stream/stages/set_fixed_allocation_performance.py @@ -3,10 +3,15 @@ from zigzag.cost_model.cost_model import CostModelEvaluation from zigzag.datatypes import MemoryOperand +from zigzag.mapping.data_movement import MemoryAccesses from stream.hardware.architecture.accelerator import Accelerator from stream.stages.stage import Stage, StageCallable -from stream.utils import CostModelEvaluationLUT, get_required_offchip_bandwidth, get_too_large_operands +from stream.utils import ( + CostModelEvaluationLUT, + get_too_large_operands, + get_top_level_inst_bandwidth, +) from stream.workload.computation.computation_node import ComputationNode from stream.workload.onnx_workload import ComputationNodeWorkload @@ -62,11 +67,14 @@ def set_fixed_allocation_performance(self): latency = getattr(cme, self.latency_attr) too_large_operands = get_too_large_operands(cme, self.accelerator, core_id=core_id) onchip_energy, offchip_energy = self.get_energy_distribution(cme, too_large_operands) + # Get the required offchip bandwidth during the execution of the node for all directions - offchip_bandwidth = get_required_offchip_bandwidth(cme, too_large_operands) + offchip_bandwidth_per_op: dict[MemoryOperand, MemoryAccesses] = { + mem_op: get_top_level_inst_bandwidth(cme, mem_op) for mem_op in too_large_operands + } self.set_hw_performance_node(node, onchip_energy, offchip_energy, latency, core_id) node.set_too_large_operands(too_large_operands.copy()) - node.set_offchip_bandwidth(offchip_bandwidth) + node.set_offchip_bandwidth(offchip_bandwidth_per_op) def get_energy_distribution( self, cme: CostModelEvaluation, too_large_operands: list[MemoryOperand] diff --git a/stream/utils.py b/stream/utils.py index a4994134..d9d8591a 100644 --- a/stream/utils.py +++ b/stream/utils.py @@ -6,7 +6,7 @@ from numpy.typing import NDArray from zigzag.cost_model.cost_model import CostModelEvaluation from zigzag.datatypes import MemoryOperand -from zigzag.mapping.data_movement import FourWayDataMoving +from zigzag.mapping.data_movement import FourWayDataMoving, MemoryAccesses from stream.hardware.architecture.core import Core from stream.workload.mapping import TILING_T @@ -95,7 +95,19 @@ def get_unique_nodes(workload: "ComputationNodeWorkload") -> list["ComputationNo return unique_nodes -def get_required_offchip_bandwidth( +def get_top_level_inst_bandwidth(cme: CostModelEvaluation, mem_op: MemoryOperand) -> MemoryAccesses: + """Given a cost model evaluation and a memory instance, compute the memory's total instantaneous bandwidth + required throughout the execution of the layer that corresponds to this CME. Returns empty bandwidth + requirements if the given memory instance is not included in this CME's memory hierarchy. + NOTE: this function is used in Stream + """ + assert mem_op in cme.mem_hierarchy_dict + layer_op = cme.layer.memory_operand_links.mem_to_layer_op(mem_op) + inst_bw_4way = cme.mapping.unit_mem_data_movement[layer_op][-1].req_mem_bw_inst + return inst_bw_4way + + +def get_total_required_offchip_bandwidth( cme: CostModelEvaluation, too_large_operands: list[MemoryOperand] ) -> FourWayDataMoving: if not too_large_operands: diff --git a/stream/visualization/schedule.py b/stream/visualization/schedule.py index a48567da..b5a55fb4 100644 --- a/stream/visualization/schedule.py +++ b/stream/visualization/schedule.py @@ -15,6 +15,7 @@ from plotly.express.colors import sample_colorscale from zigzag.datatypes import LayerOperand +from stream.hardware.architecture.noc.communication_link import CommunicationLink from stream.utils import CostModelEvaluationLUT from stream.workload.computation.computation_node import ComputationNode from stream.workload.tensor import Tensor @@ -182,7 +183,7 @@ def plot_timeline_brokenaxes( if plot_data_transfer: # First get all used and unique communication links - used_cl_collect = [] + used_cl_collect: list[CommunicationLink] = [] for ky, pair_link in accelerator.communication_manager.pair_links.items(): if pair_link: for link in pair_link: @@ -201,13 +202,13 @@ def plot_timeline_brokenaxes( start = event.start end = event.end runtime = end - start - tensors = event.tensors - weight_transfer = task_type.lower() == "transfer" and tensors[0].layer_operand in [ + tensor = event.tensor + weight_transfer = task_type.lower() == "transfer" and tensor.layer_operand in [ LayerOperand("W"), LayerOperand("B"), ] - layer_id = tensors[0].origin.id - node_id = tensors[0].origin.sub_id + layer_id = tensor.origin.id + node_id = tensor.origin.sub_id if layer_id not in layer_ids_seen: color = next(layer_colors) colors_seen.append(color) @@ -401,8 +402,8 @@ def add_dependencies(fig, scme, colors, layer_ids): def get_communication_dicts(scme): dicts = [] - accelerator = scme.accelerator - active_links = set() + accelerator: Accelerator = scme.accelerator + active_links: set[CommunicationLink] = set() for ky, link_pair in accelerator.communication_manager.pair_links.items(): if link_pair: for link in link_pair: @@ -418,8 +419,8 @@ def get_communication_dicts(scme): end = event.end runtime = end - start energy = event.energy - tensors = event.tensors - node = event.tensors[0].origin + tensor = event.tensor + node = tensor.origin layer_id = node.id activity = event.activity if runtime == 0: @@ -433,7 +434,7 @@ def get_communication_dicts(scme): Resource=resource, Layer=layer_id, Runtime=runtime, - Tensors=tensors, + Tensors=[tensor], Type=task_type, Activity=activity, Energy=energy, diff --git a/stream/workload/node.py b/stream/workload/node.py index c720ef88..68eb64f1 100644 --- a/stream/workload/node.py +++ b/stream/workload/node.py @@ -1,6 +1,7 @@ from abc import ABCMeta -from zigzag.mapping.data_movement import FourWayDataMoving +from zigzag.datatypes import MemoryOperand +from zigzag.mapping.data_movement import MemoryAccesses from zigzag.workload.layer_node_abc import LayerNodeABC @@ -9,6 +10,8 @@ class Node(LayerNodeABC, metaclass=ABCMeta): Example: ComputationNode, etc. """ + offchip_bandwidth_per_op: dict[MemoryOperand, MemoryAccesses] + def __init__( self, node_id: int, @@ -53,9 +56,6 @@ def __init__( # number of data (in bits) only this node produces (not produced by any other node) self.data_produced_unique = 0 - # will be set together with the core allocation - self.offchip_bw = FourWayDataMoving(0, 0, 0, 0) - def get_total_energy(self) -> float: """Get the total energy of running this node, including off-chip energy.""" return self.onchip_energy + self.offchip_energy @@ -134,8 +134,8 @@ def has_end(self) -> bool: """ return self.end is not None - def set_offchip_bandwidth(self, offchip_bw: FourWayDataMoving): - self.offchip_bw = offchip_bw + def set_offchip_bandwidth(self, offchip_bandwidth_per_op: dict[MemoryOperand, MemoryAccesses]): + self.offchip_bandwidth_per_op = offchip_bandwidth_per_op def __str__(self): return self.name From 78039156bc0acc5ad6711ef9ba2be0773f36b1ad Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Wed, 18 Dec 2024 23:54:20 +0100 Subject: [PATCH 11/49] enable true layer-by-layer scheduling depending on scheduling_order --- stream/cost_model/scheduler.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 095d43c8..c1089c85 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -221,6 +221,31 @@ def check_for_removal( ) +def sync_cores_idle_from( + cores_idle_from: dict[int, int], + G: ComputationNodeWorkload, + best_candidate: ComputationNode, + scheduling_order: list[tuple[int, int]], +): + """ + Sync the cores_idle_from dict values if the best candidate is the first node of a layer and we detect layer-by-layer execution. + The layer-by-layer execution is detected through the scheduling_order. + """ + # Get the predecessor ids of the best_candidate from the workload graph G + predecessor_ids = [pred.id for pred in G.predecessors(best_candidate) if pred.id != best_candidate.id] + predecessor_idxs = [i for i in range(len(scheduling_order)) if scheduling_order[i][0] in predecessor_ids] + + best_candidate_idx = scheduling_order.index((best_candidate.id, best_candidate.sub_id)) + if scheduling_order[best_candidate_idx - 1][0] in predecessor_ids and all( + (i < best_candidate_idx for i in predecessor_idxs) + ): + # If the best_candidate is the first node of a layer and all nodes of predecessor layers have been scheduled + # Sync the cores_idle_from dict + max_idle_time = max(cores_idle_from.values()) + for core_id in cores_idle_from: + cores_idle_from[core_id] = max_idle_time + + def schedule_graph( G: ComputationNodeWorkload, accelerator: "Accelerator", @@ -301,7 +326,8 @@ def schedule_graph( while not done: # Get the best candidate given the selection priority best_candidate, preds_end = get_best_candidate(candidates, scheduling_order) - + # Sync cores_idle_from variable for layer-by-layer scheduling + sync_cores_idle_from(cores_idle_from, G, best_candidate, scheduling_order) # Get the core this candidate will be scheduled on core_id = best_candidate.chosen_core_allocation assert core_id is not None From 52b42748736f618ded785e3df880b7e6d9aa785d Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Wed, 18 Dec 2024 23:55:56 +0100 Subject: [PATCH 12/49] use latency_attr to determine latency value to use from cme; add perfetto waco visualization --- .../constraint_optimization/allocation.py | 9 +- .../constraint_optimization/utils.py | 103 +++++++++++- .../genetic_algorithm/fitness_evaluator.py | 4 +- .../constraint_optimization_allocation.py | 19 ++- .../genetic_algorithm_allocation.py | 3 + .../set_fixed_allocation_performance.py | 2 +- .../visualization/constraint_optimization.py | 151 ++++++++++-------- 7 files changed, 220 insertions(+), 71 deletions(-) diff --git a/stream/opt/allocation/constraint_optimization/allocation.py b/stream/opt/allocation/constraint_optimization/allocation.py index 5a59ea13..37b89247 100644 --- a/stream/opt/allocation/constraint_optimization/allocation.py +++ b/stream/opt/allocation/constraint_optimization/allocation.py @@ -26,6 +26,7 @@ def get_optimal_allocations( iterations: int, gap: float = 0.5, time_limit: int = 600, + latency_attr: str = "latency_total1", ) -> ALLOCATION_T: core_ids = sorted((core.id for core in accelerator.cores.node_list if core.id != accelerator.offchip_core_id)) core_capacities = get_core_capacities(accelerator, MemoryOperand("I2"), core_ids) @@ -34,7 +35,13 @@ def get_optimal_allocations( ids = convert_ids(nodes) latencies, possible_allocation_splits = get_latencies( - nodes, core_ids, accelerator, cost_lut, impossible_lat=0, ids=ids + nodes, + core_ids, + accelerator, + cost_lut, + impossible_lat=0, + ids=ids, + latency_attr=latency_attr, ) energies = get_energies(nodes, core_ids, accelerator, cost_lut, impossible_energy=0, ids=ids) output_operand = LayerOperand("O") diff --git a/stream/opt/allocation/constraint_optimization/utils.py b/stream/opt/allocation/constraint_optimization/utils.py index 263e62a2..96da754e 100644 --- a/stream/opt/allocation/constraint_optimization/utils.py +++ b/stream/opt/allocation/constraint_optimization/utils.py @@ -3,6 +3,7 @@ from zigzag.datatypes import LayerDim, LayerOperand, UnrollFactor from stream.hardware.architecture.accelerator import Accelerator +from stream.hardware.architecture.core import Core from stream.utils import CostModelEvaluationLUT from stream.workload.computation.computation_node import ComputationNode @@ -50,6 +51,7 @@ def get_latencies( cost_lut: CostModelEvaluationLUT, impossible_lat: float = 1e11, ids: dict[ComputationNode, int] = {}, + latency_attr: str = "latency_total1", ) -> tuple[dict[tuple[int, str, int], int], dict]: if not ids: ids = {node: node.id for node in nodes} @@ -74,7 +76,7 @@ def get_latencies( inter_core_tiling_dims = [layer_dim for layer_dim, _ in node.inter_core_tiling] inter_core_tiling_size = get_loop_size(temporal_loops, inter_core_tiling_dims) inter_core_tiling_sizes[(node_id, core_name)] = inter_core_tiling_size - lat = cme.latency_total1 + lat = getattr(cme, latency_attr) possible_allocations[node_id].append(core_name) except ValueError: lat = impossible_lat @@ -130,3 +132,102 @@ def get_energies( energies[(ids[node], core_name)] = en return energies + + +def get_k_splits(allocation): + k_splits: dict[int, list[Core]] = {} + for _, core, id in allocation: + k_splits[id] = k_splits.get(id, []) + [core] + return k_splits + + +def get_node_latencies(allocation, cost_lut, accelerator, k_splits, latency_attr): + node_latencies = {} + core_names = sorted(set([a for _, a, _ in allocation])) + core_ids = [int(core_name.split(" ")[-1]) for core_name in core_names] + for _, a, id in allocation: + node = next(n for n in cost_lut.get_nodes() if n.id == id[0]) + latencies, _ = get_latencies([node], core_ids, accelerator, cost_lut, latency_attr=latency_attr) + nb_k_splits = len(k_splits[id]) + lat = latencies[(node.id, a, nb_k_splits)] + node_latencies[id, a] = lat + return node_latencies + + +def get_layer_ids(allocation): + layer_ids: set[int] = set() + for _, _, id in allocation: + layer_ids.add(id[0]) + layer_ids = sorted(layer_ids) + return layer_ids + + +def get_timesteps(allocation) -> list[int]: + return [item[0] for item in allocation] + + +def get_resources(allocation) -> set[int]: + return set(item[1] for item in allocation) + + +def get_node_timesteps(allocation): + node_timesteps = {} + for t, a, id in allocation: + node_timesteps[id, a] = t + return node_timesteps + + +def get_timestep_latencies(allocation, node_latencies, timesteps): + timestep_latencies = {t: 0 for t in range(max(timesteps) + 1)} + for t, a, id in allocation: + timestep_latencies[t] = max(timestep_latencies.get(t, 0), node_latencies[id, a]) + return timestep_latencies + + +def get_node_start_timesteps(k_splits, node_timesteps, timestep_latencies): + starts = {} + for id, allocations in k_splits.items(): + for a in allocations: + start = get_start_time_of_node(id, a, node_timesteps, timestep_latencies) + starts[id, a] = start + return starts + + +def get_start_time_of_node(id, a, timesteps, timestep_latencies, t_start=0): + node_timestep = timesteps[id, a] + for t in range(node_timestep): + t_end = t_start + timestep_latencies[t] + t_start = t_end + return t_start + + +def calculate_total_latency(allocation, cost_lut, accelerator, iterations, latency_attr) -> tuple[int, str]: + k_splits = get_k_splits(allocation) + timesteps = get_timesteps(allocation) + node_latencies = get_node_latencies(allocation, cost_lut, accelerator, k_splits, latency_attr) + timestep_latencies = get_timestep_latencies(allocation, node_latencies, timesteps) + node_timesteps = get_node_timesteps(allocation) + starts = get_node_start_timesteps(k_splits, node_timesteps, timestep_latencies) + total_timestep_latency = sum(timestep_latencies.values()) + cores = sorted(set(k[1] for k in starts)) + overlap = compute_iterations_overlap(timestep_latencies, node_timesteps, starts, total_timestep_latency, cores) + total_lat = iterations * total_timestep_latency - (iterations - 1) * overlap + total_lat_str = f"total_lat = N * T - (N - 1) * overlap --> {total_lat} = {iterations} * {total_timestep_latency} - {iterations-1} * {overlap}" + return total_lat, total_lat_str + + +def compute_iterations_overlap(timestep_latencies, node_timesteps, starts, T, cores): + slacks = {} + for core in cores: + relevant_starts = [v for k, v in starts.items() if k[1] == core] + earliest_start = min(relevant_starts) + latest_start = max(relevant_starts) + latest_id_core = next((k for k, v in starts.items() if v == latest_start and k[1] == core)) + latest_timestep = node_timesteps[latest_id_core] + timestep_latency = timestep_latencies[latest_timestep] + latest_end = latest_start + timestep_latency + slack = T - latest_end + earliest_start + assert slack >= 0 + slacks[core] = slack + overlap = min(slacks.values()) + return overlap diff --git a/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py b/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py index f2d84dcf..d0b63c10 100644 --- a/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py +++ b/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py @@ -35,6 +35,7 @@ def __init__( layer_groups_flexible, operands_to_prefetch: list[LayerOperand], scheduling_order: list[tuple[int, int]], + latency_attr: str, ) -> None: super().__init__(workload, accelerator, cost_lut) @@ -44,6 +45,7 @@ def __init__( self.layer_groups_flexible = layer_groups_flexible self.operands_to_prefetch = operands_to_prefetch self.scheduling_order = scheduling_order + self.latency_attr = latency_attr def get_fitness(self, core_allocations: list[int], return_scme: bool = False): """Get the fitness of the given core_allocations @@ -88,7 +90,7 @@ def set_node_core_allocations(self, core_allocations: list[int]): assert equal_unique_node is not None, "Node not found in CostModelEvaluationLUT" cme = self.cost_lut.get_cme(equal_unique_node, core) onchip_energy = cme.energy_total # Initialize on-chip energy as total energy - latency = cme.latency_total1 + latency = getattr(cme, self.latency_attr) too_large_operands = get_too_large_operands(cme, self.accelerator, core_id=core_allocation) # If there is a too_large_operand, we separate the off-chip energy. offchip_energy = 0 diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py index 3ae46e5b..096aba43 100644 --- a/stream/stages/allocation/constraint_optimization_allocation.py +++ b/stream/stages/allocation/constraint_optimization_allocation.py @@ -12,6 +12,7 @@ from stream.cost_model.cost_model import StreamCostModelEvaluation from stream.hardware.architecture.accelerator import Accelerator from stream.opt.allocation.constraint_optimization.allocation import ALLOCATION_T, get_optimal_allocations +from stream.opt.allocation.constraint_optimization.utils import calculate_total_latency from stream.stages.estimation.stream_cost_model_evaluation import StreamCostModelEvaluationStage from stream.stages.estimation.zigzag_core_mapping_estimation import ZigZagCoreMappingEstimationStage from stream.stages.generation.tiled_workload_generation import ( @@ -20,7 +21,7 @@ from stream.stages.set_fixed_allocation_performance import SetFixedAllocationPerformanceStage from stream.stages.stage import MainStage, Stage, StageCallable from stream.utils import CostModelEvaluationLUT -from stream.visualization.constraint_optimization import visualize_waco +from stream.visualization.constraint_optimization import to_perfetto_json, visualize_waco from stream.workload.computation.computation_node import ComputationNode from stream.workload.dnn_workload import DNNWorkloadStream from stream.workload.mapping import TILING_T @@ -192,13 +193,21 @@ def extract_steady_state_per_stack(self): logger.info(f"Percentage of steady state macs: {nb_steady_state_macs}/{nb_macs} = {percentage_macs:.2f}%") def find_best_allocation_per_stack(self): + total_ss_latency = 0 for stack, to_compute in self.ss_to_computes.items(): iterations = self.ss_iterations_per_stack[stack] t_start = time() optimal_allocation = self.find_best_allocation(to_compute, iterations, stack, self.co_time_limit) + ss_latency, _ = calculate_total_latency( + optimal_allocation, self.cost_lut, self.accelerator, iterations, self.latency_attr + ) t_end = time() - logger.info(f"Stack {stack}: {t_end - t_start:.3f} seconds") + logger.info( + f"Stack {stack}: Optimization took {t_end - t_start:.3f} seconds; Predicted steady-state latency: {ss_latency} cycles" + ) self.optimal_allocation_per_stack[stack] = optimal_allocation + total_ss_latency += ss_latency + logger.info(f"Total steady-state latency across stacks: {total_ss_latency} cycles") def find_best_allocation( self, to_compute: set[ComputationNode], iterations: int, stack: STACK_T = (0,), time_limit: int = 600 @@ -218,10 +227,14 @@ def find_best_allocation( self.cost_lut, iterations, time_limit=time_limit, + latency_attr=self.latency_attr, ) pickle_save(allocation, stack_allocations_path) fig_path = stack_allocations_path.replace(".pickle", ".html") - visualize_waco(allocation, self.cost_lut, self.accelerator, fig_path, iterations) + visualize_waco(allocation, self.cost_lut, self.accelerator, iterations, self.latency_attr, fig_path) + json_path = stack_allocations_path.replace(".pickle", ".json") + to_perfetto_json(allocation, self.cost_lut, self.accelerator, iterations, self.latency_attr, json_path) + return allocation def get_scheduling_order(self, unpartitioned_workload: DNNWorkloadStream) -> SCHEDULE_ORDER_T: diff --git a/stream/stages/allocation/genetic_algorithm_allocation.py b/stream/stages/allocation/genetic_algorithm_allocation.py index 1b55c776..09320bc0 100644 --- a/stream/stages/allocation/genetic_algorithm_allocation.py +++ b/stream/stages/allocation/genetic_algorithm_allocation.py @@ -39,6 +39,7 @@ def __init__( nb_ga_individuals: int, operands_to_prefetch: list[LayerOperand], scheduling_order: list[tuple[int, int]], + latency_attr: str, **kwargs: Any, ): """Initialize the InterCoreMappingStage. @@ -59,6 +60,7 @@ def __init__( self.nb_individuals = nb_ga_individuals self.operands_to_prefetch = operands_to_prefetch self.scheduling_order = scheduling_order + self.latency_attr = latency_attr # Determine the set of all (layer, group) combinations to be allocated separately self.layer_groups: list[tuple[int, int]] = sorted(set((n.id, n.group) for n in self.workload.node_list)) @@ -97,6 +99,7 @@ def __init__( self.layer_groups_flexible, self.operands_to_prefetch, self.scheduling_order, + self.latency_attr, ) # Extract the length of an individual. diff --git a/stream/stages/set_fixed_allocation_performance.py b/stream/stages/set_fixed_allocation_performance.py index 1454a916..ae1b0387 100644 --- a/stream/stages/set_fixed_allocation_performance.py +++ b/stream/stages/set_fixed_allocation_performance.py @@ -32,7 +32,7 @@ def __init__( self.accelerator = accelerator self.workload = workload self.cost_lut = cost_lut - self.latency_attr = kwargs.get("latency_attr", "latency_total2") + self.latency_attr = kwargs.get("latency_attr", "latency_total1") def run(self): logger.info("Start SetFixedAllocationPerformanceStage.") diff --git a/stream/visualization/constraint_optimization.py b/stream/visualization/constraint_optimization.py index 5571513a..4eafa38e 100644 --- a/stream/visualization/constraint_optimization.py +++ b/stream/visualization/constraint_optimization.py @@ -1,3 +1,4 @@ +import json import logging import os from itertools import cycle @@ -9,11 +10,22 @@ from stream.hardware.architecture.accelerator import Accelerator from stream.opt.allocation.constraint_optimization.allocation import ALLOCATION_T -from stream.opt.allocation.constraint_optimization.utils import get_latencies +from stream.opt.allocation.constraint_optimization.utils import ( + calculate_total_latency, + compute_iterations_overlap, + get_k_splits, + get_layer_ids, + get_node_latencies, + get_node_start_timesteps, + get_node_timesteps, + get_resources, + get_timestep_latencies, + get_timesteps, +) from stream.utils import CostModelEvaluationLUT if TYPE_CHECKING: - from stream.hardware.architecture.core import Core + pass logger = logging.getLogger(__name__) @@ -22,49 +34,23 @@ def visualize_waco( allocation: ALLOCATION_T, cost_lut: CostModelEvaluationLUT, accelerator: Accelerator, - fig_path: str, iterations: int, + latency_attr: str, + fig_path: str, ): """ Allocation is a list of tuples, with each tuple being of form (timestep, allocation, node_id). Allocation is a core. cost_lut is a CostModelEvaluationLUT storing for each node and each core the hardware performance. """ - # Extract the number of allocations (k splits) of all nodes - k_splits: dict[int, list[Core]] = {} - for _, core, id in allocation: - k_splits[id] = k_splits.get(id, []) + [core] - # Extract the latencies of all nodes - node_latencies = {} - layer_ids: set[int] = set() - ids: list[int] = [] - timesteps: list[int] = [] - resources: set[int] = set() - core_names = sorted(set([a for _, a, _ in allocation])) - core_ids = [int(core_name.split(" ")[-1]) for core_name in core_names] - for t, a, id in allocation: - timesteps.append(t) - layer_ids.add(id[0]) - ids.append(id) - resources.add(a) - node = next(n for n in cost_lut.get_nodes() if n.id == id[0]) - latencies, _ = get_latencies([node], core_ids, accelerator, cost_lut) - nb_k_splits = len(k_splits[id]) - lat = latencies[(node.id, a, nb_k_splits)] - node_latencies[id, a] = lat - layer_ids = sorted(layer_ids) - # Extract the timesteps of all nodes and the latency per timestep - node_timesteps = {} - timestep_latencies = {t: 0 for t in range(max(timesteps) + 1)} - for t, a, id in allocation: - node_timesteps[id, a] = t - timestep_latencies[t] = max(timestep_latencies.get(t, 0), node_latencies[id, a]) - # Extract start of each node - starts = {} - for id, allocations in k_splits.items(): - for a in allocations: - start = get_start_time_of_node(id, a, node_timesteps, timestep_latencies) - starts[id, a] = start - _, total_lat_str = calculate_total_latency(starts, timestep_latencies, node_timesteps, iterations) + k_splits = get_k_splits(allocation) + layer_ids = get_layer_ids(allocation) + timesteps = get_timesteps(allocation) + resources = get_resources(allocation) + node_timesteps = get_node_timesteps(allocation) + node_latencies = get_node_latencies(allocation, cost_lut, accelerator, k_splits, latency_attr) + timestep_latencies = get_timestep_latencies(allocation, node_latencies, timesteps) + starts = get_node_start_timesteps(k_splits, node_timesteps, timestep_latencies) + _, total_lat_str = calculate_total_latency(allocation, cost_lut, accelerator, iterations, latency_attr) # Plot the nodes using Plotly rectangles color_cycle = cycle(sample_colorscale("rainbow", np.linspace(0, 1, len(cost_lut.get_nodes())))) colors = {layer_id: c for (layer_id, c) in zip(layer_ids, color_cycle)} @@ -122,30 +108,67 @@ def visualize_waco( logger.info(f"Plotted WACO result to {fig_path}") -def get_start_time_of_node(id, a, timesteps, timestep_latencies, t_start=0): - node_timestep = timesteps[id, a] - for t in range(node_timestep): - t_end = t_start + timestep_latencies[t] - t_start = t_end - return t_start +def to_perfetto_json( + allocation: ALLOCATION_T, + cost_lut: CostModelEvaluationLUT, + accelerator: Accelerator, + iterations: int, + latency_attr: str, + json_path: str, +): + """ + Allocation is a list of tuples, with each tuple being of form (timestep, allocation, node_id). Allocation is a core. + cost_lut is a CostModelEvaluationLUT storing for each node and each core the hardware performance. + """ + k_splits = get_k_splits(allocation) + timesteps = get_timesteps(allocation) + resources = get_resources(allocation) + node_latencies = get_node_latencies(allocation, cost_lut, accelerator, k_splits, latency_attr) + node_timesteps = get_node_timesteps(allocation) + timestep_latencies = get_timestep_latencies(allocation, node_latencies, timesteps) + starts = get_node_start_timesteps(k_splits, node_timesteps, timestep_latencies) + total_timestep_latency = sum(timestep_latencies.values()) + cores = sorted(set(k[1] for k in starts)) + overlap = compute_iterations_overlap(timestep_latencies, node_timesteps, starts, total_timestep_latency, cores) + offset = total_timestep_latency - overlap + + # Prepare JSON data for Perfetto + perfetto_data = [] + + # Add thread names (cores) + for core in resources: + thread_name_event = { + "name": "thread_name", + "ph": "M", + "pid": "waco", + "tid": core, + "args": {"name": f"Core {core}"}, + "cname": "blue", + } + perfetto_data.append(thread_name_event) + # Add events for each iteration + for iteration in range(iterations): + iteration_offset = iteration * offset + for id, allocations in k_splits.items(): + for a in allocations: + start = starts[id, a] + iteration_offset + runtime = node_latencies[id, a] + event = { + "name": f"Node {id}", + "cat": "compute", + "ph": "X", + "ts": start, + "dur": runtime, + "pid": "waco", + "tid": a, + "cname": "blue", + "args": {"Runtime": runtime, "NodeID": id, "Iteration": iteration}, + } + perfetto_data.append(event) -def calculate_total_latency(starts, timestep_latencies, node_timesteps, N) -> tuple[int, str]: - T = sum(timestep_latencies.values()) - cores = sorted(set(k[1] for k in starts)) - slacks = {} - for core in cores: - relevant_starts = [v for k, v in starts.items() if k[1] == core] - earliest_start = min(relevant_starts) - latest_start = max(relevant_starts) - latest_id_core = next((k for k, v in starts.items() if v == latest_start and k[1] == core)) - latest_timestep = node_timesteps[latest_id_core] - timestep_latency = timestep_latencies[latest_timestep] - latest_end = latest_start + timestep_latency - slack = T - latest_end + earliest_start - assert slack >= 0 - slacks[core] = slack - min_slack = min(slacks.values()) - total_lat = N * T - (N - 1) * min_slack - total_lat_str = f"total_lat = N * T - (N - 1) * slack --> {total_lat} = {N} * {T} - {N-1} * {min_slack}" - return total_lat, total_lat_str + # Write JSON data to file + with open(json_path, "w") as f: + json.dump(perfetto_data, f, indent=2) + + logger.info(f"Plotted WACO result to {json_path}") From dc86f671647370f7c5540ef520c8ac2758dfb775 Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Thu, 19 Dec 2024 11:33:46 +0100 Subject: [PATCH 13/49] add tensor size to visualizations --- stream/visualization/schedule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stream/visualization/schedule.py b/stream/visualization/schedule.py index b5a55fb4..167cf497 100644 --- a/stream/visualization/schedule.py +++ b/stream/visualization/schedule.py @@ -434,7 +434,7 @@ def get_communication_dicts(scme): Resource=resource, Layer=layer_id, Runtime=runtime, - Tensors=[tensor], + Tensors={tensor: tensor.size}, Type=task_type, Activity=activity, Energy=energy, @@ -526,7 +526,7 @@ def get_dataframe_from_scme( Runtime=runtime, SpatialUtilization=su_perfect_temporal, SpatialUtilizationWithTemporal=su_nonperfect_temporal, - Tensors=tensors, + Tensors={tensor: tensor.size for tensor in tensors}, Type=task_type, Activity=np.nan, Energy=energy, From 03901a0f2bc6b79cb869f282a07e8e679a193321 Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Thu, 19 Dec 2024 12:29:00 +0100 Subject: [PATCH 14/49] add caching of tiled_worload --- stream/api.py | 6 + .../constraint_optimization_allocation.py | 3 + .../generation/tiled_workload_generation.py | 252 ++++++++++-------- 3 files changed, 148 insertions(+), 113 deletions(-) diff --git a/stream/api.py b/stream/api.py index 7c3f37d8..30c2eadf 100644 --- a/stream/api.py +++ b/stream/api.py @@ -72,6 +72,7 @@ def optimize_allocation_ga( os.makedirs(f"{output_path}/{experiment_id}", exist_ok=True) # Output paths + tiled_workload_path = f"{output_path}/{experiment_id}/tiled_workload.pickle" cost_lut_path = f"{output_path}/{experiment_id}/cost_lut.pickle" scme_path = f"{output_path}/{experiment_id}/scme.pickle" @@ -103,6 +104,7 @@ def optimize_allocation_ga( nb_ga_individuals=nb_ga_individuals, # number of individuals in each ga generation mode=mode, layer_stacks=layer_stacks, + tiled_workload_path=tiled_workload_path, cost_lut_path=cost_lut_path, operands_to_prefetch=[], # required by GeneticAlgorithmAllocationStage ) @@ -130,8 +132,10 @@ def optimize_allocation_co( os.makedirs(f"{output_path}/{experiment_id}", exist_ok=True) # Output paths + tiled_workload_path = f"{output_path}/{experiment_id}/tiled_workload.pickle" cost_lut_path = f"{output_path}/{experiment_id}/cost_lut.pickle" allocations_path = f"{output_path}/{experiment_id}/waco/" + tiled_workload_post_co_path = f"{output_path}/{experiment_id}/tiled_workload_post_co.pickle" cost_lut_post_co_path = f"outputs/{experiment_id}/cost_lut_post_co.pickle" scme_path = f"{output_path}/{experiment_id}/scme.pickle" @@ -161,8 +165,10 @@ def optimize_allocation_co( loma_lpf_limit=6, # required by LomaEngine mode=mode, layer_stacks=layer_stacks, + tiled_workload_path=tiled_workload_path, cost_lut_path=cost_lut_path, allocations_path=allocations_path, + tiled_workload_post_co_path=tiled_workload_post_co_path, cost_lut_post_co_path=cost_lut_post_co_path, operands_to_prefetch=[], # required by ConstraintOptimizationAllocationStage ) diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py index 096aba43..dbcd20d5 100644 --- a/stream/stages/allocation/constraint_optimization_allocation.py +++ b/stream/stages/allocation/constraint_optimization_allocation.py @@ -51,6 +51,7 @@ def __init__( cost_lut: CostModelEvaluationLUT, layer_stacks: list[tuple[int, ...]], allocations_path: str, + tiled_workload_post_co_path: str, cost_lut_post_co_path: str, **kwargs: Any, ): @@ -75,6 +76,7 @@ def __init__( self.allocations_path = allocations_path os.makedirs(self.allocations_path, exist_ok=True) + self.tiled_workload_post_co_path = tiled_workload_post_co_path self.cost_lut_post_co_path = cost_lut_post_co_path self.co_time_limit: int = kwargs.get("co_time_limit", self.CO_TIME_LIMIT) @@ -409,6 +411,7 @@ def schedule_allocation(self, allocation: ALLOCATION_T) -> StreamCostModelEvalua kwargs["accelerator"] = self.accelerator kwargs["workload"] = unpartitioned_sub_workload kwargs["scheduling_order"] = scheduling_order + kwargs["tiled_workload_path"] = self.tiled_workload_post_co_path kwargs["cost_lut_path"] = self.cost_lut_post_co_path kwargs["latency_attr"] = self.latency_attr diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 822a2e08..b420b90c 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -1,11 +1,12 @@ import logging +import os from copy import deepcopy from math import ceil, prod from typing import Any from rtree import index from zigzag.datatypes import Constants, LayerDim, LayerOperand -from zigzag.utils import pickle_deepcopy +from zigzag.utils import pickle_deepcopy, pickle_load, pickle_save from stream.cost_model.group_allocation import GroupIdManager from stream.hardware.architecture.accelerator import Accelerator @@ -35,7 +36,7 @@ class TensorDimensionMismatchException(Exception): class TiledWorkloadGenerationStage(Stage): """ - Class that transforms the layer-by-layer workload into finer CN workload graph. + Class that transforms the layer-by-layer workload into tiled workload graph. """ def __init__( @@ -44,6 +45,7 @@ def __init__( *, workload: ONNXWorkload, accelerator: Accelerator, + tiled_workload_path: str, **kwargs: Any, ): """ @@ -54,69 +56,93 @@ def __init__( self.workload = workload self.accelerator = accelerator - # Save for each of the workload's nodes the finer nodes that will be generated - self.finer_nodes_dict: dict[ComputationNode, list[ComputationNode]] = {} + # Save for each of the workload's nodes the tiles that will be generated + self.tiles_dict: dict[ComputationNode, list[ComputationNode]] = {} # Memoize the numpy tensors for dependency generation self.numpy_tensors = {} + self.tiled_workload_path = tiled_workload_path + def run(self): - unique_finer_nodes: list[ComputationNode] = [] - # For each node get all the finer nodes and the edges between them - all_finer_nodes = [] - all_finer_edges = [] + all_unique_tiles: list[ComputationNode] = [] + # For each node get all the tiles and the edges between them + all_tiles = [] + all_edges = [] for node in self.workload.topological_sort(): - # If other node types shouldn't be included in finer node graph, add here + # If other node types shouldn't be included in tiled workload graph, add here if not isinstance(node, ComputationNode): continue outer_temporal_loops = self.get_outer_tmap_loop_dimensions(node) - finer_nodes, unique_nodes = self.get_finer_nodes(node, outer_temporal_loops) + tiles, unique_tiles = self.get_tiles(node, outer_temporal_loops) logger.info(f"{node}: Outer loops {outer_temporal_loops}.") - logger.info(f"{node}: Generated {len(finer_nodes)} finer nodes.") - self.finer_nodes_dict[node] = finer_nodes - unique_finer_nodes += unique_nodes - intra_edges = self.get_intra_edges(finer_nodes) - # Add the finer nodes and intra edges to the lists - all_finer_nodes += finer_nodes - all_finer_edges += intra_edges - - # Get all pairs of nodes that we have to extract inter edges for - all_pairs = self.get_all_node_pairs(self.workload) - for producer, consumer, is_complex in all_pairs: - finer_producers = self.finer_nodes_dict[producer] - finer_consumers = self.finer_nodes_dict[consumer] - if is_complex: - inter_edges = self.get_inter_edges_numpy(producer, consumer) - else: - inter_edges = self.get_inter_edges_rtree(producer, consumer, finer_producers, finer_consumers) - all_finer_edges += inter_edges - - # Set the base_priority value of all nodes - self.set_base_priority_of_nodes(all_finer_nodes, all_finer_edges) - - # Set the number of real predecessors of all nodes - self.set_nb_real_predecessors(all_finer_nodes, all_finer_edges) - - # Construct the new finer workload graph - # The graph construction needs to happen after the base priority and nb_real_predecessors are set - partitioned_workload = ComputationNodeWorkload() - partitioned_workload.add_edges_from(all_finer_edges) - - logger.info(f"Finer graph: {partitioned_workload}.") + logger.info(f"{node}: Generated {len(tiles)} tile(s).") + self.tiles_dict[node] = tiles + all_unique_tiles += unique_tiles + intra_edges = self.get_intra_edges(tiles) + # Add the tiles and intra edges to the lists + all_tiles += tiles + all_edges += intra_edges + + # Load in cached tiles and reuse cached tiled_workload if they match + cached_workload = self.load_cached_tiled_workload() + if cached_workload and self.match(all_tiles, cached_workload): + tiled_workload = cached_workload + logger.info("Tiled workload loaded from cache.") + else: + # Get all pairs of nodes that we have to extract inter edges for + all_pairs = self.get_all_node_pairs(self.workload) + for producer, consumer, is_complex in all_pairs: + producer_tiles = self.tiles_dict[producer] + consumer_tiles = self.tiles_dict[consumer] + if is_complex: + inter_edges = self.get_inter_edges_numpy(producer, consumer) + else: + inter_edges = self.get_inter_edges_rtree(producer, consumer, producer_tiles, consumer_tiles) + all_edges += inter_edges + + # Set the base_priority value of all nodes + self.set_base_priority_of_nodes(all_tiles, all_edges) + + # Set the number of real predecessors of all nodes + self.set_nb_real_predecessors(all_tiles, all_edges) + + # Construct the new tiled workload graph + # The graph construction needs to happen after the base priority and nb_real_predecessors are set + tiled_workload = ComputationNodeWorkload() + tiled_workload.add_edges_from(all_edges) + + # Save the tiled workload + pickle_save(tiled_workload, self.tiled_workload_path) + logger.info(f"Saved tiled workload to {self.tiled_workload_path}.") + + logger.info(f"Finer graph: {tiled_workload}.") kwargs = self.kwargs.copy() kwargs["original_workload"] = pickle_deepcopy(self.workload) - kwargs["workload"] = partitioned_workload + kwargs["workload"] = tiled_workload kwargs["accelerator"] = self.accelerator if "scheduling_order" not in kwargs: - kwargs["scheduling_order"] = self.get_scheduling_order(partitioned_workload) + kwargs["scheduling_order"] = self.get_scheduling_order(tiled_workload) sub_stage = self.list_of_callables[0](self.list_of_callables[1:], **kwargs) for cme, extra_info in sub_stage.run(): yield cme, extra_info yield None, None + def match(self, tiles: list[ComputationNode], tiled_workload: ComputationNodeWorkload) -> bool: + """Check if the tiles match the cached tiled workload. + Can't use 'has_same_performance' because nb_real_predecessors is not set yet for tiles.""" + for tile in tiles: + if not [ + t + for t in tiled_workload.node_list + if t.id == tile.id and t.sub_id == tile.sub_id and t.layer_dim_sizes == tile.layer_dim_sizes + ]: + return False + return True + @staticmethod def get_scheduling_order(workload: ComputationNodeWorkload): return sorted(((n.id, n.sub_id) for n in workload.node_list), reverse=True) @@ -198,29 +224,29 @@ def get_non_type_predecessors(self, node: Node, types: list[type]) -> list[Node] return preds @staticmethod - def get_finer_nodes( + def get_tiles( original_node: ComputationNode, outer_temporal_loops: list[TemporalLoop] ) -> tuple[list[ComputationNode], list[ComputationNode]]: original_node_id = original_node.id - # Take away the outer_temporal_loops to create finer CNs for this node - finer_node_attrs = original_node.extract_node_attr() - finer_node_mapping = original_node.extract_inter_core_mapping_attr() + # Take away the outer_temporal_loops to create tiled CNs for this node + tile_attrs = original_node.extract_node_attr() + tile_mapping = original_node.extract_inter_core_mapping_attr() for outer_tl in outer_temporal_loops: outer_dim = outer_tl.dimension outer_size = outer_tl.size # Check if this node's "dim" size is divisible by the outer-cn loop size - node_dim_size = finer_node_attrs.layer_dim_sizes[outer_dim] + node_dim_size = tile_attrs.layer_dim_sizes[outer_dim] q, rem = divmod(node_dim_size, outer_size) # returns x//y, x%y assert rem == 0, ( f"Node {original_node} dim {outer_dim} of size {node_dim_size} is not divisible by outer-cn temporal " f"loop {outer_tl}" ) - finer_node_attrs.layer_dim_sizes[outer_dim] = q + tile_attrs.layer_dim_sizes[outer_dim] = q - # Loop dimension + size of the finer nodes (called span here) - finer_span = finer_node_attrs.layer_dim_sizes + # Loop dimension + size of the tiles (called span here) + tile_span = tile_attrs.layer_dim_sizes loop_dims = original_node.layer_dims stop_values = [temporal_loop.size for temporal_loop in outer_temporal_loops] nb_cns = int(prod(stop_values)) @@ -235,7 +261,7 @@ def get_finer_nodes( for i, outer_loop in enumerate(outer_temporal_loops): loop_dim = outer_loop.dimension stop_value = outer_loop.size - inner_span = finer_span[loop_dim] if loop_dim in finer_span else 1 + inner_span = tile_span[loop_dim] if loop_dim in tile_span else 1 lower_outer_cn_loops = outer_temporal_loops[:i] # Returns 1 if empty list outer_span = prod( @@ -243,7 +269,7 @@ def get_finer_nodes( ) mult_factors.append(int(inner_span * outer_span)) - finer_nodes: list[ComputationNode] = [] + tiles: list[ComputationNode] = [] tensors: list[Tensor] = [] group_id_manager = GroupIdManager(original_node) for n in range(nb_cns): @@ -268,10 +294,9 @@ def get_finer_nodes( mult_factor = mult_factors[i] dim_min += loop_val * mult_factor # max value is exclusive - dim_max = dim_min + (finer_span[loop_dim] if loop_dim in finer_span else 1) + dim_max = dim_min + (tile_span[loop_dim] if loop_dim in tile_span else 1) dim_min_max[loop_dim] = (dim_min, dim_max) - # finer_node_mapping_copy = deepcopy(original_node.extract_mapping_attr()) group_id = group_id_manager.get_group_id(dim_min_max) # Create the computation node object with the computed ranges of the loop dimensions @@ -285,43 +310,43 @@ def get_finer_nodes( [dim_min_max[dim][1] >= original_node.layer_dim_sizes[dim] for dim in original_node_output_ir_dims] ) - finer_node = ComputationNode( + tile = ComputationNode( node_id=original_node_id, sub_id=n, node_name=node_name, - node_attr=finer_node_attrs, - mapping_attr=finer_node_mapping, + node_attr=tile_attrs, + mapping_attr=tile_mapping, op_type=original_node.type, produces_final_output=produces_final_output, group_id=group_id, ) # Override loop_ranges property - finer_node.update_loop_ranges(dim_min_max) + tile.update_loop_ranges(dim_min_max) # Re-calculate pr loop ranges based on new loop_ranges - finer_node.calculate_pr_loop_ranges() + tile.calculate_pr_loop_ranges() # Re-set the operand tensors for the new loop_ranges - finer_node.set_operand_tensors() + tile.set_operand_tensors() - # Initialize the priorities (total inter-CN data reuse factor) for the constant operands of this finer_node - for constant_operand in finer_node.constant_operands: - tensor = finer_node.operand_tensors[constant_operand] + # Initialize the priorities (total inter-CN data reuse factor) for the constant operands of this tile + for constant_operand in tile.constant_operands: + tensor = tile.operand_tensors[constant_operand] tensor.set_base_priorities(tensor_reuse_factors[constant_operand][n]) - # Replace any of the tensors with identical tensors of previous finer nodes - for op, tensor in finer_node.operand_tensors.items(): + # Replace any of the tensors with identical tensors of previous tiles + for op, tensor in tile.operand_tensors.items(): replaced = False for previous_tensor in tensors: if tensor.equality_hash() == previous_tensor.equality_hash(): - finer_node.operand_tensors[op] = previous_tensor + tile.operand_tensors[op] = previous_tensor replaced = True if not replaced: tensors.append(tensor) - # Compute the output data produced by each finer node, assuming that all the data produced by different CNs + # Compute the output data produced by each tile, assuming that all the data produced by different CNs # are unique - finer_node.data_produced_unique = int( - finer_node.operand_size_elem[Constants.OUTPUT_LAYER_OP] - * finer_node.operand_precision[Constants.FINAL_OUTPUT_LAYER_OP] + tile.data_produced_unique = int( + tile.operand_size_elem[Constants.OUTPUT_LAYER_OP] + * tile.operand_precision[Constants.FINAL_OUTPUT_LAYER_OP] ) # If the core allocation is fixed, we need to set the chosen core allocation. @@ -332,14 +357,14 @@ def get_finer_nodes( original_node.possible_core_allocation ), f"Group id {group_id} too large for core allocation list {original_node.core_allocation}" chosen_core_allocation = original_node.possible_core_allocation[group_id] - finer_node.set_chosen_core_allocation(chosen_core_allocation) + tile.set_chosen_core_allocation(chosen_core_allocation) - finer_nodes.append(finer_node) + tiles.append(tile) # NOTE We take the first node as only unique one as they are all generated equally now. - unique_finer_nodes = [finer_nodes[0]] + unique_tiles = [tiles[0]] - return finer_nodes, unique_finer_nodes + return tiles, unique_tiles @staticmethod def get_intra_edges(nodes: list[ComputationNode]): @@ -481,18 +506,16 @@ def get_inter_edges_rtree( self, producer: ComputationNode, consumer: ComputationNode, - finer_producers: list[ComputationNode], - finer_consumers: list[ComputationNode], + producer_tiles: list[ComputationNode], + consumer_tiles: list[ComputationNode], ): - """Function that finds the edges between a producer and consumer node, - more specifically their finer counterparts producer_finer and consumer_finer. - A communication node is inserted between each producer and consumer node. + """Function that finds the edges between producer and consumer tiles. Args: producer: the producer node consumer: the consumer node - finer_producers: list of finer producer nodes - finer_consumers: list of finer consumer nodes + producer_tiles: list of the producer tiles + consumer_tiles: list of the consumer tiles """ # Check all the different input operands of the consumer node that stem from the producer node # The direct predecessor of an input operand might be a DummyNode so we need to propagate back @@ -511,8 +534,8 @@ def get_inter_edges_rtree( edges: list[tuple[ComputationNode, ComputationNode, dict[str, Any]]] = [] for input_operand in dependent_input_operands: - # Build the tree of all finer consumer nodes for this operand - consumer_tree = self.build_rtree(producer, consumer, finer_consumers, input_operand) + # Build the tree of all consumer tiles for this operand + consumer_tree = self.build_rtree(producer, consumer, consumer_tiles, input_operand) # As long as we haven't iterated through all of the output's operand's irrelevant dimensions, # we shouldn't add an edge to the consumer layer's nodes, as this would create unnecessary graph complexity @@ -525,33 +548,33 @@ def get_inter_edges_rtree( # Iterate through all the producer nodes and get the consumer nodes that require its outputs, # taking into account that we only want an edge if the producer's irrelevant loops are at a max - for finer_producer in finer_producers: + for producer_tile in producer_tiles: # Get the output irrelevant loop ranges and check if they are at least at the max ir_dims_not_at_max = [ - finer_producer.loop_ranges[ir_dim][1] < producer.loop_ranges[ir_dim][1] + producer_tile.loop_ranges[ir_dim][1] < producer.loop_ranges[ir_dim][1] for ir_dim in producer_ir_dims_output ] if any(ir_dims_not_at_max): - continue # to the next finer producer + continue # to the next producer tile - p_inclusive_ranges = self.convert_to_inclusive_data_range(finer_producer.loop_ranges) + p_inclusive_ranges = self.convert_to_inclusive_data_range(producer_tile.loop_ranges) p_bounding_box = self.get_bounding_box_dimensions( producer, consumer, producer_r_dims_output, p_inclusive_ranges ) - # Get the finer consumer node ids that intersect with this finer producer node + # Get the consumer tile ids that intersect with this producer tile intersecting_consumer_node_ids = consumer_tree.intersection(p_bounding_box) for intersecting_consumer_node_id in intersecting_consumer_node_ids: - intersecting_consumer = finer_consumers[intersecting_consumer_node_id] + intersecting_consumer = consumer_tiles[intersecting_consumer_node_id] # Create a new communication node that will reside between the producer and consumer node edges += [ ( - finer_producer, + producer_tile, intersecting_consumer, { "operand": input_operand, - "bits": finer_producer.data_produced_unique, + "bits": producer_tile.data_produced_unique, }, ) ] @@ -572,8 +595,8 @@ def get_tensor_cn_for_op(node: ComputationNode, dependent_operand: LayerOperand) if node in numpy_tensors: tensor_cns = numpy_tensors[node] else: - finer_nodes = self.finer_nodes_dict[node] - tensor_cns = self.get_tensor_cns(node, finer_nodes) + tiles = self.tiles_dict[node] + tensor_cns = self.get_tensor_cns(node, tiles) # Store result for later use numpy_tensors[node] = tensor_cns tensor = tensor_cns[dependent_operand] @@ -676,7 +699,7 @@ def _get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: Node @staticmethod def get_inter_edges_tensor_based(producer_output_tensor: NodeTensor, consumer_input_tensor: NodeTensor): """This method obtains the edges between a producer and consumer. - This is done by iterating through all finer consumer nodes, + This is done by iterating through all consumer tiles, for each consumer node we create a window and get all the producer nodes that produced this data window. Args: @@ -699,9 +722,7 @@ def get_inter_edges_tensor_based(producer_output_tensor: NodeTensor, consumer_in inter_edges.add((producer, consumer)) return inter_edges - def get_tensor_cns( - self, node: ComputationNode, finer_nodes: list[ComputationNode] - ) -> dict[LayerOperand, NodeTensor]: + def get_tensor_cns(self, node: ComputationNode, tiles: list[ComputationNode]) -> dict[LayerOperand, NodeTensor]: is_source_node = len(self.get_non_type_predecessors(node, [DummyNode])) == 0 variable_operands = [op for op in node.input_operands if op not in node.constant_operands] + [ node.output_operand @@ -715,52 +736,52 @@ def get_tensor_cns( op: NodeTensor.initialize_empty(shape) for (op, shape) in tensor_shapes.items() } - # For each input operand iterate through the finer_nodes in reverse order + # For each input operand iterate through the tiles in reverse order # because we want the first cn with a dependency saved in the tensor - # For the output operand iterate through the finer_nodes in regular order + # For the output operand iterate through the tiles in regular order # because we want the last CN that handles an output tensor window to be saved for op, dims in tensor_dims.items(): if op == node.output_operand: ir_dims_output = node.loop_relevancy_info.get_ir_layer_dims(Constants.OUTPUT_LAYER_OP) - finer_nodes_list = finer_nodes # list in regular order + tile_list = tiles # list in regular order should_add_to_tensor_list = [ - all(finer_node.loop_ranges[ir_dim][1] >= node.loop_ranges[ir_dim][1] for ir_dim in ir_dims_output) - for finer_node in finer_nodes_list + all(tile.loop_ranges[ir_dim][1] >= node.loop_ranges[ir_dim][1] for ir_dim in ir_dims_output) + for tile in tile_list ] attr_to_add_to = "data_produced_unique" precision = node.operand_precision[Constants.FINAL_OUTPUT_LAYER_OP] else: - finer_nodes_list = list(reversed(finer_nodes)) # list in reversed order - should_add_to_tensor_list = [True for _ in finer_nodes_list] + tile_list = list(reversed(tiles)) # list in reversed order + should_add_to_tensor_list = [True for _ in tile_list] attr_to_add_to = "data_consumed_unique" # if this layer is the first layer, we assume the inputs are streamed and "free" precision = node.operand_precision[op] * (not is_source_node) nb_unique_data_seen = 0 - for finer_node, should_add_to_tensor in zip(finer_nodes_list, should_add_to_tensor_list): + for tile, should_add_to_tensor in zip(tile_list, should_add_to_tensor_list): if not should_add_to_tensor: continue # Skip if we're not at the max ir loop value for output - op_dim_ranges = [finer_node.loop_ranges[loop_dim] for loop_dim in dims] + op_dim_ranges = [tile.loop_ranges[loop_dim] for loop_dim in dims] op_dim_ranges_max_stop = tuple(tensor_shapes[op]) # start can be negative for padding which, makes np flip window = tuple([slice(max(0, start), stop) for (start, stop) in op_dim_ranges]) # Count how many nans we have in this window, as this is the amount of unique data consumed/produced by - # this finer_node + # this tile nb_unique_data_bits = tensors_cns[op].get_nb_empty_elements(window) * precision nb_unique_data_seen += nb_unique_data_bits # Add this amount of unique data to the "data_consumed_unique" or "data_produced_unique" depending on # input/output operand setattr( - finer_node, + tile, attr_to_add_to, - getattr(finer_node, attr_to_add_to) + nb_unique_data_bits, + getattr(tile, attr_to_add_to) + nb_unique_data_bits, ) - # Set this window of the tensor to indicate it will be consumed/produced by this finer node + # Set this window of the tensor to indicate it will be consumed/produced by this tile bounded_op_dim_ranges = tuple( slice(max(0, start), min(max_stop, stop)) for ((start, stop), max_stop) in zip(op_dim_ranges, op_dim_ranges_max_stop) ) - tensors_cns[op] = tensors_cns[op].extend_with_node(bounded_op_dim_ranges, finer_node) + tensors_cns[op] = tensors_cns[op].extend_with_node(bounded_op_dim_ranges, tile) if nb_unique_data_seen != (prod(tensor_shapes[op]) * precision): logger.warning(f"Downsampling node detected: {node}, operand= {op}.") @@ -779,7 +800,7 @@ def get_tensor_cns( @staticmethod def set_base_priority_of_nodes(nodes: list[ComputationNode], edges: list[EDGE_T]): - """Set the base_priority of all stored tensors of variable operands in every node in finer_nodes + """Set the base_priority of all stored tensors of the variable operands of all nodes based on the amount of real (excluding same layer edges) edges. Args: @@ -848,6 +869,11 @@ def get_layer_split_factors_k(self): split_factors[node] = split_factor return split_factors + def load_cached_tiled_workload(self): + if os.path.exists(self.tiled_workload_path): + return pickle_load(self.tiled_workload_path) + return None + def deduce_tensor_reuse_factors( original_node: ComputationNode, outer_temporal_loops: list[TemporalLoop] From b17adb7f67f5cfc93be4e24e8247f548528d6d60 Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Thu, 19 Dec 2024 15:39:13 +0100 Subject: [PATCH 15/49] only block constant operands if their precision is greater than 0 --- stream/stages/estimation/zigzag_core_mapping_estimation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stream/stages/estimation/zigzag_core_mapping_estimation.py b/stream/stages/estimation/zigzag_core_mapping_estimation.py index 0e0c7d91..146cc633 100644 --- a/stream/stages/estimation/zigzag_core_mapping_estimation.py +++ b/stream/stages/estimation/zigzag_core_mapping_estimation.py @@ -106,7 +106,7 @@ def run(self): # ! --- ensure all constant weights are accessed via blocking behavior i.s.o. transfer for layer_op in node.constant_operands: mem_op = node.memory_operand_links.layer_to_mem_op(layer_op) - if mem_op not in too_large_operands_for_cme: + if mem_op not in too_large_operands_for_cme and node.operand_precision[layer_op] > 0: too_large_operands_for_cme.append(mem_op) # ! --- node_duplicate.set_chosen_core_allocation(core_id) From e2a77d9bdea93d626e9a287a6f1d0463f5646e9b Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Thu, 19 Dec 2024 15:40:12 +0100 Subject: [PATCH 16/49] add sender and receiver to communication link events --- stream/cost_model/communication_manager.py | 32 ++++++++++++++++--- .../architecture/noc/communication_link.py | 9 +++++- stream/visualization/perfetto.py | 2 ++ stream/visualization/schedule.py | 4 +++ 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index 94793be5..062746c4 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -16,7 +16,7 @@ class CommunicationEvent: """Represents a communication event involving one or more CommunicationLinks.""" - def __init__(self, id: int, tasks: list["CommunicationLinkEvent"]) -> None: + def __init__(self, id: int, tasks: list["CommunicationLinkEvent"], sender: Core, receiver: Core) -> None: # Sanity checks assert len(tasks) > 0 assert all([t.type == tasks[0].type] for t in tasks) @@ -28,9 +28,11 @@ def __init__(self, id: int, tasks: list["CommunicationLinkEvent"]) -> None: self.start = tasks[0].start self.end = tasks[0].end self.energy = sum([t.energy for t in tasks]) + self.sender = sender + self.receiver = receiver def __str__(self) -> str: - return f"CommunicationEvent(id={self.id})" + return f"CommunicationEvent(id={self.id}, sender={self.sender}, receiver={self.receiver})" def __repr__(self) -> str: return str(self) @@ -49,7 +51,17 @@ class CommunicationLinkEvent: * the bits per clock cycle used of the link bandwidth """ - def __init__(self, type: str, start: int, end: int, tensor: Tensor, energy: float, activity: float) -> None: + def __init__( + self, + type: str, + start: int, + end: int, + tensor: Tensor, + energy: float, + activity: float, + sender: Core, + receiver: Core, + ) -> None: self.type = type self.start = start self.end = end @@ -57,11 +69,13 @@ def __init__(self, type: str, start: int, end: int, tensor: Tensor, energy: floa self.tensor = tensor self.energy = energy self.activity = activity + self.sender = sender + self.receiver = receiver def __str__(self) -> str: return ( f"CommunicationLinkEvent(type={self.type}, start={self.start}, end={self.end}, tensor={self.tensor}, " - f"energy={self.energy:.2e}, activity={self.activity:.2f})" + f"energy={self.energy:.2e}, activity={self.activity:.2f}, sender={self.sender}, receiver={self.receiver})" ) def __repr__(self) -> str: @@ -159,12 +173,16 @@ def update_links( tensor=tensor, energy=duration * link.unit_energy_cost, activity=link.bandwidth, + sender=sender, + receiver=receiver, ) for link in links ] event = CommunicationEvent( id=self.event_id, tasks=cles, + sender=sender, + receiver=receiver, ) self.events.append(event) self.event_id += 1 @@ -244,7 +262,11 @@ def get_inst_bw(op: MemoryOperand) -> int: for link, tensors in tensors_per_link.items(): operands = [tensor.memory_operand for tensor in tensors] bandwidths = [get_inst_bw(op) for op in operands] - link.block(block_start, duration, tensors, bandwidth_per_tensor=bandwidths) + senders = [core if operand == Constants.OUTPUT_MEM_OP else offchip_core for operand in operands] + receivers = [offchip_core if operand == Constants.OUTPUT_MEM_OP else core for operand in operands] + link.block( + block_start, duration, tensors, bandwidth_per_tensor=bandwidths, senders=senders, receivers=receivers + ) return block_start diff --git a/stream/hardware/architecture/noc/communication_link.py b/stream/hardware/architecture/noc/communication_link.py index c89d91de..3857808f 100644 --- a/stream/hardware/architecture/noc/communication_link.py +++ b/stream/hardware/architecture/noc/communication_link.py @@ -108,6 +108,8 @@ def block( duration: int, tensors: list["Tensor"], bandwidth_per_tensor: list[int], + senders: list["Core"], + receivers: list["Core"], ): """Block this communication link from start timestep for a given duration. @@ -116,11 +118,14 @@ def block( duration: The duration of the blocking. tensors: A list of tensors for which we are blocking the link. activity: The percentage of the link bandwidth used + bandwidth_per_tensor: The bandwidth used by each tensor in the list. + senders: The cores sending the tensors. + receivers: The cores receiving the tensors. """ assert len(tensors) == len(bandwidth_per_tensor) end = start + duration # Create a CLEvent per tensor - for tensor, bandwidth in zip(tensors, bandwidth_per_tensor): + for tensor, bandwidth, sender, receiver in zip(tensors, bandwidth_per_tensor, senders, receivers): event = CommunicationLinkEvent( type="block", start=start, @@ -128,6 +133,8 @@ def block( tensor=tensor, energy=tensor.origin.get_offchip_energy(), activity=bandwidth, + sender=sender, + receiver=receiver, ) self.update_activity(event) return diff --git a/stream/visualization/perfetto.py b/stream/visualization/perfetto.py index 751de35b..d5a94bbb 100644 --- a/stream/visualization/perfetto.py +++ b/stream/visualization/perfetto.py @@ -37,6 +37,8 @@ def parse_non_base_attrs(row: pd.Series, base_attrs: list[str]) -> dict: else: new_v[str(k2)] = str(v2) v = new_v + else: + v = str(v) args[k] = v return args diff --git a/stream/visualization/schedule.py b/stream/visualization/schedule.py index 167cf497..a7d930e3 100644 --- a/stream/visualization/schedule.py +++ b/stream/visualization/schedule.py @@ -423,6 +423,8 @@ def get_communication_dicts(scme): node = tensor.origin layer_id = node.id activity = event.activity + sender = event.sender + receiver = event.receiver if runtime == 0: continue d = dict( @@ -439,6 +441,8 @@ def get_communication_dicts(scme): Activity=activity, Energy=energy, LinkBandwidth=cl.bandwidth, + Sender=sender, + Receiver=receiver, ) dicts.append(d) return dicts From 6e99484ed78520f3edd9ded8c301b37c2a940080 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 19 Dec 2024 16:32:33 +0100 Subject: [PATCH 17/49] allow transfers at fraction of links bandwidth --- stream/cost_model/communication_manager.py | 8 +-- stream/hardware/architecture/accelerator.py | 55 ++++++++++++++++----- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index 94793be5..7eef18db 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -1,4 +1,5 @@ import itertools +from math import ceil from typing import TYPE_CHECKING, Any from zigzag.datatypes import Constants, MemoryOperand @@ -105,7 +106,7 @@ def get_links_for_all_core_pairs(self): ] return communication_links - def get_links_for_pair(self, sender: Core, receiver: Core): + def get_links_for_pair(self, sender: Core, receiver: Core) -> list["CommunicationLink"]: """Return the list of traversed CommunicationLinks for sending data from sender core to receiver core. Args: @@ -126,6 +127,7 @@ def update_links( receiver_memory_operand: MemoryOperand, start_timestep: int, duration: int, + link_bw_fraction: float = 1.0, ) -> tuple[float, float]: """Update the links for transfer of a tensor between sender and receiver core at a given timestep. A CommunicationEvent is created containing one or more CommunicationLinkEvents, @@ -157,8 +159,8 @@ def update_links( start=start_timestep, end=end_timestep, tensor=tensor, - energy=duration * link.unit_energy_cost, - activity=link.bandwidth, + energy=duration * link.unit_energy_cost * link_bw_fraction, + activity=ceil(link_bw_fraction * link.bandwidth), ) for link in links ] diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index 970c2b8d..f7ba93a1 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -9,6 +9,7 @@ from stream.cost_model.communication_manager import CommunicationManager from stream.cost_model.memory_manager import MemoryManager from stream.hardware.architecture.core import Core +from stream.hardware.architecture.noc.communication_link import CommunicationLink from stream.workload.computation.computation_node import ComputationNode from stream.workload.tensor import Tensor @@ -311,21 +312,52 @@ def transfer_tensor_to_core( ) ################################# STEP 4 ################################# # The links between sender and receiver have a long enough idle window. - sender_cores = self.memory_manager.cores_per_top_instance[storing_instance] + + def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): + """ + Given the links to transfer across and corresponding available bandwidths, return the earliest transfer + start and end time. + + Args: + links_bw: link and corresponding transfer bandwidth + """ + slowest_bw = min(links_bw.values()) + transfer_duration = ceil(tensor.size / slowest_bw) + transfer_start = self.communication_manager.get_links_idle_window( + links=links_bw, + best_case_start=evictions_complete_timestep, + duration=transfer_duration, + tensors_per_link={link: [tensor] for link in links_bw}, + ) + transfer_end = transfer_start + transfer_duration + return transfer_start, transfer_end + + def find_earliest_time_for_transfer(links: list[CommunicationLink], nb_iterations: int = 10): + """Find the earliest time at which a tensor transfer between 2 cores can happen. Iterate over the used + bandwidth to find the transfer bandwidth at which the finish time is earliest""" + windows: list[tuple[int, int]] = [] + bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] + + for frac in bandwidth_fractions: + links_with_bw = {link: ceil(frac * link.bandwidth) for link in links} + start, end = find_transfer_start_and_end_time(links_with_bw) + windows.append((start, end)) + + ends = [end for _, end in windows] + best_idx = ends.index(min(ends)) + best_window = windows[best_idx] + best_fraction = bandwidth_fractions[best_idx] + return best_window, best_fraction + # TODO If the storing_instance is a shared instance across more than one core, # TODO there will be multiple possible cores to transfer between. # TODO For now, we take the first one + sender_cores = self.memory_manager.cores_per_top_instance[storing_instance] sender_core = sender_cores[0] links = self.communication_manager.get_links_for_pair(sender_core, receiving_core) - links = {link: link.bandwidth for link in links} - transfer_duration = max([ceil(tensor.size / link.bandwidth) for link in links]) - transfer_start = self.communication_manager.get_links_idle_window( - links, - evictions_complete_timestep, - transfer_duration, - {link: [tensor] for link in links}, - ) - transfer_end = transfer_start + transfer_duration + (transfer_start, transfer_end), link_bw_fraction = find_earliest_time_for_transfer(links) + transfer_duration = transfer_end - transfer_start + ################################# STEP 5 ################################# # Spawn the tensor on the receiving core self.spawn(tensor, receiving_core, tensor_operand, transfer_start, transfer_end) @@ -341,9 +373,10 @@ def transfer_tensor_to_core( tensor_operand, transfer_start, transfer_duration, + link_bw_fraction=link_bw_fraction, ) ################################# STEP 7 ################################# - # Remove the transfered tensor from the sender core (excluding DRAM) + # Remove the transferred tensor from the sender core (excluding DRAM) # if it is no longer needed. if sender_core.id == self.offchip_core_id: pass From a8baaebbf8101a84590e65b2108810ca689b764a Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 19 Dec 2024 18:08:38 +0100 Subject: [PATCH 18/49] fix bug in block offchip links: output links was overwritten --- stream/cost_model/communication_manager.py | 8 +++++--- stream/hardware/architecture/accelerator.py | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index 23e65c17..2c2c566b 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -59,7 +59,7 @@ def __init__( end: int, tensor: Tensor, energy: float, - activity: float, + activity: int, sender: Core, receiver: Core, ) -> None: @@ -240,14 +240,16 @@ def get_inst_bw(op: MemoryOperand) -> int: links_to_offchip = set(self.get_links_for_pair(core, offchip_core)) for link in links_to_offchip: - tensors_per_link[link] = [(node.operand_tensors[Constants.OUTPUT_LAYER_OP])] + tensors_per_link[link] = tensors_per_link.get(link, []) + [ + (node.operand_tensors[Constants.OUTPUT_LAYER_OP]) + ] # Input operands non_output_mem_ops = [op for op in too_large_operands if op != Constants.OUTPUT_MEM_OP] if non_output_mem_ops: links_from_offchip = set(self.get_links_for_pair(offchip_core, core)) for link in links_from_offchip: - tensors_per_link[link] = [ + tensors_per_link[link] = tensors_per_link.get(link, []) + [ node.operand_tensors[node.memory_operand_links.mem_to_layer_op(op)] for op in non_output_mem_ops ] diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index f7ba93a1..5f6c5985 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -338,6 +338,9 @@ def find_earliest_time_for_transfer(links: list[CommunicationLink], nb_iteration windows: list[tuple[int, int]] = [] bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] + # ! + bandwidth_fractions = [1 / len(self.core_list)] + for frac in bandwidth_fractions: links_with_bw = {link: ceil(frac * link.bandwidth) for link in links} start, end = find_transfer_start_and_end_time(links_with_bw) From 52b1c90c4ee523132d8b3d7382c4d984fdb6d857 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 20 Dec 2024 14:08:25 +0100 Subject: [PATCH 19/49] fix bug: multiple simultaneous blockings at full link capacity --- stream/cost_model/communication_manager.py | 46 +++-- stream/hardware/architecture/accelerator.py | 26 ++- .../architecture/noc/communication_link.py | 178 ++++++++++++++---- .../constraint_optimization/utils.py | 4 +- 4 files changed, 187 insertions(+), 67 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index 2c2c566b..fa63f2ce 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -1,5 +1,5 @@ import itertools -from math import ceil +from math import ceil, floor from typing import TYPE_CHECKING from zigzag.datatypes import Constants, MemoryOperand @@ -253,33 +253,32 @@ def get_inst_bw(op: MemoryOperand) -> int: node.operand_tensors[node.memory_operand_links.mem_to_layer_op(op)] for op in non_output_mem_ops ] - # Sum the required bandwidth for all tensors on each link - total_required_bw_per_link = { - link: sum([get_inst_bw(tensor.memory_operand) for tensor in tensors]) - for link, tensors in tensors_per_link.items() + tensor_bw_per_link = { + link: [(tensor, get_inst_bw(tensor.memory_operand)) for tensor in tensors_this_link] + for link, tensors_this_link in tensors_per_link.items() } + # TODO Should the bandwidth be capped at the link BW? + # Get idle window of the involved links - block_start = self.get_links_idle_window(total_required_bw_per_link, start_timestep, duration, tensors_per_link) + block_start = self.get_links_idle_window(tensor_bw_per_link, start_timestep, duration) # # Block them - for link, tensors in tensors_per_link.items(): + for link, tensor_bws in tensor_bw_per_link.items(): + tensors = [tensor for tensor, _ in tensor_bws] + bandwidths = [bw for _, bw in tensor_bws] operands = [tensor.memory_operand for tensor in tensors] - bandwidths = [get_inst_bw(op) for op in operands] senders = [core if operand == Constants.OUTPUT_MEM_OP else offchip_core for operand in operands] receivers = [offchip_core if operand == Constants.OUTPUT_MEM_OP else core for operand in operands] - link.block( - block_start, duration, tensors, bandwidth_per_tensor=bandwidths, senders=senders, receivers=receivers - ) + link.block(block_start, duration, tensors, bandwidths=bandwidths, senders=senders, receivers=receivers) return block_start def get_links_idle_window( self, - links: dict["CommunicationLink", int], + tensor_bw_per_link: dict["CommunicationLink", list[tuple[Tensor, int]]], best_case_start: int, duration: int, - tensors_per_link: dict["CommunicationLink", list[Tensor]], ) -> int: """Return the timestep at which tensor can be transfered across the links. Both links must have an idle window large enough for the transfer. @@ -291,14 +290,25 @@ def get_links_idle_window( duration: The required duration of the idle window. tensors: The tensors to be transferred. Used to broadcast from previous transfer. """ - assert len(links) > 0 + assert len(tensor_bw_per_link) > 0 idle_intersections: list[tuple[int, int]] = [] - for i, (link, req_bw) in enumerate(links.items()): - req_bw = min(req_bw, link.bandwidth) # ceil the bw - windows = link.get_idle_window(req_bw, duration, best_case_start, tensors_per_link[link]) + for i, (link, bandwidth_per_tensor) in enumerate(tensor_bw_per_link.items()): + + # Make sure total bandwidth <= link bandwidth + total_req_bw = sum([bw for _, bw in bandwidth_per_tensor]) + if total_req_bw > link.bandwidth: + normalization_factor = link.bandwidth / total_req_bw + bandwidth_per_tensor = [ + (tensor, floor(normalization_factor * bw)) for tensor, bw in bandwidth_per_tensor + ] + + windows = link.get_idle_window(bandwidth_per_tensor, duration, best_case_start) if i == 0: idle_intersections = windows else: idle_intersections = intersections(idle_intersections, windows) idle_intersections = [period for period in idle_intersections if period[1] - period[0] >= duration] - return idle_intersections[0][0] + + earliest_window = idle_intersections[0] # TODO is this the earliest + start_time, _ = earliest_window + return start_time diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index 5f6c5985..1f4f08bf 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -323,23 +323,29 @@ def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): """ slowest_bw = min(links_bw.values()) transfer_duration = ceil(tensor.size / slowest_bw) + + tensor_bw_per_link = {link: [(tensor, link_bw)] for link, link_bw in links_bw.items()} + transfer_start = self.communication_manager.get_links_idle_window( - links=links_bw, + tensor_bw_per_link=tensor_bw_per_link, best_case_start=evictions_complete_timestep, duration=transfer_duration, - tensors_per_link={link: [tensor] for link in links_bw}, ) transfer_end = transfer_start + transfer_duration return transfer_start, transfer_end - def find_earliest_time_for_transfer(links: list[CommunicationLink], nb_iterations: int = 10): + def find_earliest_time_for_transfer( + links: list[CommunicationLink], nb_iterations: int = 1, default_fraction: float = 1 + ): """Find the earliest time at which a tensor transfer between 2 cores can happen. Iterate over the used bandwidth to find the transfer bandwidth at which the finish time is earliest""" windows: list[tuple[int, int]] = [] - bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] - # ! - bandwidth_fractions = [1 / len(self.core_list)] + if nb_iterations == 1: + bandwidth_fractions = [default_fraction] + else: + # Iterate over linearly spaced fractions of the bandwidth + bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] for frac in bandwidth_fractions: links_with_bw = {link: ceil(frac * link.bandwidth) for link in links} @@ -358,7 +364,13 @@ def find_earliest_time_for_transfer(links: list[CommunicationLink], nb_iteration sender_cores = self.memory_manager.cores_per_top_instance[storing_instance] sender_core = sender_cores[0] links = self.communication_manager.get_links_for_pair(sender_core, receiving_core) - (transfer_start, transfer_end), link_bw_fraction = find_earliest_time_for_transfer(links) + # ! By default, transfers only take a fraction of the total bandwidth + default_bandwidth_fraction = 1 / len(self.core_list) + (transfer_start, transfer_end), link_bw_fraction = find_earliest_time_for_transfer( + links, + nb_iterations=1, + default_fraction=default_bandwidth_fraction, + ) transfer_duration = transfer_end - transfer_start ################################# STEP 5 ################################# diff --git a/stream/hardware/architecture/noc/communication_link.py b/stream/hardware/architecture/noc/communication_link.py index 3857808f..75f0f30b 100644 --- a/stream/hardware/architecture/noc/communication_link.py +++ b/stream/hardware/architecture/noc/communication_link.py @@ -1,3 +1,4 @@ +from itertools import combinations, product from typing import TYPE_CHECKING, Literal import numpy as np @@ -62,7 +63,7 @@ def __init__( self.active_periods = [(0, float("inf"), 0)] self.active_ts = np.array([0, float("inf")]) self.active_deltas = np.array([0, 0]) - self.tensors: dict[Tensor, list[CommunicationLinkEvent]] = {} + self.previously_seen_tensors: dict[Tensor, list[CommunicationLinkEvent]] = {} def __str__(self) -> str: return f"CommunicationLink({self.sender}, {self.receiver}, bw={self.bandwidth})" @@ -107,7 +108,7 @@ def block( start: int, duration: int, tensors: list["Tensor"], - bandwidth_per_tensor: list[int], + bandwidths: list[int], senders: list["Core"], receivers: list["Core"], ): @@ -118,14 +119,14 @@ def block( duration: The duration of the blocking. tensors: A list of tensors for which we are blocking the link. activity: The percentage of the link bandwidth used - bandwidth_per_tensor: The bandwidth used by each tensor in the list. + bandwidths: The bandwidth used by each tensor in the list. senders: The cores sending the tensors. receivers: The cores receiving the tensors. """ - assert len(tensors) == len(bandwidth_per_tensor) + assert len(tensors) == len(bandwidths) end = start + duration # Create a CLEvent per tensor - for tensor, bandwidth, sender, receiver in zip(tensors, bandwidth_per_tensor, senders, receivers): + for tensor, bandwidth, sender, receiver in zip(tensors, bandwidths, senders, receivers): event = CommunicationLinkEvent( type="block", start=start, @@ -147,7 +148,7 @@ def update_activity(self, event: CommunicationLinkEvent): return # Check if this is a duplicate event for broadcast - previous_events = self.tensors.get(event.tensor, []) + previous_events = self.previously_seen_tensors.get(event.tensor, []) if any((previous_event.start == event.start for previous_event in previous_events)): return @@ -165,49 +166,146 @@ def update_activity(self, event: CommunicationLinkEvent): self.active_deltas = np.insert(self.active_deltas, idx_end, -activity) # Track that this link has transferred the tensors of this event for future broadcasts - self.tensors[event.tensor] = self.tensors.get(event.tensor, []) + [event] + self.previously_seen_tensors[event.tensor] = self.previously_seen_tensors.get(event.tensor, []) + [event] self.events.append(event) - def get_idle_window(self, activity: float, duration: int, earliest_t: int, tensors: list["Tensor"]): + def get_idle_window( + self, + bandwidth_per_tensor: list[tuple["Tensor", int]], + # activity: float, + duration: int, + earliest_t: int, + ): """ Get the earliest time window of duration `duration` from `earliest_t` with at least `activity` percent available. """ - valid_windows: list[tuple[int, int]] = [] - ## Check if this tensor has already been transferred on this link before - # If so, check duration and earliest timestep requirements of this call - for tensor in tensors: - if tensor in self.tensors: - previous_events = self.tensors[tensor] + + def find_valid_window_for_given_bw(required_bandwidth: int): + valid_windows: list[tuple[int, int]] = [] + + ## Check other possible periods given the activity + activities = np.cumsum(self.active_deltas) + earliest_t_index = np.searchsorted(self.active_ts, earliest_t, side="right") + relevant_ts = self.active_ts[earliest_t_index:] + updated_ts = relevant_ts.copy() + relevant_activities = activities[earliest_t_index:] + # Insert the earliest timestep and the activity at that timestep + updated_ts = np.insert(updated_ts, 0, earliest_t) + updated_activities = np.insert(relevant_activities, 0, activities[earliest_t_index - 1]) + updated_activities = updated_activities + required_bandwidth + idxs = np.argwhere(updated_activities > self.bandwidth) + idxs = [idx[0] for idx in idxs] + idxs.append(len(updated_ts) - 1) + start = earliest_t + for idx in idxs: + end: int = updated_ts[idx] + if end - start >= duration: + valid_windows.append((start, end)) + try: + start: int = updated_ts[idx + 1] + except IndexError: + break + + if not valid_windows: + raise ValueError( + f"There are no valid windows of activity {required_bandwidth} and duration {duration} for {self}." + ) + return valid_windows + + def get_previous_valid_windows(tensor: "Tensor"): + windows: list[tuple[int, int]] = [] + if tensor in self.previously_seen_tensors: + previous_events = self.previously_seen_tensors[tensor] for previous_event in previous_events: # Previous event needs to be long enough duration_valid = previous_event.duration >= duration # Previous event needs to have happened at late enough time earliest_t_valid = previous_event.start >= earliest_t if duration_valid and earliest_t_valid: - valid_windows.append((previous_event.start, previous_event.end)) - ## Check other possible periods given the activity - activities = np.cumsum(self.active_deltas) - earliest_t_index = np.searchsorted(self.active_ts, earliest_t, side="right") - relevant_ts = self.active_ts[earliest_t_index:] - updated_ts = relevant_ts.copy() - relevant_activities = activities[earliest_t_index:] - # Insert the earliest timestep and the activity at that timestep - updated_ts = np.insert(updated_ts, 0, earliest_t) - updated_activities = np.insert(relevant_activities, 0, activities[earliest_t_index - 1]) - updated_activities = updated_activities + activity - idxs = np.argwhere(updated_activities > self.bandwidth) - idxs = [idx[0] for idx in idxs] - idxs.append(len(updated_ts) - 1) - start = earliest_t - for idx in idxs: - end: int = updated_ts[idx] - if end - start >= duration: - valid_windows.append((start, end)) - try: - start: int = updated_ts[idx + 1] - except IndexError: - break - if not valid_windows: - raise ValueError(f"There are no valid windows of activity {activity} and duration {duration} for {self}.") - return valid_windows + windows.append((previous_event.start, previous_event.end)) + return windows + + def window_has_bandwidth_left(window: tuple[int, int], remaining_req_bw: int): + if remaining_req_bw == 0: + return True + + start, end = window + assert start in self.active_ts and end in self.active_ts + start_idx = np.where(self.active_ts == start)[0] + end_idx = np.where(self.active_ts == end)[0] + activities = np.cumsum(self.active_deltas) + activities_in_window = activities[start_idx:end_idx] + return all(activities_in_window + remaining_req_bw <= self.bandwidth) + + tensors = [tensor for tensor, _ in bandwidth_per_tensor] + valid_windows_per_tensor = { + tensor: get_previous_valid_windows(tensor) for tensor in tensors if get_previous_valid_windows(tensor) != [] + } + + # # Case 1: previous windows for 1 tensor found + # # NOTE functionality fully covered by case 2 + # if len(valid_windows_per_tensor) == 1: + # previously_seen_tensor, previous_windows = next(iter(valid_windows_per_tensor.items())) + # remaining_req_bw = sum([bw for tensor, bw in bandwidth_per_tensor if tensor != previously_seen_tensor]) + # all_valid_windows = [ + # previous_window + # for previous_window in previous_windows + # if window_has_bandwidth_left(previous_window, remaining_req_bw) + # ] + # # If valid windows are found, return those + # if all_valid_windows: + # return + + # Case 2: check all previously seen window combinations: + all_valid_windows: list[tuple[int, int]] = [] + for r in range(1, len(valid_windows_per_tensor) + 1, -1)[::-1]: + # e.g. if 3 tensors have been seen before, check the windows for tensors (1,2,3), (1,2), (2,3), (1,3), ... + for tensor_combination in combinations(valid_windows_per_tensor, r): + # Bandwidth that needs to be allocated, for tensors not in the previously registered window + remaining_req_bw = sum([bw for tensor, bw in bandwidth_per_tensor if tensor not in tensor_combination]) + all_window_combinations = product(*[valid_windows_per_tensor[tensor] for tensor in tensor_combination]) + for window_combination in all_window_combinations: + curr_window = window_combination[0] + # Windows must overlap exactly and have bandwidth left + if all(window == curr_window for window in window_combination[1::]) and window_has_bandwidth_left( + curr_window, remaining_req_bw + ): + all_valid_windows.append(curr_window) + + # If valid windows have been found in previously registered windows, return those + if all_valid_windows: + return all_valid_windows + + # Base case: don't assume previous transfers and find new window for all tensors + total_req_bw = sum([bw for _, bw in bandwidth_per_tensor]) + return find_valid_window_for_given_bw(total_req_bw) + + # NOTE checks if all tensors are previously seen + # valid_windows: list[tuple[int, int]] = [] + # ## Check if the tensors have already been transferred on this link before + # # If so, check duration and earliest timestep requirements of this call + # if all(tensor in self.previously_seen_tensors for tensor in tensors): + # previous_event_per_tensor = [self.previously_seen_tensors[tensor] for tensor in tensors] + + # # We need to find a window where all required tensors were simultaneously transferred + # for previous_event_combination in product(*previous_event_per_tensor): + # duration_valid = all(event.duration >= duration for event in previous_event_combination) + # earliest_t_valid = all(event.start >= earliest_t for event in previous_event_combination) + + # # TODO should all events of this combination overlap exactly? + + # if duration_valid and earliest_t_valid: + # valid_windows.append((previous_event_combination[0].start, previous_event_combination[0].end)) + + # NOTE original code + # for tensor in tensors: + # if tensor in self.previously_seen_tensors: + # previous_events = self.previously_seen_tensors[tensor] + # for previous_event in previous_events: + # # Previous event needs to be long enough + # duration_valid = previous_event.duration >= duration + # # Previous event needs to have happened at late enough time + # earliest_t_valid = previous_event.start >= earliest_t + # if duration_valid and earliest_t_valid: + # valid_windows.append((previous_event.start, previous_event.end)) diff --git a/stream/opt/allocation/constraint_optimization/utils.py b/stream/opt/allocation/constraint_optimization/utils.py index 96da754e..c9e2c461 100644 --- a/stream/opt/allocation/constraint_optimization/utils.py +++ b/stream/opt/allocation/constraint_optimization/utils.py @@ -84,7 +84,7 @@ def get_latencies( latencies_with_split = {} possible_allocation_splits = {} - p_max = len(core_names) # maximum parallalization factor + p_max = len(core_names) # maximum parallelization factor for node_id in ids.values(): possible_allocation_splits[node_id] = {} @@ -93,7 +93,7 @@ def get_latencies( if core_name in possible_allocations[node_id]: p_t = int(inter_core_tiling_sizes[node_id, core_name]) for p in range(1, p_max + 1): - if divmod(p_t, p)[1] == 0 and p <= len(possible_allocations[node_id]): + if p <= len(possible_allocations[node_id]): lat = int(latencies[(node_id, core_name)] / min(p_t, p)) possible_allocation_splits[node_id][core_name][p] = 1 else: From 9b08f2a7b770561da0a755a3cf9b5ed962d37314 Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Fri, 20 Dec 2024 15:14:06 +0100 Subject: [PATCH 20/49] update get_top_level_inst_bandwidth with scaling, requires zigzag 3.8.1 --- .../genetic_algorithm/fitness_evaluator.py | 6 +++-- .../zigzag_core_mapping_estimation.py | 6 +++++ .../set_fixed_allocation_performance.py | 4 +++- stream/utils.py | 23 ++++--------------- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py b/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py index d0b63c10..a9de475e 100644 --- a/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py +++ b/stream/opt/allocation/genetic_algorithm/fitness_evaluator.py @@ -3,7 +3,7 @@ from stream.cost_model.cost_model import StreamCostModelEvaluation from stream.hardware.architecture.accelerator import Accelerator -from stream.utils import CostModelEvaluationLUT, get_too_large_operands, get_total_required_offchip_bandwidth +from stream.utils import CostModelEvaluationLUT, get_too_large_operands, get_top_level_inst_bandwidth from stream.workload.computation.computation_node import ComputationNode from stream.workload.onnx_workload import ComputationNodeWorkload @@ -102,7 +102,9 @@ def set_node_core_allocations(self, core_allocations: list[int]): offchip_energy += layer_operand_offchip_energy onchip_energy -= layer_operand_offchip_energy # If there was offchip memory added for too_large_operands, get the offchip bandwidth - required_offchip_bandwidth = get_total_required_offchip_bandwidth(cme, too_large_operands) + required_offchip_bandwidth = sum( + get_top_level_inst_bandwidth(cme, mem_op) for mem_op in too_large_operands + ) node.set_onchip_energy(onchip_energy) node.set_offchip_energy(offchip_energy) node.set_runtime(int(latency)) diff --git a/stream/stages/estimation/zigzag_core_mapping_estimation.py b/stream/stages/estimation/zigzag_core_mapping_estimation.py index 146cc633..19e33bcd 100644 --- a/stream/stages/estimation/zigzag_core_mapping_estimation.py +++ b/stream/stages/estimation/zigzag_core_mapping_estimation.py @@ -109,6 +109,12 @@ def run(self): if mem_op not in too_large_operands_for_cme and node.operand_precision[layer_op] > 0: too_large_operands_for_cme.append(mem_op) # ! --- + # # ! --- FOR TESTING ONLY enforce blocking for all operands always + # for layer_op in node.input_operands + [node.output_operand]: + # mem_op = node.memory_operand_links.layer_to_mem_op(layer_op) + # if mem_op not in too_large_operands_for_cme: + # too_large_operands_for_cme.append(mem_op) + # # ! --- node_duplicate.set_chosen_core_allocation(core_id) # Attempt to override the node's spatial mapping based on the core's dataflow diff --git a/stream/stages/set_fixed_allocation_performance.py b/stream/stages/set_fixed_allocation_performance.py index ae1b0387..2827299f 100644 --- a/stream/stages/set_fixed_allocation_performance.py +++ b/stream/stages/set_fixed_allocation_performance.py @@ -69,8 +69,10 @@ def set_fixed_allocation_performance(self): onchip_energy, offchip_energy = self.get_energy_distribution(cme, too_large_operands) # Get the required offchip bandwidth during the execution of the node for all directions + bandwidth_scaling = cme.ideal_temporal_cycle / latency offchip_bandwidth_per_op: dict[MemoryOperand, MemoryAccesses] = { - mem_op: get_top_level_inst_bandwidth(cme, mem_op) for mem_op in too_large_operands + mem_op: get_top_level_inst_bandwidth(cme, mem_op, bandwidth_scaling) + for mem_op in too_large_operands } self.set_hw_performance_node(node, onchip_energy, offchip_energy, latency, core_id) node.set_too_large_operands(too_large_operands.copy()) diff --git a/stream/utils.py b/stream/utils.py index d9d8591a..621ec838 100644 --- a/stream/utils.py +++ b/stream/utils.py @@ -6,7 +6,7 @@ from numpy.typing import NDArray from zigzag.cost_model.cost_model import CostModelEvaluation from zigzag.datatypes import MemoryOperand -from zigzag.mapping.data_movement import FourWayDataMoving, MemoryAccesses +from zigzag.mapping.data_movement import MemoryAccesses from stream.hardware.architecture.core import Core from stream.workload.mapping import TILING_T @@ -95,27 +95,14 @@ def get_unique_nodes(workload: "ComputationNodeWorkload") -> list["ComputationNo return unique_nodes -def get_top_level_inst_bandwidth(cme: CostModelEvaluation, mem_op: MemoryOperand) -> MemoryAccesses: +def get_top_level_inst_bandwidth(cme: CostModelEvaluation, mem_op: MemoryOperand, scaling: float) -> MemoryAccesses: """Given a cost model evaluation and a memory instance, compute the memory's total instantaneous bandwidth required throughout the execution of the layer that corresponds to this CME. Returns empty bandwidth requirements if the given memory instance is not included in this CME's memory hierarchy. - NOTE: this function is used in Stream + The scaling factor can be used to scale the returned bandwidth. """ - assert mem_op in cme.mem_hierarchy_dict - layer_op = cme.layer.memory_operand_links.mem_to_layer_op(mem_op) - inst_bw_4way = cme.mapping.unit_mem_data_movement[layer_op][-1].req_mem_bw_inst - return inst_bw_4way - - -def get_total_required_offchip_bandwidth( - cme: CostModelEvaluation, too_large_operands: list[MemoryOperand] -) -> FourWayDataMoving: - if not too_large_operands: - return FourWayDataMoving(0, 0, 0, 0) - # If there was offchip memory added for some operands, get the offchip bandwidth required - offchip_level = cme.accelerator.get_memory_level(too_large_operands[0], -1) - req_offchip_bw = cme.get_total_inst_bandwidth(offchip_level) - return req_offchip_bw + memory_level = cme.accelerator.get_memory_level(mem_op, -1) + return cme.get_inst_bandwidth(memory_level=memory_level, memory_operand=mem_op, scaling=scaling) def contains_wildcard(tiling: TILING_T): From 55220251a3e6a4db3934aef6095c15c4edde71af Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Fri, 20 Dec 2024 18:58:02 +0100 Subject: [PATCH 21/49] fix true layer-by-layer scheduling --- stream/cost_model/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index c1089c85..b36572cb 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -236,7 +236,7 @@ def sync_cores_idle_from( predecessor_idxs = [i for i in range(len(scheduling_order)) if scheduling_order[i][0] in predecessor_ids] best_candidate_idx = scheduling_order.index((best_candidate.id, best_candidate.sub_id)) - if scheduling_order[best_candidate_idx - 1][0] in predecessor_ids and all( + if scheduling_order[best_candidate_idx - 1][0] != best_candidate.id and all( (i < best_candidate_idx for i in predecessor_idxs) ): # If the best_candidate is the first node of a layer and all nodes of predecessor layers have been scheduled From e774520329dc2154ee29f9471e2ae2e604dc0434 Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Fri, 20 Dec 2024 18:58:35 +0100 Subject: [PATCH 22/49] use all CommunicationEvents for visualization --- stream/visualization/schedule.py | 33 +++++++++++++------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/stream/visualization/schedule.py b/stream/visualization/schedule.py index 3116c9cc..628d8ffa 100644 --- a/stream/visualization/schedule.py +++ b/stream/visualization/schedule.py @@ -403,28 +403,21 @@ def add_dependencies(fig, scme, colors, layer_ids): def get_communication_dicts(scme: "StreamCostModelEvaluation"): dicts = [] accelerator: Accelerator = scme.accelerator - active_links: set[CommunicationLink] = set() - for _, link_pair in accelerator.communication_manager.pair_links.items(): - if link_pair: - for link in link_pair: - if link.events: - active_links.add(link) - link_labels = [] - for pair_link_id, cl in enumerate(active_links): - resource = cl.get_name_for_schedule_plot() - link_labels.append(resource) - for event in cl.events: - task_type = event.type - start = event.start - end = event.end + for c_event in accelerator.communication_manager.events: + for cl_event in c_event.tasks: + # This assumes there is only one CommunicationLink between a pair of cores + cl = accelerator.communication_manager.get_links_for_pair(cl_event.sender, cl_event.receiver)[0] + task_type = cl_event.type + start = cl_event.start + end = cl_event.end runtime = end - start - energy = event.energy - tensor = event.tensor + energy = cl_event.energy + tensor = cl_event.tensor node = tensor.origin layer_id = node.id - activity = event.activity - sender = event.sender - receiver = event.receiver + activity = cl_event.activity + sender = cl_event.sender + receiver = cl_event.receiver if runtime == 0: continue d = dict( @@ -433,7 +426,7 @@ def get_communication_dicts(scme: "StreamCostModelEvaluation"): Sub_id=np.nan, Start=start, End=end, - Resource=resource, + Resource=cl.get_name_for_schedule_plot(), Layer=layer_id, Runtime=runtime, Tensors={tensor: tensor.size}, From f5fcbc2ef634261395110f955fc4521dbf9e0903 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 20 Dec 2024 20:21:29 +0100 Subject: [PATCH 23/49] make sure all bus connections use same CL instance --- .../architecture/noc/communication_link.py | 71 ++++++++++++------- stream/parser/accelerator_factory.py | 5 ++ stream/visualization/schedule.py | 11 +-- 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/stream/hardware/architecture/noc/communication_link.py b/stream/hardware/architecture/noc/communication_link.py index 75f0f30b..129f5b9d 100644 --- a/stream/hardware/architecture/noc/communication_link.py +++ b/stream/hardware/architecture/noc/communication_link.py @@ -16,30 +16,46 @@ def get_bidirectional_edges( bandwidth: float, unit_energy_cost: float, link_type: Literal["bus"] | Literal["link"], + bus_instance: "CommunicationLink | None" = None, ) -> list[tuple["Core", "Core", dict[str, "CommunicationLink"]]]: """Create a list with two edges: from A to B and B to A.""" - bus = CommunicationLink("Any", "Any", bandwidth, unit_energy_cost) - link_a_to_b = CommunicationLink(core_a, core_b, bandwidth, unit_energy_cost) - link_b_to_a = CommunicationLink(core_b, core_a, bandwidth, unit_energy_cost) - - # if have_shared_memory(core_a, core_b): - # # No edge if the cores have a shared memory - # return [] - - return [ - # A -> B - ( - core_a, - core_b, - {"cl": bus if link_type == "bus" else link_a_to_b}, - ), - # B -> A - ( - core_b, - core_a, - {"cl": bus if link_type == "bus" else link_b_to_a}, - ), - ] + + match link_type: + case "link": + link_a_to_b = CommunicationLink(core_a, core_b, bandwidth, unit_energy_cost) + link_b_to_a = CommunicationLink(core_b, core_a, bandwidth, unit_energy_cost) + return [ + # A -> B + ( + core_a, + core_b, + {"cl": link_a_to_b}, + ), + # B -> A + ( + core_b, + core_a, + {"cl": link_b_to_a}, + ), + ] + + case "bus": + assert bus_instance is not None + + return [ + # A -> B + ( + core_a, + core_b, + {"cl": bus_instance}, + ), + # B -> A + ( + core_b, + core_a, + {"cl": bus_instance}, + ), + ] class CommunicationLink: @@ -52,12 +68,14 @@ def __init__( bandwidth: int | float, unit_energy_cost: float, bidirectional: bool = False, + bus_id: int = -1, ) -> None: self.sender = sender self.receiver = receiver self.bandwidth = bandwidth self.unit_energy_cost = unit_energy_cost - self.bidirectional = bidirectional # TODO this property is not in use? + self.bidirectional = bidirectional + self.bus_id = bus_id # Distinguishes links representing disconnected busses self.events: list[CommunicationLinkEvent] = [] self.active_periods = [(0, float("inf"), 0)] @@ -72,13 +90,16 @@ def __repr__(self) -> str: return str(self) def __hash__(self) -> int: - return hash((self.sender, self.receiver, self.bandwidth, self.unit_energy_cost, self.bidirectional)) + return hash( + (self.sender, self.receiver, self.bandwidth, self.unit_energy_cost, self.bidirectional, self.bus_id) + ) def __eq__(self, other: object) -> bool: - return isinstance(other, CommunicationLink) and (self.sender, self.receiver, self.bandwidth) == ( + return isinstance(other, CommunicationLink) and (self.sender, self.receiver, self.bandwidth, self.bus_id) == ( other.sender, other.receiver, other.bandwidth, + other.bus_id, ) def get_name_for_schedule_plot(self) -> str: diff --git a/stream/parser/accelerator_factory.py b/stream/parser/accelerator_factory.py index 5ad319b3..aa8754c7 100644 --- a/stream/parser/accelerator_factory.py +++ b/stream/parser/accelerator_factory.py @@ -101,6 +101,7 @@ def create_core_graph(self, cores: list[Core], offchip_core: Core | None): unit_energy_cost = self.data["unit_energy_cost"] connections: list[tuple[int, ...]] = self.data["core_connectivity"] edges: list[tuple[Core, Core, dict[str, CommunicationLink]]] = [] + current_bus_id = 0 # All links between cores for connection in connections: @@ -117,6 +118,9 @@ def create_core_graph(self, cores: list[Core], offchip_core: Core | None): ) else: # Connect cores to bus, edge by edge + # Make sure all links refer to the same `CommunicationLink` instance + bus_instance = CommunicationLink("Any", "Any", bandwidth, unit_energy_cost, bus_id=current_bus_id) + current_bus_id += 1 pairs_this_connection = [ (a, b) for idx, a in enumerate(connected_cores) for b in connected_cores[idx + 1 :] ] @@ -127,6 +131,7 @@ def create_core_graph(self, cores: list[Core], offchip_core: Core | None): bandwidth=bandwidth, unit_energy_cost=unit_energy_cost, link_type="bus", + bus_instance=bus_instance, ) # All links between cores and offchip core diff --git a/stream/visualization/schedule.py b/stream/visualization/schedule.py index 628d8ffa..254ccf2d 100644 --- a/stream/visualization/schedule.py +++ b/stream/visualization/schedule.py @@ -403,10 +403,13 @@ def add_dependencies(fig, scme, colors, layer_ids): def get_communication_dicts(scme: "StreamCostModelEvaluation"): dicts = [] accelerator: Accelerator = scme.accelerator - for c_event in accelerator.communication_manager.events: - for cl_event in c_event.tasks: - # This assumes there is only one CommunicationLink between a pair of cores - cl = accelerator.communication_manager.get_links_for_pair(cl_event.sender, cl_event.receiver)[0] + + active_links: set["CommunicationLink"] = set( + link for link_pair in accelerator.communication_manager.pair_links.values() for link in link_pair if link.events + ) + + for cl in active_links: + for cl_event in cl.events: task_type = cl_event.type start = cl_event.start end = cl_event.end From b3ce3cbf28d81c22b583cbe77530141d6f85d33d Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Sun, 22 Dec 2024 01:07:39 +0100 Subject: [PATCH 24/49] fix tensor storing_instances call and CommunicationEvent creation --- stream/cost_model/communication_manager.py | 51 ++++++++++------ stream/hardware/architecture/accelerator.py | 7 +-- .../architecture/noc/communication_link.py | 60 ++++++++++--------- 3 files changed, 67 insertions(+), 51 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index fa63f2ce..20165664 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -33,7 +33,7 @@ def __init__(self, id: int, tasks: list["CommunicationLinkEvent"], sender: Core, self.receiver = receiver def __str__(self) -> str: - return f"CommunicationEvent(id={self.id}, sender={self.sender}, receiver={self.receiver})" + return f"CommunicationEvent(id={self.id}, sender={self.sender}, receiver={self.receiver}, tensor={self.tasks[0].tensor}, energy={self.energy:.2e})" def __repr__(self) -> str: return str(self) @@ -180,19 +180,24 @@ def update_links( ) for link in links ] - event = CommunicationEvent( - id=self.event_id, - tasks=cles, - sender=sender, - receiver=receiver, - ) - self.events.append(event) - self.event_id += 1 link_energy_cost = 0 + is_new_event_across_all_links = True for link, cle in zip(links, cles): - transfer_energy_cost = link.transfer(cle) - link_energy_cost += transfer_energy_cost + transfer_energy_cost, is_new_event = link.transfer(cle) + if is_new_event: + link_energy_cost += transfer_energy_cost + else: + is_new_event_across_all_links = False + if is_new_event_across_all_links: + event = CommunicationEvent( + id=self.event_id, + tasks=cles, + sender=sender, + receiver=receiver, + ) + self.events.append(event) + self.event_id += 1 # Energy cost of memory reads/writes on sender/receiver # For this we need to know the memory operand in order to know where in the sender/receiver the tensor is stored # We assume the tensor to be sent is defined from the sender perspective, so we take its operand as the sender @@ -265,13 +270,23 @@ def get_inst_bw(op: MemoryOperand) -> int: # # Block them for link, tensor_bws in tensor_bw_per_link.items(): - tensors = [tensor for tensor, _ in tensor_bws] - bandwidths = [bw for _, bw in tensor_bws] - operands = [tensor.memory_operand for tensor in tensors] - senders = [core if operand == Constants.OUTPUT_MEM_OP else offchip_core for operand in operands] - receivers = [offchip_core if operand == Constants.OUTPUT_MEM_OP else core for operand in operands] - link.block(block_start, duration, tensors, bandwidths=bandwidths, senders=senders, receivers=receivers) - + for tensor, bandwidth in tensor_bws: + operand = tensor.memory_operand + sender = core if operand == Constants.OUTPUT_MEM_OP else offchip_core + receiver = offchip_core if operand == Constants.OUTPUT_MEM_OP else core + cle, is_new_event = link.block( + block_start, duration, tensor, bandwidth=bandwidth, sender=sender, receiver=receiver + ) + # TODO: Group multiple CommunicationLinkEvents into a single CommunicationEvent as opposed to one event per link and tensor + if is_new_event: + event = CommunicationEvent( + id=self.event_id, + tasks=[cle], + sender=sender, + receiver=receiver, + ) + self.events.append(event) + self.event_id += 1 return block_start def get_links_idle_window( diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index 1f4f08bf..cab94c80 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -397,10 +397,9 @@ def find_earliest_time_for_transfer( pass # Don't remove it from the producing core else: - not_on_producing_core = sender_core.id != tensor.origin.core_allocation - if (storing_instance not in tensor.instance_priorities) or ( - not_on_producing_core and tensor.instance_priorities[storing_instance] == 0 - ): + not_on_producing_core = sender_core.id != tensor.origin.chosen_core_allocation + tensor_priority = tensor.get_instance_priority(storing_instance, self.memory_manager) + if not_on_producing_core and tensor_priority == 0: self.remove( tensor, sender_core, diff --git a/stream/hardware/architecture/noc/communication_link.py b/stream/hardware/architecture/noc/communication_link.py index 129f5b9d..c14dd786 100644 --- a/stream/hardware/architecture/noc/communication_link.py +++ b/stream/hardware/architecture/noc/communication_link.py @@ -121,57 +121,58 @@ def transfer(self, link_event: CommunicationLinkEvent) -> float: int: The end time when communication on this link is finished """ energy_cost = link_event.energy - self.update_activity(link_event) - return energy_cost + is_new_event = self.update_activity(link_event) + return energy_cost, is_new_event def block( self, start: int, duration: int, - tensors: list["Tensor"], - bandwidths: list[int], - senders: list["Core"], - receivers: list["Core"], + tensor: "Tensor", + bandwidth: int, + sender: list["Core"], + receiver: list["Core"], ): """Block this communication link from start timestep for a given duration. Args: start: The timestep at which the blocking starts. duration: The duration of the blocking. - tensors: A list of tensors for which we are blocking the link. - activity: The percentage of the link bandwidth used - bandwidths: The bandwidth used by each tensor in the list. - senders: The cores sending the tensors. - receivers: The cores receiving the tensors. + tensor: The tensor for which we are blocking the link. + activity: The percentage of the link bandwidth used. + bandwidth: The bandwidth used. + sender: The sending core. + receiver: The receiving core. """ - assert len(tensors) == len(bandwidths) end = start + duration # Create a CLEvent per tensor - for tensor, bandwidth, sender, receiver in zip(tensors, bandwidths, senders, receivers): - event = CommunicationLinkEvent( - type="block", - start=start, - end=end, - tensor=tensor, - energy=tensor.origin.get_offchip_energy(), - activity=bandwidth, - sender=sender, - receiver=receiver, - ) - self.update_activity(event) - return - - def update_activity(self, event: CommunicationLinkEvent): + event = CommunicationLinkEvent( + type="block", + start=start, + end=end, + tensor=tensor, + energy=tensor.origin.get_offchip_energy(), + activity=bandwidth, + sender=sender, + receiver=receiver, + ) + is_new_event = self.update_activity(event) + return event, is_new_event + + def update_activity(self, event: CommunicationLinkEvent) -> bool: start = event.start end = event.end activity = event.activity + is_new_event = True if start == end: - return + is_new_event = False + return is_new_event # Check if this is a duplicate event for broadcast previous_events = self.previously_seen_tensors.get(event.tensor, []) if any((previous_event.start == event.start for previous_event in previous_events)): - return + is_new_event = False + return is_new_event idx_start = np.searchsorted(self.active_ts, start) if self.active_ts[idx_start] == start: @@ -189,6 +190,7 @@ def update_activity(self, event: CommunicationLinkEvent): self.previously_seen_tensors[event.tensor] = self.previously_seen_tensors.get(event.tensor, []) + [event] self.events.append(event) + return is_new_event def get_idle_window( self, From 47224e7b702256f896a00e9cb95bfc75078170f3 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sun, 22 Dec 2024 16:14:43 +0100 Subject: [PATCH 25/49] transfer tensors with earlest possible timestep --- stream/cost_model/memory_manager.py | 17 ++- stream/cost_model/scheduler.py | 4 +- stream/hardware/architecture/accelerator.py | 132 +++++++++++--------- 3 files changed, 81 insertions(+), 72 deletions(-) diff --git a/stream/cost_model/memory_manager.py b/stream/cost_model/memory_manager.py index 66454c53..dbbfbd8b 100644 --- a/stream/cost_model/memory_manager.py +++ b/stream/cost_model/memory_manager.py @@ -124,10 +124,10 @@ def add_tensor_to_core( An error is raised if there is not enough space to add it. Args: - tensor (Tensor): The tensor to be added. + tensor: The tensor to be added. core (Core): The core to add it to. - timestep (int): The timestep at which space should be reserved for the tensor. - timestep_end (int): The timestep at which the tensor is available. + timestep: The timestep at which space should be reserved for the tensor. + timestep_ The timestep at which the tensor is available. memory_op: The memory operand where the tensor will be stored. Defaults to None. """ @@ -183,13 +183,13 @@ def get_timestep_for_tensor_addition( If there is never enough space, the latest timestep is returned. Args: - tensor (Tensor): The tensor to be added to the core. - core_id (int): The core id that is going to receive the tensor. - timestep (int): The timestep from which to start considering make this tensor data transfer. - memory_op (str): The memory operand storing the tensor on the receiving end of the transfer. + tensor: The tensor to be added to the core. + core_id: The core id that is going to receive the tensor. + timestep: The timestep from which to start considering make this tensor data transfer. + memory_op: The memory operand storing the tensor on the receiving end of the transfer. Returns: - can_add_from_timestep (int): The earliest timestep at which the transfer can actually start. + The earliest timestep at which the transfer can actually start. """ core = self.accelerator.get_core(core_id) top_level_idx = self.get_top_level_idx(core, memory_op) @@ -205,7 +205,6 @@ def get_timestep_for_tensor_addition( relevant_usages_reversed = relevant_usages[::-1] max_usage = np.max(relevant_usages_reversed) last_max_usage_idx = len(relevant_usages_reversed) - np.argmax(relevant_usages_reversed) - 1 - # abs_last_max_usage_idx = relevant_start_idx + last_max_usage_idx if max_usage + tensor.size <= top_instance_capacity: can_add_from_timestep = timestep return can_add_from_timestep diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index b36572cb..ec550ae0 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -333,7 +333,8 @@ def schedule_graph( assert core_id is not None core = accelerator.get_core(core_id) # Earliest start time is when core is available or predecessors finished - start = max(cores_idle_from[core_id], preds_end) + core_idle_from = cores_idle_from[core_id] + start = max(core_idle_from, preds_end) # Step 0 tensors_this_candidate_needs, tensors_operands = get_tensors_needed_for_node(best_candidate, G) # Step 1 @@ -372,6 +373,7 @@ def schedule_graph( core_id, tensor_operand, tensors_this_candidate_needs, + earliest_t=core_idle_from, ) # Update the possible start time of this node timestep = max(timestep, transfer_complete_timestep) diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index cab94c80..f1780ed8 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -90,10 +90,10 @@ def remove( """Remove tensor from core. If required, transfer to offchip before removal. Args: - tensor (Tensor): The tensor to remove. + tensor: The tensor to remove. core (Core): The Core to remove the tensor from. - memory_op (str): The memory operand of the tensor. - timestep (int): The timestep to remove the tensor at. + memory_op: The memory operand of the tensor. + timestep: The timestep to remove the tensor at. write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. """ assert self.offchip_core_id is not None @@ -152,9 +152,9 @@ def remove_all( Args: core (Core): The Core to remove the tensor from - memory_operand (str): The memory operand for which all tensors should be evicted. - timestep (int): The timestep to remove the tensor at. - exceptions (list): A list of tensors that should not be evicted. + memory_operand: The memory operand for which all tensors should be evicted. + timestep: The timestep to remove the tensor at. + exceptions: A list of tensors that should not be evicted. write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. """ total_link_energy_cost = 0 @@ -182,10 +182,10 @@ def make_space_for( """Make space for the given tensor on the given core by evicting already stored tensors if necessary. Args: - tensor (Tensor): The tensor to make space for. + tensor: The tensor to make space for. core (Core): The core where the tensor will be stored. - memory_operand (str): The memory operand on the core. - timestep (int): The timestep at which to make space for. + memory_operand: The memory operand on the core. + timestep: The timestep at which to make space for. """ total_eviction_link_energy_cost = 0 total_eviction_memory_energy_cost = 0 @@ -238,6 +238,7 @@ def transfer_tensor_to_core( tensor_operand: MemoryOperand, non_evictable_tensors: list[Tensor], sending_core_id: int | None = None, + earliest_t: int = 0, ) -> tuple[int, float, float, float, float, bool]: """ Transfer a tensor to a given core id. @@ -255,12 +256,58 @@ def transfer_tensor_to_core( If one of the links already transferred the tensor, we broadcast if possible. Args: - tensor (Tensor): The tensor to transfer. - receiving_core_id (int): The id of the core that needs to receive the tensor. - tensor_operand (str): The memory operand where the tensor needs to be stored. - non_evictable_tensors (list): the stored tensor that cannot be evicted - sending_core_id (int, optional): The id of the core that should transfer the tensor. + tensor: The tensor to transfer. + receiving_core_id: The id of the core that needs to receive the tensor. + tensor_operand: The memory operand where the tensor needs to be stored. + non_evictable_tensors: the stored tensor that cannot be evicted + sending_core_id: The id of the core that should transfer the tensor. + earliest_t: Earliest timestep at which transfer can happen """ + + def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): + """ + Given the links to transfer across and corresponding available bandwidths, return the earliest transfer + start and end time. + + Args: + links_bw: link and corresponding transfer bandwidth + """ + slowest_bw = min(links_bw.values()) + transfer_duration = ceil(tensor.size / slowest_bw) + tensor_bw_per_link = {link: [(tensor, link_bw)] for link, link_bw in links_bw.items()} + transfer_start = self.communication_manager.get_links_idle_window( + tensor_bw_per_link=tensor_bw_per_link, + best_case_start=evictions_complete_timestep, + duration=transfer_duration, + ) + transfer_end = transfer_start + transfer_duration + return transfer_start, transfer_end + + def find_earliest_time_for_transfer( + links: list[CommunicationLink], nb_iterations: int = 1, default_fraction: float = 1 + ): + """Find the earliest time at which a tensor transfer between 2 cores can happen. Iterate over the used + bandwidth to find the transfer bandwidth at which the finish time is earliest""" + windows: list[tuple[int, int]] = [] + + # Either use the default fraction, or linearly space the fractions to try out + if nb_iterations == 1: + bandwidth_fractions = [default_fraction] + else: + # Iterate over linearly spaced fractions of the bandwidth + bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] + + for frac in bandwidth_fractions: + links_with_bw = {link: ceil(frac * link.bandwidth) for link in links} + start, end = find_transfer_start_and_end_time(links_with_bw) + windows.append((start, end)) + + ends = [end for _, end in windows] + best_idx = ends.index(min(ends)) + best_window = windows[best_idx] + best_fraction = bandwidth_fractions[best_idx] + return best_window, best_fraction + ################################# STEP 0 ################################# # Check if the tensor is already on the receiving core # Get the top instance where the tensor will be transferred to @@ -268,6 +315,7 @@ def transfer_tensor_to_core( receiving_top_instance = self.get_top_instance_of_core(receiving_core_id, tensor_operand) if self.memory_manager.contains(tensor, receiving_top_instance): return -1, 0, 0, 0, 0, False + ################################# STEP 1 ################################# # Get the top instance storing the tensor # If a sending core id is provided, we get the instance of that core. @@ -289,14 +337,17 @@ def transfer_tensor_to_core( if timestep == available_since_timestep ) ) + ################################# STEP 2 ################################# # The receiver core has enough space to store the tensor. + earliest_tensor_addition_t = max(earliest_t, available_since_timestep) enough_space_timestep = self.memory_manager.get_timestep_for_tensor_addition( tensor, receiving_core_id, - available_since_timestep, + earliest_tensor_addition_t, memory_op=tensor_operand, ) + ################################# STEP 3 ################################# # Make space on the receiving core by evicting tensors if there was never enough space. ( @@ -310,54 +361,9 @@ def transfer_tensor_to_core( enough_space_timestep, non_evictable_tensors, ) + ################################# STEP 4 ################################# # The links between sender and receiver have a long enough idle window. - - def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): - """ - Given the links to transfer across and corresponding available bandwidths, return the earliest transfer - start and end time. - - Args: - links_bw: link and corresponding transfer bandwidth - """ - slowest_bw = min(links_bw.values()) - transfer_duration = ceil(tensor.size / slowest_bw) - - tensor_bw_per_link = {link: [(tensor, link_bw)] for link, link_bw in links_bw.items()} - - transfer_start = self.communication_manager.get_links_idle_window( - tensor_bw_per_link=tensor_bw_per_link, - best_case_start=evictions_complete_timestep, - duration=transfer_duration, - ) - transfer_end = transfer_start + transfer_duration - return transfer_start, transfer_end - - def find_earliest_time_for_transfer( - links: list[CommunicationLink], nb_iterations: int = 1, default_fraction: float = 1 - ): - """Find the earliest time at which a tensor transfer between 2 cores can happen. Iterate over the used - bandwidth to find the transfer bandwidth at which the finish time is earliest""" - windows: list[tuple[int, int]] = [] - - if nb_iterations == 1: - bandwidth_fractions = [default_fraction] - else: - # Iterate over linearly spaced fractions of the bandwidth - bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] - - for frac in bandwidth_fractions: - links_with_bw = {link: ceil(frac * link.bandwidth) for link in links} - start, end = find_transfer_start_and_end_time(links_with_bw) - windows.append((start, end)) - - ends = [end for _, end in windows] - best_idx = ends.index(min(ends)) - best_window = windows[best_idx] - best_fraction = bandwidth_fractions[best_idx] - return best_window, best_fraction - # TODO If the storing_instance is a shared instance across more than one core, # TODO there will be multiple possible cores to transfer between. # TODO For now, we take the first one @@ -376,6 +382,7 @@ def find_earliest_time_for_transfer( ################################# STEP 5 ################################# # Spawn the tensor on the receiving core self.spawn(tensor, receiving_core, tensor_operand, transfer_start, transfer_end) + ################################# STEP 6 ################################# # Update the links involved in the communication and get the transfer energy cost ( @@ -390,6 +397,7 @@ def find_earliest_time_for_transfer( transfer_duration, link_bw_fraction=link_bw_fraction, ) + ################################# STEP 7 ################################# # Remove the transferred tensor from the sender core (excluding DRAM) # if it is no longer needed. @@ -407,10 +415,10 @@ def find_earliest_time_for_transfer( transfer_end, write_back_to_offchip=False, ) + ################################# STEP 8 ################################# # Give back flag that signals if the tensor came from offchip came_from_offchip = sender_core.id == self.offchip_core_id - return ( transfer_end, transfer_link_energy_cost, From 30e6ff04672eff634a67ef72006110a5f6d43cd2 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Mon, 23 Dec 2024 11:45:11 +0100 Subject: [PATCH 26/49] fix off-by-1 error in get_timestep_for_tensor_addition --- stream/cost_model/memory_manager.py | 6 +++--- stream/hardware/architecture/accelerator.py | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/stream/cost_model/memory_manager.py b/stream/cost_model/memory_manager.py index dbbfbd8b..25bc484d 100644 --- a/stream/cost_model/memory_manager.py +++ b/stream/cost_model/memory_manager.py @@ -197,11 +197,11 @@ def get_timestep_for_tensor_addition( top_instance_capacity = self.top_instance_capacities[top_instance] all_timesteps = self.top_instance_stored_cumsum[top_instance][:, 0] all_usages = self.top_instance_stored_cumsum[top_instance][:, 1] - relevant_start_idx = np.searchsorted(all_timesteps, timestep, "right") - 1 + relevant_start_idx = np.searchsorted(all_timesteps, timestep, "right") if relevant_start_idx == len(all_timesteps): return timestep - relevant_timesteps = all_timesteps[relevant_start_idx:] - relevant_usages = all_usages[relevant_start_idx:] + relevant_timesteps = all_timesteps[relevant_start_idx - 1:] + relevant_usages = all_usages[relevant_start_idx - 1:] relevant_usages_reversed = relevant_usages[::-1] max_usage = np.max(relevant_usages_reversed) last_max_usage_idx = len(relevant_usages_reversed) - np.argmax(relevant_usages_reversed) - 1 diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index f1780ed8..e4447a59 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -31,10 +31,7 @@ def __init__( cores: CoreGraph, offchip_core_id: int | None = None, ): - """ - Args: - core_ids_with_shared_mem: indicate which cores (identified by core id) have a shared top level memory - """ + """ """ self.name = name self.cores = cores self.offchip_core_id = offchip_core_id @@ -371,7 +368,19 @@ def find_earliest_time_for_transfer( sender_core = sender_cores[0] links = self.communication_manager.get_links_for_pair(sender_core, receiving_core) # ! By default, transfers only take a fraction of the total bandwidth - default_bandwidth_fraction = 1 / len(self.core_list) + nb_shared_mems = ( + len( + set( + ( + instance.shared_memory_group_id + for instances in self.memory_manager.top_instances_per_core.values() + for instance in instances + ) + ) + ) + - 1 + ) + default_bandwidth_fraction = 1 / nb_shared_mems (transfer_start, transfer_end), link_bw_fraction = find_earliest_time_for_transfer( links, nb_iterations=1, From 89762b24916f208c1f92723361f7d172007e105c Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Mon, 23 Dec 2024 14:20:02 +0100 Subject: [PATCH 27/49] store nb_shared_mem_groups in accelerator instance --- stream/cost_model/memory_manager.py | 6 +++--- stream/hardware/architecture/accelerator.py | 16 +++------------- stream/parser/accelerator_factory.py | 11 ++++++++++- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/stream/cost_model/memory_manager.py b/stream/cost_model/memory_manager.py index 25bc484d..b38de46a 100644 --- a/stream/cost_model/memory_manager.py +++ b/stream/cost_model/memory_manager.py @@ -197,11 +197,11 @@ def get_timestep_for_tensor_addition( top_instance_capacity = self.top_instance_capacities[top_instance] all_timesteps = self.top_instance_stored_cumsum[top_instance][:, 0] all_usages = self.top_instance_stored_cumsum[top_instance][:, 1] - relevant_start_idx = np.searchsorted(all_timesteps, timestep, "right") + relevant_start_idx = np.searchsorted(all_timesteps, timestep, "right") if relevant_start_idx == len(all_timesteps): return timestep - relevant_timesteps = all_timesteps[relevant_start_idx - 1:] - relevant_usages = all_usages[relevant_start_idx - 1:] + relevant_timesteps = all_timesteps[relevant_start_idx - 1 :] + relevant_usages = all_usages[relevant_start_idx - 1 :] relevant_usages_reversed = relevant_usages[::-1] max_usage = np.max(relevant_usages_reversed) last_max_usage_idx = len(relevant_usages_reversed) - np.argmax(relevant_usages_reversed) - 1 diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index e4447a59..4f0ab3b1 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -29,12 +29,14 @@ def __init__( self, name: str, cores: CoreGraph, + nb_shared_mem_groups: int, offchip_core_id: int | None = None, ): """ """ self.name = name self.cores = cores self.offchip_core_id = offchip_core_id + self.nb_shared_mem_groups = nb_shared_mem_groups self.memory_manager = MemoryManager(self) self.communication_manager = CommunicationManager(self) @@ -368,19 +370,7 @@ def find_earliest_time_for_transfer( sender_core = sender_cores[0] links = self.communication_manager.get_links_for_pair(sender_core, receiving_core) # ! By default, transfers only take a fraction of the total bandwidth - nb_shared_mems = ( - len( - set( - ( - instance.shared_memory_group_id - for instances in self.memory_manager.top_instances_per_core.values() - for instance in instances - ) - ) - ) - - 1 - ) - default_bandwidth_fraction = 1 / nb_shared_mems + default_bandwidth_fraction = 1 / self.nb_shared_mem_groups (transfer_start, transfer_end), link_bw_fraction = find_earliest_time_for_transfer( links, nb_iterations=1, diff --git a/stream/parser/accelerator_factory.py b/stream/parser/accelerator_factory.py index aa8754c7..693e2e7d 100644 --- a/stream/parser/accelerator_factory.py +++ b/stream/parser/accelerator_factory.py @@ -19,10 +19,13 @@ def __init__(self, data: dict[str, Any]): def create(self) -> Accelerator: """! Create an Accelerator instance from the user-provided data.""" cores: list[Core] = [] + unique_shared_mem_group_ids: set[int] = set() + for core_id, core_data in self.data["cores"].items(): shared_mem_group_id = self.get_shared_mem_group_id(core_id) core = self.create_core(core_data, core_id, shared_mem_group_id) cores.append(core) + unique_shared_mem_group_ids.add(shared_mem_group_id) # Extra check on shared memory if self.have_non_identical_shared_memory(cores): @@ -41,9 +44,15 @@ def create(self) -> Accelerator: offchip_core = self.create_core(self.data["offchip_core"], offchip_core_id) cores_graph = self.create_core_graph(cores, offchip_core) + nb_shared_mem_groups = len(unique_shared_mem_group_ids) # Take next available core id - return Accelerator(name=self.data["name"], cores=cores_graph, offchip_core_id=offchip_core_id) + return Accelerator( + name=self.data["name"], + cores=cores_graph, + offchip_core_id=offchip_core_id, + nb_shared_mem_groups=nb_shared_mem_groups, + ) def create_core(self, core_data: dict[str, Any], core_id: int, shared_mem_group_id: int | None = None): core_factory = ZigZagCoreFactory(core_data) From ae399e60506f72eab1e9ee7a58ec540f25d10131 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Mon, 23 Dec 2024 14:20:42 +0100 Subject: [PATCH 28/49] no longer block constant operands that fit in memory --- .../estimation/zigzag_core_mapping_estimation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stream/stages/estimation/zigzag_core_mapping_estimation.py b/stream/stages/estimation/zigzag_core_mapping_estimation.py index 19e33bcd..dd50a624 100644 --- a/stream/stages/estimation/zigzag_core_mapping_estimation.py +++ b/stream/stages/estimation/zigzag_core_mapping_estimation.py @@ -103,12 +103,12 @@ def run(self): # It's possible this node might not fully fit within the core's top level memories. # If so, we update the core too_large_operands_for_cme = self.check_core_capacity_for_node(core, node_duplicate) - # ! --- ensure all constant weights are accessed via blocking behavior i.s.o. transfer - for layer_op in node.constant_operands: - mem_op = node.memory_operand_links.layer_to_mem_op(layer_op) - if mem_op not in too_large_operands_for_cme and node.operand_precision[layer_op] > 0: - too_large_operands_for_cme.append(mem_op) - # ! --- + # # ! --- ensure all constant weights are accessed via blocking behavior i.s.o. transfer + # for layer_op in node.constant_operands: + # mem_op = node.memory_operand_links.layer_to_mem_op(layer_op) + # if mem_op not in too_large_operands_for_cme and node.operand_precision[layer_op] > 0: + # too_large_operands_for_cme.append(mem_op) + # # ! --- # # ! --- FOR TESTING ONLY enforce blocking for all operands always # for layer_op in node.input_operands + [node.output_operand]: # mem_op = node.memory_operand_links.layer_to_mem_op(layer_op) From 21e087b3cf58d45283be2872b9fdc5f93cde3566 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Mon, 23 Dec 2024 21:46:04 +0100 Subject: [PATCH 29/49] update to new zigzag cycles_per_op feature --- stream/cost_model/scheduler.py | 68 ++++++++++--------- .../zigzag_core_mapping_estimation.py | 27 +++++--- 2 files changed, 53 insertions(+), 42 deletions(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index ec550ae0..3dd54ec6 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -122,8 +122,8 @@ def get_tensors_needed_for_node(node: ComputationNode, G: ComputationNodeWorkloa for pred, node, edge_data in sorted(G.in_edges(node, data=True), key=itemgetter(0)): if pred.id == node.id: continue # Skip if predecessor was from the same layer (intra-edge) - consumer_layer_op = edge_data["operand"] - consumer_memory_op = node.memory_operand_links[consumer_layer_op] + consumer_layer_op: LayerOperand = edge_data["operand"] + consumer_memory_op = node.memory_operand_links.layer_to_mem_op(consumer_layer_op) if consumer_memory_op in node.too_large_operands: continue # Skip if tensor will be fetched fromm offchip throughout computation pred_output_tensor = pred.operand_tensors[pred.output_operand] @@ -178,7 +178,6 @@ def decrease_priority( def check_for_removal( tensors: list[Tensor], accelerator: "Accelerator", - node: ComputationNode, G: ComputationNodeWorkload, timestep: int, ): @@ -249,9 +248,9 @@ def sync_cores_idle_from( def schedule_graph( G: ComputationNodeWorkload, accelerator: "Accelerator", + scheduling_order: list[tuple[int, int]], cores_idle_from: dict[int, int] | None = None, operands_to_prefetch: list[LayerOperand] = [], - scheduling_order: list[tuple[int, int]] | None = None, ) -> tuple[int, float, float, float, float, float, float, float, float, float]: """Schedule the nodes of graph G across the cores in the system. Each node should have a core_allocation and runtime set. @@ -332,29 +331,34 @@ def schedule_graph( core_id = best_candidate.chosen_core_allocation assert core_id is not None core = accelerator.get_core(core_id) + # Earliest start time is when core is available or predecessors finished core_idle_from = cores_idle_from[core_id] start = max(core_idle_from, preds_end) + timestep = start + # Step 0 tensors_this_candidate_needs, tensors_operands = get_tensors_needed_for_node(best_candidate, G) + # Step 1 # There could be operands that are too large to store in the highest memory on the core # The tensors stored in these memories should be evicted and potentially written back to off-chip # Clear these memories (this might delay the potential start time if things have to written to off-chip) - timestep = start ( clear_link_energy, clear_memory_energy, - timestep, + memory_cleared_timestep, ) = clear_memories( - accelerator, - core, - best_candidate.too_large_operands, - timestep, + accelerator=accelerator, + core=core, + memory_operands=best_candidate.too_large_operands, + timestep=timestep, exceptions=tensors_this_candidate_needs, ) total_eviction_to_offchip_link_energy += clear_link_energy total_eviction_to_offchip_memory_energy += clear_memory_energy + timestep = memory_cleared_timestep + # Step 2 # The computation might need tensors that are currently not present in the core's memories # We need to fetch these tensors from either off-chip or from the core where they are present @@ -388,28 +392,13 @@ def schedule_graph( total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost # Step 3 - # Check if we had any operands that were too large to store in the core's memory, block the relevant off-chip - # link for the duration - # This might again delay the execution if the offchip link was already blocked by another core - timestep = accelerator.block_offchip_links( - best_candidate.too_large_operands, - core_id, - timestep, - best_candidate.get_runtime(), - best_candidate, - ) - - # Step 4 # Make space for the output tensor of this computation node and spawn it when evictions are complete # If the output operand is in the too large operands, add it to off-chip, otherwise add it to this core's # output memory output_layer_operand = best_candidate.output_operand - output_memory_operand = best_candidate.memory_operand_links[output_layer_operand] + output_memory_operand = best_candidate.memory_operand_links.layer_to_mem_op(output_layer_operand) output_tensor = best_candidate.operand_tensors[output_layer_operand] - if output_memory_operand in best_candidate.too_large_operands: - core_to_add_output_to = offchip_core - else: - core_to_add_output_to = core + core_to_add_output_to = offchip_core if output_memory_operand in best_candidate.too_large_operands else core ( evictions_complete_timestep, eviction_link_energy_cost, @@ -423,7 +412,24 @@ def schedule_graph( ) total_eviction_to_offchip_link_energy += eviction_link_energy_cost total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - start = evictions_complete_timestep + timestep = evictions_complete_timestep + + # Step 4 + # Check if we had any operands that were too large to store in the core's memory, block the relevant off-chip + # link for the duration + # This might again delay the execution if the offchip link was already blocked by another core + blocking_can_start_timestep = accelerator.block_offchip_links( + too_large_operands=best_candidate.too_large_operands, + core_id=core_id, + start_timestep=timestep, + duration=best_candidate.get_runtime(), + cn=best_candidate, + ) + timestep = blocking_can_start_timestep + + # Step 5 + # Spawn the output tensor and update the start and end time of the node + start = timestep end = start + best_candidate.get_runtime() accelerator.spawn( output_tensor, @@ -432,9 +438,6 @@ def schedule_graph( initial_timestep=start, available_timestep=end, ) - - # Step 5 - # Update the start and end time of the node best_candidate.set_start(start) best_candidate.set_end(end) cores_idle_from[core_id] = end @@ -454,7 +457,6 @@ def schedule_graph( check_for_removal( tensors_this_candidate_needs, accelerator, - best_candidate, G, end, ) @@ -465,7 +467,7 @@ def schedule_graph( # outputs to offchip if best_candidate in sink_layer_nodes: # Only push back sink node outputs if they're generated and stored on the core - if best_candidate.output_operand not in best_candidate.too_large_operands: + if Constants.OUTPUT_MEM_OP not in best_candidate.too_large_operands: ( _, link_energy_cost, diff --git a/stream/stages/estimation/zigzag_core_mapping_estimation.py b/stream/stages/estimation/zigzag_core_mapping_estimation.py index dd50a624..4cc6f795 100644 --- a/stream/stages/estimation/zigzag_core_mapping_estimation.py +++ b/stream/stages/estimation/zigzag_core_mapping_estimation.py @@ -151,19 +151,28 @@ def run(self): def increase_cc_per_op(self, cme: CostModelEvaluation, op_type: str): match op_type: case "silu": - factor = 4 + cc_per_op = 4 case "sigmoid": - factor = 4 + cc_per_op = 4 case "exp": - factor = 4 + cc_per_op = 4 case _: - factor = 1 - - if factor > 1: - logger.warning(f"Setting cycles per mac of {op_type} node to {factor}") + cc_per_op = 1 + + if cc_per_op > 1: + logger.warning(f"Setting cycles per mac of {op_type} node to {cc_per_op}") + + new_cme = CostModelEvaluation( + accelerator=cme.accelerator, + layer=cme.layer, + spatial_mapping=cme.spatial_mapping, + spatial_mapping_int=cme.spatial_mapping_int, + temporal_mapping=cme.temporal_mapping, + access_same_data_considered_as_no_access=cme.access_same_data_considered_as_no_access, + cycles_per_op=cc_per_op, + ) - cme.calc_overall_latency(cycles_per_mac=factor) - return cme + return new_cme def visualize_cost_lut(self): # Get the scale factors From 6a5f5b4b42bc3b1a5d3d18cbea2dce7976c4bb60 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Tue, 24 Dec 2024 14:15:54 +0100 Subject: [PATCH 30/49] change NodeTensor extend_with_node to prevent too large memory usage (>160GiB) --- stream/node_tensor.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/stream/node_tensor.py b/stream/node_tensor.py index 26dbb22e..980c88fa 100644 --- a/stream/node_tensor.py +++ b/stream/node_tensor.py @@ -34,6 +34,9 @@ def initialize_empty(shape: tuple[int, ...], pre_allocation_size: int = 8): ComputationNodes are accumulated in the last dimension and space is pre-allocated in memory for performance""" return NodeTensor(np.zeros(shape + (pre_allocation_size,), dtype=object), pre_allocation_size) + def _get_pointer(self): + return self.__node_count + def _get_and_increment_pointer(self): """Get the index pointer in the last dimension. which points to the next free spot to allocate nodes. Automatically increments the pointer after each use. If the index exceeds the allocated space, an error is @@ -85,11 +88,21 @@ def get_nb_empty_elements(self, slices: tuple[slice, ...]): def extend_with_node(self, slices: tuple[slice, ...], node: object) -> "NodeTensor": assert self.is_valid_shape_dimension(slices), "Last dimension of tensor is reserved for CNs" + # Case 1: Try to assign at the current pointer for given slices + idx = self._get_pointer() + extended_slices = slices + (slice(idx, idx + 1),) + # Slice is all 0 + if not np.any(self[extended_slices]): + self[extended_slices] = node + return self + + # Case 2: increment pointer and assign at empty slice try: idx = self._get_and_increment_pointer() extended_slices = slices + (slice(idx, idx + 1),) self[extended_slices] = node return self + # Case 3: pointer exceeds the tensor's accumulation dimension -> increase tensor size except IndexError: # Happens when all allocated space has been used up. Create new one and double allocated space new_pre_alloc_size = 2 * self.__pre_allocation_size From 09c3255b7db192d70e2c89410bf26091e8ada026 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Tue, 24 Dec 2024 14:17:14 +0100 Subject: [PATCH 31/49] fix error mesasge in group id core allocation --- stream/stages/generation/tiled_workload_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index a9a08b1b..22bf28cd 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -360,7 +360,7 @@ def get_tiles( if original_node.core_allocation_is_fixed: assert group_id < len( original_node.possible_core_allocation - ), f"Group id {group_id} too large for core allocation list {original_node.core_allocation}" + ), f"Group id {group_id} too large for core allocation list {original_node.possible_core_allocation}" chosen_core_allocation = original_node.possible_core_allocation[group_id] tile.set_chosen_core_allocation(chosen_core_allocation) From 94c29b2b920107e7499c46f60b128648c30e1e58 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Tue, 24 Dec 2024 16:35:54 +0100 Subject: [PATCH 32/49] fix group id manager issue in case loop_size % inter_core_tiling != 0 --- stream/cost_model/group_allocation.py | 38 +++++++++++-------- stream/opt/partitioning/TemporalLoop.py | 4 ++ .../generation/tiled_workload_generation.py | 32 ++++++++-------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/stream/cost_model/group_allocation.py b/stream/cost_model/group_allocation.py index 93c98e6f..ba53575f 100644 --- a/stream/cost_model/group_allocation.py +++ b/stream/cost_model/group_allocation.py @@ -4,7 +4,8 @@ from zigzag.datatypes import LayerDim from stream.utils import contains_wildcard -from stream.workload.computation.computation_node import ComputationNode, LoopRanges +from stream.workload.computation.computation_node import LoopRanges +from stream.workload.mapping import TILING_T logger = logging.getLogger(__name__) @@ -12,11 +13,18 @@ class GroupIdManager: - def __init__(self, node: ComputationNode): + def __init__( + self, + layer_dim_sizes: dict[LayerDim, int], + intra_core_tiling: TILING_T, + inter_core_tiling: TILING_T, + ): self.__id_count = 0 self.groups: GroupAllocation = {} - self.node = node - self.inter_core_tiled_dims = [layer_dim for layer_dim, _ in node.inter_core_tiling] + self.layer_dim_sizes = layer_dim_sizes + self.intra_core_tiling: list[tuple[LayerDim, int]] = intra_core_tiling + self.inter_core_tiling = inter_core_tiling + self.inter_core_tiled_dims = [layer_dim for layer_dim, _ in inter_core_tiling] def __get_and_raise_id(self): curr_id = self.__id_count @@ -33,24 +41,24 @@ def __get_range_identifier_single_dim(self, inter_core_layer_dim: LayerDim, curr given range modulo the size of the N equal parts. """ nb_intra_core_splits = next( - (split for layer_dim, split in self.node.intra_core_tiling if layer_dim == inter_core_layer_dim), 1 + (split for layer_dim, split in self.intra_core_tiling if layer_dim == inter_core_layer_dim), 1 ) - range_size_per_intra_split = self.node.layer_dim_sizes[inter_core_layer_dim] // nb_intra_core_splits + range_size_per_intra_split = self.layer_dim_sizes[inter_core_layer_dim] // nb_intra_core_splits range_adjusted_to_intra_split = tuple(i % range_size_per_intra_split for i in current_range) return range_adjusted_to_intra_split def __get_range_identifier(self, tile_loop_ranges: LoopRanges): """Given the loop ranges of a tile, return a hashable identifier that can be used to determine wether this tile belongs on the same core as other tiles.""" - if not all(layer_dim in tile_loop_ranges for layer_dim, _ in self.node.inter_core_tiling): + if not all(layer_dim in tile_loop_ranges for layer_dim, _ in self.inter_core_tiling): raise ValueError( - f"Given inter core tiling {self.node.inter_core_tiling} contains layer dims that are not " + f"Given inter core tiling {self.inter_core_tiling} contains layer dims that are not " f"part of the tile's loop ranges {tile_loop_ranges}" ) return tuple( self.__get_range_identifier_single_dim(layer_dim, tile_loop_ranges[layer_dim]) - for layer_dim, _ in self.node.inter_core_tiling + for layer_dim, _ in self.inter_core_tiling ) def get_group_id(self, tile_loop_ranges: LoopRanges) -> int: @@ -68,15 +76,15 @@ def get_group_id(self, tile_loop_ranges: LoopRanges) -> int: Returns: int: The group id for the given loop ranges """ - if contains_wildcard(self.node.inter_core_tiling): + if contains_wildcard(self.inter_core_tiling): # In this case, the tiles should not be split between cores yet return 0 - if not self.node.constant_operands and len(self.node.core_allocation) == 1: - # If the node can only be assigned to a single core, we give all nodes the same group id - # This is to prevent the CostModelEvaluationLUT from identifying each node as unique - # This is the case for e.g. 'Add' nodes if there is only a single 'Add' core - return 0 + # if not self.node.constant_operands and len(self.node.core_allocation) == 1: + # # If the node can only be assigned to a single core, we give all nodes the same group id + # # This is to prevent the CostModelEvaluationLUT from identifying each node as unique + # # This is the case for e.g. 'Add' nodes if there is only a single 'Add' core + # return 0 # Differentiate based on node's inter core tiling range_identifier = self.__get_range_identifier(tile_loop_ranges) diff --git a/stream/opt/partitioning/TemporalLoop.py b/stream/opt/partitioning/TemporalLoop.py index 54d279eb..68805819 100644 --- a/stream/opt/partitioning/TemporalLoop.py +++ b/stream/opt/partitioning/TemporalLoop.py @@ -12,3 +12,7 @@ def __str__(self): def __repr__(self): return str(self) + + def unpack(self): + """Unpack `dimension` and `size`""" + return (self.dimension, self.size) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 22bf28cd..9ca2341f 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -228,27 +228,26 @@ def get_tiles( original_node: ComputationNode, outer_temporal_loops: list[TemporalLoop] ) -> tuple[list[ComputationNode], list[ComputationNode]]: original_node_id = original_node.id + # If the node's loop size is not divisible by the outer temporal loops, extend it + extended_node_dim_sizes: dict[LayerDim, int] = {} # Take away the outer_temporal_loops to create tiled CNs for this node tile_attrs = original_node.extract_node_attr() tile_mapping = original_node.extract_inter_core_mapping_attr() - for outer_tl in outer_temporal_loops: - outer_dim = outer_tl.dimension - outer_size = outer_tl.size + for loop in outer_temporal_loops: + outer_dim, outer_size = loop.unpack() # Check if this node's "dim" size is divisible by the outer-cn loop size - node_dim_size = tile_attrs.layer_dim_sizes[outer_dim] + node_dim_size: int = tile_attrs.layer_dim_sizes[outer_dim] q, rem = divmod(node_dim_size, outer_size) # returns x//y, x%y if rem != 0: # Make sure that the outer_dim is divisible by the outer_size # Pad the dimension to a multiple of outer_size - node_dim_size = q * outer_size - q, rem = divmod(node_dim_size, outer_size) - assert rem == 0, ( - f"Node {original_node} dim {outer_dim} of size {node_dim_size} is not divisible by outer-cn temporal " - f"loop {outer_tl}" - ) + node_dim_size = (q + 1) * outer_size + q += 1 + tile_attrs.layer_dim_sizes[outer_dim] = q + extended_node_dim_sizes[outer_dim] = node_dim_size # Loop dimension + size of the tiles (called span here) tile_span = tile_attrs.layer_dim_sizes @@ -263,9 +262,8 @@ def get_tiles( # This is to convert from the relative loop value which goes from 0, 1, ..., stop_value - 1 # to the absolute value of that dimension (if there is another lower loop of the same type or spatial loop) mult_factors: list[int] = [] - for i, outer_loop in enumerate(outer_temporal_loops): - loop_dim = outer_loop.dimension - stop_value = outer_loop.size + for i, loop in enumerate(outer_temporal_loops): + loop_dim, stop_value = loop.unpack() inner_span = tile_span[loop_dim] if loop_dim in tile_span else 1 lower_outer_cn_loops = outer_temporal_loops[:i] # Returns 1 if empty list @@ -276,7 +274,11 @@ def get_tiles( tiles: list[ComputationNode] = [] tensors: list[Tensor] = [] - group_id_manager = GroupIdManager(original_node) + group_id_manager = GroupIdManager( + layer_dim_sizes=extended_node_dim_sizes, + intra_core_tiling=original_node.intra_core_tiling, + inter_core_tiling=original_node.inter_core_tiling, + ) for n in range(nb_cns): outer_loop_values: list[int] = [] for i, outer_loop in enumerate(outer_temporal_loops): @@ -925,7 +927,7 @@ def deduce_tensor_reuse_factors( nb_nodes = prod([tl.size for tl in outer_temporal_loops]) # tensor reuse factor will be set to the total reuse factor for each node - # whenveer a cn will be scheduled, the tensor reuse factor will decrease + # whenever a cn will be scheduled, the tensor reuse factor will decrease tensor_reuse_factors: dict[LayerOperand, list[int]] = {} for op, total_reuse_factor in total_reuse_factors.items(): tensor_reuse_factors[op] = [total_reuse_factor] * nb_nodes From 5d8ca714855f29f5bf79691eb2211b2fb6c6ce7a Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sun, 29 Dec 2024 19:54:49 +0100 Subject: [PATCH 33/49] sort scheduling order, sub_id = n_inter * i_intra + i_inter --- stream/cost_model/scheduler.py | 9 ++- .../constraint_optimization_allocation.py | 58 ++++++++++--------- .../generation/scheduling_order_generation.py | 13 ++--- .../generation/tiled_workload_generation.py | 2 +- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 3dd54ec6..12315ac8 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -257,10 +257,10 @@ def schedule_graph( Args: G : Graph containing the nodes to be scheduled. - accelerator (Accelerator): The accelerator to schedule the nodes on. - cores_start_offset (dict, optional): A dict containing for each core_id its start offset. Defaults to None. - operands_to_prefetch (list, optional): The layer operands that should be prefetched at the start of the - schedule. + accelerator: The accelerator to schedule the nodes on. + scheduling_order: + cores_idle_from: A dict containing for each core_id its start offset. Defaults to None. + operands_to_prefetch: The layer operands that should be prefetched at the start of the schedule. """ # Initialize total link energy cost and memory energy costs total_cn_onchip_energy = 0 @@ -490,7 +490,6 @@ def schedule_graph( (predecessor.end for predecessor in G.predecessors(successor)), default=0, ) - # core_candidates[successor.core_allocation].append((preds_end, successor)) candidates.append((preds_end, successor)) # Increment the number of scheduled nodes diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py index dbcd20d5..68dad683 100644 --- a/stream/stages/allocation/constraint_optimization_allocation.py +++ b/stream/stages/allocation/constraint_optimization_allocation.py @@ -251,13 +251,14 @@ def get_scheduling_order(self, unpartitioned_workload: DNNWorkloadStream) -> SCH """ scheduling_order: SCHEDULE_ORDER_T = [] - for stack, compute in self.compute_per_sink_node.items(): + for stack in sorted(self.compute_per_sink_node): + compute_this_stack = self.compute_per_sink_node[stack] hash_steady_state = self.steady_state_hashes[stack] allocation_steady_state = self.optimal_allocation_per_stack[stack] hashes_per_sink_node = self.hashes_per_sink_node[stack] order = self.get_cn_order( allocation=allocation_steady_state, - compute_per_sink_node=compute, + compute_per_sink_node=compute_this_stack, hashes_per_sink_node=hashes_per_sink_node, memoization_hash_ss=hash_steady_state, ) @@ -272,7 +273,13 @@ def adjust_order_to_inter_core_tiling( ): """Given an allocation order for a given stack, extend the order to extra outer loops that result from the inter core tiling. This method anticipates the fact that later on, CNs will be split further to allow for inter- - core tiling, and adjusts the scheduling beforehand + core tiling, and adjusts the scheduling beforehand. + + Example: [(0, 12), (0, 13)] and inter_core_tiling = 4 + -> [(0, 4*12+0), (0, 49), (0, 50), (0, 51), (0, 4*13+0), ...] + <------intra-core partition 12-------> <---- partition 13 ----> + + NOTE The ordering given by this method must match the order in which tiles are generated in `get_tiles` Args: stack: CN stack for which the order applies @@ -281,36 +288,33 @@ def adjust_order_to_inter_core_tiling( core tiling loops """ + adjusted_order = order.copy() - for node in self.get_computation_nodes(stack, unpartitioned_workload): + for curr_node in self.get_computation_nodes(stack, unpartitioned_workload): # NOTE this uses `inter_core_tiling`, because the inter core tiling is added to the intra core tiling # in `schedule_allocation` in order to alter the workload - outer_loops = node.inter_core_tiling - - # try: - # outer_loops = hint_loops[(layer_id,)] - # except KeyError: - # # If the layer_id is not present it means it was not in the allocation. - # # This happens if all nodes of the layer were not in the steady state - # outer_loops = [] - - for _, factor in outer_loops: - assert isinstance(factor, int), "tiling options `*` and `all` should be replaced by now" - if factor == 1: + outer_loops = curr_node.inter_core_tiling + + for _, inter_core_split_factor in outer_loops: + assert isinstance( + inter_core_split_factor, int + ), "tiling options `*` and `all` should be replaced by now" + if inter_core_split_factor == 1: # In case CO decides to not split up the node across cores continue - inserted = 0 - for i, ids_in_stack in enumerate(order.copy()): - layer_id, node_id = ids_in_stack - if layer_id == node.id: - nb_nodes_this_layer = self.get_nb_nodes_for_layer(layer_id) - for scale in range(1, factor): - new_node_id = scale * nb_nodes_this_layer + node_id - order.insert(i + inserted + 1, (layer_id, new_node_id)) - inserted += 1 - - return order + i = 0 + while i < len(adjusted_order): + layer_id, sub_id = adjusted_order[i] + if layer_id == curr_node.id: + adjusted_order[i : i + 1] = [ + (layer_id, sub_id * inter_core_split_factor + j) for j in range(inter_core_split_factor) + ] + i += inter_core_split_factor + else: + i += 1 + + return adjusted_order def get_nb_nodes_for_layer(self, layer_id: int): return len(list(n for n in self.workload.node_list if n.id == layer_id)) diff --git a/stream/stages/generation/scheduling_order_generation.py b/stream/stages/generation/scheduling_order_generation.py index 1534ea04..959d8b3f 100644 --- a/stream/stages/generation/scheduling_order_generation.py +++ b/stream/stages/generation/scheduling_order_generation.py @@ -21,20 +21,19 @@ def __init__( super().__init__(list_of_callables, **kwargs) self.accelerator = accelerator self.workload = workload - self.layer_stacks = kwargs.get("layer_stacks", None) # optional - self.scheduling_order = None + self.layer_stacks: list[tuple[int, ...]] | None = kwargs.get("layer_stacks", None) # type: ignore # optional def run(self): if self.layer_stacks: # All nodes of earlier stacks should be scheduled before later stacks - self.scheduling_order = [] - for layer_stack in self.layer_stacks: - nodes = [n for n in self.workload.nodes() if n.id in layer_stack] - self.scheduling_order.extend(sorted(((n.id, n.sub_id) for n in nodes), reverse=True)) + self.scheduling_order: list[tuple[int, int]] = [] + for layer_stack in sorted(self.layer_stacks): + nodes_this_stack = [n for n in self.workload.node_list if n.id in layer_stack] + self.scheduling_order.extend(sorted(((n.id, n.sub_id) for n in nodes_this_stack), reverse=True)) else: # Generate a list of node ids from highest priority to lowest # We give higher priority to nodes deeper in the graph - self.scheduling_order = sorted(((n.id, n.sub_id) for n in self.workload.nodes()), reverse=True) + self.scheduling_order = sorted(((n.id, n.sub_id) for n in self.workload.node_list), reverse=True) self.kwargs["accelerator"] = self.accelerator self.kwargs["workload"] = self.workload diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 9ca2341f..f283a795 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -145,7 +145,7 @@ def match(self, tiles: list[ComputationNode], tiled_workload: ComputationNodeWor @staticmethod def get_scheduling_order(workload: ComputationNodeWorkload): - return sorted(((n.id, n.sub_id) for n in workload.node_list), reverse=True) + return sorted(((n.id, n.sub_id) for n in workload.node_list)) @staticmethod def get_all_node_pairs(G: ONNXWorkload) -> tuple[tuple[ComputationNode, ComputationNode, bool], ...]: From 2e8d92fd46e67769317b570da22e870e663618a2 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Mon, 30 Dec 2024 11:39:53 +0100 Subject: [PATCH 34/49] fix dumb dumb bug in group id manager --- stream/cost_model/group_allocation.py | 3 ++- .../generation/tiled_workload_generation.py | 25 ++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/stream/cost_model/group_allocation.py b/stream/cost_model/group_allocation.py index ba53575f..aa3364ca 100644 --- a/stream/cost_model/group_allocation.py +++ b/stream/cost_model/group_allocation.py @@ -2,6 +2,7 @@ from typing import TypeAlias from zigzag.datatypes import LayerDim +from zigzag.workload.layer_attributes import LayerDimSizes from stream.utils import contains_wildcard from stream.workload.computation.computation_node import LoopRanges @@ -15,7 +16,7 @@ class GroupIdManager: def __init__( self, - layer_dim_sizes: dict[LayerDim, int], + layer_dim_sizes: LayerDimSizes, intra_core_tiling: TILING_T, inter_core_tiling: TILING_T, ): diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index f283a795..86bf3af9 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -227,27 +227,26 @@ def get_non_type_predecessors(self, node: Node, types: list[type]) -> list[Node] def get_tiles( original_node: ComputationNode, outer_temporal_loops: list[TemporalLoop] ) -> tuple[list[ComputationNode], list[ComputationNode]]: - original_node_id = original_node.id - # If the node's loop size is not divisible by the outer temporal loops, extend it - extended_node_dim_sizes: dict[LayerDim, int] = {} # Take away the outer_temporal_loops to create tiled CNs for this node tile_attrs = original_node.extract_node_attr() - tile_mapping = original_node.extract_inter_core_mapping_attr() - for loop in outer_temporal_loops: outer_dim, outer_size = loop.unpack() - # Check if this node's "dim" size is divisible by the outer-cn loop size node_dim_size: int = tile_attrs.layer_dim_sizes[outer_dim] q, rem = divmod(node_dim_size, outer_size) # returns x//y, x%y + # Make sure that the outer_dim is divisible by the outer_size if rem != 0: - # Make sure that the outer_dim is divisible by the outer_size # Pad the dimension to a multiple of outer_size node_dim_size = (q + 1) * outer_size q += 1 tile_attrs.layer_dim_sizes[outer_dim] = q - extended_node_dim_sizes[outer_dim] = node_dim_size + + # Reconstruct the total, padded layer_dim_sizes as padded (reduced) tile size * outer_sizes + extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) + for loop in outer_temporal_loops: + outer_dim, outer_size = loop.unpack() + extended_layer_dim_sizes[outer_dim] *= outer_size # Loop dimension + size of the tiles (called span here) tile_span = tile_attrs.layer_dim_sizes @@ -275,7 +274,7 @@ def get_tiles( tiles: list[ComputationNode] = [] tensors: list[Tensor] = [] group_id_manager = GroupIdManager( - layer_dim_sizes=extended_node_dim_sizes, + layer_dim_sizes=extended_layer_dim_sizes, intra_core_tiling=original_node.intra_core_tiling, inter_core_tiling=original_node.inter_core_tiling, ) @@ -306,8 +305,6 @@ def get_tiles( group_id = group_id_manager.get_group_id(dim_min_max) - # Create the computation node object with the computed ranges of the loop dimensions - node_name = original_node.name # If all the output irrelevant loops are at a max, this is producing a final output, so set a flag original_node_output_ir_dims = original_node.loop_relevancy_info.get_ir_layer_dims( Constants.OUTPUT_LAYER_OP @@ -318,11 +315,11 @@ def get_tiles( ) tile = ComputationNode( - node_id=original_node_id, + node_id=original_node.id, sub_id=n, - node_name=node_name, + node_name=original_node.name, node_attr=tile_attrs, - mapping_attr=tile_mapping, + mapping_attr=original_node.extract_inter_core_mapping_attr(), op_type=original_node.type, produces_final_output=produces_final_output, group_id=group_id, From 2e1107b24855823d36731919857a046438dbd847 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Mon, 30 Dec 2024 14:09:42 +0100 Subject: [PATCH 35/49] ALL (i.s.o. any) nodes in a sink layer must have out_degree == 0 --- stream/cost_model/scheduler.py | 4 ++-- .../constraint_optimization_allocation.py | 6 ++---- .../stages/generation/tiled_workload_generation.py | 2 +- stream/workload/computation/computation_node.py | 3 +++ stream/workload/onnx_workload.py | 13 +++++++++++++ 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 12315ac8..8b5114df 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -296,8 +296,8 @@ def schedule_graph( candidates.append((cores_idle_from[core_allocation], source_node)) # type: ignore # Get all the nodes with no successors that produce final outputs, used for off-loading final outputs - sink_layers = sorted(set(n.id for n, d in G.out_degree() if d == 0)) - sink_layer_nodes = set((n for n in G.node_list if (n.id in sink_layers) and n.produces_final_output)) + sink_layer_ids = G.get_sink_layer_ids() + sink_layer_nodes = set((n for n in G.node_list if (n.id in sink_layer_ids) and n.produces_final_output)) # Get the offchip core id and core offchip_core_id = accelerator.offchip_core_id diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py index 68dad683..aebb61eb 100644 --- a/stream/stages/allocation/constraint_optimization_allocation.py +++ b/stream/stages/allocation/constraint_optimization_allocation.py @@ -6,7 +6,6 @@ import networkx as nx import numpy as np -from networkx import DiGraph from zigzag.utils import pickle_deepcopy, pickle_load, pickle_save from stream.cost_model.cost_model import StreamCostModelEvaluation @@ -125,12 +124,11 @@ def extract_steady_state_per_stack(self): logger.warning(f"Stack {i} is empty.") continue - # TODO initialize the subgraph of type DiGraphWrapper[ComputationNode] - sg: DiGraph = self.workload.subgraph(nodes) + sg = self.workload.get_subgraph(nodes) sink_nodes: list[ComputationNode] = sorted( n for n in sg.nodes() if len(get_real_successors(n, sg)) == 0 # type: ignore ) - sink_layer_ids = sorted(set(n.id for n in sink_nodes)) + sink_layer_ids = sg.get_sink_layer_ids() sink_layer_nodes = [tuple(sorted(n for n in sink_nodes if n.id == layer_id)) for layer_id in sink_layer_ids] interlaced = [tuple(filter(lambda x: x is not None, t)) for t in itertools.zip_longest(*sink_layer_nodes)] computed: set[ComputationNode] = set() diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 86bf3af9..abaeba64 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -242,7 +242,7 @@ def get_tiles( tile_attrs.layer_dim_sizes[outer_dim] = q - # Reconstruct the total, padded layer_dim_sizes as padded (reduced) tile size * outer_sizes + # Reconstruct the total, padded layer_dim_sizes as padded tile size * outer_sizes extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) for loop in outer_temporal_loops: outer_dim, outer_size = loop.unpack() diff --git a/stream/workload/computation/computation_node.py b/stream/workload/computation/computation_node.py index 2cdc51af..427fbe44 100644 --- a/stream/workload/computation/computation_node.py +++ b/stream/workload/computation/computation_node.py @@ -274,3 +274,6 @@ def nb_real_predecessors(self): def nb_real_predecessors(self, nb_real_predecessors: int | None): self.__nb_real_predecessors = nb_real_predecessors self._static_hash_value = self.__compute_static_hash() + + def __repr__(self): + return f"{self.name} ({self.id},{self.sub_id})" diff --git a/stream/workload/onnx_workload.py b/stream/workload/onnx_workload.py index 5526d72d..24b70601 100644 --- a/stream/workload/onnx_workload.py +++ b/stream/workload/onnx_workload.py @@ -31,3 +31,16 @@ def add(self, node_id: int, node_obj: Node): class ComputationNodeWorkload(DiGraphWrapper[ComputationNode]): """Workload graph with only ComputationNodes""" + + def get_sink_layer_ids(self): + """Return the ids of layers where ALL sub-nodes have out-degree 0""" + out_degrees = self.out_degree() + layer_ids = set(n.id for n, _ in out_degrees) + # x: (node, out_degree) + sink_layer_ids = [ + all(filter(lambda x: (x[0].id == curr_id and x[1] == 0), out_degrees)) for curr_id in layer_ids + ] + return sink_layer_ids + + def get_subgraph(self, nodes: list[ComputationNode]) -> "ComputationNodeWorkload": + return self.subgraph(nodes) # type: ignore From a9701c667d9ff7a4d5aa27fcb04e24742c1332c5 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 2 Jan 2025 09:01:58 +0100 Subject: [PATCH 36/49] outer_tmap_loop_dimensions order must match scheduling order --- .../constraint_optimization_allocation.py | 3 +- .../generation/scheduling_order_generation.py | 1 + .../generation/tiled_workload_generation.py | 144 ++++++++++++++---- stream/workload/onnx_workload.py | 12 +- 4 files changed, 125 insertions(+), 35 deletions(-) diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py index aebb61eb..215d8b6f 100644 --- a/stream/stages/allocation/constraint_optimization_allocation.py +++ b/stream/stages/allocation/constraint_optimization_allocation.py @@ -128,7 +128,7 @@ def extract_steady_state_per_stack(self): sink_nodes: list[ComputationNode] = sorted( n for n in sg.nodes() if len(get_real_successors(n, sg)) == 0 # type: ignore ) - sink_layer_ids = sg.get_sink_layer_ids() + sink_layer_ids = set(n.id for n in sink_nodes) sink_layer_nodes = [tuple(sorted(n for n in sink_nodes if n.id == layer_id)) for layer_id in sink_layer_ids] interlaced = [tuple(filter(lambda x: x is not None, t)) for t in itertools.zip_longest(*sink_layer_nodes)] computed: set[ComputationNode] = set() @@ -413,6 +413,7 @@ def schedule_allocation(self, allocation: ALLOCATION_T) -> StreamCostModelEvalua kwargs["accelerator"] = self.accelerator kwargs["workload"] = unpartitioned_sub_workload kwargs["scheduling_order"] = scheduling_order + kwargs["layer_stacks"] = self.layer_stacks kwargs["tiled_workload_path"] = self.tiled_workload_post_co_path kwargs["cost_lut_path"] = self.cost_lut_post_co_path kwargs["latency_attr"] = self.latency_attr diff --git a/stream/stages/generation/scheduling_order_generation.py b/stream/stages/generation/scheduling_order_generation.py index 959d8b3f..f83555ee 100644 --- a/stream/stages/generation/scheduling_order_generation.py +++ b/stream/stages/generation/scheduling_order_generation.py @@ -38,6 +38,7 @@ def run(self): self.kwargs["accelerator"] = self.accelerator self.kwargs["workload"] = self.workload self.kwargs["scheduling_order"] = self.scheduling_order + # self.kwargs["layer_stacks"] = self.layer_stacks # TODO is already in kwargs sub_stage = self.list_of_callables[0]( self.list_of_callables[1:], **self.kwargs, diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index abaeba64..f9a4b873 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -1,3 +1,4 @@ +from collections import defaultdict import logging import os from copy import deepcopy @@ -7,6 +8,7 @@ from rtree import index from zigzag.datatypes import Constants, LayerDim, LayerOperand from zigzag.utils import pickle_deepcopy, pickle_load, pickle_save +from zigzag.workload.layer_attributes import LayerDimSizes from stream.cost_model.group_allocation import GroupIdManager from stream.hardware.architecture.accelerator import Accelerator @@ -55,26 +57,26 @@ def __init__( super().__init__(list_of_callables, **kwargs) self.workload = workload self.accelerator = accelerator + self.layer_stacks = kwargs.get("layer_stacks", []) # Save for each of the workload's nodes the tiles that will be generated self.tiles_dict: dict[ComputationNode, list[ComputationNode]] = {} - # Memoize the numpy tensors for dependency generation self.numpy_tensors = {} - self.tiled_workload_path = tiled_workload_path def run(self): all_unique_tiles: list[ComputationNode] = [] # For each node get all the tiles and the edges between them - all_tiles = [] - all_edges = [] + all_tiles: list[ComputationNode] = [] + all_edges: list[tuple[ComputationNode, ComputationNode, dict[str, int]]] = [] for node in self.workload.topological_sort(): # If other node types shouldn't be included in tiled workload graph, add here if not isinstance(node, ComputationNode): continue outer_temporal_loops = self.get_outer_tmap_loop_dimensions(node) - tiles, unique_tiles = self.get_tiles(node, outer_temporal_loops) + mandatory_divisors = self.get_mandatory_divisors(node) + tiles, unique_tiles = self.get_tiles(node, outer_temporal_loops, mandatory_divisors) logger.info(f"{node}: Outer loops {outer_temporal_loops}.") logger.info(f"{node}: Generated {len(tiles)} tile(s).") self.tiles_dict[node] = tiles @@ -180,6 +182,11 @@ def get_all_node_pairs(G: ONNXWorkload) -> tuple[tuple[ComputationNode, Computat def get_outer_tmap_loop_dimensions(self, node: ComputationNode) -> list[TemporalLoop]: """Get the temporal loops that are outside a CN for this node. + NOTE the order of this list matters! The order in which sub-tiles are generated should match the scheduling + order. First generate all tiles within the same intra-core split (by splitting inter-core). + i.e. tiles with sub-id 0, 1, ..., (nb_inter_tiles - 1) should have the same intra-core split and allocated + to different cores + Args: node: node for which to return outer-cn loops @@ -190,9 +197,8 @@ def get_outer_tmap_loop_dimensions(self, node: ComputationNode) -> list[Temporal # inter core tiling is not set by CO yet tiling_to_split = node.intra_core_tiling else: - # inter core tiling is ok, also split into these tiles - tiling_to_split = node.intra_core_tiling + node.inter_core_tiling - + # inter core tiling is ok, also split into these tiles. NOTE: this list is ordered + tiling_to_split = node.inter_core_tiling + node.intra_core_tiling outer_loops = convert_outer_cn_loops(tiling_to_split, node) # In case no valid intra core tiling is found: add an arbitrary tiling of size 1 @@ -223,34 +229,109 @@ def get_non_type_predecessors(self, node: Node, types: list[type]) -> list[Node] preds += skip_node_preds return preds - @staticmethod + def get_mandatory_divisors(self, node: ComputationNode) -> dict[LayerDim, set[int]]: + """Get the factors by which the smaller tiles' dimensions must be divisible. + Tile dimensions must be divisible by all the inter-core tiling factors of the nodes within the same layer stack. + This ensures dependencies between tiles within the stack do not cross the layer stack boundaries. + # TODO can nodes within the same stack have different intra-core tiling? This is not accounted for + """ + # # These divisors accumulate: e.g. if a dim must be divisible by 2 and 4, it must be divisible by 8 + # divisors_multiplicative: dict[LayerDim, int] = defaultdict(lambda: 1) + + divisors: dict[LayerDim, set[int]] = defaultdict(lambda: set()) + + # # Must be divisible by inter- and intra-core tiling factors (multiplicative) + # for dim, factor in node.intra_core_tiling + node.inter_core_tiling: + # if isinstance(factor, int): + # divisors_multiplicative[dim] *= factor + + # # Multiplied divisors become one lcm divisor + # for dim, factor in divisors_multiplicative.items(): + # divisors_lcm[dim].add(factor) + + # Must be divisible by inter-core tiling factors of all nodes in the same layer stack (least common multiple) + # Find nodes in stack + try: + curr_stack = next(stack for stack in self.layer_stacks if node.id in stack) + except StopIteration: + # No stack found + return divisors + if len(curr_stack) == 1: + return divisors + other_nodes_in_stack = [ + n + for n in self.workload.node_list + if n.id in curr_stack and n.id != node.id and isinstance(n, ComputationNode) + ] + + for curr_node in other_nodes_in_stack: + assert len(curr_node.inter_core_tiling) == len( + set(dim for dim, _ in curr_node.inter_core_tiling) + ), "Inter-core tiling contains duplicate dimensions. The divisors for this node must be multiplied" + + for layer_dim, factor in curr_node.inter_core_tiling: + if isinstance(factor, int): + divisors[layer_dim].add(factor) + return divisors + def get_tiles( - original_node: ComputationNode, outer_temporal_loops: list[TemporalLoop] + self, + original_node: ComputationNode, + outer_temporal_loops: list[TemporalLoop], + mandatory_divisors: dict[LayerDim, set[int]] = {}, ) -> tuple[list[ComputationNode], list[ComputationNode]]: - # Take away the outer_temporal_loops to create tiled CNs for this node + def get_total_outer_size(dim: LayerDim): + return prod([loop.size for loop in outer_temporal_loops if loop.dimension == dim]) + + def get_lcm(n: int, divisors: set[int]) -> int: + """Make n divisible by all the divisors in the set.""" + for divisor in divisors: + if n % divisor != 0: + n = ceil(n / divisor) * divisor + return n + + def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: + """Return x >= n such that x is divisible by `total_outer_size`, and `x // total_outer_size` divisible by + all mandatory divisors (coming from the inter-core tiling of other nodes within the same stack)""" + total_outer_size = get_total_outer_size(layer_dim) + inner_size = ceil(n / total_outer_size) + inner_size_padded = get_lcm(inner_size, mandatory_divisors[layer_dim]) + x = inner_size_padded * total_outer_size + return x + + # Pad the layer_dim_sizes to be divisible by the mandatory divisors (coming from the outer_temporal_loops) tile_attrs = original_node.extract_node_attr() + for dim, size in tile_attrs.layer_dim_sizes.items(): + new_size = pad_until_divisible(dim, size) + if size != new_size: + tile_attrs.layer_dim_sizes[dim] = new_size + logger.warning(f"Padded layer dimension {dim}: {size} -> {new_size} to be divisible by tiling factors") + + # Save these extended sizes for later + extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) + + # Take away the outer_temporal_loops to create tiled CNs for this node for loop in outer_temporal_loops: outer_dim, outer_size = loop.unpack() node_dim_size: int = tile_attrs.layer_dim_sizes[outer_dim] q, rem = divmod(node_dim_size, outer_size) # returns x//y, x%y - # Make sure that the outer_dim is divisible by the outer_size - if rem != 0: - # Pad the dimension to a multiple of outer_size - node_dim_size = (q + 1) * outer_size - q += 1 - + assert rem == 0, "Should be guaranteed through mandatory divisors" + # # Make sure that the outer_dim is divisible by the outer_size + # if rem != 0: + # # Pad the dimension to a multiple of outer_size + # node_dim_size = (q + 1) * outer_size + # q += 1 tile_attrs.layer_dim_sizes[outer_dim] = q - # Reconstruct the total, padded layer_dim_sizes as padded tile size * outer_sizes - extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) - for loop in outer_temporal_loops: - outer_dim, outer_size = loop.unpack() - extended_layer_dim_sizes[outer_dim] *= outer_size + # # Reconstruct the total, padded layer_dim_sizes as padded tile size * outer_sizes + # extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) + # for loop in outer_temporal_loops: + # outer_dim, outer_size = loop.unpack() + # extended_layer_dim_sizes[outer_dim] *= outer_size # Loop dimension + size of the tiles (called span here) tile_span = tile_attrs.layer_dim_sizes - loop_dims = original_node.layer_dims stop_values = [temporal_loop.size for temporal_loop in outer_temporal_loops] nb_cns = int(prod(stop_values)) @@ -281,18 +362,16 @@ def get_tiles( for n in range(nb_cns): outer_loop_values: list[int] = [] for i, outer_loop in enumerate(outer_temporal_loops): - loop_dim = outer_loop.dimension stop_value = outer_loop.size m = prod(stop_values[:i]) - outer_loop_values.append(int((n // m) % stop_value)) + outer_loop_values.append((n // m) % stop_value) + dim_min_max: LoopRanges = {} - for loop_dim in loop_dims: - # find all outer-cn loops that iterate over this loop_dim - # and multiply their loop values by their mult_factor + for loop_dim in original_node.layer_dims: + # multiply all outer-cn loop values that iterate over this loop_dim by their mult_factor dim_min = 0 for i, outer_loop in enumerate(outer_temporal_loops): - dim = outer_loop.dimension - stop_value = outer_loop.size + dim, stop_value = outer_loop.unpack() if dim == loop_dim: # current loop value of this outer-cn loop loop_val = outer_loop_values[i] @@ -372,6 +451,9 @@ def get_tiles( @staticmethod def get_intra_edges(nodes: list[ComputationNode]): + """ + # TODO Why do we need this? + """ # Get all the group ids group_ids = sorted(set([n.group for n in nodes])) intra_edges: list[tuple[ComputationNode, ComputationNode, dict[str, int]]] = [] @@ -873,7 +955,7 @@ def get_layer_split_factors_k(self): split_factors[node] = split_factor return split_factors - def load_cached_tiled_workload(self): + def load_cached_tiled_workload(self) -> ComputationNodeWorkload | None: if os.path.exists(self.tiled_workload_path): return pickle_load(self.tiled_workload_path) return None diff --git a/stream/workload/onnx_workload.py b/stream/workload/onnx_workload.py index 24b70601..d0b4bbd5 100644 --- a/stream/workload/onnx_workload.py +++ b/stream/workload/onnx_workload.py @@ -33,12 +33,18 @@ class ComputationNodeWorkload(DiGraphWrapper[ComputationNode]): """Workload graph with only ComputationNodes""" def get_sink_layer_ids(self): - """Return the ids of layers where ALL sub-nodes have out-degree 0""" + """Return the ids of layers where ALL sub-nodes have out-degree 0 + # TODO this might nog work yet! When there is intra-core tiling, edges between nodes in the same layer + # TODO (with bits==0) are added, meaning some nodes have an out-degree > 0 + # TODO -> use get_real_nb_predecessors instead? or remove the empty intra-core edges? + """ out_degrees = self.out_degree() layer_ids = set(n.id for n, _ in out_degrees) - # x: (node, out_degree) sink_layer_ids = [ - all(filter(lambda x: (x[0].id == curr_id and x[1] == 0), out_degrees)) for curr_id in layer_ids + curr_id + for curr_id in layer_ids + # x: (node, out_degree). Filter by id -> map to out_degree == 0 -> check if all are 0 + if all(map(lambda x: x[1] == 0, filter(lambda x: x[0].id == curr_id, out_degrees))) ] return sink_layer_ids From c3072fed631b20b5c13eaf80ee67b28960eccc5d Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 2 Jan 2025 09:04:08 +0100 Subject: [PATCH 37/49] remove commented code --- .../generation/tiled_workload_generation.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index f9a4b873..481b63c6 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -1,6 +1,6 @@ -from collections import defaultdict import logging import os +from collections import defaultdict from copy import deepcopy from math import ceil, prod from typing import Any @@ -8,7 +8,6 @@ from rtree import index from zigzag.datatypes import Constants, LayerDim, LayerOperand from zigzag.utils import pickle_deepcopy, pickle_load, pickle_save -from zigzag.workload.layer_attributes import LayerDimSizes from stream.cost_model.group_allocation import GroupIdManager from stream.hardware.architecture.accelerator import Accelerator @@ -103,13 +102,10 @@ def run(self): inter_edges = self.get_inter_edges_rtree(producer, consumer, producer_tiles, consumer_tiles) all_edges += inter_edges - # Set the base_priority value of all nodes + # Set the base_priority and number of real predecessors of all nodes self.set_base_priority_of_nodes(all_tiles, all_edges) - - # Set the number of real predecessors of all nodes self.set_nb_real_predecessors(all_tiles, all_edges) - # Construct the new tiled workload graph # The graph construction needs to happen after the base priority and nb_real_predecessors are set tiled_workload = ComputationNodeWorkload() tiled_workload.add_edges_from(all_edges) @@ -317,19 +313,8 @@ def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: node_dim_size: int = tile_attrs.layer_dim_sizes[outer_dim] q, rem = divmod(node_dim_size, outer_size) # returns x//y, x%y assert rem == 0, "Should be guaranteed through mandatory divisors" - # # Make sure that the outer_dim is divisible by the outer_size - # if rem != 0: - # # Pad the dimension to a multiple of outer_size - # node_dim_size = (q + 1) * outer_size - # q += 1 tile_attrs.layer_dim_sizes[outer_dim] = q - # # Reconstruct the total, padded layer_dim_sizes as padded tile size * outer_sizes - # extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) - # for loop in outer_temporal_loops: - # outer_dim, outer_size = loop.unpack() - # extended_layer_dim_sizes[outer_dim] *= outer_size - # Loop dimension + size of the tiles (called span here) tile_span = tile_attrs.layer_dim_sizes stop_values = [temporal_loop.size for temporal_loop in outer_temporal_loops] From c83dd3a47111932c01a88b306bd19e791203b2a3 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Thu, 2 Jan 2025 16:13:52 +0100 Subject: [PATCH 38/49] bw fraction used for transfer is (nb_inter_cores_split)^-1 --- stream/cost_model/communication_manager.py | 1 + stream/cost_model/scheduler.py | 36 +++++++--- stream/hardware/architecture/accelerator.py | 66 ++++++++++--------- .../workload/computation/computation_node.py | 8 +++ 4 files changed, 72 insertions(+), 39 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index 20165664..26a5f8bd 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -158,6 +158,7 @@ def update_links( Returns: tuple: A tuple containing the link and memory energy costs associated with this transfer. """ + assert 0 <= link_bw_fraction <= 1 end_timestep = start_timestep + duration if isinstance(sender, int): sender = self.accelerator.get_core(sender) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 8b5114df..290907bc 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -143,6 +143,7 @@ def clear_memories( memory_operands: list[MemoryOperand], timestep: int, exceptions: list[Tensor] = [], + transfer_bandwidth_fraction: float = 1, ): total_eviction_to_offchip_link_energy = 0 total_eviction_to_offchip_memory_energy = 0 @@ -151,7 +152,14 @@ def clear_memories( timestep, eviction_link_energy_cost, eviction_memory_energy_cost, - ) = accelerator.remove_all(core, too_large_operand, timestep, exceptions, write_back_to_offchip=True) + ) = accelerator.remove_all( + core=core, + memory_operand=too_large_operand, + timestep=timestep, + exceptions=exceptions, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=True, + ) total_eviction_to_offchip_link_energy += eviction_link_energy_cost total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost return ( @@ -180,6 +188,7 @@ def check_for_removal( accelerator: "Accelerator", G: ComputationNodeWorkload, timestep: int, + transfer_bandwidth_fraction: float = 1, ): offchip_core_id = accelerator.offchip_core_id for tensor_used_by_node in tensors: @@ -217,6 +226,7 @@ def check_for_removal( core, tensor_used_by_node.memory_operand, timestep_for_removal, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) @@ -332,6 +342,9 @@ def schedule_graph( assert core_id is not None core = accelerator.get_core(core_id) + # Fraction of the off-chip bandwidth to be used for the tensor transfers related to this node + transfer_bandwidth_fraction = 1 / best_candidate.get_total_inter_core_splits() + # Earliest start time is when core is available or predecessors finished core_idle_from = cores_idle_from[core_id] start = max(core_idle_from, preds_end) @@ -354,6 +367,7 @@ def schedule_graph( memory_operands=best_candidate.too_large_operands, timestep=timestep, exceptions=tensors_this_candidate_needs, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) total_eviction_to_offchip_link_energy += clear_link_energy total_eviction_to_offchip_memory_energy += clear_memory_energy @@ -378,6 +392,7 @@ def schedule_graph( tensor_operand, tensors_this_candidate_needs, earliest_t=core_idle_from, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) # Update the possible start time of this node timestep = max(timestep, transfer_complete_timestep) @@ -409,6 +424,7 @@ def schedule_graph( output_memory_operand, timestep, tensors_this_candidate_needs, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) total_eviction_to_offchip_link_energy += eviction_link_energy_cost total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost @@ -455,10 +471,11 @@ def schedule_graph( decrease_priority(tensors_this_candidate_needs, tensors_operands, accelerator, best_candidate) # Remove the tensor if the priority is zero check_for_removal( - tensors_this_candidate_needs, - accelerator, - G, - end, + tensors=tensors_this_candidate_needs, + accelerator=accelerator, + G=G, + timestep=end, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) # Step 7 @@ -473,10 +490,11 @@ def schedule_graph( link_energy_cost, memory_energy_cost, ) = accelerator.remove( - output_tensor, - core, - output_tensor.memory_operand, - end, + tensor=output_tensor, + core=core, + memory_op=output_tensor.memory_operand, + timestep=end, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, write_back_to_offchip=True, ) total_sink_layer_output_offchip_link_energy += link_energy_cost diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index 4f0ab3b1..681e960e 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -84,7 +84,13 @@ def spawn( self.memory_manager.add_tensor_to_core(tensor, core, initial_timestep, available_timestep, memory_op) def remove( - self, tensor: Tensor, core: Core, memory_op: MemoryOperand, timestep: int, write_back_to_offchip: bool = False + self, + tensor: Tensor, + core: Core, + memory_op: MemoryOperand, + timestep: int, + transfer_bandwidth_fraction: float = 1, + write_back_to_offchip: bool = False, ): """Remove tensor from core. If required, transfer to offchip before removal. @@ -93,6 +99,7 @@ def remove( core (Core): The Core to remove the tensor from. memory_op: The memory operand of the tensor. timestep: The timestep to remove the tensor at. + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. """ assert self.offchip_core_id is not None @@ -118,6 +125,7 @@ def remove( memory_op, non_evictable_tensors=[], sending_core_id=core.id, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) # There should be no evictions as we are writing to offchip assert eviction_link_energy_cost == 0 @@ -144,6 +152,7 @@ def remove_all( memory_operand: MemoryOperand, timestep: int, exceptions: list[Tensor] = [], + transfer_bandwidth_fraction: float = 1, write_back_to_offchip: bool = False, ): """Remove all tensors from a core's memory with the given memory operand. @@ -154,6 +163,7 @@ def remove_all( memory_operand: The memory operand for which all tensors should be evicted. timestep: The timestep to remove the tensor at. exceptions: A list of tensors that should not be evicted. + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfers. write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. """ total_link_energy_cost = 0 @@ -164,7 +174,12 @@ def remove_all( for tensor in self.memory_manager.get_tensors_stored_at_timestep(top_instance, timestep): if tensor not in exceptions: t, link_energy_cost, memory_energy_cost = self.remove( - tensor, core, memory_operand, t, write_back_to_offchip + tensor=tensor, + core=core, + memory_op=memory_operand, + timestep=t, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=write_back_to_offchip, ) total_link_energy_cost += link_energy_cost total_memory_energy_cost += memory_energy_cost @@ -177,6 +192,7 @@ def make_space_for( memory_op: MemoryOperand, timestep: int, tensors_to_avoid_evicting: list[Tensor] = [], + transfer_bandwidth_fraction: float = 1, ): """Make space for the given tensor on the given core by evicting already stored tensors if necessary. @@ -185,6 +201,7 @@ def make_space_for( core (Core): The core where the tensor will be stored. memory_operand: The memory operand on the core. timestep: The timestep at which to make space for. + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. """ total_eviction_link_energy_cost = 0 total_eviction_memory_energy_cost = 0 @@ -219,6 +236,7 @@ def make_space_for( memory_op, timestep, write_back_to_offchip=True, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) t_evictions_complete = max(t_evictions_complete, t_eviction_complete) total_eviction_link_energy_cost += eviction_link_energy_cost @@ -238,6 +256,7 @@ def transfer_tensor_to_core( non_evictable_tensors: list[Tensor], sending_core_id: int | None = None, earliest_t: int = 0, + transfer_bandwidth_fraction: float = 1, ) -> tuple[int, float, float, float, float, bool]: """ Transfer a tensor to a given core id. @@ -261,6 +280,7 @@ def transfer_tensor_to_core( non_evictable_tensors: the stored tensor that cannot be evicted sending_core_id: The id of the core that should transfer the tensor. earliest_t: Earliest timestep at which transfer can happen + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer """ def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): @@ -282,30 +302,19 @@ def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): transfer_end = transfer_start + transfer_duration return transfer_start, transfer_end - def find_earliest_time_for_transfer( - links: list[CommunicationLink], nb_iterations: int = 1, default_fraction: float = 1 - ): - """Find the earliest time at which a tensor transfer between 2 cores can happen. Iterate over the used - bandwidth to find the transfer bandwidth at which the finish time is earliest""" + def find_earliest_time_for_transfer(links: list[CommunicationLink], bandwidth_fraction: float = 1): + """Find the earliest time at which a tensor transfer between 2 cores can happen.""" + assert 0 < bandwidth_fraction <= 1 windows: list[tuple[int, int]] = [] - # Either use the default fraction, or linearly space the fractions to try out - if nb_iterations == 1: - bandwidth_fractions = [default_fraction] - else: - # Iterate over linearly spaced fractions of the bandwidth - bandwidth_fractions = [i / nb_iterations for i in range(1, nb_iterations + 1)] - - for frac in bandwidth_fractions: - links_with_bw = {link: ceil(frac * link.bandwidth) for link in links} - start, end = find_transfer_start_and_end_time(links_with_bw) - windows.append((start, end)) + links_with_bw = {link: ceil(bandwidth_fraction * link.bandwidth) for link in links} + start, end = find_transfer_start_and_end_time(links_with_bw) + windows.append((start, end)) ends = [end for _, end in windows] best_idx = ends.index(min(ends)) best_window = windows[best_idx] - best_fraction = bandwidth_fractions[best_idx] - return best_window, best_fraction + return best_window ################################# STEP 0 ################################# # Check if the tensor is already on the receiving core @@ -359,22 +368,18 @@ def find_earliest_time_for_transfer( tensor_operand, enough_space_timestep, non_evictable_tensors, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) ################################# STEP 4 ################################# # The links between sender and receiver have a long enough idle window. - # TODO If the storing_instance is a shared instance across more than one core, - # TODO there will be multiple possible cores to transfer between. - # TODO For now, we take the first one + # TODO If the storing_instance is a shared instance across more than one core, there will be multiple possible + # TODO cores to transfer between. For now, we take the first one sender_cores = self.memory_manager.cores_per_top_instance[storing_instance] sender_core = sender_cores[0] links = self.communication_manager.get_links_for_pair(sender_core, receiving_core) - # ! By default, transfers only take a fraction of the total bandwidth - default_bandwidth_fraction = 1 / self.nb_shared_mem_groups - (transfer_start, transfer_end), link_bw_fraction = find_earliest_time_for_transfer( - links, - nb_iterations=1, - default_fraction=default_bandwidth_fraction, + transfer_start, transfer_end = find_earliest_time_for_transfer( + links, bandwidth_fraction=transfer_bandwidth_fraction ) transfer_duration = transfer_end - transfer_start @@ -394,7 +399,7 @@ def find_earliest_time_for_transfer( tensor_operand, transfer_start, transfer_duration, - link_bw_fraction=link_bw_fraction, + link_bw_fraction=transfer_bandwidth_fraction, ) ################################# STEP 7 ################################# @@ -412,6 +417,7 @@ def find_earliest_time_for_transfer( sender_core, tensor.memory_operand, transfer_end, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, write_back_to_offchip=False, ) diff --git a/stream/workload/computation/computation_node.py b/stream/workload/computation/computation_node.py index 427fbe44..fe2dc2fc 100644 --- a/stream/workload/computation/computation_node.py +++ b/stream/workload/computation/computation_node.py @@ -11,6 +11,7 @@ from zigzag.workload.layer_node import LayerNode, LayerNodeAttributes from stream.node_tensor import NodeTensor +from stream.utils import contains_wildcard from stream.workload.mapping import INTRA_CORE_MAPPING_DEFAULT, InterCoreMappingAttributes from stream.workload.node import Node from stream.workload.tensor import Tensor @@ -132,6 +133,13 @@ def get_operand_tensor_reshape_default(self) -> OperandTensorReshape | None: except KeyError: return None + def get_total_inter_core_splits(self) -> int: + """Return the total number of inter-core splits for this node, i.e. over how many cores this node is split""" + if contains_wildcard(self.inter_core_tiling): + return 1 + assert all(isinstance(factor, int) for _, factor in self.inter_core_tiling) + return prod([factor for _, factor in self.inter_core_tiling]) + @property def short_name(self) -> str: return shorten_onnx_layer_name(self.name) From 22622d949e51fa1778b46eb50c0047d39c7e6427 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 3 Jan 2025 10:11:12 +0100 Subject: [PATCH 39/49] ZigZag minimal latency -> minimal EDP --- stream/cost_model/communication_manager.py | 2 +- stream/cost_model/group_allocation.py | 2 +- stream/cost_model/scheduler.py | 4 +- .../constraint_optimization/allocation.py | 12 +- .../constraint_optimization/utils.py | 8 +- stream/opt/partitioning/utils.py | 2 +- .../genetic_algorithm_allocation.py | 4 +- .../zigzag_core_mapping_estimation.py | 117 +++++++++--------- .../generation/tiled_workload_generation.py | 2 +- .../set_fixed_allocation_performance.py | 4 +- stream/utils.py | 4 +- stream/workload/node.py | 6 +- stream/workload/tensor.py | 2 +- 13 files changed, 84 insertions(+), 85 deletions(-) diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py index 26a5f8bd..621dc451 100644 --- a/stream/cost_model/communication_manager.py +++ b/stream/cost_model/communication_manager.py @@ -301,7 +301,7 @@ def get_links_idle_window( The timestep must be greater than or equal to best_case_start. Args: - links (dict): CommunicationLinks involved in the transfer and their required bandwidth. + links: CommunicationLinks involved in the transfer and their required bandwidth. best_case_start: The best case start timestep of the transfer. duration: The required duration of the idle window. tensors: The tensors to be transferred. Used to broadcast from previous transfer. diff --git a/stream/cost_model/group_allocation.py b/stream/cost_model/group_allocation.py index aa3364ca..bad7cc00 100644 --- a/stream/cost_model/group_allocation.py +++ b/stream/cost_model/group_allocation.py @@ -72,7 +72,7 @@ def get_group_id(self, tile_loop_ranges: LoopRanges) -> int: Args: node (ComputationNode): The original (layer) CN. - loop_ranges (dict): A dictionary containing the loop range for each dimension + loop_ranges: A dictionary containing the loop range for each dimension Returns: int: The group id for the given loop ranges diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 290907bc..e82f25ea 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -237,8 +237,8 @@ def sync_cores_idle_from( scheduling_order: list[tuple[int, int]], ): """ - Sync the cores_idle_from dict values if the best candidate is the first node of a layer and we detect layer-by-layer execution. - The layer-by-layer execution is detected through the scheduling_order. + Sync the cores_idle_from dict values if the best candidate is the first node of a layer and we detect layer-by-layer + execution. The layer-by-layer execution is detected through the scheduling_order. """ # Get the predecessor ids of the best_candidate from the workload graph G predecessor_ids = [pred.id for pred in G.predecessors(best_candidate) if pred.id != best_candidate.id] diff --git a/stream/opt/allocation/constraint_optimization/allocation.py b/stream/opt/allocation/constraint_optimization/allocation.py index 37b89247..39d34d37 100644 --- a/stream/opt/allocation/constraint_optimization/allocation.py +++ b/stream/opt/allocation/constraint_optimization/allocation.py @@ -107,12 +107,12 @@ def constraint_allocation_optimization( The timeline is divided into a number of slots. Each node will be assigned to one slot. Args: - latencies (dict): Latency for each node in form {id: latency} - weights_per_id (dict): Weights (in bits) for each node in form {id: weights} - dependencies (dict): Dependencies between nodes in form {(producer_id, consumer_id): tensor_size} - core_capacities (dict): Weight capacity (in bits) of each core in form {core: capacity} - allocations (dict): TODO: Add fixed allocation constraints - N (int): The number of iterations of the steady state + latencies: Latency for each node in form {id: latency} + weights_per_id: Weights (in bits) for each node in form {id: weights} + dependencies: Dependencies between nodes in form {(producer_id, consumer_id): tensor_size} + core_capacities: Weight capacity (in bits) of each core in form {core: capacity} + allocations: TODO: Add fixed allocation constraints + N: The number of iterations of the steady state """ node_core_k_ids, lats = gp.multidict(latencies) node_ids = sorted(set([node_core_id[0] for node_core_id in node_core_k_ids])) diff --git a/stream/opt/allocation/constraint_optimization/utils.py b/stream/opt/allocation/constraint_optimization/utils.py index c9e2c461..2047984c 100644 --- a/stream/opt/allocation/constraint_optimization/utils.py +++ b/stream/opt/allocation/constraint_optimization/utils.py @@ -1,4 +1,4 @@ -from math import ceil, log10, prod +from math import prod from zigzag.datatypes import LayerDim, LayerOperand, UnrollFactor @@ -7,11 +7,7 @@ from stream.utils import CostModelEvaluationLUT from stream.workload.computation.computation_node import ComputationNode -MODULATION_NUMBER = 10**3 # Must be higher than any node's sub id - - -def nearest_power_of_10(x: int): - return 10 ** ceil(log10(x)) +MODULATION_NUMBER = 1 << 20 # Must be higher than any node's sub id def convert_id(i: int, j: int) -> int: diff --git a/stream/opt/partitioning/utils.py b/stream/opt/partitioning/utils.py index dd1492d3..4a05b189 100644 --- a/stream/opt/partitioning/utils.py +++ b/stream/opt/partitioning/utils.py @@ -136,7 +136,7 @@ def convert_outer_cn_loops_with_k(outer_cn_loops: TILING_T, layer: ComputationNo Args: outer_cn_loops (list): The list of string-defined outer-cn loops. layer (ComputationNode): The original layer. - split_factor (int): The number of output channel splits that will be added. + split_factor: The number of output channel splits that will be added. """ raise DeprecationWarning("Still uses string representation for LayerOperand") if not isinstance(split_factor, int): diff --git a/stream/stages/allocation/genetic_algorithm_allocation.py b/stream/stages/allocation/genetic_algorithm_allocation.py index 09320bc0..2dd457f3 100644 --- a/stream/stages/allocation/genetic_algorithm_allocation.py +++ b/stream/stages/allocation/genetic_algorithm_allocation.py @@ -49,8 +49,8 @@ def __init__( workload (DiGraph): The NetworkX DiGraph representing the workload to be scheduled accelerator (Accelerator): The hardware accelerator onto which we schedule the workload cost_lut (CostModelEvaluationLUT): A LUT of CMEs for each unique node and their valid cores - nb_ga_generations (int): The number of generations considered by the genetic algorithm - nb_ga_individuals (int): The number of individuals in each genetic algorithm generation + nb_ga_generations: The number of generations considered by the genetic algorithm + nb_ga_individuals: The number of individuals in each genetic algorithm generation """ super().__init__(list_of_callables, **kwargs) self.workload = workload diff --git a/stream/stages/estimation/zigzag_core_mapping_estimation.py b/stream/stages/estimation/zigzag_core_mapping_estimation.py index 4cc6f795..587b78d5 100644 --- a/stream/stages/estimation/zigzag_core_mapping_estimation.py +++ b/stream/stages/estimation/zigzag_core_mapping_estimation.py @@ -10,7 +10,7 @@ from zigzag.stages.main import MainStage from zigzag.stages.mapping.spatial_mapping_generation import SpatialMappingGeneratorStage from zigzag.stages.mapping.temporal_mapping_generator_stage import TemporalMappingGeneratorStage -from zigzag.stages.results.reduce_stages import MinimalLatencyStage +from zigzag.stages.results.reduce_stages import MinimalEDPStage from zigzag.utils import pickle_deepcopy from stream.hardware.architecture.accelerator import Accelerator @@ -57,25 +57,34 @@ def __init__( # Extract all unique nodes that will have to be evaluated self.unique_nodes = get_unique_nodes(self.workload) - # Initialize the valid node-core allocations. - self.valid_allocations: dict[ComputationNode, list[int]] = {} - for node in self.unique_nodes: - assert isinstance( - node, ComputationNode - ), f"ZigZagCoreMappingEstimationStage received node {node} of type {type(node)}." - assert isinstance(node.possible_core_allocation, list), f"Core allocation is not a list for node {node}." - self.valid_allocations[node] = node.possible_core_allocation + assert all( + isinstance(node, ComputationNode) for node in self.unique_nodes + ), "ZigZagCoreMappingEstimationStage received a non-ComputationNode." + assert all( + isinstance(node.possible_core_allocation, list) for node in self.unique_nodes + ), "ZigZagCoreMappingEstimationStage received a node with a non-list core allocation." - # Initialize CostModelEvaluationLUT + self.valid_allocations: dict[ComputationNode, list[int]] = { + node: node.possible_core_allocation for node in self.unique_nodes + } self.cost_lut = CostModelEvaluationLUT(self.cost_lut_path) def run(self): logger.info("Start ZigZagCoreMappingEstimationStage.") - for node in self.unique_nodes: - # TODO This should never evaluate to true: enforce core_allocation as list everywhere - if isinstance(self.valid_allocations[node], tuple): - raise ValueError + self.update_cost_lut() + self.visualize_cost_lut() + logger.info("Finished ZigZagCoreMappingEstimationStage.") + kwargs = self.kwargs.copy() + kwargs["workload"] = self.workload + kwargs["accelerator"] = self.accelerator + kwargs["cost_lut"] = self.cost_lut + sub_stage = self.list_of_callables[0](self.list_of_callables[1:], **kwargs) + for cme, extra_info in sub_stage.run(): + yield cme, extra_info + + def update_cost_lut(self): + for node in self.unique_nodes: core_ids = self.valid_allocations[node] for core_id in core_ids: core = self.accelerator.get_core(core_id) @@ -121,44 +130,29 @@ def run(self): if core.dataflows: node_duplicate.spatial_mapping = core.dataflows - # Initialize the flow that will be followed to extract the optimal HW performance of every - # unique node-core allocation - main_stage = self.get_intra_core_mapping_flow( - node=node_duplicate, - too_large_operands=too_large_operands_for_cme, - core_id=core_id, - ) - answers = main_stage.run() - assert len(answers) == 1, "ZigZagCoreMappingEstimationStage's subflow returned more than one CME" - cme: CostModelEvaluation = answers[0][0] # type: ignore + cme = self.run_zigzag(node_duplicate, too_large_operands_for_cme, core_id) cme = self.increase_cc_per_op(cme, node.type) node_duplicate.set_chosen_core_allocation(None) # Reset the node's chosen core allocation self.cost_lut.add_cme(node, core, cme, allow_overwrite=False) self.cost_lut.save() - self.visualize_cost_lut() - kwargs = self.kwargs.copy() - kwargs["workload"] = self.workload - kwargs["accelerator"] = self.accelerator - kwargs["cost_lut"] = self.cost_lut - - logger.info("Finished ZigZagCoreMappingEstimationStage.") - sub_stage = self.list_of_callables[0](self.list_of_callables[1:], **kwargs) - for cme, extra_info in sub_stage.run(): - yield cme, extra_info - - def increase_cc_per_op(self, cme: CostModelEvaluation, op_type: str): + def get_cc_per_op(self, op_type: str): + """Return the number of cycles that the operational units need to finish the given operation.""" match op_type: case "silu": - cc_per_op = 4 + return 4 case "sigmoid": - cc_per_op = 4 + return 4 case "exp": - cc_per_op = 4 + return 4 case _: - cc_per_op = 1 + return 1 + def increase_cc_per_op(self, cme: CostModelEvaluation, op_type: str): + """Given a ZigZag that assumes each operation takes one cycle, generate a new one that takes into account that + the operation might take more than one cycle.""" + cc_per_op = self.get_cc_per_op(op_type) if cc_per_op > 1: logger.warning(f"Setting cycles per mac of {op_type} node to {cc_per_op}") @@ -175,17 +169,26 @@ def increase_cc_per_op(self, cme: CostModelEvaluation, op_type: str): return new_cme def visualize_cost_lut(self): - # Get the scale factors scale_factors = { n: len([cn for cn in self.workload.node_list if cn.has_same_performance(n)]) for n in self.cost_lut.get_nodes() } - # Run the visualization visualize_cost_lut_pickle(self.cost_lut, scale_factors, self.visualize_cost_lut_path) - def get_intra_core_mapping_flow(self, node: ComputationNode, too_large_operands: list[MemoryOperand], core_id: int): + def run_zigzag( + self, node: ComputationNode, too_large_operands: list[MemoryOperand], core_id: int + ) -> CostModelEvaluation: + """Run the ZigZag flow to estimate performance of a given node on a core.""" + + main_stage = self.instantiate_zigzag_flow(node, too_large_operands, core_id) logger.info(f"Launching intra-core mapping optimization for {node} -> core {core_id} ...") + answers = main_stage.run() + assert len(answers) == 1, "ZigZagCoreMappingEstimationStage's subflow returned more than one CME" + cme: CostModelEvaluation = answers[0][0] # type: ignore + return cme + def instantiate_zigzag_flow(self, node: ComputationNode, too_large_operands: list[MemoryOperand], core_id: int): + """Instantiate a runnable ZigZag mainstage""" core = self.accelerator.get_core(core_id) if too_large_operands: @@ -193,9 +196,9 @@ def get_intra_core_mapping_flow(self, node: ComputationNode, too_large_operands: main_stage = MainStage( [ # Initializes the MainStage as entry point - MinimalLatencyStage, + MinimalEDPStage, SpatialMappingGeneratorStage, # Generates multiple spatial mappings (SM) - MinimalLatencyStage, # Reduces all CMEs, returning minimal latency one + MinimalEDPStage, # Reduces all CMEs, returning minimal EDP one TemporalMappingGeneratorStage, # Generates multiple temporal mappings (TM) CostModelStage, # Evaluates generated SM and TM through cost model ], @@ -212,11 +215,11 @@ def check_core_capacity_for_node(self, core: Core, node: ComputationNode) -> lis and the stored operands inside each memory. Args: - core (Core): The core onto which we want to map the node - node (ComputationNode): The node we want to map onto the core + core: The core onto which we want to map the node + node: The node we want to map onto the core Returns: - list: A list of memory operands for which the capacity on the core is insufficient. + A list of memory operands for which the capacity on the core is insufficient. """ too_large_operands_for_cme: list[MemoryOperand] = [] @@ -326,12 +329,12 @@ def get_top_level_memory_rest_capacity( """Calculate the remaining capacity in the top level core memory after storing the operands_stored_in_top_level Args: - operands_stored_in_top_level (list): list of operands that can fit in the top memory level of the core - bits_to_be_stored_in_top_level (dict): the data size in bit for each variable operands - top_level_capacity_bits (int): the total capacity of the top level core memory + operands_stored_in_top_level: list of operands that can fit in the top memory level of the core + bits_to_be_stored_in_top_level: the data size in bit for each variable operands + top_level_capacity_bits: the total capacity of the top level core memory Returns: - int: the memory capacity left after storing the operands_stored_in_top_level + The memory capacity left after storing the operands_stored_in_top_level """ rest_capacity = top_level_capacity_bits for mem_operand in operands_stored_in_top_level: @@ -347,12 +350,12 @@ def get_too_large_operands_minimal_required_capacity_in_top_level_memory( unrolling Args: - operands_stored_in_offchip (list): list of operands that cannot fit in the top memory level of the core + operands_stored_in_offchip: list of operands that cannot fit in the top memory level of the core dataflows (list of dict): the dataflows (spatial mappings) that current core supports node (ComputationNode): The computational node we want to map onto the core Returns: - int: the required memory capacity in the top memory of the core for operands_stored_in_offchip + The required memory capacity in the top memory of the core for operands_stored_in_offchip """ def get_lowest_level_unrolled_memory_capacity(memory_operand: MemoryOperand): @@ -369,9 +372,9 @@ def add_offchip_to_core(self, core: Core, too_large_operands: list[MemoryOperand """Add the offchip memory as the top level memory of the core with core_id in a copy of the accelerator Args: - core_id (int): The id of the core to which we want to add the off-chip memory for cost evaluation. - too_large_operands (list): The memory operands the off-chip memory should store. - layer_idx (int): workload layer index. + core_id: The id of the core to which we want to add the off-chip memory for cost evaluation. + too_large_operands: The memory operands the off-chip memory should store. + layer_idx: workload layer index. """ assert self.accelerator.offchip_core_id is not None logger.warning(f"Adding offchip memory for {core}, layer={layer_idx}, memory_operands={too_large_operands}.") diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 481b63c6..cfe0fffb 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -531,7 +531,7 @@ def flatten_grouped_convolution_ranges( Args: dimensions (list): list of the different tensor dimensions - loop_ranges (dict): dict of the loop ranges for the current node. + loop_ranges: dict of the loop ranges for the current node. """ # TODO these should be constants dim_G = LayerDim("G") diff --git a/stream/stages/set_fixed_allocation_performance.py b/stream/stages/set_fixed_allocation_performance.py index 2827299f..06861e66 100644 --- a/stream/stages/set_fixed_allocation_performance.py +++ b/stream/stages/set_fixed_allocation_performance.py @@ -105,8 +105,8 @@ def set_hw_performance_node( node (Node): The node of which to set the onchip_energy (float): on-chip energy of executing this node offchip_energy (float): off-chip energy of executing this node - runtime (int): runtime of executing this node - core_allocation (int): the core_id on which this node will be ran + runtime: runtime of executing this node + core_allocation: the core_id on which this node will be ran """ node.set_onchip_energy(onchip_energy) node.set_offchip_energy(offchip_energy) diff --git a/stream/utils.py b/stream/utils.py index 621ec838..2b09a95f 100644 --- a/stream/utils.py +++ b/stream/utils.py @@ -25,7 +25,7 @@ def get_too_large_operands(cme: CostModelEvaluation, accelerator: "Accelerator", Args: cme (CostModelEvaluation): The CostModelEvaluation containing information wrt the memory utilization. accelerator (Accelerator): The accelerator object containing the different cores. - core_id (int): The id of the core of which we wish to get the too large operands. + core_id: The id of the core of which we wish to get the too large operands. """ too_large_operands: list[MemoryOperand] = [] core = accelerator.get_core(core_id) @@ -51,7 +51,7 @@ def save_core_allocation( type (str, optional): The type of core allocation: fixed or flexible. Returns: - allocations (dict): The dictionary containing core allocations for each node name + allocations: The dictionary containing core allocations for each node name """ node_allocations = {} node_allocations_grouped = {} diff --git a/stream/workload/node.py b/stream/workload/node.py index 68eb64f1..3ec269da 100644 --- a/stream/workload/node.py +++ b/stream/workload/node.py @@ -100,7 +100,7 @@ def set_runtime(self, runtime: int): """Set the runtime of running this node. Args: - runtime (int): runtime in cycles + runtime: runtime in cycles """ self.runtime = runtime @@ -108,7 +108,7 @@ def set_start(self, start: int): """Set the start time in cycles of this node. Args: - start (int): start time in cycles + start: start time in cycles """ self.start = start @@ -116,7 +116,7 @@ def set_end(self, end: int): """Set the end time in cycles of this node. Args: - end (int): end time in cycles + end: end time in cycles """ self.end = end diff --git a/stream/workload/tensor.py b/stream/workload/tensor.py index a9129fea..e4ad4049 100644 --- a/stream/workload/tensor.py +++ b/stream/workload/tensor.py @@ -27,7 +27,7 @@ def __init__( """Initialize the Tensor instance. Args: - size (int): the size of the tensor in bits + size: the size of the tensor in bits origin (ComputationNode): The computation node that consumes/produces this tensor layer_operand (str, optional): The layer operand to which this tensor belongs loop_dimensions (tuple, optional): The loop dimensions for this tensor From 829f82d0d4b0b06287ff74ef574e40c1e25b418e Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Fri, 3 Jan 2025 15:49:58 +0100 Subject: [PATCH 40/49] fix pointer issue in NodeTensor --- stream/node_tensor.py | 12 +++++++----- .../generation/tiled_workload_generation.py | 15 +-------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/stream/node_tensor.py b/stream/node_tensor.py index 980c88fa..9a87b0af 100644 --- a/stream/node_tensor.py +++ b/stream/node_tensor.py @@ -37,14 +37,14 @@ def initialize_empty(shape: tuple[int, ...], pre_allocation_size: int = 8): def _get_pointer(self): return self.__node_count - def _get_and_increment_pointer(self): + def _increment_and_get_pointer(self): """Get the index pointer in the last dimension. which points to the next free spot to allocate nodes. Automatically increments the pointer after each use. If the index exceeds the allocated space, an error is raised.""" + self.__node_count += 1 pointer = self.__node_count if pointer >= self.__pre_allocation_size: raise IndexError - self.__node_count += 1 return pointer @property @@ -80,7 +80,7 @@ def convert_to_full_shape(self, tensor_shape: tuple[int, ...]): def get_nb_empty_elements(self, slices: tuple[slice, ...]): """Returns the number of points for which there are no ComputationNodes.""" assert self.is_valid_shape_dimension(slices), "Last dimension of tensor is reserved for CNs" - extended_slices = slices + (slice(0, self.__node_count),) + extended_slices = slices + (slice(0, self._get_pointer() + 1),) tensor_slice = self.as_ndarray()[extended_slices] all_empty_mask = np.logical_and.reduce(tensor_slice == 0, axis=-1) return int(np.sum(all_empty_mask)) @@ -91,6 +91,8 @@ def extend_with_node(self, slices: tuple[slice, ...], node: object) -> "NodeTens # Case 1: Try to assign at the current pointer for given slices idx = self._get_pointer() extended_slices = slices + (slice(idx, idx + 1),) + assert all(s.stop <= self.full_shape[i] for i, s in enumerate(extended_slices)), "Index out of bounds" + # Slice is all 0 if not np.any(self[extended_slices]): self[extended_slices] = node @@ -98,7 +100,7 @@ def extend_with_node(self, slices: tuple[slice, ...], node: object) -> "NodeTens # Case 2: increment pointer and assign at empty slice try: - idx = self._get_and_increment_pointer() + idx = self._increment_and_get_pointer() extended_slices = slices + (slice(idx, idx + 1),) self[extended_slices] = node return self @@ -167,7 +169,7 @@ def concat_with_empty(self, shape: tuple[int, ...], axis: int, variable_input_fi return np.concat((self.as_ndarray(), empty_tensor), axis=axis).view(NodeTensor) def __repr__(self): - return f"TensorNode{self.tensor_shape}[depth={self.__node_count}]" + return f"NodeTensor{self.tensor_shape}[depth={self.__node_count}]" def __reduce__(self): return self.as_ndarray().__reduce__() diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index cfe0fffb..03ce17fc 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -231,21 +231,8 @@ def get_mandatory_divisors(self, node: ComputationNode) -> dict[LayerDim, set[in This ensures dependencies between tiles within the stack do not cross the layer stack boundaries. # TODO can nodes within the same stack have different intra-core tiling? This is not accounted for """ - # # These divisors accumulate: e.g. if a dim must be divisible by 2 and 4, it must be divisible by 8 - # divisors_multiplicative: dict[LayerDim, int] = defaultdict(lambda: 1) - divisors: dict[LayerDim, set[int]] = defaultdict(lambda: set()) - # # Must be divisible by inter- and intra-core tiling factors (multiplicative) - # for dim, factor in node.intra_core_tiling + node.inter_core_tiling: - # if isinstance(factor, int): - # divisors_multiplicative[dim] *= factor - - # # Multiplied divisors become one lcm divisor - # for dim, factor in divisors_multiplicative.items(): - # divisors_lcm[dim].add(factor) - - # Must be divisible by inter-core tiling factors of all nodes in the same layer stack (least common multiple) # Find nodes in stack try: curr_stack = next(stack for stack in self.layer_stacks if node.id in stack) @@ -854,7 +841,7 @@ def get_tensor_cns(self, node: ComputationNode, tiles: list[ComputationNode]) -> ) tensors_cns[op] = tensors_cns[op].extend_with_node(bounded_op_dim_ranges, tile) - if nb_unique_data_seen != (prod(tensor_shapes[op]) * precision): + if nb_unique_data_seen < (prod(tensor_shapes[op]) * precision): logger.warning(f"Downsampling node detected: {node}, operand= {op}.") # The dimensionality order of this input/output operand might include From 081ebe86bc969840217de18259208ee8f5c92eda Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Fri, 3 Jan 2025 16:15:00 +0100 Subject: [PATCH 41/49] update zigzag-dse to 3.8.1 in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5cc3b68c..15cabe86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -zigzag-dse==3.8.0 +zigzag-dse==3.8.1 rtree deap matplotlib From c16c009a26647c4236f5e37674d66080e3902c45 Mon Sep 17 00:00:00 2001 From: Arne Symons Date: Fri, 3 Jan 2025 16:57:06 +0100 Subject: [PATCH 42/49] improve dependency generation time complexity; not yet implemented for all PropagationNodes --- .../generation/tiled_workload_generation.py | 232 +++++++++--------- .../dependency_propagation/slice_node.py | 6 +- .../dependency_propagation/split_node.py | 7 +- .../dependency_propagation/transpose_node.py | 11 +- 4 files changed, 129 insertions(+), 127 deletions(-) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 481b63c6..3fd225f1 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -1,5 +1,6 @@ import logging import os +import time from collections import defaultdict from copy import deepcopy from math import ceil, prod @@ -19,7 +20,6 @@ from stream.stages.stage import Stage, StageCallable from stream.utils import contains_wildcard from stream.workload.computation.computation_node import ComputationNode, LoopRanges -from stream.workload.dependency_propagation.concat_node import ConcatNode from stream.workload.dependency_propagation.dummy_node import DummyNode from stream.workload.dependency_propagation.propagation_node import PropagationNode from stream.workload.node import Node @@ -658,19 +658,18 @@ def get_inter_edges_numpy( consumer: ComputationNode, ): - numpy_tensors: dict[ComputationNode, dict[LayerOperand, NodeTensor]] = {} + numpy_tensors: dict[tuple[ComputationNode, LayerOperand], NodeTensor] = {} all_inter_edges: list[tuple[ComputationNode, ComputationNode, dict[str, Any]]] = [] def get_tensor_cn_for_op(node: ComputationNode, dependent_operand: LayerOperand): """And update the known tensors of computation nodes""" - if node in numpy_tensors: - tensor_cns = numpy_tensors[node] + if (node, dependent_operand) in numpy_tensors: + tensor = numpy_tensors[(node, dependent_operand)] else: tiles = self.tiles_dict[node] - tensor_cns = self.get_tensor_cns(node, tiles) + tensor = self.get_node_tensor(node, tiles, dependent_operand) # Store result for later use - numpy_tensors[node] = tensor_cns - tensor = tensor_cns[dependent_operand] + numpy_tensors[(node, dependent_operand)] = tensor return tensor paths_between = list(self.workload.all_simple_paths(producer, consumer)) @@ -683,76 +682,33 @@ def get_tensor_cn_for_op(node: ComputationNode, dependent_operand: LayerOperand) ), "No paths between producer and consumer found without ComputationNode in intermediates." for path_between in paths_between: + ts = [time.time()] # First node in the path is a ComputationNode, of which we extract the output operand dependency tensor first_node = path_between[0] assert isinstance(first_node, ComputationNode), "First node in path should be ComputationNode" tensor = get_tensor_cn_for_op(first_node, dependent_operand=Constants.OUTPUT_LAYER_OP) + ts.append(time.time()) # Propagate through intermediate, non-computation nodes + relevant_axes = [False] * len(tensor.tensor_shape) for i, node in enumerate(path_between[1:-1], start=1): assert isinstance(node, PropagationNode), "Intermediate nodes should not be of type ComputationNode" next_node = path_between[i + 1] - tensor = node.propagate(tensor, next_node) + tensor, relevant_axes = node.propagate(tensor, next_node, relevant_axes) # Final node: Computation node final_node: ComputationNode = path_between[-1] # type: ignore assert isinstance(final_node, ComputationNode), "Last node in path should be ComputationNode" - # Find the operand for which this last node connects to its predecessor dependent_operand = next( op for op, dependent_node_id in final_node.input_operand_source.items() if dependent_node_id == node.id ) + inter_edges = self.get_inter_edges_hybrid(tensor, final_node, dependent_operand, relevant_axes) - # Error handling of shape mismatches in tensor propagation - def _get_final_tensor_alt_operand(): - """Error handling case 1: sources for `W` and `I` operand are swapped for this node - -> try the other one""" - try: - alt_operand = next(op for op in final_node.input_operand_source if op != dependent_operand) - except StopIteration: - # No alt operand was found -> we're still in trouble - raise TensorDimensionMismatchException - return get_tensor_cn_for_op(final_node, alt_operand) - - def _get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: NodeTensor): - """Error handling case 2: dimensions of ComputationNode (`final_tensor`) were altered by stream - (e.g. to be properly divisible) but this is not reflected in `ConcatNode` with constant shape. - -> manually fix shape""" - if not any(isinstance(node, ConcatNode) for node in path_between[1:-1]): - raise TensorDimensionMismatchException( - "This function only solves the case of errors due to constant shapes in ConcatNode" - ) - - target_shape = final_tensor.tensor_shape - propagated_shape = tensor.tensor_shape - extension_axis = next(i for i in range(len(target_shape)) if target_shape[i] != propagated_shape[i]) - extension_value = target_shape[extension_axis] - propagated_shape[extension_axis] - if extension_value <= 0: - raise TensorDimensionMismatchException( - "Propagated shape cannot be larger than (extended) found shape" - ) - extension_shape = tuple( - val if i != extension_axis else extension_value for i, val in enumerate(target_shape) - ) - return tensor.concat_with_empty(extension_shape, extension_axis, variable_input_first=False) - - try: # Regular case - final_tensor = get_tensor_cn_for_op(final_node, dependent_operand) - inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) - except TensorDimensionMismatchException: - try: # Error case 1 - final_tensor = _get_final_tensor_alt_operand() - inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) - except TensorDimensionMismatchException: - try: # Error case 2 - final_tensor = get_tensor_cn_for_op(final_node, dependent_operand) - tensor = _get_shape_inferred_propagated_tensor(tensor, final_tensor) - inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) - except TensorDimensionMismatchException: - # Error case 1 and 2 combined - final_tensor = _get_final_tensor_alt_operand() - tensor = _get_shape_inferred_propagated_tensor(tensor, final_tensor) - inter_edges = self.get_inter_edges_tensor_based(tensor, final_tensor) + ts.append(time.time()) + ts_deltas = [ts[i] - ts[i - 1] for i in range(1, len(ts))] + ts_deltas_str = ", ".join([f"{delta:.3f}" for delta in ts_deltas]) + logger.info(f"Path {path_between} time deltas: {ts_deltas_str}") for producer, cons in inter_edges: all_inter_edges.append( @@ -767,6 +723,39 @@ def _get_shape_inferred_propagated_tensor(tensor: NodeTensor, final_tensor: Node ) return all_inter_edges + def get_inter_edges_hybrid( + self, tensor: NodeTensor, final_node: ComputationNode, op: LayerOperand, relevant_axes: list[bool] + ): + """This method obtains the tile dependencies between producers in tensor and the consumer final_node. + This is done by iterating through all consumer tiles, + for each consumer node we create a window and get all the producer nodes that produced this data window. + + Args: + tensor (NodeTensor): A tensor containing for each position which CNs will produce it + final_node (ComputationNode): The node for which to get the inter-edges + operand (LayerOperand): The input operand of final_node for which to get the inter-edges + relevant_axes (list): A list of boolean values indicating which axes are relevant for the final_node + """ + inter_edges: set[tuple[ComputationNode, ComputationNode]] = [] + dims = final_node.operand_dimensionality_order[op] + assert len(dims) == len(relevant_axes) + for consumer_tile in self.tiles_dict[final_node]: + relevant_loop_ranges = [consumer_tile.loop_ranges[dim] for dim in dims] + # Override loop ranges of irrelevant axes to only include a single slice + for i, relevant in enumerate(relevant_axes): + if not relevant: + relevant_loop_ranges[i] = (0, 1) + # Ellipsis adds the entire last axis for the extra dimension in NodeTensor + slices = tuple(slice(start, stop) for start, stop in relevant_loop_ranges) + (Ellipsis,) + sliced_tensor = tensor[slices] + producer_tiles = set( + prod + for prod in (elem for elem in sliced_tensor.flat.flat if elem and isinstance(elem, ComputationNode)) + ) + for producer_tile in producer_tiles: + inter_edges.append((producer_tile, consumer_tile)) + return inter_edges + @staticmethod def get_inter_edges_tensor_based(producer_output_tensor: NodeTensor, consumer_input_tensor: NodeTensor): """This method obtains the edges between a producer and consumer. @@ -793,69 +782,62 @@ def get_inter_edges_tensor_based(producer_output_tensor: NodeTensor, consumer_in inter_edges.add((producer, consumer)) return inter_edges - def get_tensor_cns(self, node: ComputationNode, tiles: list[ComputationNode]) -> dict[LayerOperand, NodeTensor]: + def get_node_tensor( + self, + node: ComputationNode, + tiles: list[ComputationNode], + op: LayerOperand, + ) -> NodeTensor: is_source_node = len(self.get_non_type_predecessors(node, [DummyNode])) == 0 - variable_operands = [op for op in node.input_operands if op not in node.constant_operands] + [ - node.output_operand - ] - tensor_dims = {op: node.operand_dimensionality_order[op] for op in variable_operands} + tensor_dims = node.operand_dimensionality_order[op] all_loop_dim_sizes = node.layer_dim_sizes + node.pr_layer_dim_sizes # union - tensor_shapes = {op: tuple(all_loop_dim_sizes[dim] for dim in dims) for (op, dims) in tensor_dims.items()} - - # Initial arrays. - tensors_cns: dict[LayerOperand, NodeTensor] = { - op: NodeTensor.initialize_empty(shape) for (op, shape) in tensor_shapes.items() - } - - # For each input operand iterate through the tiles in reverse order - # because we want the first cn with a dependency saved in the tensor - # For the output operand iterate through the tiles in regular order - # because we want the last CN that handles an output tensor window to be saved - for op, dims in tensor_dims.items(): - if op == node.output_operand: - ir_dims_output = node.loop_relevancy_info.get_ir_layer_dims(Constants.OUTPUT_LAYER_OP) - tile_list = tiles # list in regular order - should_add_to_tensor_list = [ - all(tile.loop_ranges[ir_dim][1] >= node.loop_ranges[ir_dim][1] for ir_dim in ir_dims_output) - for tile in tile_list - ] - attr_to_add_to = "data_produced_unique" - precision = node.operand_precision[Constants.FINAL_OUTPUT_LAYER_OP] - else: - tile_list = list(reversed(tiles)) # list in reversed order - should_add_to_tensor_list = [True for _ in tile_list] - attr_to_add_to = "data_consumed_unique" - # if this layer is the first layer, we assume the inputs are streamed and "free" - precision = node.operand_precision[op] * (not is_source_node) - - nb_unique_data_seen = 0 - for tile, should_add_to_tensor in zip(tile_list, should_add_to_tensor_list): - if not should_add_to_tensor: - continue # Skip if we're not at the max ir loop value for output - op_dim_ranges = [tile.loop_ranges[loop_dim] for loop_dim in dims] - op_dim_ranges_max_stop = tuple(tensor_shapes[op]) - # start can be negative for padding which, makes np flip - window = tuple([slice(max(0, start), stop) for (start, stop) in op_dim_ranges]) - # Count how many nans we have in this window, as this is the amount of unique data consumed/produced by - # this tile - nb_unique_data_bits = tensors_cns[op].get_nb_empty_elements(window) * precision - nb_unique_data_seen += nb_unique_data_bits - # Add this amount of unique data to the "data_consumed_unique" or "data_produced_unique" depending on - # input/output operand - setattr( - tile, - attr_to_add_to, - getattr(tile, attr_to_add_to) + nb_unique_data_bits, - ) - # Set this window of the tensor to indicate it will be consumed/produced by this tile - bounded_op_dim_ranges = tuple( - slice(max(0, start), min(max_stop, stop)) - for ((start, stop), max_stop) in zip(op_dim_ranges, op_dim_ranges_max_stop) - ) - tensors_cns[op] = tensors_cns[op].extend_with_node(bounded_op_dim_ranges, tile) + tensor_shapes = tuple(all_loop_dim_sizes[dim] for dim in tensor_dims) + + if op == node.output_operand: + ir_dims_output = node.loop_relevancy_info.get_ir_layer_dims(Constants.OUTPUT_LAYER_OP) + tile_list = tiles # list in regular order + should_add_to_tensor_list = [ + all(tile.loop_ranges[ir_dim][1] >= node.loop_ranges[ir_dim][1] for ir_dim in ir_dims_output) + for tile in tile_list + ] + attr_to_add_to = "data_produced_unique" + precision = node.operand_precision[Constants.FINAL_OUTPUT_LAYER_OP] + else: + tile_list = list(reversed(tiles)) # list in reversed order + should_add_to_tensor_list = [True for _ in tile_list] + attr_to_add_to = "data_consumed_unique" + # if this layer is the first layer, we assume the inputs are streamed and "free" + precision = node.operand_precision[op] * (not is_source_node) + + nb_unique_data_seen = 0 + node_tensor = NodeTensor.initialize_empty(tensor_shapes) + for tile, should_add_to_tensor in zip(tile_list, should_add_to_tensor_list): + if not should_add_to_tensor: + continue # Skip if we're not at the max ir loop value for output + op_dim_ranges = [tile.loop_ranges[loop_dim] for loop_dim in tensor_dims] + op_dim_ranges_max_stop = tuple(tensor_shapes) + # start can be negative for padding which, makes np flip + window = tuple([slice(max(0, start), stop) for (start, stop) in op_dim_ranges]) + # Count how many nans we have in this window, as this is the amount of unique data consumed/produced by + # this tile + nb_unique_data_bits = node_tensor.get_nb_empty_elements(window) * precision + nb_unique_data_seen += nb_unique_data_bits + # Add this amount of unique data to the "data_consumed_unique" or "data_produced_unique" depending on + # input/output operand + setattr( + tile, + attr_to_add_to, + getattr(tile, attr_to_add_to) + nb_unique_data_bits, + ) + # Set this window of the tensor to indicate it will be consumed/produced by this tile + bounded_op_dim_ranges = tuple( + slice(max(0, start), min(max_stop, stop)) + for ((start, stop), max_stop) in zip(op_dim_ranges, op_dim_ranges_max_stop) + ) + node_tensor = node_tensor.extend_with_node(bounded_op_dim_ranges, tile) - if nb_unique_data_seen != (prod(tensor_shapes[op]) * precision): - logger.warning(f"Downsampling node detected: {node}, operand= {op}.") + if nb_unique_data_seen != (prod(tensor_shapes) * precision): + logger.warning(f"Downsampling node detected: {node}, operand= {op}.") # The dimensionality order of this input/output operand might include # both a G and C/K dimension because the ComputationNode gets the group as an extra @@ -864,9 +846,17 @@ def get_tensor_cns(self, node: ComputationNode, tiles: list[ComputationNode]) -> # input operand with dimensionality_order = ['B', 'G', 'C', 'IY', 'IX'] # -> gets reduced to dimensionality_order = ['B', 'CH', 'IY', 'IX'] # (in this case the 'CH' represents the absolute "channel" dimension) - for op, tensor in tensors_cns.items(): - tensors_cns[op] = node.reshape_operand_tensor(tensor, operand=op) + node_tensor = node.reshape_operand_tensor(node_tensor, operand=op) + + return node_tensor + def get_node_tensors(self, node: ComputationNode, tiles: list[ComputationNode]) -> dict[LayerOperand, NodeTensor]: + variable_operands = [op for op in node.input_operands if op not in node.constant_operands] + [ + node.output_operand + ] + tensors_cns: dict[LayerOperand, NodeTensor] = {} + for op in variable_operands: + tensors_cns[op] = self.get_node_tensor(node, tiles, op) return tensors_cns @staticmethod diff --git a/stream/workload/dependency_propagation/slice_node.py b/stream/workload/dependency_propagation/slice_node.py index 49de89d5..2d76c705 100644 --- a/stream/workload/dependency_propagation/slice_node.py +++ b/stream/workload/dependency_propagation/slice_node.py @@ -39,7 +39,9 @@ def __init__( self.input_operand_source = {Constants.LAYER_OP_I: predecessor} self.output_names = output_names - def propagate(self, tensor: NodeTensor, next_node: Node | None = None): + def propagate(self, tensor: NodeTensor, next_node: Node | None = None, relevant_axes: list[bool] = []): """Slice the tensor. Currently assumes only one slice is created.""" - return tensor.slice(starts=self.starts[0], ends=self.ends[0], axis=self.axes[0], steps=self.steps[0]) + sliced_tensor = tensor.slice(starts=self.starts[0], ends=self.ends[0], axis=self.axes[0], steps=self.steps[0]) + relevant_axes[self.axes[0]] = True + return sliced_tensor, relevant_axes diff --git a/stream/workload/dependency_propagation/split_node.py b/stream/workload/dependency_propagation/split_node.py index 631be9f0..c01c6aae 100644 --- a/stream/workload/dependency_propagation/split_node.py +++ b/stream/workload/dependency_propagation/split_node.py @@ -37,7 +37,7 @@ def __init__( self.input_operand_source = {Constants.LAYER_OP_I: predecessor} self.output_names = output_names - def propagate(self, tensor: NodeTensor, next_node: Node): + def propagate(self, tensor: NodeTensor, next_node: Node, relevant_axes: list[bool]): """Split the tensor back to the representation needed for producer/consumer.""" # Numpy requires the indices where to split instead of the sizes of the resulting splits @@ -52,5 +52,8 @@ def propagate(self, tensor: NodeTensor, next_node: Node): f"Cannot find this nodes' ({self.name}) outputs {self.output_names} in next nodes' inputs {next_node.input_names}" ) + # Update the relevant_dims with the axis involved in the split + relevant_axes[self.axis] = True + output_tensor = output_tensors[index] - return output_tensor + return output_tensor, relevant_axes diff --git a/stream/workload/dependency_propagation/transpose_node.py b/stream/workload/dependency_propagation/transpose_node.py index d2d2fb23..53cd4301 100644 --- a/stream/workload/dependency_propagation/transpose_node.py +++ b/stream/workload/dependency_propagation/transpose_node.py @@ -22,6 +22,13 @@ def __init__( self.permute_axes = permute_axes self.input_operand_source = {LayerOperand("I"): predecessor} - def propagate(self, tensor: NodeTensor, next_node: Node | None = None) -> NodeTensor: + def propagate( + self, tensor: NodeTensor, next_node: Node | None = None, relevant_axes: list[bool] = [] + ) -> NodeTensor: """Transpose an input tensor.""" - return tensor.transpose(axes=self.permute_axes) + transposed_tensor = tensor.transpose(axes=self.permute_axes) + + for axis in self.permute_axes: + relevant_axes[axis] = True + + return transposed_tensor, relevant_axes From 17d716b0db7a0e87db11a60fe5723e4e1d3bec01 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sat, 4 Jan 2025 11:40:46 +0100 Subject: [PATCH 43/49] in NodeTensor: if slice > tensor.shape, assign dependency at last size-1 slice --- .../generation/tiled_workload_generation.py | 29 ++++++++++--------- .../workload/computation/computation_node.py | 3 ++ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 03ce17fc..a0463866 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -292,7 +292,7 @@ def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: logger.warning(f"Padded layer dimension {dim}: {size} -> {new_size} to be divisible by tiling factors") # Save these extended sizes for later - extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) + original_node.extended_layer_dim_sizes = deepcopy(tile_attrs.layer_dim_sizes) # Take away the outer_temporal_loops to create tiled CNs for this node for loop in outer_temporal_loops: @@ -327,7 +327,7 @@ def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: tiles: list[ComputationNode] = [] tensors: list[Tensor] = [] group_id_manager = GroupIdManager( - layer_dim_sizes=extended_layer_dim_sizes, + layer_dim_sizes=original_node.extended_layer_dim_sizes, intra_core_tiling=original_node.intra_core_tiling, inter_core_tiling=original_node.inter_core_tiling, ) @@ -806,12 +806,12 @@ def get_tensor_cns(self, node: ComputationNode, tiles: list[ComputationNode]) -> all(tile.loop_ranges[ir_dim][1] >= node.loop_ranges[ir_dim][1] for ir_dim in ir_dims_output) for tile in tile_list ] - attr_to_add_to = "data_produced_unique" + # attr_to_add_to = "data_produced_unique" precision = node.operand_precision[Constants.FINAL_OUTPUT_LAYER_OP] else: tile_list = list(reversed(tiles)) # list in reversed order should_add_to_tensor_list = [True for _ in tile_list] - attr_to_add_to = "data_consumed_unique" + # attr_to_add_to = "data_consumed_unique" # if this layer is the first layer, we assume the inputs are streamed and "free" precision = node.operand_precision[op] * (not is_source_node) @@ -824,19 +824,20 @@ def get_tensor_cns(self, node: ComputationNode, tiles: list[ComputationNode]) -> # start can be negative for padding which, makes np flip window = tuple([slice(max(0, start), stop) for (start, stop) in op_dim_ranges]) # Count how many nans we have in this window, as this is the amount of unique data consumed/produced by - # this tile + # this tile # TODO this call takes a loooong time, can we optimize this? nb_unique_data_bits = tensors_cns[op].get_nb_empty_elements(window) * precision nb_unique_data_seen += nb_unique_data_bits - # Add this amount of unique data to the "data_consumed_unique" or "data_produced_unique" depending on - # input/output operand - setattr( - tile, - attr_to_add_to, - getattr(tile, attr_to_add_to) + nb_unique_data_bits, - ) - # Set this window of the tensor to indicate it will be consumed/produced by this tile + if op == node.output_operand: + tile.data_produced_unique += nb_unique_data_bits + else: + tile.data_consumed_unique += nb_unique_data_bits + + # This is not guaranteed: tiles of nodes whose ranges have been extended can exceed the NodeTensor shape + # assert all(start < max_stop for (start, _), max_stop in zip(op_dim_ranges, op_dim_ranges_max_stop)) + + # Slices that exceed the max stop are reduced to a size-1 slice at `max_stop-1` bounded_op_dim_ranges = tuple( - slice(max(0, start), min(max_stop, stop)) + slice(max(0, min(max_stop - 1, start)), min(max_stop, stop)) for ((start, stop), max_stop) in zip(op_dim_ranges, op_dim_ranges_max_stop) ) tensors_cns[op] = tensors_cns[op].extend_with_node(bounded_op_dim_ranges, tile) diff --git a/stream/workload/computation/computation_node.py b/stream/workload/computation/computation_node.py index fe2dc2fc..6395f1ab 100644 --- a/stream/workload/computation/computation_node.py +++ b/stream/workload/computation/computation_node.py @@ -6,6 +6,7 @@ from zigzag.utils import hash_sha512 from zigzag.visualization.results.plot_cme import shorten_onnx_layer_name from zigzag.workload.layer_attributes import ( + LayerDimSizes, LayerPadding, ) from zigzag.workload.layer_node import LayerNode, LayerNodeAttributes @@ -83,6 +84,8 @@ def __init__( self.operand_dimensionality_order: dict[LayerOperand, list[LayerDim]] = { layer_op: self.equation.get_r_layer_dims(layer_op) for layer_op in self.equation.get_contained_operands() } + # Sizes can be extended to fit division factors + self.extended_layer_dim_sizes: LayerDimSizes = deepcopy(self.layer_dim_sizes) # adds pr dimensions loop ranges to self.loop_ranges self.calculate_pr_loop_ranges() From 06acaaf9e91b26e41d220d79b4c3648e2829e576 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sat, 4 Jan 2025 14:10:29 +0100 Subject: [PATCH 44/49] refactor scheduler into class --- stream/cost_model/cost_model.py | 32 +- stream/cost_model/scheduler.py | 974 ++++++++++---------- stream/hardware/architecture/accelerator.py | 10 +- 3 files changed, 514 insertions(+), 502 deletions(-) diff --git a/stream/cost_model/cost_model.py b/stream/cost_model/cost_model.py index a7af4694..428910ae 100644 --- a/stream/cost_model/cost_model.py +++ b/stream/cost_model/cost_model.py @@ -1,6 +1,6 @@ from zigzag.datatypes import LayerOperand -from stream.cost_model.scheduler import schedule_graph +from stream.cost_model.scheduler import Schedule from stream.hardware.architecture.accelerator import Accelerator from stream.visualization.memory_usage import plot_memory_usage from stream.visualization.schedule import plot_timeline_brokenaxes @@ -49,22 +49,24 @@ def run(self): manager. This assumes each node in the graph has an energy and runtime of the core to which they are allocated to. """ - results = schedule_graph( - self.workload, - self.accelerator, - operands_to_prefetch=self.operands_to_prefetch, + schedule = Schedule( + G=self.workload, + accelerator=self.accelerator, scheduling_order=self.scheduling_order, + operands_to_prefetch=self.operands_to_prefetch, ) - self.latency = results[0] - self.total_cn_onchip_energy = results[1] - self.total_cn_offchip_link_energy = results[2] - self.total_cn_offchip_memory_energy = results[3] - self.total_eviction_to_offchip_link_energy = results[4] - self.total_eviction_to_offchip_memory_energy = results[5] - self.total_sink_layer_output_offchip_link_energy = results[6] - self.total_sink_layer_output_offchip_memory_energy = results[7] - self.total_core_to_core_link_energy = results[8] - self.total_core_to_core_memory_energy = results[9] + schedule.run() + + self.latency = schedule.latency + self.total_cn_onchip_energy = schedule.total_cn_onchip_energy + self.total_cn_offchip_link_energy = schedule.total_cn_offchip_link_energy + self.total_cn_offchip_memory_energy = schedule.total_cn_offchip_memory_energy + self.total_eviction_to_offchip_link_energy = schedule.total_eviction_to_offchip_link_energy + self.total_eviction_to_offchip_memory_energy = schedule.total_eviction_to_offchip_memory_energy + self.total_sink_layer_output_offchip_link_energy = schedule.total_sink_layer_output_offchip_link_energy + self.total_sink_layer_output_offchip_memory_energy = schedule.total_sink_layer_output_offchip_memory_energy + self.total_core_to_core_link_energy = schedule.total_core_to_core_link_energy + self.total_core_to_core_memory_energy = schedule.total_core_to_core_memory_energy self.energy = ( self.total_cn_onchip_energy diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index e82f25ea..32917a73 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -15,520 +15,524 @@ logger = logging.getLogger(__name__) -def initialize_priorities(workload: ComputationNodeWorkload, accelerator: "Accelerator"): - for n in workload.node_list: - for tensor in n.operand_tensors.values(): - tensor.initialize_instance_priorities(workload, n, accelerator) - - -def initialize_offchip_tensors(workload: ComputationNodeWorkload, accelerator: "Accelerator"): - offchip_core_id = accelerator.offchip_core_id - assert offchip_core_id is not None, "No offchip core found for this accelerator" - offchip_core = accelerator.get_core(offchip_core_id) - offchip_top_instances = accelerator.get_top_instances_of_core(offchip_core_id) - for n in workload.node_list: - for op, tensor in n.operand_tensors.items(): - # For constant operands or inputs of first node - if op in n.constant_operands or (op != Constants.OUTPUT_LAYER_OP and len(workload.in_edges(n)) == 0): - if not any( - ( - accelerator.contains_tensor(tensor, offchip_top_instance) - for offchip_top_instance in offchip_top_instances - ) - ): - memory_op = n.memory_operand_links.layer_to_mem_op(op) - accelerator.spawn( - tensor=tensor, - core=offchip_core, - memory_op=memory_op, - initial_timestep=0, - available_timestep=0, - ) - - -def prefetch_constant_operands( - G: ComputationNodeWorkload, accelerator: "Accelerator", operands_to_prefetch: list[LayerOperand] -): - total_cn_offchip_link_energy = 0 - total_cn_offchip_memory_energy = 0 - total_eviction_to_offchip_link_energy = 0 - total_eviction_to_offchip_memory_energy = 0 - for n in G.node_list: - for op, tensor in n.operand_tensors.items(): - if op in n.constant_operands and op in operands_to_prefetch: - core_allocation = n.chosen_core_allocation - assert core_allocation is not None, "Core should be allocated" - memory_op = n.memory_operand_links.layer_to_mem_op(op) - if not accelerator.contains_tensor(tensor, core_allocation): - ( - _, - transfer_link_energy_cost, - transfer_memory_energy_cost, - eviction_link_energy_cost, - eviction_memory_energy_cost, - came_from_offchip, - ) = accelerator.transfer_tensor_to_core(tensor, core_allocation, memory_op, []) - assert came_from_offchip - total_cn_offchip_link_energy += transfer_link_energy_cost - total_cn_offchip_memory_energy += transfer_memory_energy_cost - total_eviction_to_offchip_link_energy += eviction_link_energy_cost - total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - return ( - total_cn_offchip_link_energy, - total_cn_offchip_memory_energy, - total_eviction_to_offchip_link_energy, - total_eviction_to_offchip_memory_energy, - ) - - -def get_best_candidate( - candidates: list[tuple[int, ComputationNode]], scheduling_order: list[tuple[int, int]] -) -> tuple[ComputationNode, int]: - # If this core doesn't have any candidates, continue to the next core - if not candidates: - raise ValueError("There are no candidates to schedule.") - preds_ends, cn_candidates = zip(*candidates) - cn_candidates: list[ComputationNode] - idxs = [scheduling_order.index((n.id, n.sub_id)) for n in cn_candidates] - best_candidate_idx = idxs.index(min(idxs)) - best_candidate = cn_candidates[best_candidate_idx] - preds_end = preds_ends[best_candidate_idx] - # Remove the candidate from the list of candidates - del candidates[best_candidate_idx] - return best_candidate, preds_end - - -def get_tensors_needed_for_node(node: ComputationNode, G: ComputationNodeWorkload): - """Determine all the tensors needed to compute a node. - The node might need multiple outputs from previous nodes, depending on the graph. - - Args: - node (ComputationNode): The node to be computed. - G : The graph of all nodes. - - Returns: - tuple: A tuple of tensors and a tuple of memory operands for the node. - """ - tensors_this_candidate_needs: list[Tensor] = [] - tensors_operands: list[MemoryOperand] = [] - # Constant operands - for layer_op in node.constant_operands: - memory_op = node.memory_operand_links.layer_to_mem_op(layer_op) - if memory_op in node.too_large_operands: - continue - tensors_this_candidate_needs.append(node.operand_tensors[layer_op]) - tensors_operands.append(memory_op) - # Non-constant operands - for pred, node, edge_data in sorted(G.in_edges(node, data=True), key=itemgetter(0)): - if pred.id == node.id: - continue # Skip if predecessor was from the same layer (intra-edge) - consumer_layer_op: LayerOperand = edge_data["operand"] - consumer_memory_op = node.memory_operand_links.layer_to_mem_op(consumer_layer_op) - if consumer_memory_op in node.too_large_operands: - continue # Skip if tensor will be fetched fromm offchip throughout computation - pred_output_tensor = pred.operand_tensors[pred.output_operand] - tensors_this_candidate_needs.append(pred_output_tensor) - tensors_operands.append(consumer_memory_op) - if tensors_this_candidate_needs: - # Sort these tensors based on their earliest possible transfer time - tensors_this_candidate_needs, tensors_operands = zip( - *sorted(zip(tensors_this_candidate_needs, tensors_operands)) - ) - return tensors_this_candidate_needs, tensors_operands - - -def clear_memories( - accelerator: "Accelerator", - core: Core, - memory_operands: list[MemoryOperand], - timestep: int, - exceptions: list[Tensor] = [], - transfer_bandwidth_fraction: float = 1, -): - total_eviction_to_offchip_link_energy = 0 - total_eviction_to_offchip_memory_energy = 0 - for too_large_operand in memory_operands: - ( - timestep, - eviction_link_energy_cost, - eviction_memory_energy_cost, - ) = accelerator.remove_all( - core=core, - memory_operand=too_large_operand, - timestep=timestep, - exceptions=exceptions, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - write_back_to_offchip=True, - ) - total_eviction_to_offchip_link_energy += eviction_link_energy_cost - total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - return ( - total_eviction_to_offchip_link_energy, - total_eviction_to_offchip_memory_energy, - timestep, - ) - - -def decrease_priority( - tensors: list[Tensor], - tensors_operands: list[MemoryOperand], - accelerator: "Accelerator", - node: ComputationNode, -): - for tensor_used_by_node, tensor_memory_operand in zip(tensors, tensors_operands): - # TODO: tensor_memory_operand will be 'O' for activation tensors. - # TODO: If the memory between input and output is not shared, this will give a wrong instance. - assert node.chosen_core_allocation is not None - top_instance = accelerator.get_top_instance_of_core(node.chosen_core_allocation, tensor_memory_operand) - tensor_used_by_node.instance_priorities[top_instance] -= 1 - - -def check_for_removal( - tensors: list[Tensor], - accelerator: "Accelerator", - G: ComputationNodeWorkload, - timestep: int, - transfer_bandwidth_fraction: float = 1, -): - offchip_core_id = accelerator.offchip_core_id - for tensor_used_by_node in tensors: - if tensor_used_by_node.get_total_priority() == 0: - instances_storing_tensor, _ = accelerator.memory_manager.find_tensor_in_top_instances(tensor_used_by_node) - for instance_storing_tensor in instances_storing_tensor: - core_ids_of_instance = [ - core.id for core in accelerator.memory_manager.cores_per_top_instance[instance_storing_tensor] - ] - # If this tensor is an output tensor, find all nodes that needed it - # to get an accurate timestep at which it can be removed - timestep_for_removal = timestep - if tensor_used_by_node.layer_operand == tensor_used_by_node.origin.output_operand: - origin = tensor_used_by_node.origin - if offchip_core_id in core_ids_of_instance: - # If wanting to discard it from offchip, look at the max end time across all successors - nodes_that_needed_tensor = [n for n in G.successors(origin) if n.id != origin.id] - else: - # If discarding it from a regular core, look at the max end time successors that used it from - # that instance - nodes_that_needed_tensor = [ - n - for n in G.successors(origin) - if n.chosen_core_allocation in core_ids_of_instance and n.id != origin.id - ] - end_times = [n.end for n in nodes_that_needed_tensor if n.end is not None] - max_end_time = max(end_times, default=timestep_for_removal) - # assert max_end_time != -1, "There should be at least one successor." - timestep_for_removal = max_end_time - - # Get a core tied to the top_instance we want to remove it on. - core = accelerator.memory_manager.cores_per_top_instance[instance_storing_tensor][0] - accelerator.remove( - tensor_used_by_node, - core, - tensor_used_by_node.memory_operand, - timestep_for_removal, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, +class Schedule: + def __init__( + self, + G: ComputationNodeWorkload, + accelerator: "Accelerator", + scheduling_order: list[tuple[int, int]], + cores_idle_from: dict[int, int] | None = None, + operands_to_prefetch: list[LayerOperand] = [], + ): + """ + Args: + G: Graph containing the nodes to be scheduled. + accelerator: The accelerator to schedule the nodes on. + scheduling_order: + cores_idle_from: A dict containing for each core_id its start offset. Defaults to None. + operands_to_prefetch: The layer operands that should be prefetched at the start of the schedule. + """ + self.G = G + self.accelerator = accelerator + self.scheduling_order = scheduling_order + self.operands_to_prefetch = operands_to_prefetch + + core_ids = set(n.chosen_core_allocation for n in G.node_list) + assert None not in core_ids, "Not all nodes have core allocation. Insert SetFixedAllocationPerformanceStage." + self.all_core_ids: list[int] = sorted(list(core_ids)) # type: ignore + self.cores_idle_from = cores_idle_from if cores_idle_from else {core_id: 0 for core_id in self.all_core_ids} + + # Initialize the schedule results + self.latency = 0 + self.total_cn_onchip_energy = 0 + self.total_cn_offchip_link_energy = 0 + self.total_cn_offchip_memory_energy = 0 + self.total_eviction_to_offchip_link_energy = 0 + self.total_eviction_to_offchip_memory_energy = 0 + self.total_sink_layer_output_offchip_link_energy = 0 + self.total_sink_layer_output_offchip_memory_energy = 0 + self.total_core_to_core_link_energy = 0 + self.total_core_to_core_memory_energy = 0 + + # Remains constant throughout the scheduling + self.sink_layer_nodes = self.get_sink_layer_nodes() + self.offchip_core = accelerator.get_offchip_core() + self.nb_graph_nodes = G.number_of_nodes() + + # Initialize bookkeeping + self.nb_scheduled_nodes = 0 + self.scheduled_nodes: set[ComputationNode] = set() + self.candidates = self.get_initial_candidates() + self.initialize_tensor_priorities() + self.initialize_offchip_tensors() + + def get_initial_candidates(self): + """Put the very first nodes of a layer that doesn't have any incoming edges as the first candidates""" + candidates: list[tuple[int, ComputationNode]] = [] + for source_node in (n for n, d in self.G.in_degree() if d == 0): + core_allocation = source_node.chosen_core_allocation + candidates.append((self.cores_idle_from[core_allocation], source_node)) # type: ignore + return candidates + + def get_sink_layer_nodes(self): + """Get all the nodes with no successors that produce final outputs, used for off-loading final outputs""" + sink_layer_ids = self.G.get_sink_layer_ids() + sink_layer_nodes = set((n for n in self.G.node_list if (n.id in sink_layer_ids) and n.produces_final_output)) + return sink_layer_nodes + + def initialize_tensor_priorities(self): + """Initialize the memory instance priorities for each tensor in the workload.""" + for n in self.G.node_list: + for tensor in n.operand_tensors.values(): + tensor.initialize_instance_priorities(self.G, n, self.accelerator) + + def initialize_offchip_tensors(self): + """Add the constant operand tensors of all nodes to the off-chip initially.""" + offchip_top_instances = self.accelerator.get_top_instances_of_core(self.offchip_core) + for n in self.G.node_list: + for op, tensor in n.operand_tensors.items(): + # For constant operands or inputs of first node + if op in n.constant_operands or (op != Constants.OUTPUT_LAYER_OP and len(self.G.in_edges(n)) == 0): + if not any( + ( + self.accelerator.contains_tensor(tensor, offchip_top_instance) + for offchip_top_instance in offchip_top_instances + ) + ): + memory_op = n.memory_operand_links.layer_to_mem_op(op) + self.accelerator.spawn( + tensor=tensor, + core=self.offchip_core, + memory_op=memory_op, + initial_timestep=0, + available_timestep=0, + ) + + def run(self): + nb_scheduled_nodes = 0 + done = False + + self.prefetch_constant_operands() + + while not done: + best_candidate, preds_end = self.pop_best_candidate() + tensors_this_candidate_needs, tensors_operands = self.get_tensors_needed_for_node(best_candidate) + core = self.get_core_for_node(best_candidate) + transfer_bw_fraction = self.get_transfer_bandwidth_fraction(best_candidate) + + # Step 0: get the start time: when core is available or predecessors finished + self.sync_cores_idle_from(best_candidate) + core_idle_from = self.cores_idle_from[core.id] + timestep = max(core_idle_from, preds_end) + + # Step 1: for operands that are too large to store in the core's memory, clear the memory so ZigZag can + # optimize the loop ordering using the full memory size + transfer_complete_timestep = self.clear_memories( + core=core, + memory_operands=best_candidate.too_large_operands, + timestep=timestep, + exceptions=tensors_this_candidate_needs, + transfer_bandwidth_fraction=transfer_bw_fraction, + ) + timestep = transfer_complete_timestep + + # Step 2: Transfer the tensors needed for this node to the core (from off-chip or from another core) + for tensor, tensor_operand in zip(tensors_this_candidate_needs, tensors_operands): + transfer_complete_timestep = self.transfer_tensor_to_core( + tensor=tensor, + tensor_operand=tensor_operand, + receiving_core=core, + non_evictable_tensors=tensors_this_candidate_needs, + earliest_t=core_idle_from, + transfer_bandwidth_fraction=transfer_bw_fraction, ) + timestep = max(timestep, transfer_complete_timestep) + + # Step 3: make space for the output tensor of this node + output_layer_operand = best_candidate.output_operand + output_memory_operand = best_candidate.memory_operand_links.layer_to_mem_op(output_layer_operand) + output_tensor = best_candidate.operand_tensors[output_layer_operand] + core_to_add_output_to = ( + self.offchip_core if output_memory_operand in best_candidate.too_large_operands else core + ) + transfer_complete_timestep = self.make_space_for_tensor( + output_tensor, + core_to_add_output_to, + output_memory_operand, + timestep, + tensors_this_candidate_needs, + transfer_bandwidth_fraction=transfer_bw_fraction, + ) + timestep = transfer_complete_timestep + + # Step 4: If any operands are too large to store in memory, find a window and block off-chip links for the + # runtime duration + blocking_can_start_timestep = self.accelerator.block_offchip_links( + too_large_operands=best_candidate.too_large_operands, + core_id=core.id, + start_timestep=timestep, + duration=best_candidate.get_runtime(), + cn=best_candidate, + ) + timestep = blocking_can_start_timestep + + # Step 5: Register the scheduling decision for this node and spawn the output tensor + node_end_timestep = self.register_scheduled_node( + node=best_candidate, + start_time=timestep, + output_tensor=output_tensor, + output_memory_operand=output_memory_operand, + core_to_add_output_to=core_to_add_output_to, + core_to_run_on=core, + ) + timestep = node_end_timestep + + # Step 6: manage memory usage when the node ends + self.decrease_priority(tensors_this_candidate_needs, tensors_operands, best_candidate) + self.check_for_removal(tensors_this_candidate_needs, timestep, transfer_bw_fraction) + self.remove_sink_node_tensor( + node=best_candidate, + tensor_to_remove=output_tensor, + core_to_remove_from=core, + timestep=timestep, + transfer_bandwidth_fraction=transfer_bw_fraction, + ) - -def sync_cores_idle_from( - cores_idle_from: dict[int, int], - G: ComputationNodeWorkload, - best_candidate: ComputationNode, - scheduling_order: list[tuple[int, int]], -): - """ - Sync the cores_idle_from dict values if the best candidate is the first node of a layer and we detect layer-by-layer - execution. The layer-by-layer execution is detected through the scheduling_order. - """ - # Get the predecessor ids of the best_candidate from the workload graph G - predecessor_ids = [pred.id for pred in G.predecessors(best_candidate) if pred.id != best_candidate.id] - predecessor_idxs = [i for i in range(len(scheduling_order)) if scheduling_order[i][0] in predecessor_ids] - - best_candidate_idx = scheduling_order.index((best_candidate.id, best_candidate.sub_id)) - if scheduling_order[best_candidate_idx - 1][0] != best_candidate.id and all( - (i < best_candidate_idx for i in predecessor_idxs) + # Step 7: finish this round + self.extend_candidates(best_candidate) + nb_scheduled_nodes += 1 + done = nb_scheduled_nodes == self.nb_graph_nodes + + self.latency = self.get_total_latency() + return self.latency + + def get_transfer_bandwidth_fraction(self, node: ComputationNode): + """Get the fraction of the off-chip bandwidth to be used for the tensor transfers related to this node""" + return 1 / node.get_total_inter_core_splits() + + def prefetch_constant_operands(self): + """Load the `operands_to_prefetch` to the cores they belong to and log the energy costs.""" + for n in self.G.node_list: + for op, tensor in n.operand_tensors.items(): + if op in n.constant_operands and op in self.operands_to_prefetch: + core_allocation = n.chosen_core_allocation + assert core_allocation is not None, "Core should be allocated" + memory_op = n.memory_operand_links.layer_to_mem_op(op) + if not self.accelerator.contains_tensor(tensor, core_allocation): + ( + _, + transfer_link_energy_cost, + transfer_memory_energy_cost, + eviction_link_energy_cost, + eviction_memory_energy_cost, + came_from_offchip, + ) = self.accelerator.transfer_tensor_to_core(tensor, core_allocation, memory_op, []) + assert came_from_offchip + self.total_cn_offchip_link_energy += transfer_link_energy_cost + self.total_cn_offchip_memory_energy += transfer_memory_energy_cost + self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost + self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost + + def pop_best_candidate(self) -> tuple[ComputationNode, int]: + """Get the best candidate node to schedule next, given the selection priority. Remove that candidate from the + list of candidates and return it.""" + if not self.candidates: + raise ValueError("There are no candidates to schedule.") + preds_ends, cn_candidates = zip(*self.candidates) + cn_candidates: list[ComputationNode] + idxs = [self.scheduling_order.index((n.id, n.sub_id)) for n in cn_candidates] + best_candidate_idx = idxs.index(min(idxs)) + best_candidate = cn_candidates[best_candidate_idx] + preds_end = preds_ends[best_candidate_idx] + # Remove the candidate from the list of candidates + del self.candidates[best_candidate_idx] + return best_candidate, preds_end + + def sync_cores_idle_from( + self, + best_candidate: ComputationNode, ): - # If the best_candidate is the first node of a layer and all nodes of predecessor layers have been scheduled - # Sync the cores_idle_from dict - max_idle_time = max(cores_idle_from.values()) - for core_id in cores_idle_from: - cores_idle_from[core_id] = max_idle_time - - -def schedule_graph( - G: ComputationNodeWorkload, - accelerator: "Accelerator", - scheduling_order: list[tuple[int, int]], - cores_idle_from: dict[int, int] | None = None, - operands_to_prefetch: list[LayerOperand] = [], -) -> tuple[int, float, float, float, float, float, float, float, float, float]: - """Schedule the nodes of graph G across the cores in the system. - Each node should have a core_allocation and runtime set. - - Args: - G : Graph containing the nodes to be scheduled. - accelerator: The accelerator to schedule the nodes on. - scheduling_order: - cores_idle_from: A dict containing for each core_id its start offset. Defaults to None. - operands_to_prefetch: The layer operands that should be prefetched at the start of the schedule. - """ - # Initialize total link energy cost and memory energy costs - total_cn_onchip_energy = 0 - total_cn_offchip_link_energy = 0 - total_cn_offchip_memory_energy = 0 - total_eviction_to_offchip_link_energy = 0 - total_eviction_to_offchip_memory_energy = 0 - total_sink_layer_output_offchip_link_energy = 0 - total_sink_layer_output_offchip_memory_energy = 0 - total_core_to_core_link_energy = 0 - total_core_to_core_memory_energy = 0 - - core_ids = set(n.chosen_core_allocation for n in G.node_list) - assert ( - None not in core_ids - ), "Make sure all nodes have a core allocation. Insert SetFixedAllocationPerformanceStage." - all_core_ids: list[int] = sorted(list(core_ids)) # type: ignore - - if cores_idle_from is None: - # Make it 0 for all cores - cores_idle_from = {core_allocation: 0 for core_allocation in all_core_ids} - - nb_graph_nodes = G.number_of_nodes() - nb_scheduled_nodes = 0 - scheduled_nodes: set[ComputationNode] = set() - - # List that keeps all possible candidate nodes for each core. - candidates: list[tuple[int, ComputationNode]] = [] - - # Put the very first nodes of a layer that doesn't have any incoming edges as the first candidates - for source_node in (n for n, d in G.in_degree() if d == 0): - core_allocation = source_node.chosen_core_allocation - candidates.append((cores_idle_from[core_allocation], source_node)) # type: ignore - - # Get all the nodes with no successors that produce final outputs, used for off-loading final outputs - sink_layer_ids = G.get_sink_layer_ids() - sink_layer_nodes = set((n for n in G.node_list if (n.id in sink_layer_ids) and n.produces_final_output)) - - # Get the offchip core id and core - offchip_core_id = accelerator.offchip_core_id - assert offchip_core_id is not None - offchip_core = accelerator.get_core(offchip_core_id) - - # Schedule preparation: - # 1. Initialize the memory instance priorities for each tensor - initialize_priorities(G, accelerator) - # 2. Add the constant operand tensors of all nodes to the off-chip initially - initialize_offchip_tensors(G, accelerator) - # 3. Prefetch the constant operands that should be prefetched to their core - ( - prefetch_cn_offchip_link_energy, - prefetch_cn_offchip_memory_energy, - prefetch_eviction_to_offchip_link_energy, - prefetch_eviction_to_offchip_memory_energy, - ) = prefetch_constant_operands(G, accelerator, operands_to_prefetch) - total_cn_offchip_link_energy += prefetch_cn_offchip_link_energy - total_cn_offchip_memory_energy += prefetch_cn_offchip_memory_energy - total_eviction_to_offchip_link_energy += prefetch_eviction_to_offchip_link_energy - total_eviction_to_offchip_memory_energy += prefetch_eviction_to_offchip_memory_energy - - done = False - while not done: - # Get the best candidate given the selection priority - best_candidate, preds_end = get_best_candidate(candidates, scheduling_order) - # Sync cores_idle_from variable for layer-by-layer scheduling - sync_cores_idle_from(cores_idle_from, G, best_candidate, scheduling_order) - # Get the core this candidate will be scheduled on - core_id = best_candidate.chosen_core_allocation + """ + Sync the cores_idle_from dict values if the best candidate is the first node of a layer and we detect + layer-by-layer execution. The layer-by-layer execution is detected through the scheduling_order. + """ + # Get the predecessor ids of the best_candidate from the workload graph G + predecessor_ids = [pred.id for pred in self.G.predecessors(best_candidate) if pred.id != best_candidate.id] + predecessor_idxs = [ + i for i in range(len(self.scheduling_order)) if self.scheduling_order[i][0] in predecessor_ids + ] + + best_candidate_idx = self.scheduling_order.index((best_candidate.id, best_candidate.sub_id)) + if self.scheduling_order[best_candidate_idx - 1][0] != best_candidate.id and all( + (i < best_candidate_idx for i in predecessor_idxs) + ): + # If the best_candidate is the first node of a layer and all nodes of predecessor layers have been scheduled + # Sync the cores_idle_from dict + max_idle_time = max(self.cores_idle_from.values()) + for core_id in self.cores_idle_from: + self.cores_idle_from[core_id] = max_idle_time + + def get_core_for_node(self, node: ComputationNode): + """Get the core this candidate will be scheduled on""" + core_id = node.chosen_core_allocation assert core_id is not None - core = accelerator.get_core(core_id) - - # Fraction of the off-chip bandwidth to be used for the tensor transfers related to this node - transfer_bandwidth_fraction = 1 / best_candidate.get_total_inter_core_splits() - - # Earliest start time is when core is available or predecessors finished - core_idle_from = cores_idle_from[core_id] - start = max(core_idle_from, preds_end) - timestep = start - - # Step 0 - tensors_this_candidate_needs, tensors_operands = get_tensors_needed_for_node(best_candidate, G) - - # Step 1 - # There could be operands that are too large to store in the highest memory on the core - # The tensors stored in these memories should be evicted and potentially written back to off-chip - # Clear these memories (this might delay the potential start time if things have to written to off-chip) - ( - clear_link_energy, - clear_memory_energy, - memory_cleared_timestep, - ) = clear_memories( - accelerator=accelerator, - core=core, - memory_operands=best_candidate.too_large_operands, - timestep=timestep, - exceptions=tensors_this_candidate_needs, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - ) - total_eviction_to_offchip_link_energy += clear_link_energy - total_eviction_to_offchip_memory_energy += clear_memory_energy - timestep = memory_cleared_timestep - - # Step 2 - # The computation might need tensors that are currently not present in the core's memories - # We need to fetch these tensors from either off-chip or from the core where they are present - # Transfer these tensors from wherever they are currently residing to this core - for tensor, tensor_operand in zip(tensors_this_candidate_needs, tensors_operands): - # Transfer the tensor + return self.accelerator.get_core(core_id) + + def get_tensors_needed_for_node(self, node: ComputationNode): + """Determine all the tensors needed to compute a node. + The node might need multiple outputs from previous nodes, depending on the graph. + + Args: + node (ComputationNode): The node to be computed. + G : The graph of all nodes. + + Returns: + A tuple of tensors and a tuple of memory operands for the node. + """ + tensors_this_candidate_needs: list[Tensor] = [] + tensors_operands: list[MemoryOperand] = [] + # Constant operands + for layer_op in node.constant_operands: + memory_op = node.memory_operand_links.layer_to_mem_op(layer_op) + if memory_op in node.too_large_operands: + continue + tensors_this_candidate_needs.append(node.operand_tensors[layer_op]) + tensors_operands.append(memory_op) + # Non-constant operands + for pred, node, edge_data in sorted(self.G.in_edges(node, data=True), key=itemgetter(0)): + if pred.id == node.id: + continue # Skip if predecessor was from the same layer (intra-edge) + consumer_layer_op: LayerOperand = edge_data["operand"] + consumer_memory_op = node.memory_operand_links.layer_to_mem_op(consumer_layer_op) + if consumer_memory_op in node.too_large_operands: + continue # Skip if tensor will be fetched fromm offchip throughout computation + pred_output_tensor = pred.operand_tensors[pred.output_operand] + tensors_this_candidate_needs.append(pred_output_tensor) + tensors_operands.append(consumer_memory_op) + if tensors_this_candidate_needs: + # Sort these tensors based on their earliest possible transfer time + tensors_this_candidate_needs, tensors_operands = zip( + *sorted(zip(tensors_this_candidate_needs, tensors_operands)) + ) + return tensors_this_candidate_needs, tensors_operands + + def clear_memories( + self, + core: Core, + memory_operands: list[MemoryOperand], + timestep: int, + exceptions: list[Tensor] = [], + transfer_bandwidth_fraction: float = 1, + ): + for too_large_operand in memory_operands: ( - transfer_complete_timestep, - transfer_link_energy_cost, - transfer_memory_energy_cost, + timestep, eviction_link_energy_cost, eviction_memory_energy_cost, - came_from_offchip, - ) = accelerator.transfer_tensor_to_core( - tensor, - core_id, - tensor_operand, - tensors_this_candidate_needs, - earliest_t=core_idle_from, + ) = self.accelerator.remove_all( + core=core, + memory_operand=too_large_operand, + timestep=timestep, + exceptions=exceptions, transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=True, ) - # Update the possible start time of this node - timestep = max(timestep, transfer_complete_timestep) - # Add the energy costs to their respective trackers - if came_from_offchip: - total_cn_offchip_link_energy += transfer_link_energy_cost - total_cn_offchip_memory_energy += transfer_memory_energy_cost - else: - total_core_to_core_link_energy += transfer_link_energy_cost - total_core_to_core_memory_energy += transfer_memory_energy_cost - total_eviction_to_offchip_link_energy += eviction_link_energy_cost - total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - - # Step 3 - # Make space for the output tensor of this computation node and spawn it when evictions are complete - # If the output operand is in the too large operands, add it to off-chip, otherwise add it to this core's - # output memory - output_layer_operand = best_candidate.output_operand - output_memory_operand = best_candidate.memory_operand_links.layer_to_mem_op(output_layer_operand) - output_tensor = best_candidate.operand_tensors[output_layer_operand] - core_to_add_output_to = offchip_core if output_memory_operand in best_candidate.too_large_operands else core + self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost + self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost + return timestep + + def transfer_tensor_to_core( + self, + tensor: Tensor, + tensor_operand: MemoryOperand, + receiving_core: Core, + non_evictable_tensors: list[Tensor], + sending_core_id: int | None = None, + earliest_t: int = 0, + transfer_bandwidth_fraction: float = 1, + ): ( - evictions_complete_timestep, + transfer_complete_timestep, + transfer_link_energy_cost, + transfer_memory_energy_cost, eviction_link_energy_cost, eviction_memory_energy_cost, - ) = accelerator.make_space_for( - output_tensor, - core_to_add_output_to, - output_memory_operand, - timestep, - tensors_this_candidate_needs, + came_from_offchip, + ) = self.accelerator.transfer_tensor_to_core( + tensor, + receiving_core.id, + tensor_operand, + non_evictable_tensors, + earliest_t=earliest_t, transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) - total_eviction_to_offchip_link_energy += eviction_link_energy_cost - total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - timestep = evictions_complete_timestep - - # Step 4 - # Check if we had any operands that were too large to store in the core's memory, block the relevant off-chip - # link for the duration - # This might again delay the execution if the offchip link was already blocked by another core - blocking_can_start_timestep = accelerator.block_offchip_links( - too_large_operands=best_candidate.too_large_operands, - core_id=core_id, - start_timestep=timestep, - duration=best_candidate.get_runtime(), - cn=best_candidate, + + # Add the energy costs to their respective trackers + if came_from_offchip: + self.total_cn_offchip_link_energy += transfer_link_energy_cost + self.total_cn_offchip_memory_energy += transfer_memory_energy_cost + else: + self.total_core_to_core_link_energy += transfer_link_energy_cost + self.total_core_to_core_memory_energy += transfer_memory_energy_cost + self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost + self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost + return transfer_complete_timestep + + def make_space_for_tensor( + self, + tensor: Tensor, + core: Core, + memory_operand: MemoryOperand, + timestep: int, + tensors_to_avoid_evicting: list[Tensor] = [], + transfer_bandwidth_fraction: float = 1, + ): + ( + evictions_complete_timestep, + eviction_link_energy_cost, + eviction_memory_energy_cost, + ) = self.accelerator.make_space_for( + tensor=tensor, + core=core, + memory_op=memory_operand, + timestep=timestep, + tensors_to_avoid_evicting=tensors_to_avoid_evicting, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) - timestep = blocking_can_start_timestep + self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost + self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost + return evictions_complete_timestep + + def register_scheduled_node( + self, + node: ComputationNode, + start_time: int, + output_tensor: Tensor, + output_memory_operand: MemoryOperand, + core_to_add_output_to: Core, + core_to_run_on: Core, + ): + """Spawn the output tensor and register the runtimes and energies of the node.""" - # Step 5 - # Spawn the output tensor and update the start and end time of the node - start = timestep - end = start + best_candidate.get_runtime() - accelerator.spawn( + end_time = start_time + node.get_runtime() + self.accelerator.spawn( output_tensor, core_to_add_output_to, output_memory_operand, - initial_timestep=start, - available_timestep=end, - ) - best_candidate.set_start(start) - best_candidate.set_end(end) - cores_idle_from[core_id] = end - - # Add the computation energy of running this node - total_cn_onchip_energy += best_candidate.get_onchip_energy() - total_cn_offchip_memory_energy += best_candidate.get_offchip_energy() - - # Add this node to the scheduled nodes - scheduled_nodes.add(best_candidate) - - # Step 6 - # Memory usage: When the node ends: - # Decrease the priority of all the tensors this node used - decrease_priority(tensors_this_candidate_needs, tensors_operands, accelerator, best_candidate) - # Remove the tensor if the priority is zero - check_for_removal( - tensors=tensors_this_candidate_needs, - accelerator=accelerator, - G=G, - timestep=end, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, + initial_timestep=start_time, + available_timestep=end_time, ) - - # Step 7 - # Memory usage: When the node ends: - # If this node is a sink node (node that has no successors and that produces a final output), transfer final - # outputs to offchip - if best_candidate in sink_layer_nodes: + node.set_start(start_time) + node.set_end(end_time) + self.cores_idle_from[core_to_run_on.id] = end_time + self.scheduled_nodes.add(node) + + self.total_cn_onchip_energy += node.get_onchip_energy() + self.total_cn_offchip_memory_energy += node.get_offchip_energy() + return end_time + + def remove_sink_node_tensor( + self, + node: ComputationNode, + tensor_to_remove: Tensor, + core_to_remove_from: Core, + timestep: int, + transfer_bandwidth_fraction: float, + ): + """If this node is a sink node (node that has no successors and that produces a final output), transfer final + outputs to offchip + """ + if node in self.sink_layer_nodes: # Only push back sink node outputs if they're generated and stored on the core - if Constants.OUTPUT_MEM_OP not in best_candidate.too_large_operands: + if Constants.OUTPUT_MEM_OP not in node.too_large_operands: ( _, link_energy_cost, memory_energy_cost, - ) = accelerator.remove( - tensor=output_tensor, - core=core, - memory_op=output_tensor.memory_operand, - timestep=end, + ) = self.accelerator.remove( + tensor=tensor_to_remove, + core=core_to_remove_from, + memory_op=tensor_to_remove.memory_operand, + timestep=timestep, transfer_bandwidth_fraction=transfer_bandwidth_fraction, write_back_to_offchip=True, ) - total_sink_layer_output_offchip_link_energy += link_energy_cost - total_sink_layer_output_offchip_memory_energy += memory_energy_cost + self.total_sink_layer_output_offchip_link_energy += link_energy_cost + self.total_sink_layer_output_offchip_memory_energy += memory_energy_cost + + def decrease_priority( + self, + tensors: list[Tensor], + tensors_operands: list[MemoryOperand], + node: ComputationNode, + ): + for tensor_used_by_node, tensor_memory_operand in zip(tensors, tensors_operands): + # TODO: tensor_memory_operand will be 'O' for activation tensors. + # TODO: If the memory between input and output is not shared, this will give a wrong instance. + assert node.chosen_core_allocation is not None + top_instance = self.accelerator.get_top_instance_of_core(node.chosen_core_allocation, tensor_memory_operand) + tensor_used_by_node.instance_priorities[top_instance] -= 1 + + def check_for_removal( + self, + tensors: list[Tensor], + timestep: int, + transfer_bandwidth_fraction: float = 1, + ): + """Remove the tensor from the core if its priority is zero.""" + for tensor_used_by_node in tensors: + if tensor_used_by_node.get_total_priority() == 0: + instances_storing_tensor, _ = self.accelerator.memory_manager.find_tensor_in_top_instances( + tensor_used_by_node + ) + for instance_storing_tensor in instances_storing_tensor: + core_ids_of_instance = [ + core.id + for core in self.accelerator.memory_manager.cores_per_top_instance[instance_storing_tensor] + ] + # If this tensor is an output tensor, find all nodes that needed it + # to get an accurate timestep at which it can be removed + timestep_for_removal = timestep + if tensor_used_by_node.layer_operand == tensor_used_by_node.origin.output_operand: + origin = tensor_used_by_node.origin + if self.offchip_core.id in core_ids_of_instance: + # If wanting to discard it from offchip, look at the max end time across all successors + nodes_that_needed_tensor = [n for n in self.G.successors(origin) if n.id != origin.id] + else: + # If discarding it from a regular core, look at the max end time successors that used it from + # that instance + nodes_that_needed_tensor = [ + n + for n in self.G.successors(origin) + if n.chosen_core_allocation in core_ids_of_instance and n.id != origin.id + ] + end_times = [n.end for n in nodes_that_needed_tensor if n.end is not None] + max_end_time = max(end_times, default=timestep_for_removal) + # assert max_end_time != -1, "There should be at least one successor." + timestep_for_removal = max_end_time + + # Get a core tied to the top_instance we want to remove it on. + core = self.accelerator.memory_manager.cores_per_top_instance[instance_storing_tensor][0] + self.accelerator.remove( + tensor_used_by_node, + core, + tensor_used_by_node.memory_operand, + timestep_for_removal, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + ) - # Step 8 - # For each successor of this node, check if all of its predecessors have been scheduled - for successor in sorted(G.successors(best_candidate)): - if all((pred in scheduled_nodes for pred in G.predecessors(successor))): + def extend_candidates(self, node: ComputationNode): + """For each successor of this node, check if all of its predecessors have been scheduled""" + for successor in sorted(self.G.successors(node)): + if all((pred in self.scheduled_nodes for pred in self.G.predecessors(successor))): preds_end = max( - (predecessor.end for predecessor in G.predecessors(successor)), + (predecessor.end for predecessor in self.G.predecessors(successor)), default=0, ) - candidates.append((preds_end, successor)) - - # Increment the number of scheduled nodes - nb_scheduled_nodes += 1 - done = nb_scheduled_nodes == nb_graph_nodes - - # Step 9 - # The total schedule latency is the max of all CN end times and the link end times - cns_end_time = max((n.end for n in G.node_list)) - links_end_time = max([event.end for event in accelerator.communication_manager.events], default=0) - latency = max(cns_end_time, links_end_time) - - return ( - latency, - total_cn_onchip_energy, - total_cn_offchip_link_energy, - total_cn_offchip_memory_energy, - total_eviction_to_offchip_link_energy, - total_eviction_to_offchip_memory_energy, - total_sink_layer_output_offchip_link_energy, - total_sink_layer_output_offchip_memory_energy, - total_core_to_core_link_energy, - total_core_to_core_memory_energy, - ) + self.candidates.append((preds_end, successor)) + + def get_total_latency(self): + """The total schedule latency is the max of all CN end times and the link end times""" + cns_end_time = max((n.end for n in self.G.node_list)) + links_end_time = max([event.end for event in self.accelerator.communication_manager.events], default=0) + return max(cns_end_time, links_end_time) diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index 681e960e..c95f28df 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -59,6 +59,11 @@ def get_core(self, core_id: int) -> Core: """ return self.cores.get_node_with_id(core_id) + def get_offchip_core(self) -> Core: + """Return the offchip core.""" + assert self.offchip_core_id, "This accelerator has no offchip core id." + return self.get_core(self.offchip_core_id) + @property def core_list(self) -> list[Core]: return list(self.cores.node_list) @@ -509,8 +514,9 @@ def has_shared_memory(self, core_id_a: int, core_id_b: int, mem_op_a: MemoryOper ) return top_memory_instance_a is top_memory_instance_b - def get_top_instances_of_core(self, core_id: int): - core = self.get_core(core_id) + def get_top_instances_of_core(self, core: int | Core) -> list[MemoryInstance]: + if isinstance(core, int): + core = self.get_core(core) top_instances = self.memory_manager.top_instances_per_core[core] return top_instances From 6ec30c8220e79ac19c9b138d2f92654148e4ab01 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sat, 4 Jan 2025 17:15:13 +0100 Subject: [PATCH 45/49] refactor tensor movement functions from accelerator to scheduler --- outputs/custom_ssm.onnx | Bin 10998009 -> 0 bytes stream/cost_model/memory_manager.py | 5 +- stream/cost_model/scheduler.py | 415 +++++++++---- stream/hardware/architecture/accelerator.py | 581 ++++++------------ .../generation/tiled_workload_generation.py | 2 +- .../workload/computation/computation_node.py | 3 + 6 files changed, 489 insertions(+), 517 deletions(-) delete mode 100644 outputs/custom_ssm.onnx diff --git a/outputs/custom_ssm.onnx b/outputs/custom_ssm.onnx deleted file mode 100644 index 97da8bdb06facb9bb63f254d0aadd3a0dffa129e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10998009 zcmeFw!A=uV6adiH(l87OG^2^wmFdQ$3lI#^-~x=ekc7mD`=+52X@n`#7Q|IQ!Nd;$ z_x^@EzrkP8sdPGp!XKPjyiD(Xz3;yB-UM^e?D2W8bFzO>Ew3%FFRyO=`EgMW-$oBt zlFrG|%Ii+j?KP8Lqtovl_j`?})rIScT98etKM2dG&BK0N4=Tl-nL?qkby=JVx5IKf zN&gz&RWXXLD!%KS<*Uq%W3^J62=}5#*Vk>V=94QoQxi>9v!)h;Vr6zG_&a)rt(}Lr zEFIpxP}alyxYgf}cjIO<)kB`Dh537ZF$_M%@o}qt)LklG7Vm|-S*=X3u{KpJPfhIS zI`Yq7Po58V^1Ya~oi94Bm!a_MgpG@4UFb~#dhN)YGp5-mxJn?6D3Ka^2 z&7shTDTTaTcw0*u&35}VUw1^4#%iT>2sg9KJ-SGfld}T5{llpOX{=UChw%4cP>vQm zN%Cc5<4rs1_KzB?FRJtTQLRtx>>{0!M2oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkL{A3Jm)000000ObGL2nh}xIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB) z95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c z2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*= zfddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede z;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQq zIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n? z4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj! z0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^` zz<~n?4jede;J|?c2M!!KaNxj!0|yQqIB?*=fddB)95`^`z<~n?4jede;J|?c2M!!K zaNxj!1KP2>-nNqS0KldtTlDxt^0C%Yl#Qb@i^f*Gr6qZ@seu-o%?1b-aM4{~t$V@H z5@VB;4xrHoHJ)Wo_Xgz zb4Y*y0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7csf&Vjsw~giSI7!;S91Ty#-Ql=1IvpRMjyn%FujFN|W_ESE z6)t_#Jv{Ag-)>%9TlsvgnGHNl|45trO-S0q(aWRur@0gbH!iGC<=)JGZ_Pwg$-i%eGv^R${UxjI zY1i2#>jgXYg5Ax_(_31XCUjGgvww|rW?v=i#nSB*D|NM0dd~c^2+8Kx zD9vv_)CRp17WS3g%A%rE8bBxt&kL6S`tDA72D=Mquv?nJ@4mZ}tH?F=57tw_eaBT zb`JjJRj>d2#W?wX`{?wrla{rA)gPXm9u=Le)i1R!PpfSIAT)blpLWOn(Xe&x*}<#( z2Zygd{PXz7+s0BDhO^sD-fzF`9h~m>7Wa5-!PUIe+oeudVqa;8X@7qg&TTYYolW>z zzk8BgZ~x-ppqS!v`mwbXe{3h=>Xf8&(*L%%(TslEtD)K5AD0SZrP)$VDy^l+e-^5Z zChxW5P3_J^Ti-wAYz1{9c)4%C__IiSw=gaGr@^=*eqMRun z=ACbrGR4DErf6jkpWDrg@!_+UWs2+}lsE>$f$~5QxRr14fdG-JkF`jUKzsZlvH-0%ekv;BphsD5dFRsuEMdkE#^w+)K@j?ISWMgvN{Hoe+KHbtY zrqXUvp)}?7Y|72G#^=lb`SCw}luq_P8sQI(xmuxUJD*f_`O<8YZ-~nB#VhZ)B_3B- zrrcFLaF^EX$=5GSr~0+}rPgv)&i%5lRL zPukKZsF7#YbGKx(Jg)R~QCjMobJwPuq8rY+t9~SVFyBRy$v5Y^h);_+itxL}d>_Sa z*%R?`cbpFSX9=@0 zJ5pT;(?8N}aug%wtf`c9XVGgZcNPh?lsg}umpi{Mg-vmHxYQrUx7p-lvBgViPxqewxE@jQKoMfbj@_gq zi9gk`*y@Z;=;JVO^d zGau>QY^0Ucxv05&F$5=bs-)6nQPugLi&Qt?^Jo2U&YwuGnI@6kdg0n^X8|%hy6yZt z`CfQ0(%hWUY^5>Mh#x)~Y{jYPG0Svf8kDUgllow>8?%x zZ^dRxubm4+ZfUSx$~vVgW%+FDMuk#)r9fLyd1>&I`LD|=yJObs^58*?;iI_f`wPA{ zTYN|R{4c5It+qn#Tcw)#y}{kHzA2iknFdVyl~g5A=W5krVDD#@7)aZwzqHp!eK&6I zxmr85HU4B#rQNwoKZx_Me2-JgmxJ_TdmA*)n29RRVwjZN;k!RuEz5jAsV;FBtr~#&;L@PmiM+M8D($whIlalt-{%9X!p!_ z__ObcX|DDXe|Ft;uhm{;D?2BjaeFrUMen!eU7X+W;aM+_i`%cPP;rPU1^>mtAD1#t zX%TB%=`!OU@7Y^$^|4#8JU8t5SBUy8AI=}$OCz3BTu{BVhvur^kE4&z%>T(_F+M{pu~9lR W7b{cUy`}J}hUWB0`su~B_5TKE)quPJ diff --git a/stream/cost_model/memory_manager.py b/stream/cost_model/memory_manager.py index b38de46a..6a144cac 100644 --- a/stream/cost_model/memory_manager.py +++ b/stream/cost_model/memory_manager.py @@ -174,7 +174,7 @@ def add_tensor_to_core( def get_timestep_for_tensor_addition( self, tensor: Tensor, - core_id: int, + core: Core, timestep: int, memory_op: MemoryOperand, ) -> int: @@ -191,7 +191,6 @@ def get_timestep_for_tensor_addition( Returns: The earliest timestep at which the transfer can actually start. """ - core = self.accelerator.get_core(core_id) top_level_idx = self.get_top_level_idx(core, memory_op) top_instance = self.top_instances_per_core[core][top_level_idx] top_instance_capacity = self.top_instance_capacities[top_instance] @@ -211,7 +210,7 @@ def get_timestep_for_tensor_addition( if last_max_usage_idx == len(relevant_usages_reversed) - 1: return relevant_timesteps[last_max_usage_idx] new_timestep = relevant_timesteps[last_max_usage_idx + 1] - return self.get_timestep_for_tensor_addition(tensor, core_id, new_timestep, memory_op) + return self.get_timestep_for_tensor_addition(tensor, core, new_timestep, memory_op) def find_best_tensor_combination_to_evict_fast( self, diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 32917a73..e4b84093 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -1,4 +1,6 @@ import logging +from collections import defaultdict +from enum import Enum, auto from operator import itemgetter from typing import TYPE_CHECKING @@ -15,6 +17,16 @@ logger = logging.getLogger(__name__) +class TransferCause(Enum): + """Log transfer energies in different categories""" + + SINK_LAYER = auto() + EVICTION = auto() + OFF_CHIP = auto() + CORE_TO_CORE = auto() + NO_LOG = auto() + + class Schedule: def __init__( self, @@ -39,20 +51,23 @@ def __init__( core_ids = set(n.chosen_core_allocation for n in G.node_list) assert None not in core_ids, "Not all nodes have core allocation. Insert SetFixedAllocationPerformanceStage." - self.all_core_ids: list[int] = sorted(list(core_ids)) # type: ignore - self.cores_idle_from = cores_idle_from if cores_idle_from else {core_id: 0 for core_id in self.all_core_ids} + all_core_ids: list[int] = sorted(list(core_ids)) # type: ignore + self.cores_idle_from = cores_idle_from if cores_idle_from else {core_id: 0 for core_id in all_core_ids} # Initialize the schedule results self.latency = 0 self.total_cn_onchip_energy = 0 - self.total_cn_offchip_link_energy = 0 - self.total_cn_offchip_memory_energy = 0 - self.total_eviction_to_offchip_link_energy = 0 - self.total_eviction_to_offchip_memory_energy = 0 - self.total_sink_layer_output_offchip_link_energy = 0 - self.total_sink_layer_output_offchip_memory_energy = 0 - self.total_core_to_core_link_energy = 0 - self.total_core_to_core_memory_energy = 0 + self.link_energy: dict[TransferCause, float] = defaultdict(lambda: 0) + self.memory_energy: dict[TransferCause, float] = defaultdict(lambda: 0) + + # self.total_cn_offchip_link_energy = 0 + # self.total_cn_offchip_memory_energy = 0 + # self.total_eviction_to_offchip_link_energy = 0 + # self.total_eviction_to_offchip_memory_energy = 0 + # self.total_sink_layer_output_offchip_link_energy = 0 + # self.total_sink_layer_output_offchip_memory_energy = 0 + # self.total_core_to_core_link_energy = 0 + # self.total_core_to_core_memory_energy = 0 # Remains constant throughout the scheduling self.sink_layer_nodes = self.get_sink_layer_nodes() @@ -127,18 +142,19 @@ def run(self): # Step 1: for operands that are too large to store in the core's memory, clear the memory so ZigZag can # optimize the loop ordering using the full memory size - transfer_complete_timestep = self.clear_memories( - core=core, - memory_operands=best_candidate.too_large_operands, - timestep=timestep, - exceptions=tensors_this_candidate_needs, - transfer_bandwidth_fraction=transfer_bw_fraction, - ) - timestep = transfer_complete_timestep + if best_candidate.too_large_operands: + transfer_complete_timestep = self.clear_memories( + core=core, + memory_operands=best_candidate.too_large_operands, + timestep=timestep, + exceptions=tensors_this_candidate_needs, + transfer_bandwidth_fraction=transfer_bw_fraction, + ) + timestep = transfer_complete_timestep # Step 2: Transfer the tensors needed for this node to the core (from off-chip or from another core) for tensor, tensor_operand in zip(tensors_this_candidate_needs, tensors_operands): - transfer_complete_timestep = self.transfer_tensor_to_core( + transfer_complete_timestep = self.schedule_tensor_transfer( tensor=tensor, tensor_operand=tensor_operand, receiving_core=core, @@ -149,9 +165,8 @@ def run(self): timestep = max(timestep, transfer_complete_timestep) # Step 3: make space for the output tensor of this node - output_layer_operand = best_candidate.output_operand - output_memory_operand = best_candidate.memory_operand_links.layer_to_mem_op(output_layer_operand) - output_tensor = best_candidate.operand_tensors[output_layer_operand] + output_tensor = best_candidate.get_output_tensor() + output_memory_operand = output_tensor.memory_operand core_to_add_output_to = ( self.offchip_core if output_memory_operand in best_candidate.too_large_operands else core ) @@ -211,27 +226,19 @@ def get_transfer_bandwidth_fraction(self, node: ComputationNode): return 1 / node.get_total_inter_core_splits() def prefetch_constant_operands(self): - """Load the `operands_to_prefetch` to the cores they belong to and log the energy costs.""" + """Load the `operands_to_prefetch` to the cores they belong to.""" for n in self.G.node_list: for op, tensor in n.operand_tensors.items(): if op in n.constant_operands and op in self.operands_to_prefetch: - core_allocation = n.chosen_core_allocation - assert core_allocation is not None, "Core should be allocated" + core = self.get_core_for_node(n) memory_op = n.memory_operand_links.layer_to_mem_op(op) - if not self.accelerator.contains_tensor(tensor, core_allocation): - ( - _, - transfer_link_energy_cost, - transfer_memory_energy_cost, - eviction_link_energy_cost, - eviction_memory_energy_cost, - came_from_offchip, - ) = self.accelerator.transfer_tensor_to_core(tensor, core_allocation, memory_op, []) - assert came_from_offchip - self.total_cn_offchip_link_energy += transfer_link_energy_cost - self.total_cn_offchip_memory_energy += transfer_memory_energy_cost - self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost - self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost + if not self.accelerator.core_contains_tensor(tensor, core): + self.schedule_tensor_transfer( + tensor=tensor, + tensor_operand=memory_op, + receiving_core=core, + non_evictable_tensors=[], + ) def pop_best_candidate(self) -> tuple[ComputationNode, int]: """Get the best candidate node to schedule next, given the selection priority. Remove that candidate from the @@ -325,11 +332,7 @@ def clear_memories( transfer_bandwidth_fraction: float = 1, ): for too_large_operand in memory_operands: - ( - timestep, - eviction_link_energy_cost, - eviction_memory_energy_cost, - ) = self.accelerator.remove_all( + timestep = self.remove_all( core=core, memory_operand=too_large_operand, timestep=timestep, @@ -337,71 +340,239 @@ def clear_memories( transfer_bandwidth_fraction=transfer_bandwidth_fraction, write_back_to_offchip=True, ) - self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost - self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost return timestep - def transfer_tensor_to_core( + def remove_all( + self, + core: Core, + memory_operand: MemoryOperand, + timestep: int, + exceptions: list[Tensor] = [], + transfer_bandwidth_fraction: float = 1, + write_back_to_offchip: bool = False, + ): + """Remove all tensors from a core's memory with the given memory operand. + If required, the tensors are written back to offchip before removal. + + Args: + core (Core): The Core to remove the tensor from + memory_operand: The memory operand for which all tensors should be evicted. + timestep: The timestep to remove the tensor at. + exceptions: A list of tensors that should not be evicted. + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfers. + write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. + """ + stored_tensors = self.accelerator.get_tensors_stored_in_core(core, memory_operand, timestep) + + for tensor in stored_tensors: + if tensor not in exceptions: + timestep = self.schedule_tensor_removal( + tensor_to_remove=tensor, + core_to_remove_from=core, + memory_op=memory_operand, + timestep=timestep, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=write_back_to_offchip, + transfer_cause=TransferCause.EVICTION, + ) + + return timestep + + def schedule_tensor_removal( + self, + tensor_to_remove: Tensor, + core_to_remove_from: Core, + memory_op: MemoryOperand, + timestep: int, + transfer_bandwidth_fraction: float = 1, + write_back_to_offchip: bool = False, + transfer_cause: TransferCause = TransferCause.EVICTION, + ): + """Remove tensor from core. If required, transfer to offchip before removal. + + Args: + tensor: The tensor to remove. + core: The Core to remove the tensor from. + memory_op: The memory operand of the tensor. + timestep: The timestep to remove the tensor at. + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. + write_back_to_offchip: Write the tensor to offchip before removal. Defaults to False. + """ + should_be_written_to_offchip = write_back_to_offchip and not self.accelerator.core_contains_tensor( + tensor_to_remove, self.offchip_core + ) + if should_be_written_to_offchip: + transfer_end = self.schedule_tensor_transfer( + tensor=tensor_to_remove, + receiving_core=self.offchip_core, + tensor_operand=memory_op, + sending_core=core_to_remove_from, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + transfer_cause=transfer_cause, + ) + + timestep = max(timestep, transfer_end) + + self.accelerator.remove_tensor( + tensor=tensor_to_remove, core=core_to_remove_from, memory_op=memory_op, timestep=timestep + ) + + return timestep + + def schedule_tensor_transfer( self, tensor: Tensor, - tensor_operand: MemoryOperand, receiving_core: Core, - non_evictable_tensors: list[Tensor], - sending_core_id: int | None = None, + tensor_operand: MemoryOperand, earliest_t: int = 0, + non_evictable_tensors: list[Tensor] = [], + sending_core: Core | None = None, transfer_bandwidth_fraction: float = 1, + transfer_cause: TransferCause | None = None, ): - ( - transfer_complete_timestep, - transfer_link_energy_cost, - transfer_memory_energy_cost, - eviction_link_energy_cost, - eviction_memory_energy_cost, - came_from_offchip, - ) = self.accelerator.transfer_tensor_to_core( - tensor, - receiving_core.id, - tensor_operand, - non_evictable_tensors, - earliest_t=earliest_t, + """Find the earliest time to transfer the tensor to the receiving core, and register the transfer. + Evictions of older tensors might be necessary + + Args: + tensor + receiving_core + tensor_operand + transfer_cause + non_evictable_tensors + sending_core + earliest_t + transfer_bandwidth_fraction + """ + + if self.accelerator.core_contains_tensor(tensor, receiving_core): + return earliest_t + + tensor_available_since_timestep = self.accelerator.get_available_timestep(tensor, sending_core) + earliest_tensor_addition_t = max(earliest_t, tensor_available_since_timestep) + + # Evict older tensors if given tensor doesn't fit yet + evictions_complete_timestep = self.make_space_for_tensor( + tensor=tensor, + core=receiving_core, + memory_op=tensor_operand, + timestep=earliest_tensor_addition_t, + tensors_to_avoid_evicting=non_evictable_tensors, transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) - # Add the energy costs to their respective trackers - if came_from_offchip: - self.total_cn_offchip_link_energy += transfer_link_energy_cost - self.total_cn_offchip_memory_energy += transfer_memory_energy_cost - else: - self.total_core_to_core_link_energy += transfer_link_energy_cost - self.total_core_to_core_memory_energy += transfer_memory_energy_cost - self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost - self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - return transfer_complete_timestep + # Find idle window between sender and receiver cores + # TODO If the storing_instance is a shared instance across more than one core, there will be multiple possible + # TODO cores to transfer between. For now, we take the first one + sending_cores = self.accelerator.get_storing_cores(tensor, sending_core) + sending_core = sending_cores[0] + + transfer_start, transfer_end = self.accelerator.find_earliest_time_for_transfer( + tensor=tensor, + sending_core=sending_core, + receiving_core=receiving_core, + earliest_t=evictions_complete_timestep, + bandwidth_fraction=transfer_bandwidth_fraction, + ) + + # Spawn the tensor on the receiving core, remove from sending core and update communication links + transfer_link_energy_cost, transfer_memory_energy_cost = self.accelerator.register_tensor_transfer( + tensor=tensor, + tensor_operand=tensor_operand, + sending_core=sending_core, + receiving_core=receiving_core, + transfer_start=transfer_start, + transfer_end=transfer_end, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + ) + + # Register energy + if not transfer_cause: + came_form_offchip = sending_core == self.offchip_core + transfer_cause = TransferCause.OFF_CHIP if came_form_offchip else TransferCause.CORE_TO_CORE + + self.link_energy[transfer_cause] += transfer_link_energy_cost + self.memory_energy[transfer_cause] += transfer_memory_energy_cost + + return transfer_end def make_space_for_tensor( self, tensor: Tensor, core: Core, - memory_operand: MemoryOperand, + memory_op: MemoryOperand, timestep: int, tensors_to_avoid_evicting: list[Tensor] = [], transfer_bandwidth_fraction: float = 1, ): - ( - evictions_complete_timestep, - eviction_link_energy_cost, - eviction_memory_energy_cost, - ) = self.accelerator.make_space_for( + """Make space for the given tensor on the given core by evicting already stored tensors if necessary. + + Args: + tensor: The tensor to make space for. + core (Core): The core where the tensor will be stored. + memory_operand: The memory operand on the core. + timestep: The timestep at which to make space for. + transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. + """ + + top_instance = self.accelerator.get_top_instance_of_core(core, memory_op) + + # Earliest timestep when the core has enough space, or the latest timestep if this is never the case + enough_space_timestep = self.accelerator.memory_manager.get_timestep_for_tensor_addition( tensor=tensor, core=core, - memory_op=memory_operand, timestep=timestep, - tensors_to_avoid_evicting=tensors_to_avoid_evicting, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, + memory_op=tensor.memory_operand, ) - self.total_eviction_to_offchip_link_energy += eviction_link_energy_cost - self.total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - return evictions_complete_timestep + + tensors_to_evict = self.accelerator.memory_manager.find_best_tensor_combination_to_evict_fast( + top_instance, + tensor, + enough_space_timestep, + exceptions=tensors_to_avoid_evicting, + ) + + if core == self.offchip_core and tensors_to_evict: + raise ValueError("Evictions required in offchip memory. Consider making offchip larger.") + + for tensor_to_evict in tensors_to_evict: + t_eviction_complete = self.schedule_tensor_removal( + tensor_to_remove=tensor_to_evict, + core_to_remove_from=core, + memory_op=memory_op, + timestep=timestep, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=True, + transfer_cause=TransferCause.EVICTION, + ) + timestep = max(timestep, t_eviction_complete) + + t_evictions_complete = max(enough_space_timestep, timestep) + return t_evictions_complete + + def remove_sink_node_tensor( + self, + node: ComputationNode, + tensor_to_remove: Tensor, + core_to_remove_from: Core, + timestep: int, + transfer_bandwidth_fraction: float, + ): + """If this node is a sink node (node that has no successors and that produces a final output), transfer final + outputs to offchip + """ + if node in self.sink_layer_nodes: + # Only push back sink node outputs if they're generated and stored on the core + if Constants.OUTPUT_MEM_OP not in node.too_large_operands: + self.schedule_tensor_removal( + tensor_to_remove=tensor_to_remove, + core_to_remove_from=core_to_remove_from, + memory_op=tensor_to_remove.memory_operand, + timestep=timestep, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=True, + transfer_cause=TransferCause.SINK_LAYER, + ) + # TODO hier wordt denk ik gene timestep doorgegeven! def register_scheduled_node( self, @@ -428,38 +599,9 @@ def register_scheduled_node( self.scheduled_nodes.add(node) self.total_cn_onchip_energy += node.get_onchip_energy() - self.total_cn_offchip_memory_energy += node.get_offchip_energy() + self.memory_energy[TransferCause.OFF_CHIP] += node.get_offchip_energy() return end_time - def remove_sink_node_tensor( - self, - node: ComputationNode, - tensor_to_remove: Tensor, - core_to_remove_from: Core, - timestep: int, - transfer_bandwidth_fraction: float, - ): - """If this node is a sink node (node that has no successors and that produces a final output), transfer final - outputs to offchip - """ - if node in self.sink_layer_nodes: - # Only push back sink node outputs if they're generated and stored on the core - if Constants.OUTPUT_MEM_OP not in node.too_large_operands: - ( - _, - link_energy_cost, - memory_energy_cost, - ) = self.accelerator.remove( - tensor=tensor_to_remove, - core=core_to_remove_from, - memory_op=tensor_to_remove.memory_operand, - timestep=timestep, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - write_back_to_offchip=True, - ) - self.total_sink_layer_output_offchip_link_energy += link_energy_cost - self.total_sink_layer_output_offchip_memory_energy += memory_energy_cost - def decrease_priority( self, tensors: list[Tensor], @@ -513,12 +655,13 @@ def check_for_removal( # Get a core tied to the top_instance we want to remove it on. core = self.accelerator.memory_manager.cores_per_top_instance[instance_storing_tensor][0] - self.accelerator.remove( - tensor_used_by_node, - core, - tensor_used_by_node.memory_operand, - timestep_for_removal, + self.schedule_tensor_removal( + tensor_to_remove=tensor_used_by_node, + core_to_remove_from=core, + memory_op=tensor_used_by_node.memory_operand, + timestep=timestep_for_removal, transfer_bandwidth_fraction=transfer_bandwidth_fraction, + transfer_cause=TransferCause.NO_LOG, ) def extend_candidates(self, node: ComputationNode): @@ -536,3 +679,35 @@ def get_total_latency(self): cns_end_time = max((n.end for n in self.G.node_list)) links_end_time = max([event.end for event in self.accelerator.communication_manager.events], default=0) return max(cns_end_time, links_end_time) + + @property + def total_cn_offchip_link_energy(self): + return self.link_energy[TransferCause.OFF_CHIP] + + @property + def total_cn_offchip_memory_energy(self): + return self.memory_energy[TransferCause.OFF_CHIP] + + @property + def total_eviction_to_offchip_link_energy(self): + return self.link_energy[TransferCause.EVICTION] + + @property + def total_eviction_to_offchip_memory_energy(self): + return self.memory_energy[TransferCause.EVICTION] + + @property + def total_sink_layer_output_offchip_link_energy(self): + return self.link_energy[TransferCause.SINK_LAYER] + + @property + def total_sink_layer_output_offchip_memory_energy(self): + return self.memory_energy[TransferCause.SINK_LAYER] + + @property + def total_core_to_core_link_energy(self): + return self.link_energy[TransferCause.CORE_TO_CORE] + + @property + def total_core_to_core_memory_energy(self): + return self.memory_energy[TransferCause.CORE_TO_CORE] diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index c95f28df..406ed2f0 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -40,18 +40,6 @@ def __init__( self.memory_manager = MemoryManager(self) self.communication_manager = CommunicationManager(self) - def __str__(self) -> str: - return f"Accelerator({self.name})" - - def __repr__(self) -> str: - return str(self) - - def __jsonrepr__(self) -> dict[str, Any]: - """ - JSON representation used for saving this object to a json file. - """ - return {"name": self.name, "cores": self.cores} - def get_core(self, core_id: int) -> Core: """s Return the core with id 'core_id'. @@ -64,277 +52,72 @@ def get_offchip_core(self) -> Core: assert self.offchip_core_id, "This accelerator has no offchip core id." return self.get_core(self.offchip_core_id) - @property - def core_list(self) -> list[Core]: - return list(self.cores.node_list) - - def spawn( - self, - tensor: Tensor, - core: Core, - memory_op: MemoryOperand, - initial_timestep: int, - available_timestep: int, - ): - """Spawns a tensor on a core. - - Args: - tensor: The tensor to be spawned. - core: The core on which to spawn the tensor. - memory_op: The memory operand on the core where the tensor will spawn. - initial_timestep: The timestep at which space will be reserved for the tensor. - available_timestep: The timestep at which the tensor will become available. Different from - initial_timestep when it is transferred. - """ - self.memory_manager.add_tensor_to_core(tensor, core, initial_timestep, available_timestep, memory_op) - - def remove( - self, - tensor: Tensor, - core: Core, - memory_op: MemoryOperand, - timestep: int, - transfer_bandwidth_fraction: float = 1, - write_back_to_offchip: bool = False, - ): - """Remove tensor from core. If required, transfer to offchip before removal. + def get_top_instances_of_core(self, core: int | Core) -> list[MemoryInstance]: + if isinstance(core, int): + core = self.get_core(core) + top_instances = self.memory_manager.top_instances_per_core[core] + return top_instances - Args: - tensor: The tensor to remove. - core (Core): The Core to remove the tensor from. - memory_op: The memory operand of the tensor. - timestep: The timestep to remove the tensor at. - transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. - write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. - """ - assert self.offchip_core_id is not None - ################################# STEP 1 ################################# - # Transfer the tensor to off-chip if required and not present there - link_energy_cost = 0 - memory_energy_cost = 0 - offchip_instance = self.get_top_instance_of_core(self.offchip_core_id, memory_op) - should_be_written_to_offchip = write_back_to_offchip and not self.contains_tensor(tensor, offchip_instance) - current_timestep = timestep - if should_be_written_to_offchip: - assert self.offchip_core_id is not None - ( - transfer_end, - transfer_link_energy_cost, - transfer_memory_energy_cost, - eviction_link_energy_cost, - eviction_memory_energy_cost, - came_from_offchip, - ) = self.transfer_tensor_to_core( - tensor, - self.offchip_core_id, - memory_op, - non_evictable_tensors=[], - sending_core_id=core.id, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - ) - # There should be no evictions as we are writing to offchip - assert eviction_link_energy_cost == 0 - assert eviction_memory_energy_cost == 0 - assert not came_from_offchip - link_energy_cost = transfer_link_energy_cost - memory_energy_cost = transfer_memory_energy_cost - current_timestep = max(current_timestep, transfer_end) - - ################################# STEP 2 ################################# - # Remove the tensor from the memory manager's attributes - top_instance = self.get_top_instance_of_core(core, memory_op) - self.memory_manager.remove_tensor_from_top_instance( - top_instance, - tensor, - timestep, - ) + def get_top_instance_of_core(self, core: Core | int, mem_op: MemoryOperand) -> MemoryInstance: + if isinstance(core, int): + core = self.get_core(core) + top_instances = self.memory_manager.top_instances_per_core[core] + for instance in top_instances: + core_idx = self.memory_manager.cores_per_top_instance[instance].index(core) + instance_mem_ops = self.memory_manager.memory_operands_per_top_instance[instance][core_idx] + if mem_op in instance_mem_ops: + return instance + raise ValueError(f"No top instance for {core} with memory operand {mem_op}.") - return current_timestep, link_energy_cost, memory_energy_cost + def get_spatial_mapping_from_core(self, core_allocation: list[int]) -> SpatialMapping: + """Iff the dataflows of all given cores is the same, return that dataflow. Otherwise, throw an error""" + all_dataflows = [self.get_core(core_id).dataflows for core_id in core_allocation] + some_dataflow = all_dataflows.pop() - def remove_all( - self, - core: Core, - memory_operand: MemoryOperand, - timestep: int, - exceptions: list[Tensor] = [], - transfer_bandwidth_fraction: float = 1, - write_back_to_offchip: bool = False, - ): - """Remove all tensors from a core's memory with the given memory operand. - If required, the tensors are written back to offchip before removal. + # All cores have same dataflow + if some_dataflow is not None and all(some_dataflow == dataflow for dataflow in all_dataflows): + return some_dataflow - Args: - core (Core): The Core to remove the tensor from - memory_operand: The memory operand for which all tensors should be evicted. - timestep: The timestep to remove the tensor at. - exceptions: A list of tensors that should not be evicted. - transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfers. - write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. - """ - total_link_energy_cost = 0 - total_memory_energy_cost = 0 - top_instance = self.get_top_instance_of_core(core, memory_operand) - # stored_tensors = self.stored_tensors[core][top_level_idx] - t = timestep - for tensor in self.memory_manager.get_tensors_stored_at_timestep(top_instance, timestep): - if tensor not in exceptions: - t, link_energy_cost, memory_energy_cost = self.remove( - tensor=tensor, - core=core, - memory_op=memory_operand, - timestep=t, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - write_back_to_offchip=write_back_to_offchip, - ) - total_link_energy_cost += link_energy_cost - total_memory_energy_cost += memory_energy_cost - return t, total_link_energy_cost, total_memory_energy_cost + raise ValueError("Unclear which dataflow to return or no valid dataflow found.") - def make_space_for( - self, - tensor: Tensor, - core: Core, - memory_op: MemoryOperand, - timestep: int, - tensors_to_avoid_evicting: list[Tensor] = [], - transfer_bandwidth_fraction: float = 1, - ): - """Make space for the given tensor on the given core by evicting already stored tensors if necessary. + def has_shared_memory(self, core_id_a: int, core_id_b: int, mem_op_a: MemoryOperand, mem_op_b: MemoryOperand): + """Check whether two cores have a shared top level memory instance for a given memory operand. Args: - tensor: The tensor to make space for. - core (Core): The core where the tensor will be stored. - memory_operand: The memory operand on the core. - timestep: The timestep at which to make space for. - transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. + core_id_a : The first core id. + core_id_b : The second core id. + mem_op_a : The memory operand for the tensor in core a. + mem_op_b : The memory operand for the tensor in core b. """ - total_eviction_link_energy_cost = 0 - total_eviction_memory_energy_cost = 0 - - top_instance = self.get_top_instance_of_core(core, memory_op) - - # Get the timestep at which there's enough space for this tensor - enough_space_timestep = self.memory_manager.get_timestep_for_tensor_addition( - tensor, - core.id, - timestep, - memory_op=tensor.memory_operand, - ) - - tensors_to_evict = self.memory_manager.find_best_tensor_combination_to_evict_fast( - top_instance, - tensor, - enough_space_timestep, - exceptions=tensors_to_avoid_evicting, + core_a = self.get_core(core_id_a) + core_b = self.get_core(core_id_b) + top_memory_instance_a = next( + ( + ml.memory_instance + for ml, out_degree in core_a.memory_hierarchy.out_degree() + if out_degree == 0 and mem_op_a in ml.operands + ) ) - if core.id == self.offchip_core_id and tensors_to_evict: - raise ValueError("Evictions required in offchip memory. Consider making offchip larger.") - t_evictions_complete = timestep - for tensor_to_evict in tensors_to_evict: + top_memory_instance_b = next( ( - t_eviction_complete, - eviction_link_energy_cost, - eviction_memory_energy_cost, - ) = self.remove( - tensor_to_evict, - core, - memory_op, - timestep, - write_back_to_offchip=True, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, + ml.memory_instance + for ml, out_degree in core_b.memory_hierarchy.out_degree() + if out_degree == 0 and mem_op_b in ml.operands ) - t_evictions_complete = max(t_evictions_complete, t_eviction_complete) - total_eviction_link_energy_cost += eviction_link_energy_cost - total_eviction_memory_energy_cost += eviction_memory_energy_cost - t_evictions_complete = max(enough_space_timestep, t_evictions_complete) - return ( - t_evictions_complete, - total_eviction_link_energy_cost, - total_eviction_memory_energy_cost, ) + return top_memory_instance_a is top_memory_instance_b - def transfer_tensor_to_core( - self, - tensor: Tensor, - receiving_core_id: int, - tensor_operand: MemoryOperand, - non_evictable_tensors: list[Tensor], - sending_core_id: int | None = None, - earliest_t: int = 0, - transfer_bandwidth_fraction: float = 1, - ) -> tuple[int, float, float, float, float, bool]: - """ - Transfer a tensor to a given core id. - If the tensor is already present on the receiving core, nothing happens. - - This function computes when the transfer can take place based on three factors: - 1) The tensor is available for transfer on a sender core. - 2) The receiver core has enough space to store the tensor. - 3) The links between sender and receiver have a long enough idle window. - - TODO: The transfer is scheduled as close as possible to the computation - - The tensor is then added to the memory. Evictions are still possible if - there wasn't enough space on the receiver core at any earlier timestep. - If one of the links already transferred the tensor, we broadcast if possible. + def get_storing_memory_instance_and_timestep(self, tensor: Tensor, suggested_core: Core | None): + """Get the top instance storing the given tensor, and the timestep since which it was available. + If a core id is provided, we get the instance of that core. Else, we find the instance where the tensor has + been stored the longest. Args: - tensor: The tensor to transfer. - receiving_core_id: The id of the core that needs to receive the tensor. - tensor_operand: The memory operand where the tensor needs to be stored. - non_evictable_tensors: the stored tensor that cannot be evicted - sending_core_id: The id of the core that should transfer the tensor. - earliest_t: Earliest timestep at which transfer can happen - transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer + tensor: The tensor to find the storing instance for. + suggested_core_id: The core id to suggest for the storing instance. """ - - def find_transfer_start_and_end_time(links_bw: dict[CommunicationLink, int]): - """ - Given the links to transfer across and corresponding available bandwidths, return the earliest transfer - start and end time. - - Args: - links_bw: link and corresponding transfer bandwidth - """ - slowest_bw = min(links_bw.values()) - transfer_duration = ceil(tensor.size / slowest_bw) - tensor_bw_per_link = {link: [(tensor, link_bw)] for link, link_bw in links_bw.items()} - transfer_start = self.communication_manager.get_links_idle_window( - tensor_bw_per_link=tensor_bw_per_link, - best_case_start=evictions_complete_timestep, - duration=transfer_duration, - ) - transfer_end = transfer_start + transfer_duration - return transfer_start, transfer_end - - def find_earliest_time_for_transfer(links: list[CommunicationLink], bandwidth_fraction: float = 1): - """Find the earliest time at which a tensor transfer between 2 cores can happen.""" - assert 0 < bandwidth_fraction <= 1 - windows: list[tuple[int, int]] = [] - - links_with_bw = {link: ceil(bandwidth_fraction * link.bandwidth) for link in links} - start, end = find_transfer_start_and_end_time(links_with_bw) - windows.append((start, end)) - - ends = [end for _, end in windows] - best_idx = ends.index(min(ends)) - best_window = windows[best_idx] - return best_window - - ################################# STEP 0 ################################# - # Check if the tensor is already on the receiving core - # Get the top instance where the tensor will be transferred to - receiving_core = self.get_core(receiving_core_id) - receiving_top_instance = self.get_top_instance_of_core(receiving_core_id, tensor_operand) - if self.memory_manager.contains(tensor, receiving_top_instance): - return -1, 0, 0, 0, 0, False - - ################################# STEP 1 ################################# - # Get the top instance storing the tensor - # If a sending core id is provided, we get the instance of that core. - # Else, we find the instance where the tensor has been stored the longest - if sending_core_id is not None: - storing_instance = self.get_top_instance_of_core(sending_core_id, tensor.memory_operand) + if suggested_core is not None: + storing_instance = self.get_top_instance_of_core(suggested_core, tensor.memory_operand) assert self.contains_tensor(tensor, storing_instance) available_since_timestep = self.memory_manager.top_instance_available_since_timestep[storing_instance][ tensor.equality_hash() @@ -351,92 +134,156 @@ def find_earliest_time_for_transfer(links: list[CommunicationLink], bandwidth_fr ) ) - ################################# STEP 2 ################################# - # The receiver core has enough space to store the tensor. - earliest_tensor_addition_t = max(earliest_t, available_since_timestep) - enough_space_timestep = self.memory_manager.get_timestep_for_tensor_addition( - tensor, - receiving_core_id, - earliest_tensor_addition_t, - memory_op=tensor_operand, - ) + return storing_instance, available_since_timestep - ################################# STEP 3 ################################# - # Make space on the receiving core by evicting tensors if there was never enough space. - ( - evictions_complete_timestep, - eviction_link_energy_cost, - eviction_memory_energy_cost, - ) = self.make_space_for( + def get_available_timestep(self, tensor: Tensor, suggested_core: Core | None): + _, available_since_timestep = self.get_storing_memory_instance_and_timestep(tensor, suggested_core) + return available_since_timestep + + def get_storing_memory_instance(self, tensor: Tensor, suggested_core: Core | None): + storing_instance, _ = self.get_storing_memory_instance_and_timestep(tensor, suggested_core) + return storing_instance + + def get_storing_cores(self, tensor: Tensor, suggested_core: Core | None): + storing_instance, _ = self.get_storing_memory_instance_and_timestep(tensor, suggested_core) + storing_cores = self.memory_manager.cores_per_top_instance[storing_instance] + return storing_cores + + def get_tensors_stored_in_core(self, core: Core, memory_operand: MemoryOperand, timestep: int): + top_instance = self.get_top_instance_of_core(core, memory_operand) + tensors = self.memory_manager.get_tensors_stored_at_timestep(top_instance, timestep) + return tensors + + def core_contains_tensor(self, tensor: Tensor, core: int | Core): + memory_op = tensor.memory_operand + top_instance = self.get_top_instance_of_core(core, memory_op) + assert isinstance(top_instance, MemoryInstance) + return self.memory_manager.contains(tensor, top_instance) + + def contains_tensor(self, tensor: Tensor, top_instance: int | MemoryInstance): + if isinstance(top_instance, int): # assume core id + return self.core_contains_tensor(tensor, top_instance) + assert isinstance(top_instance, MemoryInstance) + return self.memory_manager.contains(tensor, top_instance) + + def find_tensor(self, tensor: Tensor): + return self.memory_manager.find_tensor(tensor) + + def find_tensor_in_top_instances(self, tensor: Tensor): + return self.memory_manager.find_tensor_in_top_instances(tensor) + + def remove_tensor( + self, + tensor: Tensor, + core: Core, + memory_op: MemoryOperand, + timestep: int, + ): + """Remove the tensor from the memory manager's attributes""" + top_instance = self.get_top_instance_of_core(core, memory_op) + self.memory_manager.remove_tensor_from_top_instance( + top_instance, tensor, - receiving_core, - tensor_operand, - enough_space_timestep, - non_evictable_tensors, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, + timestep, ) - ################################# STEP 4 ################################# - # The links between sender and receiver have a long enough idle window. - # TODO If the storing_instance is a shared instance across more than one core, there will be multiple possible - # TODO cores to transfer between. For now, we take the first one - sender_cores = self.memory_manager.cores_per_top_instance[storing_instance] - sender_core = sender_cores[0] - links = self.communication_manager.get_links_for_pair(sender_core, receiving_core) - transfer_start, transfer_end = find_earliest_time_for_transfer( - links, bandwidth_fraction=transfer_bandwidth_fraction - ) + def spawn( + self, + tensor: Tensor, + core: Core, + memory_op: MemoryOperand, + initial_timestep: int, + available_timestep: int, + ): + """Spawns a tensor on a core. + + Args: + tensor: The tensor to be spawned. + core: The core on which to spawn the tensor. + memory_op: The memory operand on the core where the tensor will spawn. + initial_timestep: The timestep at which space will be reserved for the tensor. + available_timestep: The timestep at which the tensor will become available. Different from + initial_timestep when it is transferred. + """ + self.memory_manager.add_tensor_to_core(tensor, core, initial_timestep, available_timestep, memory_op) + + def register_tensor_transfer( + self, + tensor: Tensor, + tensor_operand: MemoryOperand, + sending_core: Core, + receiving_core: Core, + transfer_start: int, + transfer_end: int, + transfer_bandwidth_fraction: float, + ): + """Register a tensor transfer between two cores: spawn the tensor on the receiving core, remove it form the + sending core and update the communication links.""" transfer_duration = transfer_end - transfer_start - ################################# STEP 5 ################################# - # Spawn the tensor on the receiving core + # Spawn at the receiving core self.spawn(tensor, receiving_core, tensor_operand, transfer_start, transfer_end) - ################################# STEP 6 ################################# - # Update the links involved in the communication and get the transfer energy cost + # Register transfer sending core -> receiving core ( transfer_link_energy_cost, transfer_memory_energy_cost, ) = self.communication_manager.update_links( tensor, - sender_core.id, - receiving_core_id, + sending_core.id, + receiving_core.id, tensor_operand, transfer_start, transfer_duration, link_bw_fraction=transfer_bandwidth_fraction, ) - ################################# STEP 7 ################################# - # Remove the transferred tensor from the sender core (excluding DRAM) - # if it is no longer needed. - if sender_core.id == self.offchip_core_id: - pass - # Don't remove it from the producing core - else: - not_on_producing_core = sender_core.id != tensor.origin.chosen_core_allocation + # Remove from sending core (except if it is offchip) + if sending_core.id != self.offchip_core_id: + not_on_producing_core = sending_core.id != tensor.origin.chosen_core_allocation + storing_instance = self.get_storing_memory_instance(tensor, sending_core) tensor_priority = tensor.get_instance_priority(storing_instance, self.memory_manager) if not_on_producing_core and tensor_priority == 0: - self.remove( - tensor, - sender_core, - tensor.memory_operand, - transfer_end, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - write_back_to_offchip=False, - ) + self.remove_tensor(tensor, sending_core, memory_op=tensor.memory_operand, timestep=transfer_end) - ################################# STEP 8 ################################# - # Give back flag that signals if the tensor came from offchip - came_from_offchip = sender_core.id == self.offchip_core_id - return ( - transfer_end, - transfer_link_energy_cost, - transfer_memory_energy_cost, - eviction_link_energy_cost, - eviction_memory_energy_cost, - came_from_offchip, + return transfer_link_energy_cost, transfer_memory_energy_cost + + def find_earliest_time_for_transfer( + self, tensor: Tensor, sending_core: Core, receiving_core: Core, earliest_t: int, bandwidth_fraction: float = 1 + ): + """Find the earliest time >= `earliest_t` at which a tensor transfer between 2 cores can happen.""" + assert 0 < bandwidth_fraction <= 1 + windows: list[tuple[int, int]] = [] + + links = self.communication_manager.get_links_for_pair(sending_core, receiving_core) + links_with_bw = {link: ceil(bandwidth_fraction * link.bandwidth) for link in links} + start, end = self.find_transfer_start_and_end_time(tensor, links_with_bw, earliest_t) + windows.append((start, end)) + + ends = [end for _, end in windows] + best_idx = ends.index(min(ends)) + best_window = windows[best_idx] + return best_window + + def find_transfer_start_and_end_time(self, tensor: Tensor, links_bw: dict[CommunicationLink, int], earliest_t: int): + """ + Given the links to transfer across and corresponding available bandwidths, return the earliest transfer start + and end time for this tensor. + + Args: + tensor: The tensor to transfer + links_bw: link and corresponding transfer bandwidth + """ + slowest_bw = min(links_bw.values()) + transfer_duration = ceil(tensor.size / slowest_bw) + tensor_bw_per_link = {link: [(tensor, link_bw)] for link, link_bw in links_bw.items()} + transfer_start = self.communication_manager.get_links_idle_window( + tensor_bw_per_link=tensor_bw_per_link, + best_case_start=earliest_t, + duration=transfer_duration, ) + transfer_end = transfer_start + transfer_duration + return transfer_start, transfer_end def get_memory_energy_cost_of_transfer( self, @@ -474,70 +321,18 @@ def block_offchip_links( ) -> int: return self.communication_manager.block_offchip_links(too_large_operands, core_id, start_timestep, duration, cn) - def contains_tensor(self, tensor: Tensor, top_instance: int | MemoryInstance): - if isinstance(top_instance, int): # assume core id - memory_op = tensor.memory_operand - top_instance = self.get_top_instance_of_core(top_instance, memory_op) - assert isinstance(top_instance, MemoryInstance) - return self.memory_manager.contains(tensor, top_instance) - - def find_tensor(self, tensor: Tensor): - return self.memory_manager.find_tensor(tensor) + @property + def core_list(self) -> list[Core]: + return list(self.cores.node_list) - def find_tensor_in_top_instances(self, tensor: Tensor): - return self.memory_manager.find_tensor_in_top_instances(tensor) + def __str__(self) -> str: + return f"Accelerator({self.name})" - def has_shared_memory(self, core_id_a: int, core_id_b: int, mem_op_a: MemoryOperand, mem_op_b: MemoryOperand): - """Check whether two cores have a shared top level memory instance for a given memory operand. + def __repr__(self) -> str: + return str(self) - Args: - core_id_a : The first core id. - core_id_b : The second core id. - mem_op_a : The memory operand for the tensor in core a. - mem_op_b : The memory operand for the tensor in core b. + def __jsonrepr__(self) -> dict[str, Any]: """ - core_a = self.get_core(core_id_a) - core_b = self.get_core(core_id_b) - top_memory_instance_a = next( - ( - ml.memory_instance - for ml, out_degree in core_a.memory_hierarchy.out_degree() - if out_degree == 0 and mem_op_a in ml.operands - ) - ) - top_memory_instance_b = next( - ( - ml.memory_instance - for ml, out_degree in core_b.memory_hierarchy.out_degree() - if out_degree == 0 and mem_op_b in ml.operands - ) - ) - return top_memory_instance_a is top_memory_instance_b - - def get_top_instances_of_core(self, core: int | Core) -> list[MemoryInstance]: - if isinstance(core, int): - core = self.get_core(core) - top_instances = self.memory_manager.top_instances_per_core[core] - return top_instances - - def get_top_instance_of_core(self, core: Core | int, mem_op: MemoryOperand) -> MemoryInstance: - if isinstance(core, int): - core = self.get_core(core) - top_instances = self.memory_manager.top_instances_per_core[core] - for instance in top_instances: - core_idx = self.memory_manager.cores_per_top_instance[instance].index(core) - instance_mem_ops = self.memory_manager.memory_operands_per_top_instance[instance][core_idx] - if mem_op in instance_mem_ops: - return instance - raise ValueError(f"No top instance for {core} with memory operand {mem_op}.") - - def get_spatial_mapping_from_core(self, core_allocation: list[int]) -> SpatialMapping: - """Iff the dataflows of all given cores is the same, return that dataflow. Otherwise, throw an error""" - all_dataflows = [self.get_core(core_id).dataflows for core_id in core_allocation] - some_dataflow = all_dataflows.pop() - - # All cores have same dataflow - if some_dataflow is not None and all(some_dataflow == dataflow for dataflow in all_dataflows): - return some_dataflow - - raise ValueError("Unclear which dataflow to return or no valid dataflow found.") + JSON representation used for saving this object to a json file. + """ + return {"name": self.name, "cores": self.cores} diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 08955802..724787bb 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -826,7 +826,7 @@ def get_node_tensor( ) node_tensor = node_tensor.extend_with_node(bounded_op_dim_ranges, tile) - if nb_unique_data_seen < (prod(tensor_shapes[op]) * precision): + if nb_unique_data_seen < (prod(tensor_shapes) * precision): logger.warning(f"Downsampling node detected: {node}, operand= {op}.") # The dimensionality order of this input/output operand might include diff --git a/stream/workload/computation/computation_node.py b/stream/workload/computation/computation_node.py index 6395f1ab..a8a3e504 100644 --- a/stream/workload/computation/computation_node.py +++ b/stream/workload/computation/computation_node.py @@ -136,6 +136,9 @@ def get_operand_tensor_reshape_default(self) -> OperandTensorReshape | None: except KeyError: return None + def get_output_tensor(self) -> Tensor: + return self.operand_tensors[self.output_operand] + def get_total_inter_core_splits(self) -> int: """Return the total number of inter-core splits for this node, i.e. over how many cores this node is split""" if contains_wildcard(self.inter_core_tiling): From fec0f5c7ffd9f1862644dc1ffd1a1802f87cb65e Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sat, 4 Jan 2025 19:52:52 +0100 Subject: [PATCH 46/49] use different bw fraction for evictions --- stream/cost_model/scheduler.py | 143 ++++++++------------ stream/hardware/architecture/accelerator.py | 12 ++ stream/workload/node.py | 9 +- 3 files changed, 73 insertions(+), 91 deletions(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index e4b84093..92491a05 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -60,15 +60,6 @@ def __init__( self.link_energy: dict[TransferCause, float] = defaultdict(lambda: 0) self.memory_energy: dict[TransferCause, float] = defaultdict(lambda: 0) - # self.total_cn_offchip_link_energy = 0 - # self.total_cn_offchip_memory_energy = 0 - # self.total_eviction_to_offchip_link_energy = 0 - # self.total_eviction_to_offchip_memory_energy = 0 - # self.total_sink_layer_output_offchip_link_energy = 0 - # self.total_sink_layer_output_offchip_memory_energy = 0 - # self.total_core_to_core_link_energy = 0 - # self.total_core_to_core_memory_energy = 0 - # Remains constant throughout the scheduling self.sink_layer_nodes = self.get_sink_layer_nodes() self.offchip_core = accelerator.get_offchip_core() @@ -77,6 +68,7 @@ def __init__( # Initialize bookkeeping self.nb_scheduled_nodes = 0 self.scheduled_nodes: set[ComputationNode] = set() + self.bw_fraction_to_use_for_tensor: dict[Tensor, float] = {} self.candidates = self.get_initial_candidates() self.initialize_tensor_priorities() self.initialize_offchip_tensors() @@ -132,7 +124,7 @@ def run(self): while not done: best_candidate, preds_end = self.pop_best_candidate() tensors_this_candidate_needs, tensors_operands = self.get_tensors_needed_for_node(best_candidate) - core = self.get_core_for_node(best_candidate) + core = self.get_allocated_core(best_candidate) transfer_bw_fraction = self.get_transfer_bandwidth_fraction(best_candidate) # Step 0: get the start time: when core is available or predecessors finished @@ -176,7 +168,6 @@ def run(self): output_memory_operand, timestep, tensors_this_candidate_needs, - transfer_bandwidth_fraction=transfer_bw_fraction, ) timestep = transfer_complete_timestep @@ -214,6 +205,7 @@ def run(self): ) # Step 7: finish this round + self.bw_fraction_to_use_for_tensor[output_tensor] = transfer_bw_fraction self.extend_candidates(best_candidate) nb_scheduled_nodes += 1 done = nb_scheduled_nodes == self.nb_graph_nodes @@ -221,16 +213,12 @@ def run(self): self.latency = self.get_total_latency() return self.latency - def get_transfer_bandwidth_fraction(self, node: ComputationNode): - """Get the fraction of the off-chip bandwidth to be used for the tensor transfers related to this node""" - return 1 / node.get_total_inter_core_splits() - def prefetch_constant_operands(self): """Load the `operands_to_prefetch` to the cores they belong to.""" for n in self.G.node_list: for op, tensor in n.operand_tensors.items(): if op in n.constant_operands and op in self.operands_to_prefetch: - core = self.get_core_for_node(n) + core = self.get_allocated_core(n) memory_op = n.memory_operand_links.layer_to_mem_op(op) if not self.accelerator.core_contains_tensor(tensor, core): self.schedule_tensor_transfer( @@ -279,12 +267,6 @@ def sync_cores_idle_from( for core_id in self.cores_idle_from: self.cores_idle_from[core_id] = max_idle_time - def get_core_for_node(self, node: ComputationNode): - """Get the core this candidate will be scheduled on""" - core_id = node.chosen_core_allocation - assert core_id is not None - return self.accelerator.get_core(core_id) - def get_tensors_needed_for_node(self, node: ComputationNode): """Determine all the tensors needed to compute a node. The node might need multiple outputs from previous nodes, depending on the graph. @@ -331,51 +313,29 @@ def clear_memories( exceptions: list[Tensor] = [], transfer_bandwidth_fraction: float = 1, ): - for too_large_operand in memory_operands: - timestep = self.remove_all( - core=core, - memory_operand=too_large_operand, - timestep=timestep, - exceptions=exceptions, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - write_back_to_offchip=True, - ) - return timestep - - def remove_all( - self, - core: Core, - memory_operand: MemoryOperand, - timestep: int, - exceptions: list[Tensor] = [], - transfer_bandwidth_fraction: float = 1, - write_back_to_offchip: bool = False, - ): - """Remove all tensors from a core's memory with the given memory operand. - If required, the tensors are written back to offchip before removal. + """Remove all tensors from a core's memory for the given memory operands. + All tensors are written back to offchip before removal. Args: - core (Core): The Core to remove the tensor from + core: The Core to remove the tensor from memory_operand: The memory operand for which all tensors should be evicted. timestep: The timestep to remove the tensor at. exceptions: A list of tensors that should not be evicted. transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfers. - write_back_to_offchip (bool, optional): Write the tensor to offchip before removal. Defaults to False. """ - stored_tensors = self.accelerator.get_tensors_stored_in_core(core, memory_operand, timestep) - - for tensor in stored_tensors: - if tensor not in exceptions: - timestep = self.schedule_tensor_removal( - tensor_to_remove=tensor, - core_to_remove_from=core, - memory_op=memory_operand, - timestep=timestep, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, - write_back_to_offchip=write_back_to_offchip, - transfer_cause=TransferCause.EVICTION, - ) - + for memory_operand in memory_operands: + stored_tensors = self.accelerator.get_tensors_stored_in_core(core, memory_operand, timestep) + for tensor in stored_tensors: + if tensor not in exceptions: + timestep = self.schedule_tensor_removal( + tensor_to_remove=tensor, + core_to_remove_from=core, + memory_op=memory_operand, + timestep=timestep, + transfer_bandwidth_fraction=transfer_bandwidth_fraction, + write_back_to_offchip=True, + transfer_cause=TransferCause.EVICTION, + ) return timestep def schedule_tensor_removal( @@ -432,16 +392,6 @@ def schedule_tensor_transfer( ): """Find the earliest time to transfer the tensor to the receiving core, and register the transfer. Evictions of older tensors might be necessary - - Args: - tensor - receiving_core - tensor_operand - transfer_cause - non_evictable_tensors - sending_core - earliest_t - transfer_bandwidth_fraction """ if self.accelerator.core_contains_tensor(tensor, receiving_core): @@ -457,7 +407,6 @@ def schedule_tensor_transfer( memory_op=tensor_operand, timestep=earliest_tensor_addition_t, tensors_to_avoid_evicting=non_evictable_tensors, - transfer_bandwidth_fraction=transfer_bandwidth_fraction, ) # Find idle window between sender and receiver cores @@ -502,32 +451,28 @@ def make_space_for_tensor( memory_op: MemoryOperand, timestep: int, tensors_to_avoid_evicting: list[Tensor] = [], - transfer_bandwidth_fraction: float = 1, ): """Make space for the given tensor on the given core by evicting already stored tensors if necessary. Args: tensor: The tensor to make space for. - core (Core): The core where the tensor will be stored. - memory_operand: The memory operand on the core. + core: The core where the tensor will be stored. + memory_op: The memory operand on the core. timestep: The timestep at which to make space for. - transfer_bandwidth_fraction: Fraction of the bandwidth to use for the transfer. + tensors_to_avoid_evicting: A list of tensors that should not be evicted. """ - - top_instance = self.accelerator.get_top_instance_of_core(core, memory_op) - # Earliest timestep when the core has enough space, or the latest timestep if this is never the case enough_space_timestep = self.accelerator.memory_manager.get_timestep_for_tensor_addition( tensor=tensor, core=core, timestep=timestep, - memory_op=tensor.memory_operand, + memory_op=memory_op, ) - tensors_to_evict = self.accelerator.memory_manager.find_best_tensor_combination_to_evict_fast( - top_instance, - tensor, - enough_space_timestep, + tensors_to_evict = self.accelerator.find_best_tensor_combination_to_evict_fast( + tensor=tensor, + core=core, + timestep=enough_space_timestep, exceptions=tensors_to_avoid_evicting, ) @@ -535,11 +480,12 @@ def make_space_for_tensor( raise ValueError("Evictions required in offchip memory. Consider making offchip larger.") for tensor_to_evict in tensors_to_evict: + transfer_bandwidth_fraction = self.get_transfer_bandwidth_fraction_for_eviction(tensor_to_evict, timestep) t_eviction_complete = self.schedule_tensor_removal( tensor_to_remove=tensor_to_evict, core_to_remove_from=core, memory_op=memory_op, - timestep=timestep, + timestep=timestep, # TODO should this be `enough_space_timestep`? transfer_bandwidth_fraction=transfer_bandwidth_fraction, write_back_to_offchip=True, transfer_cause=TransferCause.EVICTION, @@ -648,7 +594,7 @@ def check_for_removal( for n in self.G.successors(origin) if n.chosen_core_allocation in core_ids_of_instance and n.id != origin.id ] - end_times = [n.end for n in nodes_that_needed_tensor if n.end is not None] + end_times = [n.end for n in nodes_that_needed_tensor if n.end >= 0] max_end_time = max(end_times, default=timestep_for_removal) # assert max_end_time != -1, "There should be at least one successor." timestep_for_removal = max_end_time @@ -680,6 +626,33 @@ def get_total_latency(self): links_end_time = max([event.end for event in self.accelerator.communication_manager.events], default=0) return max(cns_end_time, links_end_time) + def get_allocated_core(self, node: ComputationNode): + """Get the core this candidate will be scheduled on""" + core_id = node.chosen_core_allocation + assert core_id is not None + return self.accelerator.get_core(core_id) + + def get_transfer_bandwidth_fraction(self, node: ComputationNode): + """Get the fraction of the off-chip bandwidth to be used for the tensor transfers related to this node""" + return 1 / node.get_total_inter_core_splits() + + def get_transfer_bandwidth_fraction_for_eviction(self, tensor: Tensor, timestep: int): + """Get the fraction of the off-chip bandwidth to be used to evict this tensor at the given timestep. + Instead of using the total inter-core splits of the current node, we use the number of cores that store a tensor + of the same layer and memory operand at the given timestep. + # TODO check for given timestep + """ + + def contains_related_tensor(tensors: list[Tensor]): + return any(t.origin.id == tensor.origin.id and t.memory_operand == tensor.memory_operand for t in tensors) + + instances_storing_related_tensor = [ + instance + for instance, tensors in self.accelerator.memory_manager.top_instance_stored_tensors.items() + if contains_related_tensor(tensors) + ] + return 1 / len(instances_storing_related_tensor) + @property def total_cn_offchip_link_energy(self): return self.link_energy[TransferCause.OFF_CHIP] diff --git a/stream/hardware/architecture/accelerator.py b/stream/hardware/architecture/accelerator.py index 406ed2f0..a8b1ef3d 100644 --- a/stream/hardware/architecture/accelerator.py +++ b/stream/hardware/architecture/accelerator.py @@ -172,6 +172,18 @@ def find_tensor(self, tensor: Tensor): def find_tensor_in_top_instances(self, tensor: Tensor): return self.memory_manager.find_tensor_in_top_instances(tensor) + def find_best_tensor_combination_to_evict_fast( + self, tensor: Tensor, core: Core, timestep: int, exceptions: list[Tensor] = [] + ): + top_instance = self.get_top_instance_of_core(core, tensor.memory_operand) + tensors_to_evict = self.memory_manager.find_best_tensor_combination_to_evict_fast( + top_instance=top_instance, + tensor_to_add=tensor, + timestep=timestep, + exceptions=exceptions, + ) + return tensors_to_evict + def remove_tensor( self, tensor: Tensor, diff --git a/stream/workload/node.py b/stream/workload/node.py index 3ec269da..1ef3c201 100644 --- a/stream/workload/node.py +++ b/stream/workload/node.py @@ -47,13 +47,10 @@ def __init__( self.core_allocation_is_fixed = core_allocation_is_fixed self.chosen_core_allocation = chosen_core_allocation self.input_names = input_names - # will be set by the scheduler - self.start = None - # will be set by the scheduler - self.end = None - # number of data (in bits) only this node consumes (not consumed by any other node) + self.start = -1 + self.end = -1 + # number of data (in bits) only this node produces/consumes (not produced/consumed by any other node) self.data_consumed_unique = 0 - # number of data (in bits) only this node produces (not produced by any other node) self.data_produced_unique = 0 def get_total_energy(self) -> float: From 765de8a432f38cc8b12a4cd80fb5ed050e91178b Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sat, 4 Jan 2025 19:55:00 +0100 Subject: [PATCH 47/49] fix indentation error in downsampling warning --- stream/stages/generation/tiled_workload_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 724787bb..54ecf4f9 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -826,8 +826,8 @@ def get_node_tensor( ) node_tensor = node_tensor.extend_with_node(bounded_op_dim_ranges, tile) - if nb_unique_data_seen < (prod(tensor_shapes) * precision): - logger.warning(f"Downsampling node detected: {node}, operand= {op}.") + if nb_unique_data_seen < (prod(tensor_shapes) * precision): + logger.warning(f"Downsampling node detected: {node}, operand= {op}.") # The dimensionality order of this input/output operand might include # both a G and C/K dimension because the ComputationNode gets the group as an extra From 609a601b6ebdea46396ba6122db897ed7e0175bb Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sun, 5 Jan 2025 12:27:13 +0100 Subject: [PATCH 48/49] fix bug in padding tile sizes --- .../generation/tiled_workload_generation.py | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index 54ecf4f9..d5810c16 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -203,6 +203,10 @@ def get_outer_tmap_loop_dimensions(self, node: ComputationNode) -> list[Temporal return outer_loops + def get_total_outer_size(self, outer_temporal_loops: list[TemporalLoop], dim: LayerDim): + """Return the total outer temporal size for the given dim.""" + return prod([loop.size for loop in outer_temporal_loops if loop.dimension == dim]) + def get_non_type_predecessors(self, node: Node, types: list[type]) -> list[Node]: """Find all self.workload nodes that are not of any type in types. If a node of any type in types is a predecessor, we cascade back through the graph until only non-types type @@ -226,10 +230,9 @@ def get_non_type_predecessors(self, node: Node, types: list[type]) -> list[Node] return preds def get_mandatory_divisors(self, node: ComputationNode) -> dict[LayerDim, set[int]]: - """Get the factors by which the smaller tiles' dimensions must be divisible. - Tile dimensions must be divisible by all the inter-core tiling factors of the nodes within the same layer stack. - This ensures dependencies between tiles within the stack do not cross the layer stack boundaries. - # TODO can nodes within the same stack have different intra-core tiling? This is not accounted for + """Get the factors by which the (padded) dimensions of this node must be divisible. The dimensions must be + divisible by the outer loops of all nodes in the same layer stack. This ensures dependencies between tiles + within the stack do not cross the layer stack boundaries. """ divisors: dict[LayerDim, set[int]] = defaultdict(lambda: set()) @@ -247,14 +250,11 @@ def get_mandatory_divisors(self, node: ComputationNode) -> dict[LayerDim, set[in if n.id in curr_stack and n.id != node.id and isinstance(n, ComputationNode) ] - for curr_node in other_nodes_in_stack: - assert len(curr_node.inter_core_tiling) == len( - set(dim for dim, _ in curr_node.inter_core_tiling) - ), "Inter-core tiling contains duplicate dimensions. The divisors for this node must be multiplied" - - for layer_dim, factor in curr_node.inter_core_tiling: - if isinstance(factor, int): - divisors[layer_dim].add(factor) + for other_node in other_nodes_in_stack: + outer_sizes = self.get_outer_tmap_loop_dimensions(other_node) + for layer_dim in other_node.layer_dims: + total_outer_size = self.get_total_outer_size(outer_sizes, layer_dim) + divisors[layer_dim].add(total_outer_size) return divisors def get_tiles( @@ -264,25 +264,24 @@ def get_tiles( mandatory_divisors: dict[LayerDim, set[int]] = {}, ) -> tuple[list[ComputationNode], list[ComputationNode]]: - def get_total_outer_size(dim: LayerDim): - return prod([loop.size for loop in outer_temporal_loops if loop.dimension == dim]) + # def get_lcm(n: int, divisors: set[int]) -> int: + # """Make n divisible by all the divisors in the set.""" + # for divisor in divisors: + # if n % divisor != 0: + # n = ceil(n / divisor) * divisor + # return n + + def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: + """Return x >= n such that x is divisible by `total_outer_size`, as well as by all `mandatory_divisors` + (coming from the inter-core tiling of other nodes within the same stack)""" + total_outer_size = self.get_total_outer_size(outer_temporal_loops, layer_dim) + all_divisors = list(mandatory_divisors[layer_dim]) + [total_outer_size] - def get_lcm(n: int, divisors: set[int]) -> int: - """Make n divisible by all the divisors in the set.""" - for divisor in divisors: + for divisor in all_divisors: if n % divisor != 0: n = ceil(n / divisor) * divisor return n - def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: - """Return x >= n such that x is divisible by `total_outer_size`, and `x // total_outer_size` divisible by - all mandatory divisors (coming from the inter-core tiling of other nodes within the same stack)""" - total_outer_size = get_total_outer_size(layer_dim) - inner_size = ceil(n / total_outer_size) - inner_size_padded = get_lcm(inner_size, mandatory_divisors[layer_dim]) - x = inner_size_padded * total_outer_size - return x - # Pad the layer_dim_sizes to be divisible by the mandatory divisors (coming from the outer_temporal_loops) tile_attrs = original_node.extract_node_attr() for dim, size in tile_attrs.layer_dim_sizes.items(): From 6261e17dd7d9981cb0e81fd5b582966dd3703e22 Mon Sep 17 00:00:00 2001 From: RobinGeens Date: Sun, 5 Jan 2025 13:00:23 +0100 Subject: [PATCH 49/49] fix bug in eviction BW fraction: don't count DRAM --- stream/cost_model/scheduler.py | 6 +++--- stream/stages/generation/tiled_workload_generation.py | 7 ------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/stream/cost_model/scheduler.py b/stream/cost_model/scheduler.py index 92491a05..e2155f05 100644 --- a/stream/cost_model/scheduler.py +++ b/stream/cost_model/scheduler.py @@ -63,6 +63,7 @@ def __init__( # Remains constant throughout the scheduling self.sink_layer_nodes = self.get_sink_layer_nodes() self.offchip_core = accelerator.get_offchip_core() + self.offchip_top_instances = self.accelerator.get_top_instances_of_core(self.offchip_core) self.nb_graph_nodes = G.number_of_nodes() # Initialize bookkeeping @@ -95,7 +96,6 @@ def initialize_tensor_priorities(self): def initialize_offchip_tensors(self): """Add the constant operand tensors of all nodes to the off-chip initially.""" - offchip_top_instances = self.accelerator.get_top_instances_of_core(self.offchip_core) for n in self.G.node_list: for op, tensor in n.operand_tensors.items(): # For constant operands or inputs of first node @@ -103,7 +103,7 @@ def initialize_offchip_tensors(self): if not any( ( self.accelerator.contains_tensor(tensor, offchip_top_instance) - for offchip_top_instance in offchip_top_instances + for offchip_top_instance in self.offchip_top_instances ) ): memory_op = n.memory_operand_links.layer_to_mem_op(op) @@ -649,7 +649,7 @@ def contains_related_tensor(tensors: list[Tensor]): instances_storing_related_tensor = [ instance for instance, tensors in self.accelerator.memory_manager.top_instance_stored_tensors.items() - if contains_related_tensor(tensors) + if contains_related_tensor(tensors) and instance not in self.offchip_top_instances ] return 1 / len(instances_storing_related_tensor) diff --git a/stream/stages/generation/tiled_workload_generation.py b/stream/stages/generation/tiled_workload_generation.py index d5810c16..c09972e8 100644 --- a/stream/stages/generation/tiled_workload_generation.py +++ b/stream/stages/generation/tiled_workload_generation.py @@ -264,13 +264,6 @@ def get_tiles( mandatory_divisors: dict[LayerDim, set[int]] = {}, ) -> tuple[list[ComputationNode], list[ComputationNode]]: - # def get_lcm(n: int, divisors: set[int]) -> int: - # """Make n divisible by all the divisors in the set.""" - # for divisor in divisors: - # if n % divisor != 0: - # n = ceil(n / divisor) * divisor - # return n - def pad_until_divisible(layer_dim: LayerDim, n: int) -> int: """Return x >= n such that x is divisible by `total_outer_size`, as well as by all `mandatory_divisors` (coming from the inter-core tiling of other nodes within the same stack)"""