diff --git a/stream/classes/cost_model/scheduler.py b/stream/classes/cost_model/scheduler.py index eb8505a3..f10acf00 100644 --- a/stream/classes/cost_model/scheduler.py +++ b/stream/classes/cost_model/scheduler.py @@ -81,7 +81,7 @@ def get_best_candidate(candidates: list[ComputationNode], scheduling_order: list if not candidates: raise ValueError("There are no candidates to schedule.") preds_ends, cn_candidates = zip(*candidates) - idxs = [scheduling_order.index(n.id) for n in cn_candidates] + idxs = [scheduling_order.index((n.id, n.sub_id)) for n in cn_candidates] best_candidate_idx = idxs.index(min(idxs)) best_candidate = cn_candidates[best_candidate_idx] preds_end = preds_ends[best_candidate_idx] @@ -264,7 +264,7 @@ def schedule_graph( offchip_core_id = accelerator.offchip_core_id offchip_core = accelerator.get_core(offchip_core_id) - ## Schedule preparation: + # Schedule preparation: # 1. Initialize the memory instance priorities for each tensor initialize_priorities(G, accelerator) # 2. Add the constant operand tensors of all nodes to the off-chip initially @@ -292,9 +292,9 @@ def schedule_graph( core = accelerator.get_core(core_id) # Earliest start time is when core is available or predecessors finished start = max(cores_idle_from[core_id], preds_end) - ## Step 0 + # Step 0 tensors_this_candidate_needs, tensors_operands = get_tensors_needed_for_node(best_candidate, G) - ## Step 1 + # Step 1 # There could be operands that are too large to store in the highest memory on the core # The tensors stored in these memories should be evicted and potentially written back to off-chip # Clear these memories (this might delay the potential start time if things have to written to off-chip) @@ -312,7 +312,7 @@ def schedule_graph( ) total_eviction_to_offchip_link_energy += clear_link_energy total_eviction_to_offchip_memory_energy += clear_memory_energy - ## Step 2 + # Step 2 # The computation might need tensors that are currently not present in the core's memories # We need to fetch these tensors from either off-chip or from the core where they are present # Transfer these tensors from wherever they are currently residing to this core @@ -343,7 +343,7 @@ def schedule_graph( total_eviction_to_offchip_link_energy += eviction_link_energy_cost total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost - ## Step 3 + # Step 3 # Check if we had any operands that were too large to store in the core's memory, block the relevant off-chip link for the duration # This might again delay the execution if the offchip link was already blocked by another core timestep = accelerator.block_offchip_links( @@ -354,7 +354,7 @@ def schedule_graph( best_candidate, ) - ## Step 4 + # Step 4 # Make space for the output tensor of this computation node and spawn it when evictions are complete # If the output operand is in the too large operands, add it to off-chip, otherwise add it to this core's output memory output_layer_operand = best_candidate.output_operand @@ -387,7 +387,7 @@ def schedule_graph( available_timestep=end, ) - ## Step 5 + # Step 5 # Update the start and end time of the node best_candidate.set_start(start) best_candidate.set_end(end) @@ -400,7 +400,7 @@ def schedule_graph( # Add this node to the scheduled nodes scheduled_nodes.add(best_candidate) - ## Step 6 + # Step 6 # Memory usage: When the node ends: # Decrease the priority of all the tensors this node used decrease_priority(tensors_this_candidate_needs, tensors_operands, accelerator, best_candidate) @@ -413,7 +413,7 @@ def schedule_graph( end, ) - ## Step 7 + # Step 7 # Memory usage: When the node ends: # If this node is a sink node (node that has no successors and that produces a final output), transfer final outputs to offchip if best_candidate in sink_layer_nodes: @@ -433,7 +433,7 @@ def schedule_graph( total_sink_layer_output_offchip_link_energy += link_energy_cost total_sink_layer_output_offchip_memory_energy += memory_energy_cost - ## Step 8 + # Step 8 # For each successor of this node, check if all of its predecessors have been scheduled for successor in sorted(G.successors(best_candidate)): if all((pred in scheduled_nodes for pred in G.predecessors(successor))): @@ -448,7 +448,7 @@ def schedule_graph( nb_scheduled_nodes += 1 done = nb_scheduled_nodes == nb_graph_nodes - ## Step 9 + # Step 9 # The total schedule latency is the max of all CN end times and the link end times cns_end_time = max((n.end for n in G.nodes())) links_end_time = max([event.end for event in accelerator.communication_manager.events], default=0) diff --git a/stream/classes/opt/allocation/genetic_algorithm/fitness_evaluator.py b/stream/classes/opt/allocation/genetic_algorithm/fitness_evaluator.py index ad189f2c..327ca38a 100644 --- a/stream/classes/opt/allocation/genetic_algorithm/fitness_evaluator.py +++ b/stream/classes/opt/allocation/genetic_algorithm/fitness_evaluator.py @@ -102,7 +102,7 @@ def set_node_core_allocations(self, core_allocations: list[int]): offchip_energy = 0 for too_large_operand in too_large_operands: layer_operand = next( - (k for (k, v) in cme.layer.memory_operand_links.items() if v == too_large_operand) + (k for (k, v) in cme.layer.memory_operand_links.data.items() if v == too_large_operand) ) layer_operand_offchip_energy = cme.mem_energy_breakdown[layer_operand][-1] offchip_energy += layer_operand_offchip_energy diff --git a/stream/classes/stages/GenerateCNWorkloadHybridStage.py b/stream/classes/stages/GenerateCNWorkloadHybridStage.py index 58497b8e..d1c57408 100644 --- a/stream/classes/stages/GenerateCNWorkloadHybridStage.py +++ b/stream/classes/stages/GenerateCNWorkloadHybridStage.py @@ -135,7 +135,7 @@ def run(self): @staticmethod def get_scheduling_order(workload: Workload): - return sorted((n.id for n in workload.nodes()), reverse=True) + return sorted(((n.id, n.sub_id) for n in workload.nodes()), reverse=True) @staticmethod def get_all_node_pairs(G: Workload) -> tuple[tuple[ComputationNode, ComputationNode, bool], ...]: @@ -367,8 +367,12 @@ def get_finer_nodes( produces_final_output=produces_final_output, group_id=group_id, ) - # Override property + # Override loop_ranges property finer_node.update_loop_ranges(dim_min_max) + # Re-calculate pr loop ranges based on new loop_ranges + finer_node.calculate_pr_loop_ranges() + # Re-set the operand tensors for the new loop_ranges + finer_node.set_operand_tensors() # Initialize the priorities (total inter-CN data reuse factor) for the constant operands of this finer_node for constant_operand in finer_node.constant_operands: @@ -436,6 +440,7 @@ def get_bounding_box_dimensions( # where the onnx tensors are always flattened back to 4D (merging the G+C or G+K into one channel dimension) dimensions, loop_ranges = self.flatten_grouped_convolution_ranges(producer, consumer, dimensions, loop_ranges) bounding_box = [loop_ranges[dim] for dim in dimensions] + print(bounding_box) if not interleaved: bounding_box_flat = tuple([item for sublist in bounding_box for item in sublist]) @@ -631,7 +636,7 @@ def get_inter_edges_numpy( all_inter_edges: list[tuple[ComputationNode, ComputationNode, dict[str, Any]]] = [] for path_between in paths_between_generator: dependent_operand = Constants.OUTPUT_LAYER_OP - ## FIRST NODE + # FIRST NODE # First node in the path is a ComputationNode, of which we extract the output operand dependency tensor node = path_between[0] assert isinstance(node, ComputationNode), "First node in path should be ComputationNode" @@ -642,12 +647,12 @@ def get_inter_edges_numpy( tensor_cns = self.get_tensor_cns(node, finer_nodes) numpy_tensors[node] = tensor_cns tensor = tensor_cns[Constants.OUTPUT_LAYER_OP] - ## INTERMEDIATE NON-COMPUTATION NODES + # INTERMEDIATE NON-COMPUTATION NODES for _, node in enumerate(path_between[1:-1], start=1): if isinstance(node, ComputationNode): raise ValueError("Intermediate nodes should not be of type ComputationNode.") tensor = self.propagate_cn_production_for_non_cn(node, tensor) - ## LAST NODE IN PATH + # LAST NODE IN PATH last_node: Node = path_between[-1] # Find the operand for which this last node connects to its predecessor diff --git a/stream/classes/stages/IntraCoreMappingStage.py b/stream/classes/stages/IntraCoreMappingStage.py index 7759a0d0..0c7bdcd6 100644 --- a/stream/classes/stages/IntraCoreMappingStage.py +++ b/stream/classes/stages/IntraCoreMappingStage.py @@ -144,7 +144,7 @@ def run(self): # Compute this (node, core) combination's optimal mapping else: # Set the node's core allocation to the core_id we want to extract hw performance for - node.set_chosen_core_allocation(core_id) + node.set_core_allocation(core_id) # Set the node's spatial mapping to the possible spatial mappings of the current core node.spatial_mapping = core.dataflows if core.dataflows is not None else SpatialMapping.empty() # Initialize the flow that will be followed to extract the optimal HW performance of every unique node-core allocation @@ -156,7 +156,7 @@ def run(self): answers = main_stage.run() assert len(answers) == 1, "IntraCoreMappingStage's subflow returned more than one CME" cme = answers[0][0] - node.chosen_core_allocation = None # Reset the node's core allocation + node.core_allocation = None # Reset the node's core allocation self.node_hw_performances[node][core] = cme self.save_node_hw_performances() # Save the hw performances dict after every node is finished self.visualize_node_hw_performances() diff --git a/stream/classes/workload/computation_node.py b/stream/classes/workload/computation_node.py index aac3a68a..dbc97889 100644 --- a/stream/classes/workload/computation_node.py +++ b/stream/classes/workload/computation_node.py @@ -82,6 +82,13 @@ def __init__( # Each ComputationNode will save a tensor for all its defined operands. # For example, a conv layer will have an I tensor, W tensor and O tensor. self.operand_tensors: dict[LayerOperand, Tensor] = {} + self.set_operand_tensors() + + # Will be set by the InterCoreMappingStage or by the FitnessEvaluator + self.too_large_operands = None + self.nb_real_predecessors = None + + def set_operand_tensors(self): for op in self.layer_operands: if op == Constants.OUTPUT_LAYER_OP: precision = self.operand_precision.final_output_precision @@ -99,10 +106,6 @@ def __init__( loop_ranges=ranges, ) - # Will be set by the InterCoreMappingStage or by the FitnessEvaluator - self.too_large_operands = None - self.nb_real_predecessors = None - def get_operand_tensor_reshape_default(self) -> OperandTensorReshape | None: try: size_B = self.layer_dim_sizes[LayerDim("B")] diff --git a/stream/classes/workload/node.py b/stream/classes/workload/node.py index 2fa406ac..524448fb 100644 --- a/stream/classes/workload/node.py +++ b/stream/classes/workload/node.py @@ -123,6 +123,9 @@ def set_end(self, end: int): """ self.end = end + def set_core_allocation(self, core_allocation: int): + self.core_allocation = [core_allocation] + def set_chosen_core_allocation(self, core_allocation: int): self.chosen_core_allocation = core_allocation diff --git a/stream/classes/workload/tensor.py b/stream/classes/workload/tensor.py index a90f4820..17dd7e79 100644 --- a/stream/classes/workload/tensor.py +++ b/stream/classes/workload/tensor.py @@ -18,7 +18,7 @@ def __init__( origin: "ComputationNode", layer_operand: LayerOperand, loop_dimensions: list[LayerDim], - loop_ranges: tuple[int, int], + loop_ranges: tuple[tuple[int, int], ...], ): """Initialize the Tensor instance. @@ -61,7 +61,7 @@ def __lt__(self, __o: object) -> bool: # self.loop_ranges == __o.loop_ranges def equality_hash(self): - return hash((self.origin.id, self.origin.sub_id, self.layer_operand, self.loop_ranges)) + return hash((self.origin.id, self.layer_operand, self.loop_ranges)) def set_base_priorities(self, base_priority): self.base_priority = base_priority diff --git a/stream/utils.py b/stream/utils.py index 7b594e62..f2e18789 100644 --- a/stream/utils.py +++ b/stream/utils.py @@ -22,7 +22,7 @@ def get_too_large_operands(cme: CostModelEvaluation, accelerator: Accelerator, c core = accelerator.get_core(core_id) core_nb_memory_levels = core.memory_hierarchy.nb_levels for layer_operand, l in cme.mapping.data_elem_per_level.items(): - memory_operand = cme.layer.memory_operand_links[layer_operand] + memory_operand = cme.layer.memory_operand_links.layer_to_mem_op(layer_operand) if len(l) > core_nb_memory_levels[memory_operand] + 1: # +1 because of spatial level too_large_operands.append(memory_operand) return too_large_operands