update group manager to deal with inter core tiles

KULeuven-MICAS · Oct 15, 2024 · 8cb4aaa · 8cb4aaa
1 parent ad7a568
commit 8cb4aaa
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 25 deletions.
diff --git a/stream/cost_model/group_allocation.py b/stream/cost_model/group_allocation.py
@@ -9,16 +9,34 @@
 
 
 class GroupIdManager:
-    def __init__(self):
+    def __init__(self, node: ComputationNode):
         self.__id_count = 0
         self.groups: GroupAllocation = {}
+        self.node = node
+        self.inter_core_tiled_dims = [layer_dim for layer_dim, _ in node.inter_core_tiling]
+        # self.seen_inter_core_tiles_and_ids: dict[frozenset[tuple[LayerDim, int]], int] = {}
 
     def __get_and_raise_id(self):
         curr_id = self.__id_count
         self.__id_count += 1
         return curr_id
 
-    def get_group_id(self, node: ComputationNode, loop_ranges: LoopRanges) -> int:
+    def __extract_inter_core_ranges(self, tile_loop_ranges: LoopRanges):
+        """Given the loop ranges of a tile, return a hashable identifier that can be used to determine wether this
+        tile belongs on the same core as other tiles. Tiles with different ranges for the inter core tile dimensions
+        belong on different cores"""
+        return tuple([tile_loop_ranges[dim] for dim in self.inter_core_tiled_dims])
+        # inter_core_tile_loop_ranges = [
+        #     (dim, loop_range) for dim, loop_range in tile_loop_ranges.items() if dim in self.inter_core_tiled_dims
+        # ]
+        # return frozenset(inter_core_tile_loop_ranges)
+
+    def __extract_relevant_ranges(self, tile_loop_ranges: LoopRanges):
+        constant_operand = self.node.constant_operands[-1]
+        relevant_dims = self.node.loop_relevancy_info.get_r_layer_dims(constant_operand)
+        return tuple([tile_loop_ranges[dim] for dim in relevant_dims])
+
+    def get_group_id(self, tile_loop_ranges: LoopRanges) -> int:
         """Return the group id for the given loop ranges.
         The group id is determined based on the relevant constant operand dimension loop ranges.
         If there is no constant operand, we return 0.
@@ -33,24 +51,25 @@ def get_group_id(self, node: ComputationNode, loop_ranges: LoopRanges) -> int:
         Returns:
             int: The group id for the given loop ranges
         """
-        # No constant operand
-        if not node.constant_operands:
+
+        if not self.node.constant_operands and len(self.node.core_allocation) == 1:
             # If the node can only be assigned to a single core, we give all nodes the same group id
             # This is to prevent the CostModelEvaluationLUT from identifying each node as unique
             # This is the case for e.g. 'Add' nodes if there is only a single 'Add' core
-            if len(node.core_allocation) == 1:
-                return 0
-            else:
-                return self.__get_and_raise_id()
-
-        # Constant operand and known ranges
-        constant_operand = node.constant_operands[-1]
-        relevant_dims = node.loop_relevancy_info.get_r_layer_dims(constant_operand)
-        relevant_ranges = tuple([loop_ranges[dim] for dim in relevant_dims])
-        if relevant_ranges in self.groups:
-            return self.groups[relevant_ranges]
-
-        # Constant operand and new ranges
+            return 0
+
+        if not self.node.constant_operands:
+            # No constant operands -> differentiate based on node's inter core tiling
+            range_identifier = self.__extract_inter_core_ranges(tile_loop_ranges)
+        else:
+            # Constant operands -> differentiate based on relevant layer dims
+            range_identifier = self.__extract_relevant_ranges(tile_loop_ranges)
+
+        # This tile belongs together with previously seen tiles
+        if range_identifier in self.groups:
+            return self.groups[range_identifier]
+
+        # New group
         new_group_id = self.__get_and_raise_id()
-        self.groups[relevant_ranges] = new_group_id
+        self.groups[range_identifier] = new_group_id
         return new_group_id
diff --git a/stream/inputs/examples/mapping/tpu_like_quad_core.yaml b/stream/inputs/examples/mapping/tpu_like_quad_core.yaml
@@ -1,14 +1,14 @@
 - name: default
   core_allocation: [0, 1, 2, 3]
   intra_core_tiling:
-    - K, 4
+    - D, 7
   inter_core_tiling:
     - K, *
 
 - name: Conv
   core_allocation: [0, 1, 2, 3]
   intra_core_tiling:
-    - OY, 4
+    - OY, 5
   inter_core_tiling:
     - K, *
 

diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py
@@ -87,7 +87,7 @@ def __init__(
         self.ss_to_computes: dict[STACK_T, set[ComputationNode]] = {}
         self.hashes_per_sink_node: dict[STACK_T, dict[ComputationNode, int]] = {}
         self.steady_state_hashes: dict[STACK_T, int] = {}
-        self.compute_per_sink_node: dict[STACK_T, set[ComputationNode]] = {}
+        self.compute_per_sink_node: dict[STACK_T, dict[ComputationNode, set[ComputationNode]]] = {}
         self.ss_iterations_per_stack: dict[STACK_T, int] = {}
         self.optimal_allocation_per_stack: dict[STACK_T, ALLOCATION_T] = {}
         self.nb_macs_per_stack: dict[STACK_T, int] = {}
@@ -325,7 +325,7 @@ def get_cn_order(
         """
         order: SCHEDULE_ORDER_T = []
         allocation = sorted(allocation, key=lambda x: (x[0], x[2], x[1]))
-        allocation_adjusted: ALLOCATION_T = []  # will hold allocation with removed k splits
+        allocation_adjusted: ALLOCATION_T = []  # allocation with removed inter core splits (which have same sub id)
         seen_ids: set[tuple[int, int]] = set()
         for t, c, id in allocation:
             if id not in seen_ids:
@@ -446,7 +446,7 @@ def remove_invalid_entries_from_inter_core_tiling(self, node: ComputationNode):
                 )
                 factor = layer_size
 
-            valid_tiling.append((layer_dim, layer_size))
+            valid_tiling.append((layer_dim, factor))
         node.inter_core_tiling = valid_tiling
 
     def replace_wildcard_in_tiling(self, tiling: TILING_T, nb_cores_split: int):

diff --git a/stream/stages/generation/hint_loops_partitioned_workload_generation.py b/stream/stages/generation/hint_loops_partitioned_workload_generation.py
@@ -234,7 +234,7 @@ def get_finer_nodes(
 
         finer_nodes: list[ComputationNode] = []
         tensors: list[Tensor] = []
-        group_id_manager = GroupIdManager()
+        group_id_manager = GroupIdManager(original_node)
         for n in range(nb_cns):
             outer_loop_values: list[int] = []
             for i, outer_loop in enumerate(outer_temporal_loops):
@@ -261,7 +261,7 @@ def get_finer_nodes(
                 dim_min_max[loop_dim] = (dim_min, dim_max)
 
             # finer_node_mapping_copy = deepcopy(original_node.extract_mapping_attr())
-            group_id = group_id_manager.get_group_id(original_node, dim_min_max)
+            group_id = group_id_manager.get_group_id(dim_min_max)
 
             # Create the computation node object with the computed ranges of the loop dimensions
             node_name = original_node.name