From 8cb4aaa9ac53185e07bea2472cc85ca7b441ac76 Mon Sep 17 00:00:00 2001
From: RobinGeens <robin.geens@kuleuven.be>
Date: Tue, 15 Oct 2024 17:23:57 +0200
Subject: [PATCH] update group manager to deal with inter core tiles

---
 stream/cost_model/group_allocation.py         | 55 +++++++++++++------
 .../examples/mapping/tpu_like_quad_core.yaml  |  4 +-
 .../constraint_optimization_allocation.py     |  6 +-
 ...t_loops_partitioned_workload_generation.py |  4 +-
 4 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/stream/cost_model/group_allocation.py b/stream/cost_model/group_allocation.py
index df90d787..8f4fc2da 100644
--- a/stream/cost_model/group_allocation.py
+++ b/stream/cost_model/group_allocation.py
@@ -9,16 +9,34 @@
 
 
 class GroupIdManager:
-    def __init__(self):
+    def __init__(self, node: ComputationNode):
         self.__id_count = 0
         self.groups: GroupAllocation = {}
+        self.node = node
+        self.inter_core_tiled_dims = [layer_dim for layer_dim, _ in node.inter_core_tiling]
+        # self.seen_inter_core_tiles_and_ids: dict[frozenset[tuple[LayerDim, int]], int] = {}
 
     def __get_and_raise_id(self):
         curr_id = self.__id_count
         self.__id_count += 1
         return curr_id
 
-    def get_group_id(self, node: ComputationNode, loop_ranges: LoopRanges) -> int:
+    def __extract_inter_core_ranges(self, tile_loop_ranges: LoopRanges):
+        """Given the loop ranges of a tile, return a hashable identifier that can be used to determine wether this
+        tile belongs on the same core as other tiles. Tiles with different ranges for the inter core tile dimensions
+        belong on different cores"""
+        return tuple([tile_loop_ranges[dim] for dim in self.inter_core_tiled_dims])
+        # inter_core_tile_loop_ranges = [
+        #     (dim, loop_range) for dim, loop_range in tile_loop_ranges.items() if dim in self.inter_core_tiled_dims
+        # ]
+        # return frozenset(inter_core_tile_loop_ranges)
+
+    def __extract_relevant_ranges(self, tile_loop_ranges: LoopRanges):
+        constant_operand = self.node.constant_operands[-1]
+        relevant_dims = self.node.loop_relevancy_info.get_r_layer_dims(constant_operand)
+        return tuple([tile_loop_ranges[dim] for dim in relevant_dims])
+
+    def get_group_id(self, tile_loop_ranges: LoopRanges) -> int:
         """Return the group id for the given loop ranges.
         The group id is determined based on the relevant constant operand dimension loop ranges.
         If there is no constant operand, we return 0.
@@ -33,24 +51,25 @@ def get_group_id(self, node: ComputationNode, loop_ranges: LoopRanges) -> int:
         Returns:
             int: The group id for the given loop ranges
         """
-        # No constant operand
-        if not node.constant_operands:
+
+        if not self.node.constant_operands and len(self.node.core_allocation) == 1:
             # If the node can only be assigned to a single core, we give all nodes the same group id
             # This is to prevent the CostModelEvaluationLUT from identifying each node as unique
             # This is the case for e.g. 'Add' nodes if there is only a single 'Add' core
-            if len(node.core_allocation) == 1:
-                return 0
-            else:
-                return self.__get_and_raise_id()
-
-        # Constant operand and known ranges
-        constant_operand = node.constant_operands[-1]
-        relevant_dims = node.loop_relevancy_info.get_r_layer_dims(constant_operand)
-        relevant_ranges = tuple([loop_ranges[dim] for dim in relevant_dims])
-        if relevant_ranges in self.groups:
-            return self.groups[relevant_ranges]
-
-        # Constant operand and new ranges
+            return 0
+
+        if not self.node.constant_operands:
+            # No constant operands -> differentiate based on node's inter core tiling
+            range_identifier = self.__extract_inter_core_ranges(tile_loop_ranges)
+        else:
+            # Constant operands -> differentiate based on relevant layer dims
+            range_identifier = self.__extract_relevant_ranges(tile_loop_ranges)
+
+        # This tile belongs together with previously seen tiles
+        if range_identifier in self.groups:
+            return self.groups[range_identifier]
+
+        # New group
         new_group_id = self.__get_and_raise_id()
-        self.groups[relevant_ranges] = new_group_id
+        self.groups[range_identifier] = new_group_id
         return new_group_id
diff --git a/stream/inputs/examples/mapping/tpu_like_quad_core.yaml b/stream/inputs/examples/mapping/tpu_like_quad_core.yaml
index 47bebec2..c959f828 100644
--- a/stream/inputs/examples/mapping/tpu_like_quad_core.yaml
+++ b/stream/inputs/examples/mapping/tpu_like_quad_core.yaml
@@ -1,14 +1,14 @@
 - name: default
   core_allocation: [0, 1, 2, 3]
   intra_core_tiling:
-    - K, 4
+    - D, 7
   inter_core_tiling:
     - K, *
 
 - name: Conv
   core_allocation: [0, 1, 2, 3]
   intra_core_tiling:
-    - OY, 4
+    - OY, 5
   inter_core_tiling:
     - K, *
 
diff --git a/stream/stages/allocation/constraint_optimization_allocation.py b/stream/stages/allocation/constraint_optimization_allocation.py
index e8665c85..5cd9ca4f 100644
--- a/stream/stages/allocation/constraint_optimization_allocation.py
+++ b/stream/stages/allocation/constraint_optimization_allocation.py
@@ -87,7 +87,7 @@ def __init__(
         self.ss_to_computes: dict[STACK_T, set[ComputationNode]] = {}
         self.hashes_per_sink_node: dict[STACK_T, dict[ComputationNode, int]] = {}
         self.steady_state_hashes: dict[STACK_T, int] = {}
-        self.compute_per_sink_node: dict[STACK_T, set[ComputationNode]] = {}
+        self.compute_per_sink_node: dict[STACK_T, dict[ComputationNode, set[ComputationNode]]] = {}
         self.ss_iterations_per_stack: dict[STACK_T, int] = {}
         self.optimal_allocation_per_stack: dict[STACK_T, ALLOCATION_T] = {}
         self.nb_macs_per_stack: dict[STACK_T, int] = {}
@@ -325,7 +325,7 @@ def get_cn_order(
         """
         order: SCHEDULE_ORDER_T = []
         allocation = sorted(allocation, key=lambda x: (x[0], x[2], x[1]))
-        allocation_adjusted: ALLOCATION_T = []  # will hold allocation with removed k splits
+        allocation_adjusted: ALLOCATION_T = []  # allocation with removed inter core splits (which have same sub id)
         seen_ids: set[tuple[int, int]] = set()
         for t, c, id in allocation:
             if id not in seen_ids:
@@ -446,7 +446,7 @@ def remove_invalid_entries_from_inter_core_tiling(self, node: ComputationNode):
                 )
                 factor = layer_size
 
-            valid_tiling.append((layer_dim, layer_size))
+            valid_tiling.append((layer_dim, factor))
         node.inter_core_tiling = valid_tiling
 
     def replace_wildcard_in_tiling(self, tiling: TILING_T, nb_cores_split: int):
diff --git a/stream/stages/generation/hint_loops_partitioned_workload_generation.py b/stream/stages/generation/hint_loops_partitioned_workload_generation.py
index bfbd145b..45847c28 100644
--- a/stream/stages/generation/hint_loops_partitioned_workload_generation.py
+++ b/stream/stages/generation/hint_loops_partitioned_workload_generation.py
@@ -234,7 +234,7 @@ def get_finer_nodes(
 
         finer_nodes: list[ComputationNode] = []
         tensors: list[Tensor] = []
-        group_id_manager = GroupIdManager()
+        group_id_manager = GroupIdManager(original_node)
         for n in range(nb_cns):
             outer_loop_values: list[int] = []
             for i, outer_loop in enumerate(outer_temporal_loops):
@@ -261,7 +261,7 @@ def get_finer_nodes(
                 dim_min_max[loop_dim] = (dim_min, dim_max)
 
             # finer_node_mapping_copy = deepcopy(original_node.extract_mapping_attr())
-            group_id = group_id_manager.get_group_id(original_node, dim_min_max)
+            group_id = group_id_manager.get_group_id(dim_min_max)
 
             # Create the computation node object with the computed ranges of the loop dimensions
             node_name = original_node.name