Skip to content

Commit

Permalink
fix pr loop_ranges and operand_tensors recalculation for finer nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
asyms committed May 21, 2024
1 parent b385915 commit 66c3b26
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 27 deletions.
24 changes: 12 additions & 12 deletions stream/classes/cost_model/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_best_candidate(candidates: list[ComputationNode], scheduling_order: list
if not candidates:
raise ValueError("There are no candidates to schedule.")
preds_ends, cn_candidates = zip(*candidates)
idxs = [scheduling_order.index(n.id) for n in cn_candidates]
idxs = [scheduling_order.index((n.id, n.sub_id)) for n in cn_candidates]
best_candidate_idx = idxs.index(min(idxs))
best_candidate = cn_candidates[best_candidate_idx]
preds_end = preds_ends[best_candidate_idx]
Expand Down Expand Up @@ -264,7 +264,7 @@ def schedule_graph(
offchip_core_id = accelerator.offchip_core_id
offchip_core = accelerator.get_core(offchip_core_id)

## Schedule preparation:
# Schedule preparation:
# 1. Initialize the memory instance priorities for each tensor
initialize_priorities(G, accelerator)
# 2. Add the constant operand tensors of all nodes to the off-chip initially
Expand Down Expand Up @@ -292,9 +292,9 @@ def schedule_graph(
core = accelerator.get_core(core_id)
# Earliest start time is when core is available or predecessors finished
start = max(cores_idle_from[core_id], preds_end)
## Step 0
# Step 0
tensors_this_candidate_needs, tensors_operands = get_tensors_needed_for_node(best_candidate, G)
## Step 1
# Step 1
# There could be operands that are too large to store in the highest memory on the core
# The tensors stored in these memories should be evicted and potentially written back to off-chip
# Clear these memories (this might delay the potential start time if things have to written to off-chip)
Expand All @@ -312,7 +312,7 @@ def schedule_graph(
)
total_eviction_to_offchip_link_energy += clear_link_energy
total_eviction_to_offchip_memory_energy += clear_memory_energy
## Step 2
# Step 2
# The computation might need tensors that are currently not present in the core's memories
# We need to fetch these tensors from either off-chip or from the core where they are present
# Transfer these tensors from wherever they are currently residing to this core
Expand Down Expand Up @@ -343,7 +343,7 @@ def schedule_graph(
total_eviction_to_offchip_link_energy += eviction_link_energy_cost
total_eviction_to_offchip_memory_energy += eviction_memory_energy_cost

## Step 3
# Step 3
# Check if we had any operands that were too large to store in the core's memory, block the relevant off-chip link for the duration
# This might again delay the execution if the offchip link was already blocked by another core
timestep = accelerator.block_offchip_links(
Expand All @@ -354,7 +354,7 @@ def schedule_graph(
best_candidate,
)

## Step 4
# Step 4
# Make space for the output tensor of this computation node and spawn it when evictions are complete
# If the output operand is in the too large operands, add it to off-chip, otherwise add it to this core's output memory
output_layer_operand = best_candidate.output_operand
Expand Down Expand Up @@ -387,7 +387,7 @@ def schedule_graph(
available_timestep=end,
)

## Step 5
# Step 5
# Update the start and end time of the node
best_candidate.set_start(start)
best_candidate.set_end(end)
Expand All @@ -400,7 +400,7 @@ def schedule_graph(
# Add this node to the scheduled nodes
scheduled_nodes.add(best_candidate)

## Step 6
# Step 6
# Memory usage: When the node ends:
# Decrease the priority of all the tensors this node used
decrease_priority(tensors_this_candidate_needs, tensors_operands, accelerator, best_candidate)
Expand All @@ -413,7 +413,7 @@ def schedule_graph(
end,
)

## Step 7
# Step 7
# Memory usage: When the node ends:
# If this node is a sink node (node that has no successors and that produces a final output), transfer final outputs to offchip
if best_candidate in sink_layer_nodes:
Expand All @@ -433,7 +433,7 @@ def schedule_graph(
total_sink_layer_output_offchip_link_energy += link_energy_cost
total_sink_layer_output_offchip_memory_energy += memory_energy_cost

## Step 8
# Step 8
# For each successor of this node, check if all of its predecessors have been scheduled
for successor in sorted(G.successors(best_candidate)):
if all((pred in scheduled_nodes for pred in G.predecessors(successor))):
Expand All @@ -448,7 +448,7 @@ def schedule_graph(
nb_scheduled_nodes += 1
done = nb_scheduled_nodes == nb_graph_nodes

## Step 9
# Step 9
# The total schedule latency is the max of all CN end times and the link end times
cns_end_time = max((n.end for n in G.nodes()))
links_end_time = max([event.end for event in accelerator.communication_manager.events], default=0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def set_node_core_allocations(self, core_allocations: list[int]):
offchip_energy = 0
for too_large_operand in too_large_operands:
layer_operand = next(
(k for (k, v) in cme.layer.memory_operand_links.items() if v == too_large_operand)
(k for (k, v) in cme.layer.memory_operand_links.data.items() if v == too_large_operand)
)
layer_operand_offchip_energy = cme.mem_energy_breakdown[layer_operand][-1]
offchip_energy += layer_operand_offchip_energy
Expand Down
15 changes: 10 additions & 5 deletions stream/classes/stages/GenerateCNWorkloadHybridStage.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def run(self):

@staticmethod
def get_scheduling_order(workload: Workload):
return sorted((n.id for n in workload.nodes()), reverse=True)
return sorted(((n.id, n.sub_id) for n in workload.nodes()), reverse=True)

@staticmethod
def get_all_node_pairs(G: Workload) -> tuple[tuple[ComputationNode, ComputationNode, bool], ...]:
Expand Down Expand Up @@ -367,8 +367,12 @@ def get_finer_nodes(
produces_final_output=produces_final_output,
group_id=group_id,
)
# Override property
# Override loop_ranges property
finer_node.update_loop_ranges(dim_min_max)
# Re-calculate pr loop ranges based on new loop_ranges
finer_node.calculate_pr_loop_ranges()
# Re-set the operand tensors for the new loop_ranges
finer_node.set_operand_tensors()

# Initialize the priorities (total inter-CN data reuse factor) for the constant operands of this finer_node
for constant_operand in finer_node.constant_operands:
Expand Down Expand Up @@ -436,6 +440,7 @@ def get_bounding_box_dimensions(
# where the onnx tensors are always flattened back to 4D (merging the G+C or G+K into one channel dimension)
dimensions, loop_ranges = self.flatten_grouped_convolution_ranges(producer, consumer, dimensions, loop_ranges)
bounding_box = [loop_ranges[dim] for dim in dimensions]
print(bounding_box)

if not interleaved:
bounding_box_flat = tuple([item for sublist in bounding_box for item in sublist])
Expand Down Expand Up @@ -631,7 +636,7 @@ def get_inter_edges_numpy(
all_inter_edges: list[tuple[ComputationNode, ComputationNode, dict[str, Any]]] = []
for path_between in paths_between_generator:
dependent_operand = Constants.OUTPUT_LAYER_OP
## FIRST NODE
# FIRST NODE
# First node in the path is a ComputationNode, of which we extract the output operand dependency tensor
node = path_between[0]
assert isinstance(node, ComputationNode), "First node in path should be ComputationNode"
Expand All @@ -642,12 +647,12 @@ def get_inter_edges_numpy(
tensor_cns = self.get_tensor_cns(node, finer_nodes)
numpy_tensors[node] = tensor_cns
tensor = tensor_cns[Constants.OUTPUT_LAYER_OP]
## INTERMEDIATE NON-COMPUTATION NODES
# INTERMEDIATE NON-COMPUTATION NODES
for _, node in enumerate(path_between[1:-1], start=1):
if isinstance(node, ComputationNode):
raise ValueError("Intermediate nodes should not be of type ComputationNode.")
tensor = self.propagate_cn_production_for_non_cn(node, tensor)
## LAST NODE IN PATH
# LAST NODE IN PATH
last_node: Node = path_between[-1]
# Find the operand for which this last node connects to its predecessor

Expand Down
4 changes: 2 additions & 2 deletions stream/classes/stages/IntraCoreMappingStage.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def run(self):
# Compute this (node, core) combination's optimal mapping
else:
# Set the node's core allocation to the core_id we want to extract hw performance for
node.set_chosen_core_allocation(core_id)
node.set_core_allocation(core_id)
# Set the node's spatial mapping to the possible spatial mappings of the current core
node.spatial_mapping = core.dataflows if core.dataflows is not None else SpatialMapping.empty()
# Initialize the flow that will be followed to extract the optimal HW performance of every unique node-core allocation
Expand All @@ -156,7 +156,7 @@ def run(self):
answers = main_stage.run()
assert len(answers) == 1, "IntraCoreMappingStage's subflow returned more than one CME"
cme = answers[0][0]
node.chosen_core_allocation = None # Reset the node's core allocation
node.core_allocation = None # Reset the node's core allocation
self.node_hw_performances[node][core] = cme
self.save_node_hw_performances() # Save the hw performances dict after every node is finished
self.visualize_node_hw_performances()
Expand Down
11 changes: 7 additions & 4 deletions stream/classes/workload/computation_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def __init__(
# Each ComputationNode will save a tensor for all its defined operands.
# For example, a conv layer will have an I tensor, W tensor and O tensor.
self.operand_tensors: dict[LayerOperand, Tensor] = {}
self.set_operand_tensors()

# Will be set by the InterCoreMappingStage or by the FitnessEvaluator
self.too_large_operands = None
self.nb_real_predecessors = None

def set_operand_tensors(self):
for op in self.layer_operands:
if op == Constants.OUTPUT_LAYER_OP:
precision = self.operand_precision.final_output_precision
Expand All @@ -99,10 +106,6 @@ def __init__(
loop_ranges=ranges,
)

# Will be set by the InterCoreMappingStage or by the FitnessEvaluator
self.too_large_operands = None
self.nb_real_predecessors = None

def get_operand_tensor_reshape_default(self) -> OperandTensorReshape | None:
try:
size_B = self.layer_dim_sizes[LayerDim("B")]
Expand Down
3 changes: 3 additions & 0 deletions stream/classes/workload/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def set_end(self, end: int):
"""
self.end = end

def set_core_allocation(self, core_allocation: int):
self.core_allocation = [core_allocation]

def set_chosen_core_allocation(self, core_allocation: int):
self.chosen_core_allocation = core_allocation

Expand Down
4 changes: 2 additions & 2 deletions stream/classes/workload/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(
origin: "ComputationNode",
layer_operand: LayerOperand,
loop_dimensions: list[LayerDim],
loop_ranges: tuple[int, int],
loop_ranges: tuple[tuple[int, int], ...],
):
"""Initialize the Tensor instance.
Expand Down Expand Up @@ -61,7 +61,7 @@ def __lt__(self, __o: object) -> bool:
# self.loop_ranges == __o.loop_ranges

def equality_hash(self):
return hash((self.origin.id, self.origin.sub_id, self.layer_operand, self.loop_ranges))
return hash((self.origin.id, self.layer_operand, self.loop_ranges))

def set_base_priorities(self, base_priority):
self.base_priority = base_priority
Expand Down
2 changes: 1 addition & 1 deletion stream/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_too_large_operands(cme: CostModelEvaluation, accelerator: Accelerator, c
core = accelerator.get_core(core_id)
core_nb_memory_levels = core.memory_hierarchy.nb_levels
for layer_operand, l in cme.mapping.data_elem_per_level.items():
memory_operand = cme.layer.memory_operand_links[layer_operand]
memory_operand = cme.layer.memory_operand_links.layer_to_mem_op(layer_operand)
if len(l) > core_nb_memory_levels[memory_operand] + 1: # +1 because of spatial level
too_large_operands.append(memory_operand)
return too_large_operands
Expand Down

0 comments on commit 66c3b26

Please sign in to comment.