Skip to content

Commit

Permalink
#15171: Better parallelization strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
Pavle Josipovic committed Nov 27, 2024
1 parent effbfe0 commit 8ccf99e
Show file tree
Hide file tree
Showing 27 changed files with 517 additions and 359 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

import ttnn
import torch
from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_model_utils import get_conv_input_memory_config
from models.utility_functions import (
is_grayskull,
is_wormhole_b0,
pad_and_fold_conv_activation_for_unity_stride,
nearest_y,
)
from typing import List

Expand Down Expand Up @@ -391,8 +393,25 @@ def __init__(
self.conv1_bias_tensor = parameters.conv1.bias
self.conv1_input_channels = self.conv1_weight_tensor.shape[1]
self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
self.conv1_input_height = 259
self.conv1_input_width = 259
self.conv1_output_height = ttnn.get_conv_output_dim(self.conv1_input_height, 4, 1, 0)
self.conv1_output_width = ttnn.get_conv_output_dim(self.conv1_input_width, 4, 1, 0)
assert self.conv1_weight_tensor.shape[2] == 4

self.grayskull_conv1_input_memory_config = get_conv_input_memory_config(
self.batch_size,
self.conv1_input_channels,
self.conv1_input_height,
self.conv1_input_width,
self.conv1_output_channels,
self.conv1_output_height,
self.conv1_output_width,
device.compute_with_storage_grid_size(),
16,
True,
)

self.layer1 = self._make_layer(
parameters=parameters.layer1,
planes=64,
Expand Down Expand Up @@ -522,6 +541,11 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
else:
act_block_h_override = 0

if is_grayskull():
input_tensor = ttnn.to_device(
input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config
)

x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d(
input_tensor=input_tensor,
weight_tensor=self.conv1_weight_tensor,
Expand All @@ -533,8 +557,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
stride=(1, 1),
padding=(0, 0),
batch_size=self.batch_size,
input_height=259,
input_width=259,
input_height=self.conv1_input_height,
input_width=self.conv1_input_width,
conv_config=ttnn.Conv2dConfig(
dtype=self.model_config["ACTIVATIONS_DTYPE"],
weights_dtype=self.model_config["WEIGHTS_DTYPE"],
Expand Down Expand Up @@ -828,6 +852,11 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
else:
act_block_h_override = 0

if is_grayskull():
input_tensor = ttnn.to_device(
input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config
)

x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d(
input_tensor=input_tensor,
weight_tensor=self.conv1_weight_tensor,
Expand All @@ -839,8 +868,8 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
stride=(1, 1),
padding=(0, 0),
batch_size=self.batch_size,
input_height=259,
input_width=259,
input_height=self.conv1_input_height,
input_width=self.conv1_input_width,
conv_config=ttnn.Conv2dConfig(
dtype=self.model_config["ACTIVATIONS_DTYPE"],
weights_dtype=self.model_config["WEIGHTS_DTYPE"],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

import math
import ttnn
from models.utility_functions import nearest_y


def get_core_grid_from_num_cores(num_cores: int, grid_rows: int, grid_cols: int):
columns = num_cores // grid_rows
assert columns <= grid_cols, "Not enough cores for specified core grid"
ranges = []
if columns != 0:
ranges.append(
ttnn.CoreRange(
ttnn.CoreCoord(0, 0),
ttnn.CoreCoord(grid_rows - 1, columns - 1),
)
)
remainder = num_cores % grid_rows
if remainder != 0:
assert columns + 1 <= grid_cols, "Not enough cores for specified core grid"
ranges.append(
ttnn.CoreRange(
ttnn.CoreCoord(0, columns),
ttnn.CoreCoord(remainder - 1, columns),
)
)
return ttnn.CoreRangeSet({*ranges})


def find_closest_largest_divisor(num: int, start_divisor: int) -> int:
divisor = start_divisor
while num % divisor != 0:
divisor -= 1
return divisor


# Determins input memory config for a height sharded conv operation.
# If override_num_cores is set to True, the number of cores will be overriden to the closest largest divisor of the number of tiles
# This will avoid default conv codepath which can pad-up the nhw num tiles and produce padded output
# This can lead to issues with data-movment ops not handling padding correctly
def get_conv_input_memory_config(
batch_size: int,
input_channels: int,
input_height: int,
input_width: int,
output_channels: int,
output_height: int,
output_width: int,
compute_grid: ttnn.CoreGrid,
input_channels_alignment: int,
override_num_cores: bool,
) -> ttnn.MemoryConfig:
parallel_config = ttnn._ttnn.operations.conv.determine_parallel_config(
shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
batch_size=batch_size,
input_channels=input_channels,
output_height=output_height,
output_width=output_width,
output_channels=output_channels,
compute_grid_size=compute_grid,
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=True,
is_out_tiled=True,
)

if override_num_cores:
nhw_ntiles = math.ceil(batch_size * output_height * output_width / 32)
num_cores_nwh = find_closest_largest_divisor(nhw_ntiles, compute_grid.x * compute_grid.y)
parallel_config.grid = get_core_grid_from_num_cores(num_cores_nwh, compute_grid.x, compute_grid.y)

memory_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
tensor_shape=ttnn.Shape(
[
1,
1,
input_width * input_height * batch_size,
nearest_y(
input_channels,
input_channels_alignment,
),
]
),
parallel_config=parallel_config,
tile_size=32,
)
return memory_config
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

import ttnn
import torch
from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_model_utils import get_conv_input_memory_config
from models.utility_functions import (
is_grayskull,
is_wormhole_b0,
_nearest_y,
pad_and_fold_conv_activation_for_unity_stride,
)
from typing import List
from loguru import logger
Expand Down Expand Up @@ -632,15 +632,18 @@ def __init__(

conv_dummy_tensor = torch.rand((self.fold_output_shape), dtype=torch.bfloat16)
conv_dummy_tensor = ttnn.from_torch(conv_dummy_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
_, self.override_fold_mem_config, _, _ = ttnn.get_conv_padded_input_shape_and_mem_config(
device=device,
input_tensor=conv_dummy_tensor,
conv_config=self.conv1_config,
batch_size=self.batch_size,
height=self.conv1_output_height,
width=self.conv1_output_width,
in_channels=self.conv1_input_channels,
out_channels=self.conv1_output_channels,

self.override_fold_mem_config = get_conv_input_memory_config(
self.batch_size,
self.conv1_input_channels,
self.conv1_input_height,
self.conv1_input_width,
self.conv1_output_channels,
self.conv1_output_height,
self.conv1_output_width,
device.compute_with_storage_grid_size(),
self.conv1_config.input_channels_alignment,
is_grayskull(),
)

def __del__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import ttnn
import torch
from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_model_utils import get_conv_input_memory_config
from models.utility_functions import (
is_grayskull,
is_wormhole_b0,
Expand Down Expand Up @@ -388,8 +389,25 @@ def __init__(
self.conv1_bias_tensor = parameters.conv1.bias
self.conv1_input_channels = self.conv1_weight_tensor.shape[1]
self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
self.conv1_input_height = 451
self.conv1_input_width = 451
self.conv1_output_height = ttnn.get_conv_output_dim(self.conv1_input_height, 4, 1, 0)
self.conv1_output_width = ttnn.get_conv_output_dim(self.conv1_input_width, 4, 1, 0)
assert self.conv1_weight_tensor.shape[2] == 4

self.grayskull_conv1_input_memory_config = get_conv_input_memory_config(
self.batch_size,
self.conv1_input_channels,
self.conv1_input_height,
self.conv1_input_width,
self.conv1_output_channels,
self.conv1_output_height,
self.conv1_output_width,
device.compute_with_storage_grid_size(),
16,
True,
)

self.layer1 = self._make_layer(
parameters=parameters.layer1,
planes=64,
Expand Down Expand Up @@ -518,6 +536,11 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
elif batch_size == 20:
act_block_h_override = 640

if is_grayskull():
input_tensor = ttnn.to_device(
input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config
)

x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d(
input_tensor=input_tensor,
weight_tensor=self.conv1_weight_tensor,
Expand All @@ -529,8 +552,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
stride=(1, 1),
padding=(0, 0),
batch_size=self.batch_size,
input_height=451,
input_width=451,
input_height=self.conv1_input_height,
input_width=self.conv1_input_width,
conv_config=ttnn.Conv2dConfig(
dtype=self.model_config["ACTIVATIONS_DTYPE"],
weights_dtype=self.model_config["WEIGHTS_DTYPE"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,33 @@ def __init__(

self.output_height = ttnn.get_conv_output_dim(input_height, 3, self.stride, 1)
self.output_width = ttnn.get_conv_output_dim(input_width, 3, self.stride, 1)
self.shard_layout = (
ttnn.TensorMemoryLayout.HEIGHT_SHARDED if self.in_channels < 320 else ttnn.TensorMemoryLayout.BLOCK_SHARDED
)

self.input_memory_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
tensor_shape=ttnn.Shape(
[
1,
1,
self.batch_size * self.input_height * self.input_width,
self.out_channels,
]
),
parallel_config=ttnn._ttnn.operations.conv.determine_parallel_config(
shard_layout=self.shard_layout,
batch_size=self.batch_size,
input_channels=self.in_channels,
output_height=self.output_height,
output_width=self.output_width,
output_channels=self.out_channels,
compute_grid_size=self.device.compute_with_storage_grid_size(),
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=False,
is_out_tiled=True,
),
tile_size=32,
)

def __call__(
self,
Expand Down Expand Up @@ -104,13 +131,15 @@ def __call__(
math_approx_mode_enabled=True,
fp32_dest_acc_enabled=True,
packer_l1_accum_enabled=False,
shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
if self.in_channels < 320
else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
shard_layout=self.shard_layout,
input_channels_alignment=32,
transpose_shards=False,
reshard_if_not_optimal=True,
reshard_if_not_optimal=False,
)

if hidden_states.memory_config() != self.input_memory_config:
hidden_states = ttnn.to_memory_config(hidden_states, self.input_memory_config)

if self.conv_config_override and "act_block_h" in self.conv_config_override:
conv_config.act_block_h_override = self.conv_config_override["act_block_h"]

Expand Down
Loading

0 comments on commit 8ccf99e

Please sign in to comment.