#15171: Better parallelization strategy

tenstorrent · Nov 27, 2024 · 8ccf99e · 8ccf99e
1 parent effbfe0
commit 8ccf99e
Show file tree

Hide file tree

Showing 27 changed files with 517 additions and 359 deletions.
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py
@@ -4,10 +4,12 @@
 
 import ttnn
 import torch
+from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_model_utils import get_conv_input_memory_config
 from models.utility_functions import (
     is_grayskull,
     is_wormhole_b0,
     pad_and_fold_conv_activation_for_unity_stride,
+    nearest_y,
 )
 from typing import List
 
@@ -391,8 +393,25 @@ def __init__(
         self.conv1_bias_tensor = parameters.conv1.bias
         self.conv1_input_channels = self.conv1_weight_tensor.shape[1]
         self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
+        self.conv1_input_height = 259
+        self.conv1_input_width = 259
+        self.conv1_output_height = ttnn.get_conv_output_dim(self.conv1_input_height, 4, 1, 0)
+        self.conv1_output_width = ttnn.get_conv_output_dim(self.conv1_input_width, 4, 1, 0)
         assert self.conv1_weight_tensor.shape[2] == 4
 
+        self.grayskull_conv1_input_memory_config = get_conv_input_memory_config(
+            self.batch_size,
+            self.conv1_input_channels,
+            self.conv1_input_height,
+            self.conv1_input_width,
+            self.conv1_output_channels,
+            self.conv1_output_height,
+            self.conv1_output_width,
+            device.compute_with_storage_grid_size(),
+            16,
+            True,
+        )
+
         self.layer1 = self._make_layer(
             parameters=parameters.layer1,
             planes=64,
@@ -522,6 +541,11 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
         else:
             act_block_h_override = 0
 
+        if is_grayskull():
+            input_tensor = ttnn.to_device(
+                input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config
+            )
+
         x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d(
             input_tensor=input_tensor,
             weight_tensor=self.conv1_weight_tensor,
@@ -533,8 +557,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             stride=(1, 1),
             padding=(0, 0),
             batch_size=self.batch_size,
-            input_height=259,
-            input_width=259,
+            input_height=self.conv1_input_height,
+            input_width=self.conv1_input_width,
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
@@ -828,6 +852,11 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
         else:
             act_block_h_override = 0
 
+        if is_grayskull():
+            input_tensor = ttnn.to_device(
+                input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config
+            )
+
         x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d(
             input_tensor=input_tensor,
             weight_tensor=self.conv1_weight_tensor,
@@ -839,8 +868,8 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             stride=(1, 1),
             padding=(0, 0),
             batch_size=self.batch_size,
-            input_height=259,
-            input_width=259,
+            input_height=self.conv1_input_height,
+            input_width=self.conv1_input_width,
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_model_utils.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_model_utils.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+import ttnn
+from models.utility_functions import nearest_y
+
+
+def get_core_grid_from_num_cores(num_cores: int, grid_rows: int, grid_cols: int):
+    columns = num_cores // grid_rows
+    assert columns <= grid_cols, "Not enough cores for specified core grid"
+    ranges = []
+    if columns != 0:
+        ranges.append(
+            ttnn.CoreRange(
+                ttnn.CoreCoord(0, 0),
+                ttnn.CoreCoord(grid_rows - 1, columns - 1),
+            )
+        )
+    remainder = num_cores % grid_rows
+    if remainder != 0:
+        assert columns + 1 <= grid_cols, "Not enough cores for specified core grid"
+        ranges.append(
+            ttnn.CoreRange(
+                ttnn.CoreCoord(0, columns),
+                ttnn.CoreCoord(remainder - 1, columns),
+            )
+        )
+    return ttnn.CoreRangeSet({*ranges})
+
+
+def find_closest_largest_divisor(num: int, start_divisor: int) -> int:
+    divisor = start_divisor
+    while num % divisor != 0:
+        divisor -= 1
+    return divisor
+
+
+# Determins input memory config for a height sharded conv operation.
+# If override_num_cores is set to True, the number of cores will be overriden to the closest largest divisor of the number of tiles
+# This will avoid default conv codepath which can pad-up the nhw num tiles and produce padded output
+# This can lead to issues with data-movment ops not handling padding correctly
+def get_conv_input_memory_config(
+    batch_size: int,
+    input_channels: int,
+    input_height: int,
+    input_width: int,
+    output_channels: int,
+    output_height: int,
+    output_width: int,
+    compute_grid: ttnn.CoreGrid,
+    input_channels_alignment: int,
+    override_num_cores: bool,
+) -> ttnn.MemoryConfig:
+    parallel_config = ttnn._ttnn.operations.conv.determine_parallel_config(
+        shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+        batch_size=batch_size,
+        input_channels=input_channels,
+        output_height=output_height,
+        output_width=output_width,
+        output_channels=output_channels,
+        compute_grid_size=compute_grid,
+        block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
+        enable_channels_padding=True,
+        is_out_tiled=True,
+    )
+
+    if override_num_cores:
+        nhw_ntiles = math.ceil(batch_size * output_height * output_width / 32)
+        num_cores_nwh = find_closest_largest_divisor(nhw_ntiles, compute_grid.x * compute_grid.y)
+        parallel_config.grid = get_core_grid_from_num_cores(num_cores_nwh, compute_grid.x, compute_grid.y)
+
+    memory_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
+        tensor_shape=ttnn.Shape(
+            [
+                1,
+                1,
+                input_width * input_height * batch_size,
+                nearest_y(
+                    input_channels,
+                    input_channels_alignment,
+                ),
+            ]
+        ),
+        parallel_config=parallel_config,
+        tile_size=32,
+    )
+    return memory_config
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -4,11 +4,11 @@
 
 import ttnn
 import torch
+from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_model_utils import get_conv_input_memory_config
 from models.utility_functions import (
     is_grayskull,
     is_wormhole_b0,
     _nearest_y,
-    pad_and_fold_conv_activation_for_unity_stride,
 )
 from typing import List
 from loguru import logger
@@ -632,15 +632,18 @@ def __init__(
 
         conv_dummy_tensor = torch.rand((self.fold_output_shape), dtype=torch.bfloat16)
         conv_dummy_tensor = ttnn.from_torch(conv_dummy_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-        _, self.override_fold_mem_config, _, _ = ttnn.get_conv_padded_input_shape_and_mem_config(
-            device=device,
-            input_tensor=conv_dummy_tensor,
-            conv_config=self.conv1_config,
-            batch_size=self.batch_size,
-            height=self.conv1_output_height,
-            width=self.conv1_output_width,
-            in_channels=self.conv1_input_channels,
-            out_channels=self.conv1_output_channels,
+
+        self.override_fold_mem_config = get_conv_input_memory_config(
+            self.batch_size,
+            self.conv1_input_channels,
+            self.conv1_input_height,
+            self.conv1_input_width,
+            self.conv1_output_channels,
+            self.conv1_output_height,
+            self.conv1_output_width,
+            device.compute_with_storage_grid_size(),
+            self.conv1_config.input_channels_alignment,
+            is_grayskull(),
         )
 
     def __del__(self):

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py
@@ -4,6 +4,7 @@
 
 import ttnn
 import torch
+from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_model_utils import get_conv_input_memory_config
 from models.utility_functions import (
     is_grayskull,
     is_wormhole_b0,
@@ -388,8 +389,25 @@ def __init__(
         self.conv1_bias_tensor = parameters.conv1.bias
         self.conv1_input_channels = self.conv1_weight_tensor.shape[1]
         self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
+        self.conv1_input_height = 451
+        self.conv1_input_width = 451
+        self.conv1_output_height = ttnn.get_conv_output_dim(self.conv1_input_height, 4, 1, 0)
+        self.conv1_output_width = ttnn.get_conv_output_dim(self.conv1_input_width, 4, 1, 0)
         assert self.conv1_weight_tensor.shape[2] == 4
 
+        self.grayskull_conv1_input_memory_config = get_conv_input_memory_config(
+            self.batch_size,
+            self.conv1_input_channels,
+            self.conv1_input_height,
+            self.conv1_input_width,
+            self.conv1_output_channels,
+            self.conv1_output_height,
+            self.conv1_output_width,
+            device.compute_with_storage_grid_size(),
+            16,
+            True,
+        )
+
         self.layer1 = self._make_layer(
             parameters=parameters.layer1,
             planes=64,
@@ -518,6 +536,11 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             elif batch_size == 20:
                 act_block_h_override = 640
 
+        if is_grayskull():
+            input_tensor = ttnn.to_device(
+                input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config
+            )
+
         x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d(
             input_tensor=input_tensor,
             weight_tensor=self.conv1_weight_tensor,
@@ -529,8 +552,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             stride=(1, 1),
             padding=(0, 0),
             batch_size=self.batch_size,
-            input_height=451,
-            input_width=451,
+            input_height=self.conv1_input_height,
+            input_width=self.conv1_input_width,
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],

diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py
@@ -74,6 +74,33 @@ def __init__(
 
         self.output_height = ttnn.get_conv_output_dim(input_height, 3, self.stride, 1)
         self.output_width = ttnn.get_conv_output_dim(input_width, 3, self.stride, 1)
+        self.shard_layout = (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED if self.in_channels < 320 else ttnn.TensorMemoryLayout.BLOCK_SHARDED
+        )
+
+        self.input_memory_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
+            tensor_shape=ttnn.Shape(
+                [
+                    1,
+                    1,
+                    self.batch_size * self.input_height * self.input_width,
+                    self.out_channels,
+                ]
+            ),
+            parallel_config=ttnn._ttnn.operations.conv.determine_parallel_config(
+                shard_layout=self.shard_layout,
+                batch_size=self.batch_size,
+                input_channels=self.in_channels,
+                output_height=self.output_height,
+                output_width=self.output_width,
+                output_channels=self.out_channels,
+                compute_grid_size=self.device.compute_with_storage_grid_size(),
+                block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
+                enable_channels_padding=False,
+                is_out_tiled=True,
+            ),
+            tile_size=32,
+        )
 
     def __call__(
         self,
@@ -104,13 +131,15 @@ def __call__(
             math_approx_mode_enabled=True,
             fp32_dest_acc_enabled=True,
             packer_l1_accum_enabled=False,
-            shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
-            if self.in_channels < 320
-            else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+            shard_layout=self.shard_layout,
             input_channels_alignment=32,
             transpose_shards=False,
-            reshard_if_not_optimal=True,
+            reshard_if_not_optimal=False,
         )
+
+        if hidden_states.memory_config() != self.input_memory_config:
+            hidden_states = ttnn.to_memory_config(hidden_states, self.input_memory_config)
+
         if self.conv_config_override and "act_block_h" in self.conv_config_override:
             conv_config.act_block_h_override = self.conv_config_override["act_block_h"]