Remove some usages of ttnn::Shape from the codebase (#17062)

### Ticket ### Problem description Remove some usages of ttnn::Shape from the codebase ### What's changed Removed some usages of ttnn::Shape from the codebase ### Checklist - [x] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12960342700) - [x] [Model regression CI testing passes](https://github.com/tenstorrent/tt-metal/actions/runs/12960346094) - [x] [Device performance regression CI testing passes](https://github.com/tenstorrent/tt-metal/actions/runs/12960344180) - [x] [T3K frequent CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12958453052) - [x] [T3K unit tests CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12958455364) - [x] [Nightly model and ttnn CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12960348968) - [x] New/Existing tests provide coverage for changes
tenstorrent · Jan 25, 2025 · f952174 · f952174
1 parent ad0b806
commit f952174
Show file tree

Hide file tree

Showing 26 changed files with 185 additions and 204 deletions.
diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp
@@ -377,7 +377,7 @@ int main() {
             {config.batch_size,
              config.input_hw.first + 2 * config.pad_hw.first,
              config.input_hw.second + 2 * config.pad_hw.second});
-        auto output_tensor_shape = config.get_output_shape().value;
+        auto output_tensor_shape = config.get_output_shape();
         ttnn::SimpleShape filter_tensor_shape({config.window_hw.first, config.window_hw.second});
 
         Tensor input_padded_tensor =
@@ -390,12 +390,12 @@ int main() {
         vector<float> filter_vector = create_filter_vec(filter_tensor_buf, tc.filter_h, tc.filter_w);
         owned_buffer::Buffer<bfloat16> out_golden_tensor_buf = ref_conv_op(
             input_padded_tensor,
-            ttnn::Shape(input_tensor_shape),
+            input_tensor_shape,
             tc.stride_h,
             tc.stride_w,
             filter_vector,
-            ttnn::Shape(filter_tensor_shape),
-            ttnn::Shape(output_tensor_shape));
+            filter_tensor_shape,
+            output_tensor_shape);
 
         auto failed_tests = validate_generate_functions(
             device,

diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
@@ -1646,8 +1646,8 @@ TEST(WorkerFabricEdmDatapath, LineFabricMcast_ManyMessages_SingleSource_Persiste
 ////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////
 
-ttnn::ccl::Shape4D<uint32_t> shape_to_shape_in_tiles(ttnn::Shape const& shape) {
-    auto logical_shape = shape.logical_shape();
+ttnn::ccl::Shape4D<uint32_t> shape_to_shape_in_tiles(const ttnn::SimpleShape& shape) {
+    auto logical_shape = shape;
     logical_shape[-2] /= tt::constants::TILE_HEIGHT;
     logical_shape[-1] /= tt::constants::TILE_WIDTH;
     EXPECT_TRUE(logical_shape.size() == 4);
@@ -2727,20 +2727,19 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
         "Expected {} devices but got {}",
         test_expected_num_devices,
         num_devices);
-    const ttnn::Shape input_shape = ttnn::Shape{1, 1, 32, 32 * num_devices};
+    const ttnn::SimpleShape input_shape({1, 1, 32, 32 * num_devices});
     const MemoryConfig in_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM);
-    auto const logical_shape = input_shape.logical_shape();
-    const auto num_elems = logical_shape.volume();
+    const auto num_elems = input_shape.volume();
 
     // INPUT TENSOR setup
     size_t page_size = tile_size(DataFormat::Float16);
     std::vector<Tensor> device_input_tensors;
     for (size_t i = 0; i < num_devices; i++) {
         // host_input_tensors.push_back(ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f) ,
-        // {logical_shape[0],logical_shape[1],logical_shape[2],logical_shape[3]}, layout).to(devices[i]));
+        // {input_shape[0],input_shape[1],input_shape[2],input_shape[3]}, layout).to(devices[i]));
         auto t = ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::BFLOAT16), input_shape).to(layout);
         t.set_tensor_spec(TensorSpec(
-            logical_shape, TensorLayout(DataType::BFLOAT16, PageConfig(layout, tt_metal::Tile()), in_memory_config)));
+            input_shape, TensorLayout(DataType::BFLOAT16, PageConfig(layout, tt_metal::Tile()), in_memory_config)));
 
         device_input_tensors.push_back(t.to(devices[i]));
     }
@@ -2814,7 +2813,8 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
 }
 
 #include "ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp"
-void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_links, ttnn::Shape const& input_shape) {
+void run_all_gather_with_persistent_fabric(
+    const size_t dim, const size_t num_links, ttnn::SimpleShape const& input_shape) {
     log_info(tt::LogTest, "entering test");
     constexpr auto layout = Layout::TILE;
     // DEVICES setuip
@@ -2841,8 +2841,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
         test_expected_num_devices,
         num_devices);
     const MemoryConfig in_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM);
-    auto const logical_shape = input_shape.logical_shape();
-    const auto num_elems = logical_shape.volume();
+    const auto num_elems = input_shape.volume();
 
     // INPUT TENSOR setup
     log_info(tt::LogTest, "setting up input tensors");
@@ -2851,7 +2850,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
     for (size_t i = 0; i < num_devices; i++) {
         auto t = ttnn::experimental::view(ttnn::arange(0, num_elems, 1), input_shape).to(layout);
         t.set_tensor_spec(TensorSpec(
-            logical_shape, TensorLayout(DataType::BFLOAT16, PageConfig(layout, tt_metal::Tile()), in_memory_config)));
+            input_shape, TensorLayout(DataType::BFLOAT16, PageConfig(layout, tt_metal::Tile()), in_memory_config)));
 
         device_input_tensors.push_back(t.to(devices[i]));
     }
@@ -2915,16 +2914,16 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li
 }
 
 TEST(CclAsyncOp, AllGather_PersistentFabric_Dim3_Links1_Shape1_1_32_128) {
-    run_all_gather_with_persistent_fabric(3, 1, ttnn::Shape{1, 1, 32, 128});
+    run_all_gather_with_persistent_fabric(3, 1, ttnn::SimpleShape({1, 1, 32, 128}));
 }
 TEST(CclAsyncOp, AllGather_PersistentFabric_Dim3_Links1_Shape1_1_32_8192) {
-    run_all_gather_with_persistent_fabric(3, 1, ttnn::Shape{1, 1, 32, 8192});
+    run_all_gather_with_persistent_fabric(3, 1, ttnn::SimpleShape({1, 1, 32, 8192}));
 }
 // Mesh device setup seems to not provide the correct configuration for multi-link? To be investigated
 TEST(CclAsyncOp, DISABLED_AllGather_PersistentFabric_Dim3_Links2_Shape1_1_32_128) {
-    run_all_gather_with_persistent_fabric(3, 2, ttnn::Shape{1, 1, 32, 128});
+    run_all_gather_with_persistent_fabric(3, 2, ttnn::SimpleShape({1, 1, 32, 128}));
 }
 // Mesh device setup seems to not provide the correct configuration for multi-link? To be investigated
 TEST(CclAsyncOp, DISABLED_AllGather_PersistentFabric_Dim3_Links2_Shape1_1_32_8192) {
-    run_all_gather_with_persistent_fabric(3, 2, ttnn::Shape{1, 1, 32, 8192});
+    run_all_gather_with_persistent_fabric(3, 2, ttnn::SimpleShape({1, 1, 32, 8192}));
 }
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
@@ -86,7 +86,7 @@ std::ostream& operator<<(std::ostream& os, const tt::tt_metal::DataType& value)
 }
 
 using CombinationInputParams =
-    std::tuple<ttnn::Shape, tt::tt_metal::DataType, tt::tt_metal::Layout, tt::tt_metal::MemoryConfig>;
+    std::tuple<ttnn::SimpleShape, tt::tt_metal::DataType, tt::tt_metal::Layout, tt::tt_metal::MemoryConfig>;
 class EmptyTensorTest : public ttnn::TTNNFixtureWithDevice,
                         public ::testing::WithParamInterface<CombinationInputParams> {};
 
@@ -108,36 +108,36 @@ TEST_P(EmptyTensorTest, Combinations) {
 
     // Ignoring too large single bank allocations
     if (memory_config.memory_layout == TensorMemoryLayout::SINGLE_BANK) {
-        if (tensor_layout.compute_page_size_bytes(shape.logical_shape()) >= 500 * 1024) {
+        if (tensor_layout.compute_page_size_bytes(shape) >= 500 * 1024) {
             GTEST_SKIP() << "Skipping test with page size exceeding single bank size of 500 kB!";
         }
     }
 
     auto tensor = tt::tt_metal::create_device_tensor(shape, dtype, layout, device_, memory_config);
-    EXPECT_EQ(tensor.get_logical_shape(), shape.logical_shape());
+    EXPECT_EQ(tensor.get_logical_shape(), shape);
 
-    test_utils::test_tensor_on_device(shape.logical_shape(), tensor_layout, device_);
+    test_utils::test_tensor_on_device(shape, tensor_layout, device_);
 }
 
 INSTANTIATE_TEST_SUITE_P(
     EmptyTensorTestWithShape,
     EmptyTensorTest,
     ::testing::Combine(
         ::testing::Values(
-            ttnn::Shape({}),
-            ttnn::Shape({0}),
-            ttnn::Shape({1}),
-            ttnn::Shape({1, 2}),
-            ttnn::Shape({1, 2, 3}),
-            ttnn::Shape({1, 2, 3, 4}),
-            // ttnn::Shape({0, 0, 0, 0}), fails with width sharded case
-            ttnn::Shape({1, 1, 1, 1}),
-            // ttnn::Shape({0, 1, 32, 32}), fails with width sharded case
-            ttnn::Shape({1, 1, 32, 32}),
-            ttnn::Shape({2, 1, 32, 32}),
-            ttnn::Shape({64, 1, 256, 1}),
-            ttnn::Shape({1, 1, 21120, 16}),
-            ttnn::Shape({1, 2, 3, 4, 5})),
+            ttnn::SimpleShape({}),
+            ttnn::SimpleShape({0}),
+            ttnn::SimpleShape({1}),
+            ttnn::SimpleShape({1, 2}),
+            ttnn::SimpleShape({1, 2, 3}),
+            ttnn::SimpleShape({1, 2, 3, 4}),
+            // ttnn::SimpleShape({0, 0, 0, 0}), fails with width sharded case
+            ttnn::SimpleShape({1, 1, 1, 1}),
+            // ttnn::SimpleShape({0, 1, 32, 32}), fails with width sharded case
+            ttnn::SimpleShape({1, 1, 32, 32}),
+            ttnn::SimpleShape({2, 1, 32, 32}),
+            ttnn::SimpleShape({64, 1, 256, 1}),
+            ttnn::SimpleShape({1, 1, 21120, 16}),
+            ttnn::SimpleShape({1, 2, 3, 4, 5})),
 
         ::testing::Values(
             tt::tt_metal::DataType::BFLOAT16,

diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
@@ -145,7 +145,7 @@ INSTANTIATE_TEST_SUITE_P(
                 .expected_l1_output_per_core = 2048,
                 .expected_l1_peak_per_core = 2048,
                 .expected_output_info = {graph::TensorInfo{
-                    .shape = ttnn::Shape(tt::tt_metal::Array4D{1, 3, 32, 32}),
+                    .shape = ttnn::SimpleShape(tt::tt_metal::Array4D{1, 3, 32, 32}),
                     .size = 6144,
                     .type = tt::tt_metal::BufferType::L1}}},
             AddOpGraphTestParam{
@@ -167,7 +167,7 @@ INSTANTIATE_TEST_SUITE_P(
                 .expected_l1_output_per_core = 2048,
                 .expected_l1_peak_per_core = 2 * 2048,
                 .expected_output_info = {graph::TensorInfo{
-                    .shape = ttnn::Shape(tt::tt_metal::Array4D{4, 3, 32, 32}),
+                    .shape = ttnn::SimpleShape(tt::tt_metal::Array4D{4, 3, 32, 32}),
                     .size = 24576,
                     .type = tt::tt_metal::BufferType::L1}},
             },
@@ -190,7 +190,7 @@ INSTANTIATE_TEST_SUITE_P(
                 .expected_l1_output_per_core = 2 * (3 * 32 * 32 * 32 * 32) / 16,
                 .expected_l1_peak_per_core = 2 * (3 * 32 * 32 * 32 * 32) / 16,
                 .expected_output_info = {graph::TensorInfo{
-                    .shape = ttnn::Shape(tt::tt_metal::Array4D{3, 1, 32 * 32, 32 * 32}),
+                    .shape = ttnn::SimpleShape(tt::tt_metal::Array4D{3, 1, 32 * 32, 32 * 32}),
                     .size = 2 * (3 * 32 * 32 * 32 * 32),
                     .type = tt::tt_metal::BufferType::L1}}}),
         ::testing::Values(

diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
@@ -15,7 +15,7 @@
 namespace ttnn::graph {
 
 namespace {
-ttnn::Shape parse_shape(std::string_view shape_string) {
+ttnn::SimpleShape parse_shape(std::string_view shape_string) {
     // Extract shape values from string like "ttnn.Shape([1, 3, 32, 32])"
     auto start = shape_string.find('[') + 1;
     auto end = shape_string.find(']');
@@ -42,7 +42,7 @@ ttnn::Shape parse_shape(std::string_view shape_string) {
         }
     }
 
-    return ttnn::Shape(shape);
+    return ttnn::SimpleShape(std::move(shape));
 }
 }  // namespace
 

diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp
@@ -27,7 +27,7 @@ std::vector<std::string> extract_calltrace(const nlohmann::json& trace);
 std::unordered_set<uint32_t> extract_output_tensors(const nlohmann::json& trace);
 
 struct TensorInfo {
-    ttnn::Shape shape;
+    ttnn::SimpleShape shape;
     uint32_t size = 0;
     tt::tt_metal::BufferType type = tt::tt_metal::BufferType::DRAM;
 

diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.cpp
@@ -88,23 +88,6 @@ std::vector<std::vector<ttnn::ccl::v2::TensorSlice>> split_tensor_slices_across_
     return worker_slices_streams;
 };
 
-Shape4D<uint32_t> from_tensor_shape(ttnn::Shape const& shape) {
-    constexpr size_t max_rank = 4;
-    TT_FATAL(
-        shape.size() <= max_rank,
-        "Reduce scatter device code only supports tensors up to rank 4. Current tensor rank is {}. The host code "
-        "calling the program factory must reduce the dimensionality",
-        shape.size());
-
-    Shape4D<uint32_t> shape4d = {1, 1, 1, 1};
-    size_t output_index = max_rank - 1;
-    for (int i = shape.size() - 1; i >= 0; --i) {
-        shape4d[output_index] = shape[i];
-        output_index--;
-    }
-    return shape4d;
-}
-
 static ttnn::ccl::Shape4D<uint32_t> shape_to_shape_in_tiles(const SimpleShape& shape) {
     TT_FATAL(shape.rank() == 4, "Expected 4D shape but got {}", shape.rank());
     ttnn::ccl::Shape4D<uint32_t> shape_in_tiles = {

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -242,7 +242,7 @@ uint32_t get_num_cores_channels_from_parallel_config(const ParallelConfig& pconf
 }
 
 MemoryConfig create_sharded_memory_config_from_parallel_config(
-    const ttnn::Shape& tensor_shape, const ParallelConfig& parallel_config, uint32_t tile_size) {
+    const ttnn::SimpleShape& tensor_shape, const ParallelConfig& parallel_config, uint32_t tile_size) {
     log_debug(
         tt::LogOp,
         "create_sharded_memory_config_from_parallel_config: tensor_shape: {}, parallel_config: {}, tile_size: {}",
@@ -252,7 +252,7 @@ MemoryConfig create_sharded_memory_config_from_parallel_config(
     // tensor_shape is [N, H, W, C]
     TT_ASSERT(tensor_shape[0] == 1 && tensor_shape[1] == 1);  // todo: add support for generic non-2d shapes
     // uint32_t channels = tensor_shape[3];
-    uint32_t channels = tensor_shape.with_tile_padding()[3];
+    uint32_t channels = tensor_shape[3];
     uint32_t num_cores_nhw = get_num_cores_nhw_from_parallel_config(parallel_config);
     uint32_t num_cores_channels = get_num_cores_channels_from_parallel_config(parallel_config);
     auto shard_scheme = parallel_config.shard_scheme;
@@ -480,7 +480,7 @@ static TensorMemoryLayout select_shard_spec(
 }
 
 template <typename T>
-static std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config(
+static std::tuple<ttnn::SimpleShape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config(
     T* device,
     const ttnn::Tensor& input_tensor_,
     const Conv2dConfig& conv_config,
@@ -616,22 +616,26 @@ static std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_i
                 tt::round_up(input_shape[3], conv_config.input_channels_alignment);
         }
 
-        auto input_padded_shape = ttnn::Shape(std::array<uint32_t, 4>{
-            1,
-            1,
-            input_tensor_height_snapped_to_tile,
-            input_tensor_width_snapped_to_channels_alignment});  // TODO: resolve ttnn::types::Shape and
-                                                                 // tt::tt_metal::LegacyShape issue to clean up next
-                                                                 // line
+        auto input_padded_shape = ttnn::SimpleShape(
+            {1,
+             1,
+             input_tensor_height_snapped_to_tile,
+             input_tensor_width_snapped_to_channels_alignment});  // TODO: resolve ttnn::types::Shape and
+                                                                  // tt::tt_metal::LegacyShape issue to clean up next
+                                                                  // line
         MemoryConfig input_tensor_sharded_memory_config = create_sharded_memory_config_from_parallel_config(
-            ttnn::Shape(std::array<uint32_t, 4>{
-                input_padded_shape[0], input_padded_shape[1], input_padded_shape[2], input_padded_shape[3]}),
+            ttnn::SimpleShape(
+                {input_padded_shape[0], input_padded_shape[1], input_padded_shape[2], input_padded_shape[3]}),
             parallel_config,
             round_up_size);
 
         return {input_padded_shape, input_tensor_sharded_memory_config, needs_shard_or_reshard, use_non_tile_height};
     } else {
-        return {input_tensor.shape(), input_tensor.memory_config(), needs_shard_or_reshard, use_non_tile_height};
+        return {
+            input_tensor.get_logical_shape(),
+            input_tensor.memory_config(),
+            needs_shard_or_reshard,
+            use_non_tile_height};
     }
 }
 
@@ -888,9 +892,7 @@ std::tuple<OptimizedConvParallelizationConfig, OptimizedConvBlockConfig, MemoryC
         out_channels_padded = tt::round_up(out_channels, 32);
     }
     MemoryConfig conv_out_memory_config = create_sharded_memory_config_from_parallel_config(
-        ttnn::Shape(std::array<uint32_t, 4>{1, 1, nhw_out, out_channels_padded}),
-        output_parallel_config,
-        round_up_size);
+        ttnn::SimpleShape({1, 1, nhw_out, out_channels_padded}), output_parallel_config, round_up_size);
     ParallelConfig largest_parallel_config =
         output_parallel_config.grid.num_cores() > input_parallel_config.grid.num_cores() ? output_parallel_config
                                                                                          : input_parallel_config;
@@ -1256,7 +1258,8 @@ template bool check_non_tile_mul_width<IDevice>(
 template bool check_non_tile_mul_width<MeshDevice>(
     MeshDevice* device, const Conv2dConfig& conv_config, const uint32_t in_channels);
 
-template std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config<IDevice>(
+template std::tuple<ttnn::SimpleShape, ttnn::MemoryConfig, bool, bool>
+get_conv_padded_input_shape_and_mem_config<IDevice>(
     IDevice* device,
     const ttnn::Tensor& input_tensor_,
     const Conv2dConfig& conv_config,
@@ -1268,7 +1271,8 @@ template std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded
     bool is_mm_conv,
     bool is_non_tile_mul_width);
 
-template std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config<MeshDevice>(
+template std::tuple<ttnn::SimpleShape, ttnn::MemoryConfig, bool, bool>
+get_conv_padded_input_shape_and_mem_config<MeshDevice>(
     MeshDevice* device,
     const ttnn::Tensor& input_tensor_,
     const Conv2dConfig& conv_config,

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
@@ -64,7 +64,7 @@ uint32_t get_num_cores_nhw_from_parallel_config(const sliding_window::ParallelCo
 uint32_t get_num_cores_channels_from_parallel_config(const sliding_window::ParallelConfig& pconfig);
 
 MemoryConfig create_sharded_memory_config_from_parallel_config(
-    const ttnn::Shape& tensor_shape, const sliding_window::ParallelConfig& parallel_config, uint32_t tile_size);
+    const ttnn::SimpleShape& tensor_shape, const sliding_window::ParallelConfig& parallel_config, uint32_t tile_size);
 
 OptimizedConvParallelizationConfig determine_conv_op_parallel_config_from_conv_output_mem_config(
     const MemoryConfig& conv_output_mem_config, uint32_t num_cores_nhw, uint32_t num_cores_c);
@@ -104,7 +104,7 @@ std::tuple<OptimizedConvParallelizationConfig, OptimizedConvBlockConfig, MemoryC
     T* device);
 
 template <typename T>
-static std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config(
+static std::tuple<ttnn::SimpleShape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config(
     T* device,
     const ttnn::Tensor& input_tensor_,
     const Conv2dConfig& conv_config,