moved Silicon to Dialect test because of the L1 Interleaved hardware …

…incompatibility with compiler
tenstorrent · Nov 18, 2024 · 4d1c249 · 4d1c249
1 parent 57c92ed
commit 4d1c249
Show file tree

Hide file tree

Showing 12 changed files with 215 additions and 162 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ ttrt-artifacts/*
 query_results.json
 run_results.json
 ttrt_report.xml
+cluster_description.yaml
diff --git a/cluster_descriptor.yaml b/cluster_descriptor.yaml
@@ -0,0 +1,24 @@
+arch: {
+   0: Wormhole,
+}
+
+chips: {
+   0: [0,0,0,0],
+}
+
+ethernet_connections: [
+]
+
+chips_with_mmio: [
+   0: 1,
+]
+
+# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
+harvesting: {
+   0: {noc_translation: true, harvest_mask: 1},
+}
+
+# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case.
+boardtype: {
+   0: n150,
+}
diff --git a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp
@@ -93,6 +93,7 @@ L1InterleavedPolicy::getOptimalOpConfig(
   llvm::DenseMap<Operation *, tt::LayoutAttr> optimalConfig;
 
   optimalL1Usage = 0;
+  optimalConfigMask = 0;
   n = opsL1Usage.size();
   for (i = 0; i < (1 << n); i++) {
 
@@ -108,10 +109,12 @@ L1InterleavedPolicy::getOptimalOpConfig(
       j <<= 1;
     }
 
-    // Check if the current configuration is optimal.
+    // Figure out this const based on exec data, but will be replaced
+    // with API.
     //
+    constexpr float tensorL1UsageCap = 0.75;
     if (optimalL1Usage < currentL1Usage &&
-        currentL1Usage <= usableL1CacheSize) {
+        currentL1Usage <= tensorL1UsageCap * usableL1CacheSize) {
       optimalL1Usage = currentL1Usage;
       optimalConfigMask = i;
     }

diff --git a/out.mlir b/out.mlir
diff --git a/.../l1_interleaved_policy/large_tensors.mlir → ...ed_policy/unittests/dram_ABC_l1_None.mlir b/.../l1_interleaved_policy/large_tensors.mlir → ...ed_policy/unittests/dram_ABC_l1_None.mlir
@@ -1,19 +1,30 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A > L1) AND (B > L1) AND (C > L1)
+//      =>
+//  DRAM: ABC; L1: None
+//
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
-  func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> {
-    // CHECK: #[[LAYOUT_2:layout2]] = #tt.layout<{{.*}}, memref<{{.*}}, #dram>, {{.*}}>
+  func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>, %arg3: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> {
+    // CHECK: #[[LAYOUT_2:layout2]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
     %0 = tensor.empty() : tensor<8192x8192xbf16>
-    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
     %2 = tensor.empty() : tensor<8192x8192xbf16>
-    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
-    %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
     %4 = tensor.empty() : tensor<8192x8192xbf16>
-    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
-    %7 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
-    return %7 : tensor<8192x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
+    return %5 : tensor<8192x8192xbf16>
   }
-}
+}
diff --git a/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_AB_l1_C.mlir b/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_AB_l1_C.mlir
@@ -0,0 +1,33 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (A < C) AND (B < C) AND (C <= L1)
+//      =>
+//  DRAM: AB; L1: C
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<5120x4096xbf16>, %arg1: tensor<5120x4096xbf16>, %arg2: tensor<4096x5120xbf16>, %arg3: tensor<4096x5120xbf16>) -> tensor<5120x5120xbf16> {
+    // CHECK: #[[L1_:.*]] = #tt.memory_space<l1>
+    // CHECK: #[[LAYOUT_4:layout4]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_6:layout6]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_7:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<5120x4096xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_4]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16>
+    %2 = tensor.empty() : tensor<4096x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_6]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
+    %4 = tensor.empty() : tensor<5120x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_7]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<4096x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    return %5 : tensor<5120x5120xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_AC_l1_B.mlir b/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_AC_l1_B.mlir
@@ -0,0 +1,32 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (A < B) AND (C < B) AND (B <= L1)
+//      =>
+//  DRAM: AC; L1: B
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<4096x5120xbf16>, %arg1: tensor<4096x5120xbf16>, %arg2: tensor<5120x5120xbf16>, %arg3: tensor<5120x5120xbf16>) -> tensor<4096x5120xbf16> {
+    // CHECK: #[[L1_:.*]] = #tt.memory_space<l1>
+    // CHECK: #[[LAYOUT_3:layout3]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_5:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<4096x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
+    %2 = tensor.empty() : tensor<5120x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_5]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    %4 = tensor.empty() : tensor<4096x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<5120x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
+    return %5 : tensor<4096x5120xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_A_l1_BC.mlir b/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_A_l1_BC.mlir
@@ -0,0 +1,32 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C > L1) AND (A + C < B + C) AND (A + B < B + C) AND (B + C <= L1)
+//      =>
+//  DRAM: A; L1: BC
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x8192xbf16>, %arg3: tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> {
+    // CHECK: #[[L1_:.*]] = #tt.memory_space<l1>
+    // CHECK: #[[LAYOUT_3:layout3]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_5:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<2048x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_3]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
+    %2 = tensor.empty() : tensor<2048x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_5]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
+    %4 = tensor.empty() : tensor<2048x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_5]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
+    return %5 : tensor<2048x8192xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_C_l1_AB.mlir b/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_C_l1_AB.mlir
@@ -0,0 +1,33 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C > L1) AND (A + C < A + B) AND (B + C < A + B) AND (A + B <= L1)
+//      =>
+//  DRAM: C; L1: AB
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8192xbf16>, %arg2: tensor<8192x2048xbf16>, %arg3: tensor<8192x2048xbf16>) -> tensor<2048x2048xbf16> {
+    // CHECK: #[[L1_:.*]] = #tt.memory_space<l1>
+    // CHECK: #[[LAYOUT_4:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    // CHECK: #[[LAYOUT_6:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    // CHECK: #[[LAYOUT_7:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
+    %0 = tensor.empty() : tensor<2048x8192xbf16>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_4]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
+    %2 = tensor.empty() : tensor<8192x2048xbf16>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_6]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<8192x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16>
+    %4 = tensor.empty() : tensor<2048x2048xbf16>
+    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_7]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<8192x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
+    return %5 : tensor<2048x2048xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_None_l1_ABC.mlir b/test/ttmlir/Dialect/TTNN/l1_interleaved_policy/unittests/dram_None_l1_ABC.mlir
@@ -0,0 +1,31 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C <= L1)
+//      =>
+//  DRAM: None; L1: ABC 
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>, %arg2: tensor<32x32xbf16>, %arg3: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+    // CHECK: #[[L1_:.*]] = #tt.memory_space<l1>
+    // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<32x32xbf16>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+    %2 = tensor.empty() : tensor<32x32xbf16>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+    %4 = tensor.empty() : tensor<32x32xbf16>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
+    %5 = "ttir.add"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+    return %5 : tensor<32x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/l1_interleaved_policy/simple_join.mlir b/test/ttmlir/Silicon/TTNN/l1_interleaved_policy/simple_join.mlir
diff --git a/test/ttmlir/Silicon/TTNN/l1_interleaved_policy/single_op.mlir b/test/ttmlir/Silicon/TTNN/l1_interleaved_policy/single_op.mlir
@@ -4,9 +4,9 @@
 // UNSUPPORTED: true
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
-    %0 = tensor.empty() : tensor<64x96xbf16>
-    %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16>
-    return %1 : tensor<64x96xbf16>
+  func.func @forward(%arg0: tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> {
+    %0 = tensor.empty() : tensor<5120x5120xbf16>
+    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    return %1 : tensor<5120x5120xbf16>
   }
 }