From 7ab66ff2252b85034aa6b2b7e2eef032a97cdad3 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Thu, 8 Aug 2024 10:41:18 -0400
Subject: [PATCH] [Codegen][GPU] Move conversion to multi_mma to
 PackToIntrinsics (#18141)

Now that `iree_gpu.multi_mma` has a tiling interface implementation, the
conversion from linalg to it can happen before other levels of tiling.
This allows for reshaping the inner dimensions freely before reduction
tiling and then propagating the reshapes to nearby ops without needing
to hoist them out of tiling contructs.

Additionally this is closer to the required flow for data tiling where
we need to generate the `iree_gpu.multi_mma` op before any tiling.
---
 .../GPU/Transforms/DistributeMmaToLanes.cpp   | 32 +----------------
 .../GPU/Transforms/PackToIntrinsics.cpp       | 35 ++++++++++++++++++-
 .../Codegen/Dialect/GPU/Transforms/Passes.td  |  5 ++-
 .../Dialect/GPU/Transforms/Transforms.cpp     |  7 ++++
 .../test/distribute_mma_to_lanes.mlir         | 27 ++++++--------
 .../Transforms/test/pack_to_intrinsics.mlir   | 27 +++++++-------
 6 files changed, 70 insertions(+), 63 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/DistributeMmaToLanes.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/DistributeMmaToLanes.cpp
index ef6d5cc745..577e005a7a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/DistributeMmaToLanes.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/DistributeMmaToLanes.cpp
@@ -35,26 +35,6 @@ struct DistributeMmaToLanesPass final
 };
 } // namespace
 
-struct ConvertToMultiMma final : OpInterfaceRewritePattern<linalg::LinalgOp> {
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-  LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
-                                PatternRewriter &rewriter) const override {
-    auto loweringConfig =
-        getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
-    if (!loweringConfig) {
-      return failure();
-    }
-    IREE::GPU::MmaInterfaceAttr kind = loweringConfig.getMmaKind();
-    if (!kind) {
-      return failure();
-    }
-    if (failed(convertContractionToMultiMma(rewriter, linalgOp, kind))) {
-      return failure();
-    }
-    return success();
-  }
-};
-
 LogicalResult fuseProducersGreedily(RewriterBase &rewriter,
                                     scf::ForallOp laneForall) {
 
@@ -100,17 +80,7 @@ void DistributeMmaToLanesPass::runOnOperation() {
   MLIRContext *context = &getContext();
   auto funcOp = getOperation();
 
-  // Step 1. Convert configured linalg ops to multi_mma.
-  {
-    RewritePatternSet patterns(context);
-    patterns.add<ConvertToMultiMma>(context);
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
-      funcOp.emitError() << "failed to convert linalg to multi_mma";
-      return signalPassFailure();
-    }
-  }
-
-  // Step 2. Distribute multi_mma ops to lanes and greedily fuse producers.
+  // Distribute multi_mma ops to lanes and greedily fuse producers.
   SmallVector<IREE::GPU::MultiMmaOp> mmaOps;
   funcOp.walk([&](IREE::GPU::MultiMmaOp mmaOp) { mmaOps.push_back(mmaOp); });
   IRRewriter rewriter(funcOp);
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/PackToIntrinsics.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/PackToIntrinsics.cpp
index df0ff1c73e..f79365c7be 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/PackToIntrinsics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/PackToIntrinsics.cpp
@@ -70,9 +70,31 @@ LogicalResult packToIntrinsic(linalg::LinalgOp linalgOp,
   return success();
 }
 
+struct ConvertToMultiMma final : OpInterfaceRewritePattern<linalg::LinalgOp> {
+  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
+  LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
+                                PatternRewriter &rewriter) const override {
+    auto loweringConfig =
+        getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
+    if (!loweringConfig) {
+      return failure();
+    }
+    IREE::GPU::MmaInterfaceAttr kind = loweringConfig.getMmaKind();
+    if (!kind) {
+      return failure();
+    }
+    if (failed(convertContractionToMultiMma(rewriter, linalgOp, kind))) {
+      return failure();
+    }
+    return success();
+  }
+};
+
 void PackToIntrinsicsPass::runOnOperation() {
   MLIRContext *context = &getContext();
   auto funcOp = getOperation();
+
+  // Step 1. Pack candidate linalg ops to specified shapes.
   IRRewriter rewriter(funcOp);
   SmallVector<linalg::LinalgOp> packingCandidates;
   funcOp->walk([&](linalg::LinalgOp linalgOp) {
@@ -95,7 +117,18 @@ void PackToIntrinsicsPass::runOnOperation() {
     }
   }
 
-  // Run layout propagation patterns to pull in adjacent un-configured ops.
+  // Step 2. Convert configured linalg ops to multi_mma.
+  {
+    RewritePatternSet patterns(context);
+    patterns.add<ConvertToMultiMma>(context);
+    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+      funcOp.emitError() << "failed to convert linalg to multi_mma";
+      return signalPassFailure();
+    }
+  }
+
+  // Step 3. Run layout propagation patterns to pull in adjacent un-configured
+  // ops.
   RewritePatternSet patterns(context);
   linalg::ControlPropagationFn control = [](OpOperand *opOperand) -> bool {
     Operation *producer = opOperand->get().getDefiningOp();
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td
index 6cc7f11e6f..a882b835e4 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td
@@ -11,12 +11,11 @@ include "mlir/Pass/PassBase.td"
 
 def DistributeMmaToLanesPass :
     InterfacePass<"iree-gpu-distribute-mma-to-lanes", "mlir::FunctionOpInterface"> {
-  let summary = "Converts and distributes linalg ops with mma kinds to lanes";
+  let summary = "Distributes iree_gpu.multi_mma ops to lanes";
   let dependentDialects = [
     "::mlir::arith::ArithDialect",
     "::mlir::affine::AffineDialect",
     "::mlir::scf::SCFDialect",
-    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect",
   ];
 }
 
@@ -58,7 +57,7 @@ def LowerIREEGPUOpsPass :
 
 def PackToIntrinsicsPass :
     InterfacePass<"iree-gpu-pack-to-intrinsics", "mlir::FunctionOpInterface"> {
-  let summary = "Packs matmul like operations to specified intrinsic shapes";
+  let summary = "Packs matmul like operations and converts to iree_gpu.multi_mma";
   let dependentDialects = [
     "::mlir::tensor::TensorDialect",
     "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
index 389ffc3a76..45d8ad188c 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -442,10 +443,16 @@ convertContractionToMultiMma(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
     accPerm = accInnerPerm;
   }
 
+  IREE::Codegen::LoweringConfigAttrInterface maybeLoweringConfig =
+      getLoweringConfig(linalgOp);
+
   auto newMmaOp = rewriter.replaceOpWithNewOp<IREE::GPU::MultiMmaOp>(
       linalgOp, inputs[0], inputs[1], inputs[2],
       ArrayRef<AffineMap>{outerLhsMap, outerRhsMap, outerAccMap}, iteratorTypes,
       mmaKind, lhsPerm, rhsPerm, accPerm);
+  if (maybeLoweringConfig) {
+    setLoweringConfig(newMmaOp, maybeLoweringConfig);
+  }
   return newMmaOp;
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
index b98ff4223a..214b432f65 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
@@ -1,25 +1,20 @@
 // RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-gpu-distribute-mma-to-lanes, canonicalize, cse))' --split-input-file | FileCheck %s
 
-#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
-#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)>
-#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
 module {
   func.func @matmul_16x16x16(%arg0: tensor<8x2x16x16xf16>, %arg1: tensor<8x2x16x16xf16>, %arg2: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> {
     %empty = tensor.empty() : tensor<2x8x16x16xf16>
     %lhs_transpose = linalg.transpose ins(%arg0: tensor<8x2x16x16xf16>) outs(%empty: tensor<2x8x16x16xf16>) permutation = [1, 0, 2, 3]
-    %mm = linalg.generic {
-      indexing_maps = [#map, #map1, #map2],
-      iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-      ins(%lhs_transpose, %arg1 : tensor<2x8x16x16xf16>, tensor<8x2x16x16xf16>)
-      outs(%arg2 : tensor<2x2x16x16xf32>)
-    attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>}>} {
-    ^bb0(%in: f16, %in_2: f16, %out: f32):
-      %4 = arith.extf %in : f16 to f32
-      %5 = arith.extf %in_2 : f16 to f32
-      %6 = arith.mulf %4, %5 : f32
-      %7 = arith.addf %out, %6 : f32
-      linalg.yield %7 : f32
-    } -> tensor<2x2x16x16xf32>
+    %mm = iree_gpu.multi_mma %lhs_transpose, %arg1, %arg2 {
+      indexing_maps = #contraction_accesses,
+      iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+      rhs_permutation = array<i64: 1, 0>
+    } : tensor<2x8x16x16xf16>, tensor<8x2x16x16xf16> into tensor<2x2x16x16xf32>
     return %mm : tensor<2x2x16x16xf32>
   }
 }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir
index 7da25abd2c..30153f306d 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-gpu-pack-to-intrinsics, canonicalize, cse))' --split-input-file | FileCheck %s
+// RUN: iree-opt %s --mlir-print-local-scope --pass-pipeline='builtin.module(func.func(iree-gpu-pack-to-intrinsics, canonicalize, cse))' --split-input-file | FileCheck %s
 
 #config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>}>
 module {
@@ -15,10 +15,15 @@ module {
 //   CHECK-DAG:   %[[A_PACK:.+]] = tensor.pack %[[A]] inner_dims_pos = [0, 1] inner_tiles = [32, 8]
 //   CHECK-DAG:   %[[B_PACK:.+]] = tensor.pack %[[B]] inner_dims_pos = [1, 0] inner_tiles = [32, 8]
 //   CHECK-DAG:   %[[C_PACK:.+]] = tensor.pack %[[C]] inner_dims_pos = [0, 1] inner_tiles = [32, 32]
-//       CHECK:   %[[PACKED_MM:.+]] = linalg.generic
-//  CHECK-SAME:     ins(%[[A_PACK]], %[[B_PACK]] : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16>)
-//  CHECK-SAME:     outs(%[[C_PACK]] : tensor<2x2x32x32xf32>)
+//       CHECK:   iree_gpu.multi_mma %[[A_PACK]], %[[B_PACK]], %[[C_PACK]]
+//  CHECK-SAME:     indexing_maps =
+//  CHECK-SAME:       affine_map<(d0, d1, d2) -> (d0, d2)>
+//  CHECK-SAME:       affine_map<(d0, d1, d2) -> (d2, d1)>
+//  CHECK-SAME:       affine_map<(d0, d1, d2) -> (d0, d1)>
+//  CHECK-SAME:     iterator_types = {{.*}}parallel{{.*}}parallel{{.*}}reduction
+//  CHECK-SAME:     kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>
 //  CHECK-SAME:     lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>}>
+//  CHECK-SAME:     rhs_permutation = array<i64: 1, 0>
 
 // -----
 
@@ -45,13 +50,11 @@ module {
   }
 }
 
-// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d3, d4, d5, d7)>
-// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d2, d0, d3, d4, d6, d7)>
-// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6)>
-
 // CHECK-LABEL: func.func @matmul_16x16x16
-//       CHECK:   %[[PACKED_MM:.+]] = linalg.generic
-//  CHECK-SAME:     indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-//  CHECK-SAME:     ins({{.*}} : tensor<?x?x?x16x16xf16>, tensor<?x?x?x?x16x16xf16>)
-//  CHECK-SAME:     outs({{.*}} : tensor<?x?x?x16x16xf32>)
+//       CHECK:   iree_gpu.multi_mma
+//  CHECK-SAME:     indexing_maps =
+//  CHECK-SAME:       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>
+//  CHECK-SAME:       affine_map<(d0, d1, d2, d3, d4) -> (d2, d0, d3, d4)>
+//  CHECK-SAME:       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
 //  CHECK-SAME:     lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>}>
+//  CHECK-SAME:     : tensor<?x?x?x16x16xf16>, tensor<?x?x?x?x16x16xf16> into tensor<?x?x?x16x16xf32>