Lower tt.generic_reduce to LLVM IR

triton-lang · Mar 9, 2023 · 710a4bd · 710a4bd
1 parent 39a7957
commit 710a4bd
Show file tree

Hide file tree

Showing 9 changed files with 422 additions and 12 deletions.
diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
@@ -11,9 +11,24 @@
 namespace mlir {
 
 class ReduceOpHelper {
+  ReduceOpHelper(Operation *op, int axis, bool withIndex)
+    : op(op), axis(axis), withIndex(withIndex) {
+    srcTy = op->getOperands().front().getType().cast<RankedTensorType>();
+  }
+
 public:
-  explicit ReduceOpHelper(triton::ReduceOp op) : op(op) {
-    srcTy = op.getOperand().getType().cast<RankedTensorType>();
+  explicit ReduceOpHelper(triton::ReduceOp op):
+    ReduceOpHelper(
+        op.getOperation(),
+        op.getAxis(),
+        triton::ReduceOp::withIndex(op.getRedOp())) {
+  }
+
+  explicit ReduceOpHelper(triton::GenericReduceOp op):
+    ReduceOpHelper(
+        op.getOperation(),
+        op.getAxis(),
+        /*withIndex*/false) {
   }
 
   ArrayRef<int64_t> getSrcShape() { return srcTy.getShape(); }
@@ -35,8 +50,10 @@ class ReduceOpHelper {
   unsigned getScratchSizeInBytes();
 
 private:
-  triton::ReduceOp op;
+  Operation *op;
   RankedTensorType srcTy{};
+  int axis;
+  bool withIndex;
 };
 
 bool isSharedEncoding(Value value);

diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -166,6 +166,10 @@ class AllocationAnalysis {
       ReduceOpHelper helper(reduceOp);
       unsigned bytes = helper.getScratchSizeInBytes();
       allocation->addBuffer<BufferT::BufferKind::Scratch>(op, bytes);
+    }  else if (auto reduceOp = dyn_cast<triton::GenericReduceOp>(op)) {
+      ReduceOpHelper helper(reduceOp);
+      unsigned bytes = helper.getScratchSizeInBytes();
+      allocation->addBuffer<BufferT::BufferKind::Scratch>(op, bytes);
     } else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
       auto srcTy = cvtLayout.getSrc().getType().cast<RankedTensorType>();
       auto dstTy = cvtLayout.getResult().getType().cast<RankedTensorType>();

diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp
@@ -72,6 +72,9 @@ void MembarAnalysis::visitTerminator(Operation *op,
     }
     return;
   }
+  if (isa<triton::GenericReduceReturnOp>(op)) {
+    return;
+  }
   // Otherwise, it could be a return op
   assert(isa<func::ReturnOp>(op) && "Unknown terminator");
 }

diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -11,14 +11,12 @@ namespace mlir {
 
 bool ReduceOpHelper::isFastReduction() {
   auto srcLayout = srcTy.getEncoding();
-  auto axis = op.getAxis();
   return axis == triton::gpu::getOrder(srcLayout)[0];
 }
 
 unsigned ReduceOpHelper::getInterWarpSize() {
   auto srcLayout = srcTy.getEncoding();
   auto srcShape = srcTy.getShape();
-  auto axis = op.getAxis();
   auto srcReduceDimSize = static_cast<unsigned>(srcShape[axis]);
   unsigned sizeIntraWarps = getIntraWarpSize();
   return std::min(srcReduceDimSize / sizeIntraWarps,
@@ -28,28 +26,24 @@ unsigned ReduceOpHelper::getInterWarpSize() {
 unsigned ReduceOpHelper::getIntraWarpSize() {
   auto srcLayout = srcTy.getEncoding();
   auto srcShape = srcTy.getShape();
-  auto axis = op.getAxis();
   auto srcReduceDimSize = static_cast<unsigned>(srcShape[axis]);
   return std::min(srcReduceDimSize,
                   triton::gpu::getThreadsPerWarp(srcLayout)[axis]);
 }
 
 unsigned ReduceOpHelper::getThreadsReductionAxis() {
   auto srcLayout = srcTy.getEncoding();
-  auto axis = op.getAxis();
   return triton::gpu::getThreadsPerWarp(srcLayout)[axis] *
          triton::gpu::getWarpsPerCTA(srcLayout)[axis];
 }
 
 SmallVector<unsigned> ReduceOpHelper::getScratchConfigBasic() {
-  auto axis = op.getAxis();
   auto smemShape = convertType<unsigned>(getSrcShape());
   smemShape[axis] = std::min(smemShape[axis], getThreadsReductionAxis());
   return smemShape;
 }
 
 SmallVector<SmallVector<unsigned>> ReduceOpHelper::getScratchConfigsFast() {
-  auto axis = op.getAxis();
   SmallVector<SmallVector<unsigned>> smemShapes(3);
 
   auto argLayout = srcTy.getEncoding();
@@ -64,7 +58,7 @@ SmallVector<SmallVector<unsigned>> ReduceOpHelper::getScratchConfigsFast() {
 
   /// FIXME(Qingyi): This size is actually larger than required.
   /// shared memory block1:
-  auto mod = op.getOperation()->getParentOfType<ModuleOp>();
+  auto mod = op->getParentOfType<ModuleOp>();
   unsigned numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod);
   smemShapes[1].push_back(numWarps * 32);
 
@@ -82,10 +76,10 @@ unsigned ReduceOpHelper::getScratchSizeInBytes() {
     elems = product<unsigned>(smemShape);
   }
 
-  auto tensorType = op.getOperand().getType().cast<RankedTensorType>();
+  auto tensorType = op->getOperand(0).getType().cast<RankedTensorType>();
   unsigned bytes = elems * tensorType.getElementTypeBitWidth() / 8;
 
-  if (triton::ReduceOp::withIndex(op.getRedOp()))
+  if (withIndex)
     bytes += elems * sizeof(int32_t);
 
   return bytes;

diff --git a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
@@ -8,6 +8,7 @@ add_mlir_conversion_library(TritonGPUToLLVM
     TritonGPUToLLVMPass.cpp
     PTXAsmFormat.cpp
     ReduceOpToLLVM.cpp
+    GenericReduceOpToLLVM.cpp
     Utility.cpp
     TypeConverter.cpp
     ViewOpToLLVM.cpp
-Original file line number
+Diff line change
@@ Expand Up / @@ -72,6 +72,9 @@ void MembarAnalysis::visitTerminator(Operation *op, @@
         }
         return;
       }
+      if (isa<triton::GenericReduceReturnOp>(op)) {
+        return;
+      }
       // Otherwise, it could be a return op
       assert(isa<func::ReturnOp>(op) && "Unknown terminator");
     }
@@ Expand Down @@