apache · masahi · Jun 5, 2020 · Jun 4, 2020
diff --git a/src/target/llvm/intrin_rule_rocm.cc b/src/target/llvm/intrin_rule_rocm.cc
@@ -24,6 +24,7 @@
 
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
 
 #include <sstream>
 
@@ -40,8 +41,59 @@ inline void DispatchExternOCML(const TVMArgs& args, TVMRetValue* rv) {
   *rv = CallNode::make(call->dtype, intrinsic_name.str(), call->args, CallNode::PureExtern);
 }
 
+inline void DispatchShuffle(const TVMArgs& targs, TVMRetValue* rv) {
+  PrimExpr e_call = targs[0];
+  using namespace tir;
+  const CallNode* call = e_call.as<CallNode>();
+  CHECK(call != nullptr);
+  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  PrimExpr var = call->args[1];
+  CHECK_EQ(var.dtype().bits(), 32);
+
+  // get own lane in self (__lane_id)
+  PrimExpr minus_one = tir::make_const(DataType::Int(32), -1);
+  PrimExpr zero = tir::make_zero(DataType::Int(32));
+  PrimExpr lo = CallNode::make(DataType::Int(32), "llvm.amdgcn.mbcnt.lo", {minus_one, zero},
+                               CallNode::PureExtern);
+  PrimExpr self = CallNode::make(DataType::Int(32), "llvm.amdgcn.mbcnt.hi", {minus_one, lo},
+                                 CallNode::PureExtern);
+
+  // compute lane to get from
+  PrimExpr width = call->args[3];
+  PrimExpr index;
+  if (call->name == "tvm_warp_shuffle") {
+    PrimExpr src_lane = call->args[2];
+    index = src_lane + (self & ~(width - 1));
+  } else if (call->name == "tvm_warp_shuffle_up") {
+    PrimExpr delta = call->args[2];
+    index = self - delta;
+    index = SelectNode::make(index < (self & ~(width - 1)), self, index);
+  } else {
+    CHECK_EQ(call->name, "tvm_warp_shuffle_down");
+    PrimExpr delta = call->args[2];
+    index = self + delta;
+    index = SelectNode::make((self & (width - 1)) + delta >= width, self, index);
+  }
+  PrimExpr res = CallNode::make(var.dtype(), "llvm.amdgcn.ds.bpermute", {index << 2, var},
+                                CallNode::PureExtern);
+  *rv = res;
+}
+
 namespace llvm {
 
+// dummy because we don't have the activemask
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.tvm_warp_activemask")
+    .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+      PrimExpr zero = tir::make_zero(DataType::Int(32));
+      *rv = zero;
+    });
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.tvm_warp_shuffle").set_body(DispatchShuffle);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.tvm_warp_shuffle_up").set_body(DispatchShuffle);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.tvm_warp_shuffle_down").set_body(DispatchShuffle);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.floor").set_body(DispatchExternOCML);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.ceil").set_body(DispatchExternOCML);

diff --git a/src/target/target.cc b/src/target/target.cc
@@ -98,8 +98,9 @@ Target CreateTarget(const std::string& target_name, const std::vector<std::strin
     // For now assume rocm schedule for opencl
     if (target_name == "opencl") {
       t->device_type = kDLOpenCL;
-    } else {
+    } else {  // rocm
       t->device_type = kDLROCM;
+      t->thread_warp_size = 64;
     }
     t->keys_array.push_back(target_name);
     t->keys_array.push_back("gpu");

diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
@@ -196,7 +196,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     //
     // Allocate reduction vars v[i], i = 0..size-1
     //
-    // for offset from 16 to 1 by 2
+    // for offset from WARP_SIZE to 1 by 2
     //
     //   a    <- load(v[i])
     //   b    <- shuffle_down(load(v[i], offset))
@@ -244,7 +244,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       }
 
       // Emit reductions within a warp.
-      for (int offset = 16; offset > 0; offset /= 2) {
+      for (int offset = warp_size_ / 2; offset > 0; offset /= 2) {
         // Load reduction values, no synchronization needed.
         Array<PrimExpr> a, b;
         for (size_t i = 0; i < size; ++i) {
@@ -478,9 +478,20 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   // the warp size.
   //
   // TODO(tvm-team) reduction with a sub-warp of 8 or 16 threads.
+  // Note: The ROCm backend will only have warp reductions for now.
+  // Also, the warp/wavefront size differs (64 on rocm, 32 on cuda).
   bool is_warp_reduction(const std::vector<DataType>& types) const {
     // Only cuda target supports warp reductions.
-    if (target_->target_name != "cuda") return false;
+    if ((target_->target_name != "cuda") && (target_->target_name != "rocm")) return false;
+
+    // rocm only supports 32 bit operands for shuffling at the moment
+    if ((target_->target_name == "rocm") &&
+        (std::any_of(types.begin(), types.end(), [](DataType ty) {
+          if (ty.is_vector()) return true;
+          return ty.bits() != 32;
+        }))) {
+      return false;
+    }
 
     // Supported types:
     // {u}int, {u}long, {u}long long, float, double, half/half2

diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
@@ -65,6 +65,7 @@ def check_device(device, host="llvm"):
         check_device("vulkan")
         check_device("cuda")
         check_device("opencl")
+        check_device("rocm")
     test_prim(te.sum, np.sum)
     test_prim(tvm.te.min, np.amin)
     test_prim(tvm.te.max, np.amax)
@@ -179,7 +180,7 @@ def check_target(device, host="stackvm"):
     check_target("cuda")
     check_target("metal")
     check_target("opencl")
-
+    check_target("rocm")
 
 def test_rfactor_elemwise_threads():
     n = 1025
@@ -230,6 +231,7 @@ def check_target(device, host="stackvm"):
     check_target("cuda")
     check_target("metal")
     check_target("opencl")
+    check_target("rocm")
 
 def test_argmax():
     def fcombine(x, y):
@@ -337,6 +339,7 @@ def check_target(device):
 
     check_target("cuda")
     check_target("vulkan")
+    check_target("rocm")
 
 def test_warp_reduction1():
     nthx = 32
@@ -365,10 +368,10 @@ def check_target(device, m, n):
         s[B].bind(xi, thread_y)
         s[B].bind(xo, block_x)
 
-        print(tvm.lower(s, [A, B], simple_mode=True))
+        tvm.lower(s, [A, B], simple_mode=True)
 
         # validation
-        func = tvm.build(s, [A, B], "cuda", name="warp_reduction")
+        func = tvm.build(s, [A, B], device, name="warp_reduction")
         a_np = np.random.uniform(size=(m,n)).astype(A.dtype)
         b_np = np.zeros((m,), dtype=A.dtype)
         a = tvm.nd.array(a_np, ctx)
@@ -379,6 +382,8 @@ def check_target(device, m, n):
 
     check_target("cuda", m=32, n=256)
     check_target("cuda", m=10, n=20)
+    check_target("rocm", m=32, n=256)
+    check_target("rocm", m=10, n=20)
     # This is a bug in normal reduction.
     # check_target("cuda", m=10, n=37)
 
@@ -437,6 +442,7 @@ def check_target(device):
         tvm.testing.assert_allclose(t1.asnumpy(), t1_np, rtol=1e-3, atol=1e-3)
 
     check_target("cuda")
+    check_target("rocm")
 
 if __name__ == "__main__":
     test_rfactor_elemwise_threads()