From f0075f825ca6ddbf2279c8f6c9f7a9d9760438cc Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 27 Apr 2021 16:25:44 -0700
Subject: [PATCH 1/4] [SPARSE] Improve sparse performance on ROCM

The current sparse dense gpu kernel uses warp level storage to handling
caching of data. Warp level storage uses shuffle intrinsics, which are
slow on rocm (because they actually read and write to shared memory).
Rocm does provide intrinsics to do the correct memory management, but
they are not available through tvm. Instead this PR switches to using
shared memory on rocm devices. Performance is about 2x faster.
---
 python/tvm/relay/op/strategy/cuda.py         |  2 +-
 python/tvm/topi/cuda/sparse.py               | 48 ++++++++++++++++----
 tests/python/topi/python/test_topi_sparse.py | 15 +++---
 3 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index e5aa8aa10620..a6775ae7bd20 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -787,7 +787,7 @@ def sparse_reshape_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
-@sparse_dense_padded_strategy.register(["cuda", "gpu"])
+@sparse_dense_padded_strategy.register(["cuda", "gpu", "rocm"])
 def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target):
     """sparse dense cuda strategy"""
     strategy = _op.OpStrategy()
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index f68b31ec30ef..70e74d876a3a 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -170,6 +170,16 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
         # TODO(tkonolige): seperate implementation for large block sizes
         ib = tvm.tir.ir_builder.create()
 
+        if tvm.target.Target.current(allow_none=False).kind.name == "rocm":
+            # TVMs warp shuffle intrinsics are slow on ROCM because they use
+            # LDS (shared memory) to do the shuffling. Instead, we could use
+            # ROCM's support for accessing neighboring threads memory, but we
+            # those intrinsics aren't accessible from TVM. For now, we just use
+            # shared memory.
+            use_warp_storage = False
+        else:
+            use_warp_storage = True
+
         warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
         m = data.shape[1]
         nb = w_indptr.shape[0] - 1
@@ -218,11 +228,17 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
 
         # thread local storage for bs_m x bs_n block
         block = ib.allocate(data.dtype, (bs_m, bs_n), name="block", scope="local")
-        indices = ib.allocate(w_indices.dtype, (rowlength_bi,), name="indices", scope="warp")
         data_cache = ib.allocate(data.dtype, (mi, bs_m, bs_k), name="data_cache", scope="local")
-        w_data_cache = ib.allocate(
-            w_data.dtype, (rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="warp"
-        )
+        if use_warp_storage:
+            indices = ib.allocate(w_indices.dtype, (rowlength_bi,), name="indices", scope="warp")
+            w_data_cache = ib.allocate(
+                w_data.dtype, (rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="warp"
+            )
+        else:
+            indices = ib.allocate(ni, w_indices.dtype, (rowlength_bi,), name="indices", scope="shared")
+            w_data_cache = ib.allocate(
+                w_data.dtype, (ni, rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="shared"
+            )
 
         # zero block
         with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
@@ -232,7 +248,10 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
         with ib.for_range(0, rowlength_bo, name="bb") as bb:
             elem_idx = bb * rowlength_bi + tx
             # Cache indices. Guaranteed to be multiple of warp_size.
-            indices[elem_idx] = w_indices_ptr[row_start + elem_idx]
+            if use_warp_storage:
+                indices[tx] = w_indices_ptr[row_start + elem_idx]
+            else:
+                indices[warp, tx] = w_indices_ptr[row_start + elem_idx]
             # cache dense matrix
             # each thread has a row
             # TODO: ideally we could vectorize this
@@ -242,18 +261,29 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
                         # This memory acces should be out of bounds when
                         # m_index >= mb (which occurs when the dense matrix
                         # rows % 32 != 0), but it seems to work just fine...
-                        data_cache[bi, x, z] = data_ptr[indices[bi] * bs_k + z, m_index * bs_m + x]
+                        if use_warp_storage:
+                            ind = indices[bi]
+                        else:
+                            ind = indices[warp, bi]
+                        data_cache[bi, x, z] = data_ptr[ind * bs_k + z, m_index * bs_m + x]
             # cache w_data
             elem_idx = bb * rowlength_bi + tx
             with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
                 with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
-                    w_data_cache[tx, y, z] = w_data_ptr[row_start + elem_idx, y, z]
+                    if use_warp_storage:
+                        w_data_cache[tx, y, z] = w_data_ptr[row_start + elem_idx, y, z]
+                    else:
+                        w_data_cache[warp, tx, y, z] = w_data_ptr[row_start + elem_idx, y, z]
             with ib.for_range(0, mi, name="i") as i:
                 # thread local block matmul
                 with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
                     with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
                         with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
-                            block[x, y] += data_cache[i, x, z] * w_data_cache[i, y, z]
+                            if use_warp_storage:
+                                w = w_data_cache[i, y, z]
+                            else:
+                                w = w_data_cache[warp, i, y, z]
+                            block[x, y] += data_cache[i, x, z] * w
         # store results
         with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
             with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
@@ -391,7 +421,7 @@ def pad_sparse_matrix(matrix, blocksize):
     return sp.bsr_matrix((data, indices, indptr), matrix.shape)
 
 
-@nn.sparse_dense_alter_layout.register(["cuda", "gpu"])
+@nn.sparse_dense_alter_layout.register(["cuda", "gpu", "rocm"])
 def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type):
     """With cuda, we modify use alter_op_layout to swap the default
     sparse_dense implementation for one that operates on a padded matrix. We
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index 500384b23f2a..98a3ec86180c 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -463,8 +463,8 @@ def check_device(device):
             check_device(device)
 
 
-@tvm.testing.requires_cuda
-def test_sparse_dense_padded_cuda():
+@tvm.testing.parametrize_targets("cuda", "rocm")
+def test_sparse_dense_padded_gpu(target, dev):
     M = 128
     N = 1280
     K = 128
@@ -483,8 +483,7 @@ def test_sparse_dense_padded_cuda():
         shape=W_sp_np_padded.indptr.shape, dtype=str(W_sp_np_padded.indptr.dtype)
     )
     X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
-    with tvm.target.Target("cuda"):
-        dev = tvm.device("gpu")
+    with tvm.target.Target(target):
         Y = topi.cuda.sparse_dense_padded(X, W_data, W_indices, W_indptr)
         s = topi.cuda.schedule_sparse_dense_padded([Y])
         func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
@@ -499,9 +498,9 @@ def test_sparse_dense_padded_cuda():
         tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-5, rtol=1e-5)
 
 
-@tvm.testing.requires_cuda
-def test_sparse_dense_padded_alter_op():
-    with tvm.target.Target("cuda"):
+@tvm.testing.parametrize_targets("cuda", "rocm")
+def test_sparse_dense_padded_alter_op(target, dev):
+    with tvm.target.Target(target):
         M = 128
         N = 16
         K = 128
@@ -523,7 +522,7 @@ def test_sparse_dense_padded_alter_op():
 
         # build with cuda and AlterOpLayout to ensure that sparse_dense_padded is in action
         with tvm.transform.PassContext(opt_level=3, required_pass="AlterOpLayout"):
-            x = relay.build(tvm.IRModule.from_expr(f), target=tvm.target.Target("cuda"))
+            x = relay.build(tvm.IRModule.from_expr(f), target=target)
 
 
 def test_sparse_add_csr():

From bd246e9c344242d25b75cf92529dfc321f0e3ba3 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 28 Apr 2021 14:35:23 -0700
Subject: [PATCH 2/4] default to shared mem

---
 python/tvm/topi/cuda/sparse.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index 70e74d876a3a..14497b017064 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -170,15 +170,16 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
         # TODO(tkonolige): seperate implementation for large block sizes
         ib = tvm.tir.ir_builder.create()
 
-        if tvm.target.Target.current(allow_none=False).kind.name == "rocm":
+        if tvm.target.Target.current(allow_none=False).kind.name == "cuda":
+            use_warp_storage = True
+        else:
             # TVMs warp shuffle intrinsics are slow on ROCM because they use
             # LDS (shared memory) to do the shuffling. Instead, we could use
             # ROCM's support for accessing neighboring threads memory, but we
             # those intrinsics aren't accessible from TVM. For now, we just use
-            # shared memory.
+            # shared memory. We also default to shared memory on platforms
+            # where we do not know how warp storage performs.
             use_warp_storage = False
-        else:
-            use_warp_storage = True
 
         warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
         m = data.shape[1]
@@ -425,7 +426,7 @@ def pad_sparse_matrix(matrix, blocksize):
 def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type):
     """With cuda, we modify use alter_op_layout to swap the default
     sparse_dense implementation for one that operates on a padded matrix. We
-    also padd the matrix.
+    also pad the matrix.
     """
     # TODO(ANSHUMAN87): Handle for sparse_lhs case too
     if (

From 2a2f11551ddc490b079eba6c4b7b4566d8961eac Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 28 Apr 2021 15:38:30 -0700
Subject: [PATCH 3/4] formatting

---
 python/tvm/topi/cuda/sparse.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index 14497b017064..855a856accc6 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -236,7 +236,9 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
                 w_data.dtype, (rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="warp"
             )
         else:
-            indices = ib.allocate(ni, w_indices.dtype, (rowlength_bi,), name="indices", scope="shared")
+            indices = ib.allocate(
+                ni, w_indices.dtype, (rowlength_bi,), name="indices", scope="shared"
+            )
             w_data_cache = ib.allocate(
                 w_data.dtype, (ni, rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="shared"
             )

From 0136308769b885ac414f070eef95b1c575d87629 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 3 May 2021 09:18:23 -0700
Subject: [PATCH 4/4] formatting

---
 python/tvm/topi/cuda/sparse.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index 855a856accc6..1e846ebf5311 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -163,7 +163,7 @@ def sparse_dense_tir(data, w_data, w_indices, w_indptr):
     """
 
     def gen_ir(data, w_data, w_indices, w_indptr, out):
-        # pylint: disable=invalid-name
+        # pylint: disable=invalid-name, simplifiable-if-statement
         # TODO(tkonolige): use tensorcores for block multiply
         # TODO(tkonolige): use vectorize on loads
         # TODO(tkonolige): seperate implementation if M is small
@@ -237,7 +237,7 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
             )
         else:
             indices = ib.allocate(
-                ni, w_indices.dtype, (rowlength_bi,), name="indices", scope="shared"
+                w_indices.dtype, (ni, rowlength_bi), name="indices", scope="shared"
             )
             w_data_cache = ib.allocate(
                 w_data.dtype, (ni, rowlength_bi, bs_n, bs_k), name="w_data_cache", scope="shared"