From c3cac46465fadebc76011982b672883ce9bac028 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 15 Oct 2017 05:26:45 +0900
Subject: [PATCH] enable rocm target for topi/recipes. add timing util to gemm
 test. (#554)

---
 topi/recipe/conv/depthwise_conv2d_test.py | 30 ++++++++++++-----------
 topi/recipe/conv/test_conv2d_hwcn_map.py  |  8 +++---
 topi/recipe/gemm/cuda_gemm_square.py      | 19 ++++++++++----
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/topi/recipe/conv/depthwise_conv2d_test.py b/topi/recipe/conv/depthwise_conv2d_test.py
index 4d3a20f5c2ed..8dca7e8ec1e1 100644
--- a/topi/recipe/conv/depthwise_conv2d_test.py
+++ b/topi/recipe/conv/depthwise_conv2d_test.py
@@ -69,7 +69,7 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
+        ctx = tvm.context(device, 0)
         # Build the kernel
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
@@ -111,12 +111,13 @@ def check_device(device):
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
         print("success")
 
-    with tvm.build_config(auto_unroll_max_step=32,
-                          auto_unroll_min_depth=0,
-                          unroll_explicit=False,
-                          detect_global_barrier=False,
-                          restricted_func=True):
-        check_device("cuda")
+    for device in ['cuda', 'opencl', 'rocm']:
+        with tvm.build_config(auto_unroll_max_step=32,
+                              auto_unroll_min_depth=0,
+                              unroll_explicit=device == 'rocm',
+                              detect_global_barrier=False,
+                              restricted_func=True):
+            check_device(device)
 
 def test_depthwise_conv2d_nhwc():
     """You may test different settings."""
@@ -159,7 +160,7 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
+        ctx = tvm.context(device, 0)
         # Build the kernel
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
@@ -200,12 +201,13 @@ def check_device(device):
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
         print("success")
 
-    with tvm.build_config(auto_unroll_max_step=32,
-                          auto_unroll_min_depth=0,
-                          unroll_explicit=False,
-                          detect_global_barrier=False,
-                          restricted_func=True):
-        check_device("cuda")
+    for device in ['cuda', 'opencl', 'rocm']:
+        with tvm.build_config(auto_unroll_max_step=32,
+                              auto_unroll_min_depth=0,
+                              unroll_explicit=device == 'rocm',
+                              detect_global_barrier=False,
+                              restricted_func=True):
+            check_device(device)
 
 if __name__ == "__main__":
     test_depthwise_conv2d_nchw()
diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py
index f7cba0934627..fe973188df8b 100644
--- a/topi/recipe/conv/test_conv2d_hwcn_map.py
+++ b/topi/recipe/conv/test_conv2d_hwcn_map.py
@@ -5,7 +5,7 @@
 import tvm
 from tvm.contrib import nvcc
 import topi
-from topi.nn.util import get_const_tuple
+from topi.util import get_const_tuple
 
 TASK = "conv2d_hwcn_map"
 USE_MANUAL_CODE = False
@@ -55,14 +55,14 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
+        ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
         with tvm.build_config(auto_unroll_max_step=32,
                               auto_unroll_min_depth=0,
-                              unroll_explicit=False):
+                              unroll_explicit=device == 'rocm'):
             func1 = tvm.build(s1, [A, W, B], device)
             func1(a, w, b)
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
@@ -70,7 +70,7 @@ def check_device(device):
             func2(a, w, c)
             np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl']:
+    for device in ['cuda', 'opencl', 'rocm']:
         check_device(device)
 
 
diff --git a/topi/recipe/gemm/cuda_gemm_square.py b/topi/recipe/gemm/cuda_gemm_square.py
index 0c7ba71a86f6..8d9fedee0547 100644
--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -100,11 +100,12 @@ def test_gemm():
     s[BB].double_buffer()
     # correctness
     def check_device(device):
+        print("Device %s" % device)
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         f = tvm.build(s, [A, B, C], device)
-        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
+        ctx = tvm.context(device, 0)
         # launch the kernel.
         n, m, l = nn, nn, nn
         a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
@@ -117,10 +118,18 @@ def check_device(device):
         np.testing.assert_allclose(
             c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)
 
-    with tvm.build_config(auto_unroll_max_step=32,
-                          auto_unroll_min_depth=0,
-                          unroll_explicit=False):
-        check_device("cuda")
+        num_flops = 2 * nn * nn * nn
+        num_runs = 10
+        timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
+        t = timer_f(a, b, c).mean
+        GFLOPS = num_flops / (t * 1e3) / 1e6
+        print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
+        
+    for device in ['cuda', 'opencl', 'rocm']:
+        with tvm.build_config(auto_unroll_max_step=32,
+                              auto_unroll_min_depth=0,
+                              unroll_explicit=device == 'rocm'):
+            check_device(device)
 
 if __name__ == "__main__":
     test_gemm()