From c3cac46465fadebc76011982b672883ce9bac028 Mon Sep 17 00:00:00 2001 From: masahi Date: Sun, 15 Oct 2017 05:26:45 +0900 Subject: [PATCH] enable rocm target for topi/recipes. add timing util to gemm test. (#554) --- topi/recipe/conv/depthwise_conv2d_test.py | 30 ++++++++++++----------- topi/recipe/conv/test_conv2d_hwcn_map.py | 8 +++--- topi/recipe/gemm/cuda_gemm_square.py | 19 ++++++++++---- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/topi/recipe/conv/depthwise_conv2d_test.py b/topi/recipe/conv/depthwise_conv2d_test.py index 4d3a20f5c2ed..8dca7e8ec1e1 100644 --- a/topi/recipe/conv/depthwise_conv2d_test.py +++ b/topi/recipe/conv/depthwise_conv2d_test.py @@ -69,7 +69,7 @@ def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + ctx = tvm.context(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) @@ -111,12 +111,13 @@ def check_device(device): np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success") - with tvm.build_config(auto_unroll_max_step=32, - auto_unroll_min_depth=0, - unroll_explicit=False, - detect_global_barrier=False, - restricted_func=True): - check_device("cuda") + for device in ['cuda', 'opencl', 'rocm']: + with tvm.build_config(auto_unroll_max_step=32, + auto_unroll_min_depth=0, + unroll_explicit=device == 'rocm', + detect_global_barrier=False, + restricted_func=True): + check_device(device) def test_depthwise_conv2d_nhwc(): """You may test different settings.""" @@ -159,7 +160,7 @@ def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + ctx = tvm.context(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) @@ -200,12 +201,13 @@ def check_device(device): np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success") - with tvm.build_config(auto_unroll_max_step=32, - auto_unroll_min_depth=0, - unroll_explicit=False, - detect_global_barrier=False, - restricted_func=True): - check_device("cuda") + for device in ['cuda', 'opencl', 'rocm']: + with tvm.build_config(auto_unroll_max_step=32, + auto_unroll_min_depth=0, + unroll_explicit=device == 'rocm', + detect_global_barrier=False, + restricted_func=True): + check_device(device) if __name__ == "__main__": test_depthwise_conv2d_nchw() diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py index f7cba0934627..fe973188df8b 100644 --- a/topi/recipe/conv/test_conv2d_hwcn_map.py +++ b/topi/recipe/conv/test_conv2d_hwcn_map.py @@ -5,7 +5,7 @@ import tvm from tvm.contrib import nvcc import topi -from topi.nn.util import get_const_tuple +from topi.util import get_const_tuple TASK = "conv2d_hwcn_map" USE_MANUAL_CODE = False @@ -55,14 +55,14 @@ def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=32, auto_unroll_min_depth=0, - unroll_explicit=False): + unroll_explicit=device == 'rocm'): func1 = tvm.build(s1, [A, W, B], device) func1(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -70,7 +70,7 @@ def check_device(device): func2(a, w, c) np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device in ['cuda', 'opencl']: + for device in ['cuda', 'opencl', 'rocm']: check_device(device) diff --git a/topi/recipe/gemm/cuda_gemm_square.py b/topi/recipe/gemm/cuda_gemm_square.py index 0c7ba71a86f6..8d9fedee0547 100644 --- a/topi/recipe/gemm/cuda_gemm_square.py +++ b/topi/recipe/gemm/cuda_gemm_square.py @@ -100,11 +100,12 @@ def test_gemm(): s[BB].double_buffer() # correctness def check_device(device): + print("Device %s" % device) if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return f = tvm.build(s, [A, B, C], device) - ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + ctx = tvm.context(device, 0) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) @@ -117,10 +118,18 @@ def check_device(device): np.testing.assert_allclose( c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) - with tvm.build_config(auto_unroll_max_step=32, - auto_unroll_min_depth=0, - unroll_explicit=False): - check_device("cuda") + num_flops = 2 * nn * nn * nn + num_runs = 10 + timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) + t = timer_f(a, b, c).mean + GFLOPS = num_flops / (t * 1e3) / 1e6 + print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS)) + + for device in ['cuda', 'opencl', 'rocm']: + with tvm.build_config(auto_unroll_max_step=32, + auto_unroll_min_depth=0, + unroll_explicit=device == 'rocm'): + check_device(device) if __name__ == "__main__": test_gemm()