From 5acf5513065d7e0a567c9b24b37bc3c2323b36e2 Mon Sep 17 00:00:00 2001 From: wpan11nv <60017475+wpan11nv@users.noreply.github.com> Date: Thu, 13 Feb 2020 20:24:29 -0800 Subject: [PATCH] [TOPI][CUDA] Enable vectorization on fp16 type (#4867) - This allows to better utilize the memory bandwidth - Note that not all cases are vectorized for fp16 datatype. For instance, when the size is not a multiple of 1024, the inner loop may be an expression that cannot be vectorized. In this case, a small inner loop is still benefical for latency hidding. Signed-off-by: Wei Pan --- topi/python/topi/cuda/injective.py | 11 ++++++-- topi/tests/python/test_topi_relu.py | 21 +++++++++++---- topi/tests/python/test_topi_tensor.py | 39 +++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 9 deletions(-) diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py index b77a97924716..eb7019bd7654 100644 --- a/topi/python/topi/cuda/injective.py +++ b/topi/python/topi/cuda/injective.py @@ -40,13 +40,20 @@ def schedule_injective_from_existing(sch, out): num_thread = tvm.target.Target.current(allow_none=False).max_num_threads max_block = 256 + # vectorize on fp16 data type. This allows to better utilize the memory + # bandwidth. + vector_width = 4 if out.dtype == "float16" else 1 + try: const_size = util.get_const_int(util.prod(out.shape)) - max_block = 256 - need_block_split = const_size > max_block * num_thread + need_block_split = const_size > max_block * num_thread * vector_width except ValueError: need_block_split = False + if vector_width > 1: + fused, v = sch[out].split(fused, vector_width) + sch[out].vectorize(v) + if need_block_split: xo, xi = sch[out].split(fused, factor=num_thread * max_block) bx, tx = sch[out].split(xi, factor=num_thread) diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py index 5c41647846cb..414edbca4f0f 100644 --- a/topi/tests/python/test_topi_relu.py +++ b/topi/tests/python/test_topi_relu.py @@ -20,11 +20,20 @@ import tvm import topi from topi.util import get_const_tuple - +from tvm.contrib.nvcc import parse_compute_version from common import get_all_backend -def verify_relu(m, n): - A = tvm.placeholder((m, n), name='A') +def skip_test(dtype, device): + if dtype == "float16" and device == "cuda": + major, minor = parse_compute_version(tvm.gpu(0).compute_version) + # fp16 starts from 5.3 + if major < 6 or (major == 5 and minor < 3): + print("skip because gpu does not support fp16") + return True + return False + +def verify_relu(m, n, dtype="float32"): + A = tvm.placeholder((m, n), name='A', dtype=dtype) B = topi.nn.relu(A) a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype) @@ -35,6 +44,8 @@ def check_device(device): if not ctx.exist: print("Skip because %s is not enabled" % device) return + if skip_test(dtype, device): + return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_elemwise(B) @@ -87,12 +98,12 @@ def _prelu_numpy(x, W): tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5) def test_relu(): - verify_relu(10, 128) + verify_relu(10, 128, "float32") + verify_relu(128, 64, "float16") def test_schedule_big_array(): verify_relu(1024 * 100 , 512) - def test_leaky_relu(): verify_leaky_relu(100, 0.1) diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py index 465d98e5f082..84718ff3a647 100644 --- a/topi/tests/python/test_topi_tensor.py +++ b/topi/tests/python/test_topi_tensor.py @@ -19,6 +19,16 @@ import tvm import topi from tvm.contrib.pickle_memoize import memoize +from tvm.contrib.nvcc import parse_compute_version + +def skip_test(dtype, device): + if dtype == "float16" and device == "cuda": + major, minor = parse_compute_version(tvm.gpu(0).compute_version) + # fp16 starts from 5.3 + if major < 6 or (major == 5 and minor < 3): + print("skip because gpu does not support fp16") + return True + return False def verify_elemwise_sum(num_args, dtype): shape = (3,5,4) @@ -84,18 +94,43 @@ def check_device(device): for device in ["llvm"]: check_device(device) +def verify_vectorization(n, m, dtype): + def check_device(device): + if not tvm.runtime.enabled(device): + print("Skip because %s is not enabled" % device) + return + if skip_test(dtype, device): + return + with tvm.target.create(device): + ctx = tvm.context(device, 0) + A = tvm.placeholder((n, m), name='A', dtype=dtype) + B = tvm.compute((n, m), lambda i, j: + A[i, j] + tvm.const(1, A.dtype), name='B') + S = topi.generic.schedule_elemwise(B) + + fun = tvm.build(S, [A, B], device) + np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom( + np.random.uniform(size=(n, m))) + np_B = tvm.nd.empty((n, m), B.dtype, ctx) + fun(np_A, np_B) + tvm.testing.assert_allclose(np_B.asnumpy(), np_A.asnumpy() + 1, rtol=1e-5) + + for device in ["cuda"]: + check_device(device) + +def test_vectorization(): + verify_vectorization(128, 64, "float16") def test_elemwise_sum(): verify_elemwise_sum(1, "float32") verify_elemwise_sum(5, "float32") verify_elemwise_sum(4, "int32") - def test_full(): verify_full((3,4,5), "float32", 3.14) verify_full((10,), "int32", 7) - if __name__ == "__main__": test_elemwise_sum() test_full() + test_vectorization()