From 5f2733edccbef45a2799103e4e517f7008187c6f Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 16 Mar 2020 01:22:30 +0100 Subject: [PATCH] Workarounds for min/max half-precision operations in CUDA --- dace/runtime/include/dace/math.h | 40 ++++++++++++++++++ tests/half_cudatest.py | 69 ++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tests/half_cudatest.py diff --git a/dace/runtime/include/dace/math.h b/dace/runtime/include/dace/math.h index e3ce2824b1..59c00ca7e8 100644 --- a/dace/runtime/include/dace/math.h +++ b/dace/runtime/include/dace/math.h @@ -84,6 +84,46 @@ static DACE_CONSTEXPR DACE_HDFI int sgn(T val) { return (T(0) < val) - (val < T(0)); } + +// Workarounds for float16 in CUDA +// NOTES: * Half precision types are not trivially convertible, so other types +// will be implicitly converted to it in min/max. +// * half comparisons are designated "device-only", so they must call +// device-only functions as well. +#ifdef __CUDACC__ +template +DACE_CONSTEXPR __device__ __forceinline__ dace::float16 min(const dace::float16& a, const dace::float16& b, const Ts&... c) +{ + return (a < b) ? min(a, c...) : min(b, c...); +} +template +DACE_CONSTEXPR __device__ __forceinline__ dace::float16 min(const dace::float16& a, const T& b, const Ts&... c) +{ + return (a < dace::float16(b)) ? min(a, c...) : min(dace::float16(b), c...); +} +template +DACE_CONSTEXPR __device__ __forceinline__ dace::float16 min(const T& a, const dace::float16& b, const Ts&... c) +{ + return (dace::float16(a) < b) ? min(dace::float16(a), c...) : min(b, c...); +} +template +DACE_CONSTEXPR __device__ __forceinline__ dace::float16 max(const dace::float16& a, const dace::float16& b, const Ts&... c) +{ + return (a > b) ? max(a, c...) : max(b, c...); +} +template +DACE_CONSTEXPR __device__ __forceinline__ dace::float16 max(const dace::float16& a, const T& b, const Ts&... c) +{ + return (a > dace::float16(b)) ? max(a, c...) : max(dace::float16(b), c...); +} +template +DACE_CONSTEXPR __device__ __forceinline__ dace::float16 max(const T& a, const dace::float16& b, const Ts&... c) +{ + return (dace::float16(a) > b) ? max(dace::float16(a), c...) : max(b, c...); +} +#endif + + #ifndef DACE_SYNTHESIS // Computes integer floor, rounding the remainder towards negative infinity. diff --git a/tests/half_cudatest.py b/tests/half_cudatest.py new file mode 100644 index 0000000000..0550ee4c4d --- /dev/null +++ b/tests/half_cudatest.py @@ -0,0 +1,69 @@ +""" Tests for half-precision syntax quirks. """ + +import dace +import numpy as np + +N = dace.symbol('N') + + +def test_relu(): + @dace.program + def halftest(A: dace.float16[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + o >> out[i] + o = a if a > dace.float16(0) else dace.float16(0) + return out + + A = np.random.rand(20).astype(np.float16) + sdfg = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + out = sdfg(A=A, N=20) + assert np.allclose(out, np.maximum(A, 0)) + + +def test_relu_2(): + @dace.program + def halftest(A: dace.float16[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + o >> out[i] + o = max(a, 0) + return out + + A = np.random.rand(20).astype(np.float16) + sdfg = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + out = sdfg(A=A, N=20) + assert np.allclose(out, np.maximum(A, 0)) + + +def test_dropout(): + @dace.program + def halftest(A: dace.float16[N], mask: dace.int32[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + d << mask[i] + o >> out[i] + #o = a * dace.float16(d) + o = a if d else dace.float16(0) + return out + + A = np.random.rand(20).astype(np.float16) + mask = np.random.randint(0, 2, size=[20]).astype(np.int32) + sdfg = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + out = sdfg(A=A, mask=mask, N=20) + assert np.allclose(out, A * mask) + + +if __name__ == '__main__': + test_relu() + test_relu_2() + test_dropout()