From e1563b42ac8b6a877003966859e3c0b8891650d3 Mon Sep 17 00:00:00 2001 From: Wei Pan Date: Wed, 18 Mar 2020 13:40:19 -0700 Subject: [PATCH] [CodeGen][CUDA] Vectorization for intrinsics - This allows to emit vectorized loads/stores for CUDA math intrinsics. - A few intrinsics should be lowered as CUDAMath not CUDAFastMath ones. --- src/target/source/codegen_c.h | 23 ++++ src/target/source/codegen_cuda.cc | 49 +++++++ src/target/source/intrin_rule_cuda.cc | 26 ++-- .../unittest/test_target_codegen_cuda.py | 124 +++++++++++++++++- 4 files changed, 207 insertions(+), 15 deletions(-) diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h index a9da780876122..c1894a379ddb6 100644 --- a/src/target/source/codegen_c.h +++ b/src/target/source/codegen_c.h @@ -257,6 +257,29 @@ class CodeGenC : /*! \brief the data type of allocated buffers */ std::unordered_map handle_data_type_; + /*! + * \brief A RAII utility class for emitting code in a scoped region. + */ + class EnterScopeRAII { + // The codegen context. + CodeGenC* cg; + + // The new scope level. + int scope; + + public: + explicit EnterScopeRAII(CodeGenC* cg) : cg(cg) { + cg->PrintIndent(); + cg->stream << "{\n"; + scope = cg->BeginScope(); + } + ~EnterScopeRAII() { + cg->EndScope(scope); + cg->PrintIndent(); + cg->stream << "}\n"; + } + }; + private: /*! \brief whether to print in SSA form */ bool print_ssa_form_{false}; diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc index 2cc7b926287f8..3799adc493e34 100644 --- a/src/target/source/codegen_cuda.cc +++ b/src/target/source/codegen_cuda.cc @@ -24,6 +24,7 @@ #include #include +#include #include #include #include "literal/cuda_half_t.h" @@ -418,6 +419,54 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op, std::ostream& os) { this->PrintExpr(op->args[i * 2 + 1], os); os << "]" << ((i < 3) ? ", ": ")"); } + } else if (op->call_type == CallNode::PureExtern && op->dtype.is_vector()) { + // + // Emit an unsupported vector call + // + // v = intrin_f((float4*)A[0], (float4*)B[0]) + // + // as + // + // float4 __ret; + // { + // float4 __arg0 = ((float4*)A)[0]; + // float4 __arg1 = ((float4*)B)[0]; + // __ret.x = intrin_f(__arg0.x, __arg1.x); + // __ret.y = intrin_f(__arg0.y, __arg1.y); + // __ret.z = intrin_f(__arg0.z, __arg1.z); + // __ret.w = intrin_f(__arg0.w, __arg1.w); + // } + // v = __ret; + // + // Declare the result vector. + std::string sret = GetUniqueName("_"); + this->PrintIndent(); + this->PrintType(op->dtype, stream); + stream << ' ' << sret << ";\n"; + { + EnterScopeRAII scope(this); + + // Load arguments. + std::vector sargs; + for (size_t i = 0; i < op->args.size(); ++i) { + std::string val = SSAGetID(PrintExpr(op->args[i]), op->args[i].dtype()); + sargs.push_back(std::move(val)); + } + + // Emit a scalar call for each lane. + for (int i = 0; i < op->dtype.lanes(); ++i) { + std::ostringstream scall; + scall << op->name << "("; + for (size_t j = 0; j < op->args.size(); ++j) { + if (j > 0) + scall << ','; + PrintVecElemLoad(sargs[j], op->args[j].dtype(), i, scall); + } + scall << ")"; + PrintVecElemStore(sret, op->dtype, i, scall.str()); + } + } + os << sret; } else { CodeGenC::VisitExpr_(op, os); } diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc index d009110e0a6cc..d9441203edc01 100644 --- a/src/target/source/intrin_rule_cuda.cc +++ b/src/target/source/intrin_rule_cuda.cc @@ -29,14 +29,12 @@ namespace intrin { // Add float suffix to the intrinsics, CUDA fast math. struct CUDAMath { std::string operator()(DataType t, std::string name) const { - if (t.lanes() == 1) { - if (t.is_float()) { - switch (t.bits()) { - case 64: return name; - case 32: return name + 'f'; - case 16: return 'h' + name; - default: return ""; - } + if (t.is_float()) { + switch (t.bits()) { + case 64: return name; + case 32: return name + 'f'; + case 16: return 'h' + name; + default: return ""; } } return ""; @@ -45,7 +43,7 @@ struct CUDAMath { struct CUDAFastMath : public CUDAMath { std::string operator()(DataType t, std::string name) const { - if (t.lanes() == 1 && t.is_float() && t.bits() == 32) { + if (t.is_float() && t.bits() == 32) { return "__" + name + 'f'; } else { return CUDAMath::operator()(t, name); @@ -56,7 +54,7 @@ struct CUDAFastMath : public CUDAMath { struct CUDAFastMathTan : public CUDAMath { std::string operator()(DataType t, std::string name) const { - if (t.lanes() == 1 && t.is_float()) { + if (t.is_float()) { switch (t.bits()) { case 64: return name; // `__tanf` seems to produce some values too deviant from numpy tan version. @@ -72,7 +70,7 @@ struct CUDAFastMathTan : public CUDAMath { struct CUDAPopcount { std::string operator()(DataType t, std::string name) const { - if (t.lanes() == 1 && t.is_uint()) { + if (t.is_uint()) { switch (t.bits()) { case 32: return "__popc"; case 64: return "__popcll"; @@ -108,7 +106,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp") .set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp2") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp10") .set_body(DispatchExtern); @@ -132,13 +130,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos") .set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cosh") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sin") .set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sinh") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.atan") .set_body(DispatchExtern); diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py index 083cedeaf068d..e8c6cd1925a89 100644 --- a/tests/python/unittest/test_target_codegen_cuda.py +++ b/tests/python/unittest/test_target_codegen_cuda.py @@ -348,6 +348,125 @@ def test_cuda_floordiv_with_vectorization(): func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3) +def sched(B): + s = te.create_schedule(B.op) + io, ii = s[B].split(s[B].op.axis[0], nparts=1) + iio, iii = s[B].split(ii, nparts=32) + _, iiii = s[B].split(iii, factor=4) + s[B].vectorize(iiii) + s[B].bind(io, bx) + s[B].bind(iio, tx) + return s + +def test_vectorized_intrin1(): + test_funcs = [ + (tvm.tir.floor, lambda x : np.floor(x)), + (tvm.tir.ceil, lambda x : np.ceil(x)), + (tvm.tir.trunc, lambda x : np.trunc(x)), + (tvm.tir.abs, lambda x : np.fabs(x)), + (tvm.tir.round, lambda x : np.round(x)), + (tvm.tir.exp, lambda x : np.exp(x)), + (tvm.tir.exp2, lambda x : np.exp2(x)), + (tvm.tir.exp10, lambda x : np.power(10,x)), + (tvm.tir.log, lambda x : np.log(x)), + (tvm.tir.log2, lambda x : np.log2(x)), + (tvm.tir.log10, lambda x : np.log10(x)), + (tvm.tir.tan, lambda x : np.tan(x)), + (tvm.tir.cos, lambda x : np.cos(x)), + (tvm.tir.cosh, lambda x : np.cosh(x)), + (tvm.tir.sin, lambda x : np.sin(x)), + (tvm.tir.sinh, lambda x : np.sinh(x)), + (tvm.tir.atan, lambda x : np.arctan(x)), + (tvm.tir.tanh, lambda x : np.tanh(x)), + (tvm.tir.sqrt, lambda x : np.sqrt(x)), + ] + def run_test(tvm_intrin, np_func, dtype): + if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): + print("skip because cuda is not enabled..") + return + if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): + print("Skip because gpu does not have fp16 support") + return + # set of intrinsics does not support fp16 yet. + skip_set = {tvm.tir.abs, + tvm.tir.round, + tvm.tir.tan, + tvm.tir.atan, + tvm.tir.tanh, + tvm.tir.cosh, + tvm.tir.sinh} + if dtype == "float16" and tvm_intrin in skip_set: + print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__)) + return + + n = 128 + A = te.placeholder((n,), dtype=dtype, name='A') + B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name='B') + s = sched(B) + f = tvm.build(s, [A, B], "cuda") + ctx = tvm.gpu(0) + a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx) + f(a, b) + tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3) + + for func in test_funcs: + run_test(*func, "float32") + run_test(*func, "float16") + +def test_vectorized_intrin2(dtype="float32"): + c2 = tvm.tir.const(2, dtype=dtype) + test_funcs = [ + (tvm.tir.power, lambda x : np.power(x, 2.0)), + (tvm.tir.fmod, lambda x : np.fmod(x, 2.0)) + ] + def run_test(tvm_intrin, np_func): + if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): + print("skip because cuda is not enabled..") + return + + n = 128 + A = te.placeholder((n,), dtype=dtype, name='A') + B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name='B') + s = sched(B) + f = tvm.build(s, [A, B], "cuda") + ctx = tvm.gpu(0) + a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx) + f(a, b) + tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3) + + for func in test_funcs: + run_test(*func) + +def test_vectorized_popcount(): + def ref_popcount(x): + cnt = 0 + while x: + x -= x & -x + cnt += 1 + return cnt + + def run_test(dtype): + if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): + print("skip because cuda is not enabled..") + return + + n = 128 + A = te.placeholder((n,), dtype=dtype, name='A') + B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name='B') + s = sched(B) + f = tvm.build(s, [A, B], "cuda") + ctx = tvm.gpu(0) + a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), ctx) + f(a, b) + ref = np.vectorize(ref_popcount)(a.asnumpy()) + tvm.testing.assert_allclose(b.asnumpy(), ref) + + run_test("uint32") + run_test("uint64") + if __name__ == "__main__": test_cuda_vectorize_add() test_cuda_multiply_add() @@ -359,4 +478,7 @@ def test_cuda_floordiv_with_vectorization(): test_rfactor_predicates() test_cuda_const_float_to_half() test_cuda_reduction() - test_cuda_floordiv_with_vectorization() \ No newline at end of file + test_cuda_floordiv_with_vectorization() + test_vectorized_intrin1() + test_vectorized_intrin2() + test_vectorized_popcount() \ No newline at end of file