diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index 444dc996b10f..ac13f8a50091 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -668,15 +668,7 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) { // NOLINT(*) std::string ref = GetVecLoad(op->dtype, op->buffer_var.get(), base); HandleVolatileLoads(ref, op, os); } else { - // The assignment below introduces side-effect, and the resulting value cannot - // be reused across multiple expression, thus a new scope is needed - int vec_scope = BeginScope(); - - // load seperately. - std::string svalue = GetUniqueName("_"); - this->PrintIndent(); - this->PrintType(op->dtype, stream); - stream << ' ' << svalue << ";\n"; + std::ostringstream svalue_expr; std::string sindex = SSAGetID(PrintExpr(op->index), op->index.dtype()); std::string vid = GetVarID(op->buffer_var.get()); DataType elem_type = op->dtype.element_of(); @@ -699,10 +691,9 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) { // NOLINT(*) value_temp << '['; PrintVecElemLoad(sindex, op->index.dtype(), i, value_temp); value_temp << ']'; - PrintVecElemStore(svalue, op->dtype, i, value_temp.str()); + PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr); } - os << svalue; - EndScope(vec_scope); + os << svalue_expr.str(); } } } @@ -955,5 +946,30 @@ void CodeGenC::VisitStmt_(const ProducerConsumerNode* op) { PrintStmt(op->body); } +void CodeGenC::PrintVecElemLoadExpr( + DataType t, int i, const std::string& value, std::ostream& os) { + CHECK_GT(t.lanes(), 1); + if (t.bits() == 8 && (t.is_int() || t.is_uint())) { + if (i != 0) { + os << "|"; + } + os << "((0x000000ff << " << i * 8 << ") & (" << value << " << " << i * 8 << "))"; + return; + } + + if (i == 0) { + os << "(("; + PrintType(t, os); + os << t.lanes() << ")("; + } + os << value; + if (i != t.lanes() - 1) { + os << ","; + } else { + os << "))"; + } + return; +} + } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h index 30ad890c923d..49139de2fd1c 100644 --- a/src/target/source/codegen_c.h +++ b/src/target/source/codegen_c.h @@ -191,6 +191,8 @@ class CodeGenC : const std::string& vec, DataType t, int i, const std::string& value); // Get a cast type from to virtual std::string CastFromTo(std::string value, DataType from, DataType target); + // Get load of single element with expression + virtual void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os); protected: // Print reference to struct location diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc index 9c4fc69a9d78..c7971cef1bf6 100644 --- a/src/target/source/codegen_cuda.cc +++ b/src/target/source/codegen_cuda.cc @@ -591,13 +591,17 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) { } void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) { // NOLINT(*) - if (op->dtype.is_int() && op->dtype.bits() == 8 && op->lanes == 4) { + if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8 && op->lanes == 4) { // make_int8x4 const int64_t *p = as_const_int(op->value); CHECK(p); int64_t v = *p & 0xFF; v = (v << 24) | (v << 16) | (v << 8) | v; - os << "(int)" << v; + if (op->dtype.is_uint()) { + os << "(uint)" << v; + } else { + os << "(int)" << v; + } return; } @@ -796,5 +800,49 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, } } +void CodeGenCUDA::PrintVecElemLoadExpr( + DataType t, int i, const std::string& value, std::ostream& os) { + CHECK_GT(t.lanes(), 1); + if (t.bits() == 8 && (t.is_int() || t.is_uint())) { + if (i != 0) { + os << "|"; + } + os << "((0x000000ff << " << i * 8 << ") & (" << value << " << " << i * 8 << "))"; + return; + } + + if (t.is_float16()) { + if (i == 0) { + os << "make_"; + PrintType(t, os); + os << '('; + } + if (i % 2 == 0) { + os << "__pack_half2(" << value; + } else { + os << "," << value << ")"; + if (i != t.lanes() - 1) { + os << ","; + } else { + os << ")"; + } + } + return; + } + + if (i == 0) { + os << "make_"; + PrintType(t, os); + os << "("; + } + os << value; + if (i != t.lanes() - 1) { + os << ","; + } else { + os << ")"; + } + return; +} + } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h index 6ba748755d5b..d1db7047b1b6 100644 --- a/src/target/source/codegen_cuda.h +++ b/src/target/source/codegen_cuda.h @@ -55,6 +55,7 @@ class CodeGenCUDA final : public CodeGenC { void PrintVecElemStore( const std::string& vec, DataType t, int i, const std::string& value) final; void BindThreadIndex(const IterVar& iv) final; // NOLINT(*) + void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) final; // overload visitor void VisitExpr_(const RampNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const ShuffleNode* op, std::ostream& os) final; // NOLINT(*) diff --git a/src/target/source/literal/cuda_half_t.h b/src/target/source/literal/cuda_half_t.h index fd0652afb0d4..858ac8572a08 100644 --- a/src/target/source/literal/cuda_half_t.h +++ b/src/target/source/literal/cuda_half_t.h @@ -291,7 +291,7 @@ static inline __device__ __host__ unsigned __pack_half2(const half x, const half y) { unsigned v0 = *((unsigned short *)&x); unsigned v1 = *((unsigned short *)&y); - return (v0 << 16) | v1; + return (v1 << 16) | v0; } )"; diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py index bb162f41861d..739fc6fda76d 100644 --- a/tests/python/unittest/test_target_codegen_cuda.py +++ b/tests/python/unittest/test_target_codegen_cuda.py @@ -543,6 +543,44 @@ def run_test(dtype): run_test("uint32") run_test("uint64") +def test_cuda_vectorize_load_permute_pad(): + def check_cuda(dtype, n, l, padding, lanes): + if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): + print("skip because cuda is not enabled..") + return + if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): + print("Skip because gpu does not have fp16 support") + return + + ctx = tvm.gpu(0) + A = tvm.te.placeholder((n, l), name='A', dtype=dtype) + B = tvm.te.compute((n // lanes, l + 2 * padding, lanes), + lambda i, j, k: tvm.te.if_then_else( + tvm.te.any(j < padding, j >= l + padding), + tvm.runtime.convert(0).astype(dtype), A[i * lanes + k, j - padding]), + name='B') + s = te.create_schedule(B.op) + block, thread, vectorize = s[B].op.axis + s[B].bind(block, bx) + s[B].bind(thread, tx) + s[B].vectorize(vectorize) + fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad") + np_a = np.random.randint( + low=-128, high=127, size=(n, l)).astype(A.dtype) + a = tvm.nd.empty((n, l), A.dtype, ctx).copyfrom(np_a) + b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, ctx) + fun(a, b) + np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1) + ref = np.pad(np_a_reshape, ((0, 0), (padding, padding), + (0, 0)), mode='constant', constant_values=0) + tvm.testing.assert_allclose(b.asnumpy(), ref) + + check_cuda("int8", 64, 16, 3, 4) + check_cuda("uint8", 64, 16, 3, 4) + check_cuda("int32", 64, 16, 3, 4) + check_cuda("float16", 64, 16, 3, 4) + check_cuda("float32", 64, 16, 3, 4) + if __name__ == "__main__": test_cuda_vectorize_add() test_cuda_multiply_add() @@ -560,3 +598,4 @@ def run_test(dtype): test_vectorized_intrin1() test_vectorized_intrin2() test_vectorized_popcount() + test_cuda_vectorize_load_permute_pad()