Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CODEGEN][CUDA] Fix vector load #5226

Merged
merged 3 commits into from
Apr 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 28 additions & 12 deletions src/target/source/codegen_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -668,15 +668,7 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) { // NOLINT(*)
std::string ref = GetVecLoad(op->dtype, op->buffer_var.get(), base);
HandleVolatileLoads(ref, op, os);
} else {
// The assignment below introduces side-effect, and the resulting value cannot
// be reused across multiple expression, thus a new scope is needed
int vec_scope = BeginScope();

// load seperately.
std::string svalue = GetUniqueName("_");
this->PrintIndent();
this->PrintType(op->dtype, stream);
stream << ' ' << svalue << ";\n";
std::ostringstream svalue_expr;
std::string sindex = SSAGetID(PrintExpr(op->index), op->index.dtype());
std::string vid = GetVarID(op->buffer_var.get());
DataType elem_type = op->dtype.element_of();
Expand All @@ -699,10 +691,9 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) { // NOLINT(*)
value_temp << '[';
PrintVecElemLoad(sindex, op->index.dtype(), i, value_temp);
value_temp << ']';
PrintVecElemStore(svalue, op->dtype, i, value_temp.str());
PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr);
}
os << svalue;
EndScope(vec_scope);
os << svalue_expr.str();
}
}
}
Expand Down Expand Up @@ -955,5 +946,30 @@ void CodeGenC::VisitStmt_(const ProducerConsumerNode* op) {
PrintStmt(op->body);
}

void CodeGenC::PrintVecElemLoadExpr(
DataType t, int i, const std::string& value, std::ostream& os) {
CHECK_GT(t.lanes(), 1);
if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
if (i != 0) {
os << "|";
}
os << "((0x000000ff << " << i * 8 << ") & (" << value << " << " << i * 8 << "))";
return;
}

if (i == 0) {
os << "((";
PrintType(t, os);
os << t.lanes() << ")(";
}
os << value;
if (i != t.lanes() - 1) {
os << ",";
} else {
os << "))";
}
return;
}

} // namespace codegen
} // namespace tvm
2 changes: 2 additions & 0 deletions src/target/source/codegen_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ class CodeGenC :
const std::string& vec, DataType t, int i, const std::string& value);
// Get a cast type from to
virtual std::string CastFromTo(std::string value, DataType from, DataType target);
// Get load of single element with expression
virtual void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os);

protected:
// Print reference to struct location
Expand Down
52 changes: 50 additions & 2 deletions src/target/source/codegen_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -591,13 +591,17 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
}

void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) { // NOLINT(*)
if (op->dtype.is_int() && op->dtype.bits() == 8 && op->lanes == 4) {
if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8 && op->lanes == 4) {
// make_int8x4
const int64_t *p = as_const_int(op->value);
CHECK(p);
int64_t v = *p & 0xFF;
v = (v << 24) | (v << 16) | (v << 8) | v;
os << "(int)" << v;
if (op->dtype.is_uint()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we care the signedness? this just downcasts to 32 bits,.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TVM uses uint to store unit8x4 (in function PrintType). The care will generate code like unit x = (unit)y, instead of unit x = (int)y. And what is your further opinion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we keep it as is? I do not see benefits from this change. Otherwise the entire PR LGTM. Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's not necessary to revert this change, if it's harmless. Consider that CodeGenCUDA::PrintType for uint8x4 generates "uint", this change somehow makes sense.

os << "(uint)" << v;
} else {
os << "(int)" << v;
}
return;
}

Expand Down Expand Up @@ -796,5 +800,49 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value,
}
}

void CodeGenCUDA::PrintVecElemLoadExpr(
DataType t, int i, const std::string& value, std::ostream& os) {
CHECK_GT(t.lanes(), 1);
if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
if (i != 0) {
os << "|";
}
os << "((0x000000ff << " << i * 8 << ") & (" << value << " << " << i * 8 << "))";
return;
}

if (t.is_float16()) {
if (i == 0) {
os << "make_";
PrintType(t, os);
os << '(';
}
if (i % 2 == 0) {
os << "__pack_half2(" << value;
} else {
os << "," << value << ")";
if (i != t.lanes() - 1) {
os << ",";
} else {
os << ")";
}
}
return;
}

if (i == 0) {
os << "make_";
PrintType(t, os);
os << "(";
}
os << value;
if (i != t.lanes() - 1) {
os << ",";
} else {
os << ")";
}
return;
}

} // namespace codegen
} // namespace tvm
1 change: 1 addition & 0 deletions src/target/source/codegen_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class CodeGenCUDA final : public CodeGenC {
void PrintVecElemStore(
const std::string& vec, DataType t, int i, const std::string& value) final;
void BindThreadIndex(const IterVar& iv) final; // NOLINT(*)
void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) final;
// overload visitor
void VisitExpr_(const RampNode* op, std::ostream& os) final; // NOLINT(*)
void VisitExpr_(const ShuffleNode* op, std::ostream& os) final; // NOLINT(*)
Expand Down
2 changes: 1 addition & 1 deletion src/target/source/literal/cuda_half_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ static inline __device__ __host__ unsigned
__pack_half2(const half x, const half y) {
unsigned v0 = *((unsigned short *)&x);
unsigned v1 = *((unsigned short *)&y);
return (v0 << 16) | v1;
return (v1 << 16) | v0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch!

}
)";

Expand Down
39 changes: 39 additions & 0 deletions tests/python/unittest/test_target_codegen_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,44 @@ def run_test(dtype):
run_test("uint32")
run_test("uint64")

def test_cuda_vectorize_load_permute_pad():
def check_cuda(dtype, n, l, padding, lanes):
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled..")
return
if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
print("Skip because gpu does not have fp16 support")
return

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check if float16 is supported

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already checked.

ctx = tvm.gpu(0)
A = tvm.te.placeholder((n, l), name='A', dtype=dtype)
B = tvm.te.compute((n // lanes, l + 2 * padding, lanes),
lambda i, j, k: tvm.te.if_then_else(
tvm.te.any(j < padding, j >= l + padding),
tvm.runtime.convert(0).astype(dtype), A[i * lanes + k, j - padding]),
name='B')
s = te.create_schedule(B.op)
block, thread, vectorize = s[B].op.axis
s[B].bind(block, bx)
s[B].bind(thread, tx)
s[B].vectorize(vectorize)
fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad")
np_a = np.random.randint(
low=-128, high=127, size=(n, l)).astype(A.dtype)
a = tvm.nd.empty((n, l), A.dtype, ctx).copyfrom(np_a)
b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, ctx)
fun(a, b)
np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1)
ref = np.pad(np_a_reshape, ((0, 0), (padding, padding),
(0, 0)), mode='constant', constant_values=0)
tvm.testing.assert_allclose(b.asnumpy(), ref)

check_cuda("int8", 64, 16, 3, 4)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uint8 test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already added uint8 test.

check_cuda("uint8", 64, 16, 3, 4)
check_cuda("int32", 64, 16, 3, 4)
check_cuda("float16", 64, 16, 3, 4)
check_cuda("float32", 64, 16, 3, 4)

if __name__ == "__main__":
test_cuda_vectorize_add()
test_cuda_multiply_add()
Expand All @@ -560,3 +598,4 @@ def run_test(dtype):
test_vectorized_intrin1()
test_vectorized_intrin2()
test_vectorized_popcount()
test_cuda_vectorize_load_permute_pad()