From 6633c2dbbef6780fceb213d572007eaeec86af3d Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Fri, 19 Jun 2020 23:14:15 +0000 Subject: [PATCH 1/3] [OpenCL] Fix atomic add used by get_valid_counts --- src/target/source/codegen_opencl.cc | 23 +++++++++++++++++++++++ src/target/source/codegen_opencl.h | 1 + topi/python/topi/cuda/nms.py | 7 +++++++ 3 files changed, 31 insertions(+) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 8616853d8883..04f60a65264e 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -70,6 +70,10 @@ std::string CodeGenOpenCL::Finish() { "#endif\n\n"; } + // Enable atomic_add used by get_valid_counts. Only needed for OpenCL < 1.1. + decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n\n"; + return CodeGenC::Finish(); } @@ -224,6 +228,25 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType return os.str(); } +void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { + if (op->is_intrinsic(intrinsic::tvm_address_of)) { + // Overload tvm_address_of to add storage scope. + const LoadNode* l = op->args[0].as(); + CHECK(op->args.size() == 1 && l); + os << "(("; + auto it = alloc_storage_scope_.find(l->buffer_var.get()); + if (it != alloc_storage_scope_.end()) { + PrintStorageScope(it->second, os); + } + this->PrintType(l->dtype.element_of(), os); + os << " *)" << this->GetVarID(l->buffer_var.get()) << " + "; + this->PrintExpr(l->index, os); + os << ')'; + } else { + CodeGenC::VisitExpr_(op, os); + } +} + void CodeGenOpenCL::VisitExpr_(const BroadcastNode* op, std::ostream& os) { // NOLINT(*) std::string v = PrintExpr(op->value); os << "(("; diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 32a98e4d87ea..8520979811c1 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -54,6 +54,7 @@ class CodeGenOpenCL final : public CodeGenC { std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*) // overload visitor + void VisitExpr_(const CallNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const BroadcastNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const FloatImmNode* op, std::ostream& os) final; // NOLINT(*) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 9f46b95297c3..4772080a60eb 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -34,10 +34,17 @@ def cuda_atomic_add_rule(op): return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) raise RuntimeError("only support int32, float32 and float64") +def opencl_atomic_add_rule(op): + if op.dtype == "int32": + return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) + raise RuntimeError("only support int32") tvm.target.intrin.register_intrin_rule( "cuda", "atomic_add", cuda_atomic_add_rule, override=True) +tvm.target.intrin.register_intrin_rule( + "opencl", "atomic_add", opencl_atomic_add_rule, override=True) + tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque) def atomic_add(x, y): From 0e8c38e70c8ef5cb09bac215ed1aa42235a861a1 Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Mon, 22 Jun 2020 21:03:32 +0000 Subject: [PATCH 2/3] Rename l -> load, add flag to enable atomics --- src/target/source/codegen_opencl.cc | 27 +++++++++++++++++---------- src/target/source/codegen_opencl.h | 2 ++ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 04f60a65264e..55d9e70be25a 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -71,9 +71,10 @@ std::string CodeGenOpenCL::Finish() { } // Enable atomic_add used by get_valid_counts. Only needed for OpenCL < 1.1. - decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" - "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n\n"; - + if (enable_atomics_) { + decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n\n"; + } return CodeGenC::Finish(); } @@ -230,18 +231,24 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { if (op->is_intrinsic(intrinsic::tvm_address_of)) { - // Overload tvm_address_of to add storage scope. - const LoadNode* l = op->args[0].as(); - CHECK(op->args.size() == 1 && l); + // Overload tvm_address_of to add storage scope (e.g. __global). + const LoadNode* load = op->args[0].as(); + CHECK(op->args.size() == 1 && load); os << "(("; - auto it = alloc_storage_scope_.find(l->buffer_var.get()); + auto it = alloc_storage_scope_.find(load->buffer_var.get()); if (it != alloc_storage_scope_.end()) { PrintStorageScope(it->second, os); } - this->PrintType(l->dtype.element_of(), os); - os << " *)" << this->GetVarID(l->buffer_var.get()) << " + "; - this->PrintExpr(l->index, os); + this->PrintType(load->dtype.element_of(), os); + os << " *)" << this->GetVarID(load->buffer_var.get()) << " + "; + this->PrintExpr(load->index, os); os << ')'; + } else if (op->call_type == CallNode::Extern || op->call_type == CallNode::PureExtern) { + // Enable atomics extension if used. + if (op->name == "atomic_add") { + enable_atomics_ = true; + } + CodeGenC::VisitExpr_(op, os); } else { CodeGenC::VisitExpr_(op, os); } diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 8520979811c1..32102fec22b9 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -62,6 +62,8 @@ class CodeGenOpenCL final : public CodeGenC { // whether enable fp16 and fp64 extension bool enable_fp16_{false}; bool enable_fp64_{false}; + // Whether to enable atomics extension. + bool enable_atomics_{false}; }; } // namespace codegen From 1db9f1ad041602d37a989da1b824d0456a3b66ef Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Mon, 22 Jun 2020 21:03:54 +0000 Subject: [PATCH 3/3] Opencl doesn't do data rearrangement --- src/target/source/codegen_opencl.cc | 7 ++++--- tests/python/relay/test_op_level5.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 55d9e70be25a..21e5ed66403f 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -230,7 +230,7 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType } void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { - if (op->is_intrinsic(intrinsic::tvm_address_of)) { + if (op->op.same_as(builtin::address_of())) { // Overload tvm_address_of to add storage scope (e.g. __global). const LoadNode* load = op->args[0].as(); CHECK(op->args.size() == 1 && load); @@ -243,9 +243,10 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { os << " *)" << this->GetVarID(load->buffer_var.get()) << " + "; this->PrintExpr(load->index, os); os << ')'; - } else if (op->call_type == CallNode::Extern || op->call_type == CallNode::PureExtern) { + } else if (op->op.same_as(builtin_call_extern_)) { + auto func = Downcast(op->args[0]); // Enable atomics extension if used. - if (op->name == "atomic_add") { + if (func->value == "atomic_add") { enable_atomics_ = true; } CodeGenC::VisitExpr_(op, os); diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 265db43d9904..3a94fc69e001 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -270,8 +270,8 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): intrp = relay.create_executor("debug", ctx=ctx, target=target) out = intrp.evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) - # get_valid_count for cuda doesn't do data rearrangement - if target == 'cuda': + # get_valid_count for cuda, opencl doesn't do data rearrangement + if target in ['cuda', 'opencl']: return tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)