From 82a67bfb702e9df499b41ad76f852f47d5cd7d31 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 3 Nov 2017 15:55:17 +0800 Subject: [PATCH 001/948] [DLPack] Upgrade dlpack to 0.2 (#609) --- apps/howto_deploy/cpp_deploy.cc | 4 +-- dlpack | 2 +- include/tvm/packed_func_ext.h | 4 +-- include/tvm/runtime/packed_func.h | 36 +++++++++---------- jvm/native/src/main/native/jni_helper_func.h | 6 ++-- .../main/native/ml_dmlc_tvm_native_c_api.cc | 4 +-- src/api/api_lang.cc | 4 +-- src/codegen/llvm/codegen_amdgpu.cc | 2 +- src/codegen/llvm/codegen_llvm.cc | 2 +- src/codegen/llvm/codegen_nvptx.cc | 2 +- src/codegen/stack_vm/stack_vm.h | 12 +++---- src/codegen/verilog/vpi_device_api.cc | 4 +-- src/contrib/cblas/cblas.cc | 6 ++-- src/contrib/cudnn/cudnn_utils.cc | 6 ++-- src/contrib/nnpack/convolution.cc | 16 ++++----- src/contrib/nnpack/fully_connected.cc | 12 +++---- src/pass/lower_tvm_builtin.cc | 2 +- src/pass/make_api.cc | 7 ++-- src/runtime/c_runtime_api.cc | 22 ++++++------ src/runtime/cpu_device_api.cc | 2 +- src/runtime/cuda/cuda_device_api.cc | 8 ++--- src/runtime/graph/graph_runtime.cc | 2 +- src/runtime/metal/metal_common.h | 8 ++--- src/runtime/metal/metal_device_api.mm | 8 ++--- src/runtime/opencl/opencl_common.h | 6 ++-- src/runtime/opencl/opencl_device_api.cc | 6 ++-- src/runtime/pack_args.h | 6 ++-- src/runtime/rocm/rocm_device_api.cc | 8 ++--- src/runtime/rpc/rpc_device_api.cc | 4 +-- src/runtime/rpc/rpc_session.cc | 30 ++++++++-------- tests/cpp/packed_func_test.cc | 2 +- 31 files changed, 122 insertions(+), 121 deletions(-) diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc index e3a88550dc2b..1fd22e5f2b5f 100644 --- a/apps/howto_deploy/cpp_deploy.cc +++ b/apps/howto_deploy/cpp_deploy.cc @@ -28,10 +28,10 @@ void Verify(tvm::runtime::Module mod, std::string fname) { DLTensor* x; DLTensor* y; int ndim = 1; - int dtype_code = kFloat; + int dtype_code = kDLFloat; int dtype_bits = 32; int dtype_lanes = 1; - int device_type = kCPU; + int device_type = kDLCPU; int device_id = 0; int64_t shape[1] = {10}; TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, diff --git a/dlpack b/dlpack index 9422e98f3f4d..10892ac964f1 160000 --- a/dlpack +++ b/dlpack @@ -1 +1 @@ -Subproject commit 9422e98f3f4dafc6bc3473cf8484543ad376aab6 +Subproject commit 10892ac964f1af7c81aae145cd3fab78bbccd297 diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h index 1f66232baacc..5242a057659b 100644 --- a/include/tvm/packed_func_ext.h +++ b/include/tvm/packed_func_ext.h @@ -105,10 +105,10 @@ inline TNodeRef TVMArgValue::AsNodeRef() const { inline TVMArgValue::operator Halide::Expr() const { if (type_code_ == kNull) return Expr(); - if (type_code_ == kInt) { + if (type_code_ == kDLInt) { return Expr(static_cast(value_.v_int64)); } - if (type_code_ == kFloat) { + if (type_code_ == kDLFloat) { return Expr(static_cast(value_.v_float64)); } TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle); diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 41537760e46d..4c08ffa6077c 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -217,25 +217,25 @@ class ExtTypeVTable { class TVMPODValue_ { public: operator double() const { - TVM_CHECK_TYPE_CODE(type_code_, kFloat); + TVM_CHECK_TYPE_CODE(type_code_, kDLFloat); return value_.v_float64; } operator int64_t() const { - TVM_CHECK_TYPE_CODE(type_code_, kInt); + TVM_CHECK_TYPE_CODE(type_code_, kDLInt); return value_.v_int64; } operator uint64_t() const { - TVM_CHECK_TYPE_CODE(type_code_, kInt); + TVM_CHECK_TYPE_CODE(type_code_, kDLInt); return value_.v_int64; } operator int() const { - TVM_CHECK_TYPE_CODE(type_code_, kInt); + TVM_CHECK_TYPE_CODE(type_code_, kDLInt); CHECK_LE(value_.v_int64, std::numeric_limits::max()); return static_cast(value_.v_int64); } operator bool() const { - TVM_CHECK_TYPE_CODE(type_code_, kInt); + TVM_CHECK_TYPE_CODE(type_code_, kDLInt); return value_.v_int64 != 0; } operator void*() const { @@ -430,7 +430,7 @@ class TVMRetValue : public TVMPODValue_ { return *this; } TVMRetValue& operator=(double value) { - this->SwitchToPOD(kFloat); + this->SwitchToPOD(kDLFloat); value_.v_float64 = value; return *this; } @@ -445,12 +445,12 @@ class TVMRetValue : public TVMPODValue_ { return *this; } TVMRetValue& operator=(int64_t value) { - this->SwitchToPOD(kInt); + this->SwitchToPOD(kDLInt); value_.v_int64 = value; return *this; } TVMRetValue& operator=(int value) { - this->SwitchToPOD(kInt); + this->SwitchToPOD(kDLInt); value_.v_int64 = value; return *this; } @@ -460,7 +460,7 @@ class TVMRetValue : public TVMPODValue_ { return *this; } TVMRetValue& operator=(bool value) { - this->SwitchToPOD(kInt); + this->SwitchToPOD(kDLInt); value_.v_int64 = value; return *this; } @@ -609,9 +609,9 @@ class TVMRetValue : public TVMPODValue_ { // implementation details inline const char* TypeCode2Str(int type_code) { switch (type_code) { - case kInt: return "int"; - case kUInt: return "uint"; - case kFloat: return "float"; + case kDLInt: return "int"; + case kDLUInt: return "uint"; + case kDLFloat: return "float"; case kStr: return "str"; case kBytes: return "bytes"; case kHandle: return "handle"; @@ -648,11 +648,11 @@ inline TVMType String2TVMType(std::string s) { t.bits = 32; t.lanes = 1; const char* scan; if (s.substr(0, 3) == "int") { - t.code = kInt; scan = s.c_str() + 3; + t.code = kDLInt; scan = s.c_str() + 3; } else if (s.substr(0, 4) == "uint") { - t.code = kUInt; scan = s.c_str() + 4; + t.code = kDLUInt; scan = s.c_str() + 4; } else if (s.substr(0, 5) == "float") { - t.code = kFloat; scan = s.c_str() + 5; + t.code = kDLFloat; scan = s.c_str() + 5; } else if (s.substr(0, 6) == "handle") { t.code = kHandle; t.bits = 64; // handle uses 64 bit by default. @@ -724,17 +724,17 @@ class TVMArgsSetter { std::is_integral::value>::type> void operator()(size_t i, T value) const { values_[i].v_int64 = static_cast(value); - type_codes_[i] = kInt; + type_codes_[i] = kDLInt; } void operator()(size_t i, uint64_t value) const { values_[i].v_int64 = static_cast(value); CHECK_LE(value, static_cast(std::numeric_limits::max())); - type_codes_[i] = kInt; + type_codes_[i] = kDLInt; } void operator()(size_t i, double value) const { values_[i].v_float64 = value; - type_codes_[i] = kFloat; + type_codes_[i] = kDLFloat; } void operator()(size_t i, std::nullptr_t value) const { values_[i].v_handle = value; diff --git a/jvm/native/src/main/native/jni_helper_func.h b/jvm/native/src/main/native/jni_helper_func.h index db4224012354..dc04f4191d1a 100644 --- a/jvm/native/src/main/native/jni_helper_func.h +++ b/jvm/native/src/main/native/jni_helper_func.h @@ -161,10 +161,10 @@ void fromJavaContext(JNIEnv *env, jobject jctx, TVMContext *ctx) { jobject tvmRetValueToJava(JNIEnv *env, TVMValue value, int tcode) { switch (tcode) { - case kUInt: - case kInt: + case kDLUInt: + case kDLInt: return newTVMValueLong(env, static_cast(value.v_int64)); - case kFloat: + case kDLFloat: return newTVMValueDouble(env, static_cast(value.v_float64)); case kModuleHandle: return newModule(env, reinterpret_cast(value.v_handle)); diff --git a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc index ea567f265f5b..615e4716d669 100644 --- a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc +++ b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc @@ -62,7 +62,7 @@ JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgLong( value.v_int64 = static_cast(arg); TVMFuncArgsThreadLocalEntry *e = TVMFuncArgsThreadLocalStore::Get(); e->tvmFuncArgValues.push_back(value); - e->tvmFuncArgTypes.push_back(kInt); + e->tvmFuncArgTypes.push_back(kDLInt); } JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgDouble( @@ -71,7 +71,7 @@ JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgDouble( value.v_float64 = static_cast(arg); TVMFuncArgsThreadLocalEntry *e = TVMFuncArgsThreadLocalStore::Get(); e->tvmFuncArgValues.push_back(value); - e->tvmFuncArgTypes.push_back(kFloat); + e->tvmFuncArgTypes.push_back(kDLFloat); } JNIEXPORT void JNICALL Java_ml_dmlc_tvm_LibInfo_tvmFuncPushArgString( diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc index 50531d73010f..85b7d92c6a25 100644 --- a/src/api/api_lang.cc +++ b/src/api/api_lang.cc @@ -27,9 +27,9 @@ TVM_REGISTER_API("_max_value") TVM_REGISTER_API("_const") .set_body([](TVMArgs args, TVMRetValue* ret) { - if (args[0].type_code() == kInt) { + if (args[0].type_code() == kDLInt) { *ret = make_const(args[1], args[0].operator int64_t()); - } else if (args[0].type_code() == kFloat) { + } else if (args[0].type_code() == kDLFloat) { *ret = make_const(args[1], args[0].operator double()); } else { LOG(FATAL) << "only accept int or float"; diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 9b02f2bc299f..fdd2ec2e38d2 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -133,7 +133,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { inline int DetectROCMComputeVersion() { TVMContext tvm_ctx; - tvm_ctx.device_type = kROCM; + tvm_ctx.device_type = kDLROCM; tvm_ctx.device_id = 0; TVMRetValue val; tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr( diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 2654dee0f7e5..a4bb815b1b93 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -242,7 +242,7 @@ llvm::Type* CodeGenLLVM::LLVMType(const Type& t) const { CHECK_EQ(t.lanes(), 1); return t_void_p_; } - llvm::Type* etype; + llvm::Type* etype = nullptr; if (t.is_int() || t.is_uint()) { etype = llvm::Type::getIntNTy(*ctx_, t.bits()); } else if (t.is_float()) { diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc index d147709ff1a2..c0002873d5fc 100644 --- a/src/codegen/llvm/codegen_nvptx.cc +++ b/src/codegen/llvm/codegen_nvptx.cc @@ -132,7 +132,7 @@ class CodeGenNVPTX : public CodeGenLLVM { inline int DetectCUDAComputeVersion() { TVMContext tvm_ctx; - tvm_ctx.device_type = kGPU; + tvm_ctx.device_type = kDLGPU; tvm_ctx.device_id = 0; TVMRetValue val; tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr( diff --git a/src/codegen/stack_vm/stack_vm.h b/src/codegen/stack_vm/stack_vm.h index 7663a4586274..54972d39a5df 100644 --- a/src/codegen/stack_vm/stack_vm.h +++ b/src/codegen/stack_vm/stack_vm.h @@ -340,16 +340,16 @@ class StackVM { static OpCode GetLoad(TVMType t) { CHECK_EQ(t.lanes, 1U); if (t.code == kHandle) return ARRAY_LOAD_HANDLE; - if (t.code == kInt) { + if (t.code == kDLInt) { switch (t.bits) { case 32 : return ARRAY_LOAD_INT32; case 64 : return ARRAY_LOAD_INT64; } - } else if (t.code == kUInt) { + } else if (t.code == kDLUInt) { switch (t.bits) { case 32 : return ARRAY_LOAD_UINT32; } - } else if (t.code == kFloat) { + } else if (t.code == kDLFloat) { switch (t.bits) { case 64 : return ARRAY_LOAD_FP64; } @@ -365,16 +365,16 @@ class StackVM { static OpCode GetStore(TVMType t) { CHECK_EQ(t.lanes, 1U); if (t.code == kHandle) return ARRAY_STORE_HANDLE; - if (t.code == kInt) { + if (t.code == kDLInt) { switch (t.bits) { case 32 : return ARRAY_STORE_INT32; case 64 : return ARRAY_STORE_INT64; } - } else if (t.code == kUInt) { + } else if (t.code == kDLUInt) { switch (t.bits) { case 32 : return ARRAY_STORE_UINT32; } - } else if (t.code == kFloat) { + } else if (t.code == kDLFloat) { switch (t.bits) { case 64 : return ARRAY_STORE_FP64; } diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc index 2977a4d45ce5..4e0e73eb427b 100644 --- a/src/codegen/verilog/vpi_device_api.cc +++ b/src/codegen/verilog/vpi_device_api.cc @@ -91,10 +91,10 @@ class VPIDeviceAPI final : public runtime::DeviceAPI { TVMContext ctx_from, TVMContext ctx_to, TVMStreamHandle stream) final { - if (static_cast(ctx_from.device_type) == kVPI) { + if (static_cast(ctx_from.device_type) == kDLVPI) { from = RealAddr(static_cast(from) + from_offset, size); } - if (static_cast(ctx_to.device_type) == kVPI) { + if (static_cast(ctx_to.device_type) == kDLVPI) { to = RealAddr(static_cast(to) + to_offset, size); } memcpy(to, from, size); diff --git a/src/contrib/cblas/cblas.cc b/src/contrib/cblas/cblas.cc index 01ef12f0d04d..9ce85ae4fd95 100644 --- a/src/contrib/cblas/cblas.cc +++ b/src/contrib/cblas/cblas.cc @@ -30,9 +30,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul") CHECK(C->strides == nullptr); CHECK(B->strides == nullptr); CHECK(A->strides == nullptr); - CHECK(TypeMatch(A->dtype, kFloat, 32)); - CHECK(TypeMatch(B->dtype, kFloat, 32)); - CHECK(TypeMatch(C->dtype, kFloat, 32)); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); cblas_sgemm(CblasColMajor, transb ? CblasTrans : CblasNoTrans, transa ? CblasTrans : CblasNoTrans, diff --git a/src/contrib/cudnn/cudnn_utils.cc b/src/contrib/cudnn/cudnn_utils.cc index 5929e2ffbc0f..5ca558f7d8ba 100644 --- a/src/contrib/cudnn/cudnn_utils.cc +++ b/src/contrib/cudnn/cudnn_utils.cc @@ -13,17 +13,17 @@ namespace contrib { // CuDNN Data Type cudnnDataType_t CuDNNDataType::DLTypeToCuDNNType(const DLDataType &dtype) { switch (dtype.code) { - case kInt: + case kDLInt: if (dtype.bits == 8 && dtype.lanes == 1) return CUDNN_DATA_INT8; else if (dtype.bits == 32 && dtype.lanes == 1) return CUDNN_DATA_INT32; else if (dtype.bits == 8 && dtype.lanes == 4) return CUDNN_DATA_INT8x4; else LOG(FATAL) << "Unsupported type"; break; - case kUInt: + case kDLUInt: LOG(FATAL) << "Unsupported type"; break; - case kFloat: + case kDLFloat: if (dtype.bits == 32 && dtype.lanes == 1) return CUDNN_DATA_FLOAT; else if (dtype.bits == 64 && dtype.lanes == 1) return CUDNN_DATA_DOUBLE; else if (dtype.bits == 16 && dtype.lanes == 1) return CUDNN_DATA_HALF; diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc index f7a8597f2de9..8480a100dfd7 100644 --- a/src/contrib/nnpack/convolution.cc +++ b/src/contrib/nnpack/convolution.cc @@ -44,10 +44,10 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference") CHECK(kernel->strides == nullptr); CHECK(bias->strides == nullptr); - CHECK(TypeMatch(input->dtype, kFloat, 32)); - CHECK(TypeMatch(kernel->dtype, kFloat, 32)); - CHECK(TypeMatch(bias->dtype, kFloat, 32)); - CHECK(TypeMatch(output->dtype, kFloat, 32)); + CHECK(TypeMatch(input->dtype, kDLFloat, 32)); + CHECK(TypeMatch(kernel->dtype, kDLFloat, 32)); + CHECK(TypeMatch(bias->dtype, kDLFloat, 32)); + CHECK(TypeMatch(output->dtype, kDLFloat, 32)); nnp_convolution_inference(nnp_convolution_algorithm_auto, nnp_convolution_transform_strategy_block_based, @@ -102,10 +102,10 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output") CHECK(kernel->strides == nullptr); CHECK(bias->strides == nullptr); - CHECK(TypeMatch(input->dtype, kFloat, 32)); - CHECK(TypeMatch(kernel->dtype, kFloat, 32)); - CHECK(TypeMatch(bias->dtype, kFloat, 32)); - CHECK(TypeMatch(output->dtype, kFloat, 32)); + CHECK(TypeMatch(input->dtype, kDLFloat, 32)); + CHECK(TypeMatch(kernel->dtype, kDLFloat, 32)); + CHECK(TypeMatch(bias->dtype, kDLFloat, 32)); + CHECK(TypeMatch(output->dtype, kDLFloat, 32)); nnp_convolution_output(nnp_convolution_algorithm_auto, batch_size, diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc index 2ae60d61bd51..6793ecaa36a7 100644 --- a/src/contrib/nnpack/fully_connected.cc +++ b/src/contrib/nnpack/fully_connected.cc @@ -29,9 +29,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference") CHECK(C->strides == nullptr); CHECK(B->strides == nullptr); CHECK(A->strides == nullptr); - CHECK(TypeMatch(A->dtype, kFloat, 32)); - CHECK(TypeMatch(B->dtype, kFloat, 32)); - CHECK(TypeMatch(C->dtype, kFloat, 32)); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); nnp_fully_connected_inference(B->shape[1], B->shape[0], @@ -58,9 +58,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_output") CHECK(C->strides == nullptr); CHECK(B->strides == nullptr); CHECK(A->strides == nullptr); - CHECK(TypeMatch(A->dtype, kFloat, 32)); - CHECK(TypeMatch(B->dtype, kFloat, 32)); - CHECK(TypeMatch(C->dtype, kFloat, 32)); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); nnp_fully_connected_output(A->shape[0], B->shape[1], diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc index a12f96c2282e..105d58b95829 100644 --- a/src/pass/lower_tvm_builtin.cc +++ b/src/pass/lower_tvm_builtin.cc @@ -72,7 +72,7 @@ class BuiltinLower : public IRMutator { int64_t nbytes = GetVectorBytes(op->type); if (device_type_.defined()) { if (arith::GetConst(device_type_, &dev_type)) { - if (dev_type == kCPU) { + if (dev_type == kDLCPU) { int32_t constant_size = op->constant_allocation_size(); if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) { return stmt; diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc index 5f124e5690c6..6290f63e611d 100644 --- a/src/pass/make_api.cc +++ b/src/pass/make_api.cc @@ -107,12 +107,13 @@ LoweredFunc MakeAPI(Stmt body, } else if (t.is_int() || t.is_uint()) { std::ostringstream msg; msg << name << ": Expect arg[" << i << "] to be int"; - seq_check.emplace_back(AssertStmt::make(tcode == kInt, msg.str(), nop)); + seq_check.emplace_back(AssertStmt::make(tcode == kDLInt, msg.str(), nop)); } else { CHECK(t.is_float()); std::ostringstream msg; msg << name << ": Expect arg[" << i << "] to be float"; - seq_check.emplace_back(AssertStmt::make(tcode == kFloat, msg.str(), nop)); + seq_check.emplace_back( + AssertStmt::make(tcode == kDLFloat, msg.str(), nop)); } } else { args.push_back(v_arg); @@ -148,7 +149,7 @@ LoweredFunc MakeAPI(Stmt body, seq_check.push_back(AttrStmt::make( node, attr::device_context_type, device_type, nop)); Stmt set_device = IfThenElse::make( - device_type != kCPU, Evaluate::make(Call::make( + device_type != kDLCPU, Evaluate::make(Call::make( Int(32), intrinsic::tvm_call_packed, {StringImm::make(runtime::symbol::tvm_set_device), device_type, device_id}, Call::Intrinsic))); diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index ce4a65dc79e2..8fc9a16aa851 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -25,12 +25,12 @@ namespace runtime { */ inline std::string DeviceName(int type) { switch (type) { - case kCPU: return "cpu"; - case kGPU: return "gpu"; - case kOpenCL: return "opencl"; - case kMetal: return "metal"; - case kVPI: return "vpi"; - case kROCM: return "rocm"; + case kDLCPU: return "cpu"; + case kDLGPU: return "gpu"; + case kDLOpenCL: return "opencl"; + case kDLMetal: return "metal"; + case kDLVPI: return "vpi"; + case kDLROCM: return "rocm"; case kExtDev: return "ext_dev"; default: LOG(FATAL) << "unknown type =" << type; return "Unknown"; } @@ -126,7 +126,7 @@ inline void TVMArrayFree_(TVMArray* arr) { inline void VerifyType(int dtype_code, int dtype_bits, int dtype_lanes) { CHECK_GE(dtype_lanes, 1); - if (dtype_code == kFloat) { + if (dtype_code == kDLFloat) { CHECK_EQ(dtype_bits % 32, 0); } else { CHECK_EQ(dtype_bits % 8, 0); @@ -382,10 +382,10 @@ int TVMArrayCopyFromTo(TVMArrayHandle from, CHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match"; TVMContext ctx = from->ctx; - if (ctx.device_type == kCPU) { + if (ctx.device_type == kDLCPU) { ctx = to->ctx; } else { - CHECK(to->ctx.device_type == kCPU || + CHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type) << "Can not copy across different ctx types directly"; } @@ -401,7 +401,7 @@ int TVMArrayCopyFromBytes(TVMArrayHandle handle, size_t nbytes) { API_BEGIN(); TVMContext cpu_ctx; - cpu_ctx.device_type = kCPU; + cpu_ctx.device_type = kDLCPU; cpu_ctx.device_id = 0; size_t arr_size = GetDataSize(handle); CHECK_EQ(arr_size, nbytes) @@ -418,7 +418,7 @@ int TVMArrayCopyToBytes(TVMArrayHandle handle, size_t nbytes) { API_BEGIN(); TVMContext cpu_ctx; - cpu_ctx.device_type = kCPU; + cpu_ctx.device_type = kDLCPU; cpu_ctx.device_id = 0; size_t arr_size = GetDataSize(handle); CHECK_EQ(arr_size, nbytes) diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 78947cd62cd5..1b2009e98e7f 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -68,7 +68,7 @@ class CPUDeviceAPI final : public DeviceAPI { struct CPUWorkspacePool : public WorkspacePool { CPUWorkspacePool() : - WorkspacePool(kCPU, CPUDeviceAPI::Global()) {} + WorkspacePool(kDLCPU, CPUDeviceAPI::Global()) {} }; void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) { diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 340b286d87ca..fd2c54ffd58d 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -79,7 +79,7 @@ class CUDADeviceAPI final : public DeviceAPI { cudaStream_t cu_stream = static_cast(stream); from = static_cast(from) + from_offset; to = static_cast(to) + to_offset; - if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) { + if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLGPU) { CUDA_CALL(cudaSetDevice(ctx_from.device_id)); if (ctx_from.device_id == ctx_to.device_id) { GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); @@ -88,10 +88,10 @@ class CUDADeviceAPI final : public DeviceAPI { from, ctx_from.device_id, size, cu_stream); } - } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) { + } else if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLCPU) { CUDA_CALL(cudaSetDevice(ctx_from.device_id)); GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); - } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) { + } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLGPU) { CUDA_CALL(cudaSetDevice(ctx_to.device_id)); GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); } else { @@ -140,7 +140,7 @@ class CUDADeviceAPI final : public DeviceAPI { typedef dmlc::ThreadLocalStore CUDAThreadStore; CUDAThreadEntry::CUDAThreadEntry() - : pool(kGPU, CUDADeviceAPI::Global()) { + : pool(kDLGPU, CUDADeviceAPI::Global()) { } CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() { diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 2cf6a1fb1330..d3f849d743dc 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -462,7 +462,7 @@ void GraphRuntime::SetupStorage() { int64_t shape[] = {static_cast(pool_entry_bytes[i] + 3) / 4}; DLTensor* tensor; TVM_CCALL(TVMArrayAlloc( - shape, 1, kFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor)); + shape, 1, kDLFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor)); storage_pool_.push_back(tensor); } // Assign the pooled entries. diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index 35f78db4e814..d7980e64c9a9 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -45,14 +45,14 @@ class MetalWorkspace final : public DeviceAPI { ~MetalWorkspace(); // Get command queue for given context. id GetCommandQueue(TVMContext ctx) { - CHECK_EQ(ctx.device_type, kMetal); + CHECK_EQ(ctx.device_type, kDLMetal); CHECK(ctx.device_id >= 0 && static_cast(ctx.device_id) < queues.size()) << "Invalid Metal device_id=" << ctx.device_id; return queues[ctx.device_id]; } // Get device for given context id GetDevice(TVMContext ctx) { - CHECK_EQ(ctx.device_type, kMetal); + CHECK_EQ(ctx.device_type, kDLMetal); CHECK(ctx.device_id >= 0 && static_cast(ctx.device_id) < devices.size()) << "Invalid Metal device_id=" << ctx.device_id; return devices[ctx.device_id]; @@ -91,9 +91,9 @@ class MetalThreadEntry { WorkspacePool pool; // constructor MetalThreadEntry() - : pool(static_cast(kMetal), MetalWorkspace::Global()) { + : pool(static_cast(kDLMetal), MetalWorkspace::Global()) { context.device_id = 0; - context.device_type = static_cast(kMetal); + context.device_type = static_cast(kDLMetal); } ~MetalThreadEntry(); // Get temp buffer with at least size under ctx. diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 4af274da98a3..f66d5b51e64a 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -150,13 +150,13 @@ int GetWarpSize(id dev) { this->Init(); CHECK(stream == nullptr); TVMContext ctx = ctx_from; - if (ctx_from.device_type == kCPU) ctx = ctx_to; + if (ctx_from.device_type == kDLCPU) ctx = ctx_to; id queue = GetCommandQueue(ctx); id cb = [queue commandBuffer]; int from_dev_type = static_cast(ctx_from.device_type); int to_dev_type = static_cast(ctx_to.device_type); - if (from_dev_type == kMetal && to_dev_type == kMetal) { + if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) { CHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy."; id encoder = [cb blitCommandEncoder]; @@ -167,7 +167,7 @@ int GetWarpSize(id dev) { size:size]; [encoder endEncoding]; [cb commit]; - } else if (from_dev_type == kMetal && to_dev_type == kCPU) { + } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) { // copy to a local buffer before get into global buffer. id from_buf = (__bridge id)(from); if (from_buf.storageMode != MTLStorageModeShared) { @@ -190,7 +190,7 @@ int GetWarpSize(id dev) { static_cast([from_buf contents]) + from_offset, size); } - } else if (from_dev_type == kCPU && to_dev_type == kMetal) { + } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) { id to_buf = (__bridge id)(to); if (to_buf.storageMode != MTLStorageModeShared) { id temp = MetalThreadEntry::ThreadLocal() diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index ccbe155e6204..e990aeba6a3e 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -133,7 +133,7 @@ class OpenCLWorkspace final : public DeviceAPI { void Init(); // get the queue of the context cl_command_queue GetQueue(TVMContext ctx) { - CHECK_EQ(ctx.device_type, kOpenCL); + CHECK_EQ(ctx.device_type, kDLOpenCL); this->Init(); CHECK(ctx.device_id >= 0 && static_cast(ctx.device_id) < queues.size()) << "Invalid OpenCL device_id=" << ctx.device_id; @@ -178,9 +178,9 @@ class OpenCLThreadEntry { WorkspacePool pool; // constructor OpenCLThreadEntry() - : pool(kOpenCL, OpenCLWorkspace::Global()) { + : pool(kDLOpenCL, OpenCLWorkspace::Global()) { context.device_id = 0; - context.device_type = kOpenCL; + context.device_type = kDLOpenCL; } // get the global workspace static OpenCLThreadEntry* ThreadLocal(); diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index f70207ebe881..e95fddaa867c 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -76,13 +76,13 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, TVMStreamHandle stream) { this->Init(); CHECK(stream == nullptr); - if (ctx_from.device_type == kOpenCL && ctx_to.device_type == kOpenCL) { + if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLOpenCL) { OPENCL_CALL(clEnqueueCopyBuffer( this->GetQueue(ctx_to), static_cast((void*)from), // NOLINT(*) static_cast(to), from_offset, to_offset, size, 0, nullptr, nullptr)); - } else if (ctx_from.device_type == kOpenCL && ctx_to.device_type == kCPU) { + } else if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLCPU) { OPENCL_CALL(clEnqueueReadBuffer( this->GetQueue(ctx_from), static_cast((void*)from), // NOLINT(*) @@ -90,7 +90,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, static_cast(to) + to_offset, 0, nullptr, nullptr)); OPENCL_CALL(clFinish(this->GetQueue(ctx_from))); - } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kOpenCL) { + } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLOpenCL) { OPENCL_CALL(clEnqueueWriteBuffer( this->GetQueue(ctx_to), static_cast(to), diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h index 3cb214161f22..0a00e79f07df 100644 --- a/src/runtime/pack_args.h +++ b/src/runtime/pack_args.h @@ -104,12 +104,12 @@ enum ArgConvertCode { inline ArgConvertCode GetArgConvertCode(TVMType t) { CHECK_EQ(t.lanes, 1U) << "Cannot pass vector type argument to devic function for now"; - if (t.code == kInt) { + if (t.code == kDLInt) { if (t.bits == 64U) return INT64_TO_INT64; if (t.bits == 32U) return INT64_TO_INT32; - } else if (t.code == kUInt) { + } else if (t.code == kDLUInt) { if (t.bits == 32U) return INT64_TO_UINT32; - } else if (t.code == kFloat) { + } else if (t.code == kDLFloat) { if (t.bits == 64U) return FLOAT64_TO_FLOAT64; if (t.bits == 32U) return FLOAT64_TO_FLOAT32; } else if (t.code == kHandle) { diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index d7b4eabf01d4..1e6154163b35 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -77,7 +77,7 @@ class ROCMDeviceAPI final : public DeviceAPI { hipStream_t hip_stream = static_cast(stream); from = static_cast(from) + from_offset; to = static_cast(to) + to_offset; - if (ctx_from.device_type == kROCM && ctx_to.device_type == kROCM) { + if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLROCM) { ROCM_CALL(hipSetDevice(ctx_from.device_id)); if (ctx_from.device_id == ctx_to.device_id) { GPUCopy(from, to, size, hipMemcpyDeviceToDevice, hip_stream); @@ -86,10 +86,10 @@ class ROCMDeviceAPI final : public DeviceAPI { from, ctx_from.device_id, size, hip_stream); } - } else if (ctx_from.device_type == kROCM && ctx_to.device_type == kCPU) { + } else if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLCPU) { ROCM_CALL(hipSetDevice(ctx_from.device_id)); GPUCopy(from, to, size, hipMemcpyDeviceToHost, hip_stream); - } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kROCM) { + } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLROCM) { ROCM_CALL(hipSetDevice(ctx_to.device_id)); GPUCopy(from, to, size, hipMemcpyHostToDevice, hip_stream); } else { @@ -138,7 +138,7 @@ class ROCMDeviceAPI final : public DeviceAPI { typedef dmlc::ThreadLocalStore ROCMThreadStore; ROCMThreadEntry::ROCMThreadEntry() - : pool(kROCM, ROCMDeviceAPI::Global()) { + : pool(kDLROCM, ROCMDeviceAPI::Global()) { } ROCMThreadEntry* ROCMThreadEntry::ThreadLocal() { diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc index e8cc5b94ad52..7674fa3e2334 100644 --- a/src/runtime/rpc/rpc_device_api.cc +++ b/src/runtime/rpc/rpc_device_api.cc @@ -55,12 +55,12 @@ class RPCDeviceAPI final : public DeviceAPI { static_cast(to)->data, to_offset, size, ctx_from, ctx_to, stream); } else if (from_dev_type > kRPCSessMask && - to_dev_type == kCPU) { + to_dev_type == kDLCPU) { GetSess(ctx_from)->CopyFromRemote( static_cast(from)->data, from_offset, to, to_offset, size, ctx_from); - } else if (from_dev_type == kCPU && + } else if (from_dev_type == kDLCPU && to_dev_type > kRPCSessMask) { GetSess(ctx_to)->CopyToRemote( (void*)from, from_offset, // NOLINT(*) diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 323faf4a9b1c..0fa021918ed2 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -162,9 +162,9 @@ class RPCSession::EventHandler { int tcode = type_codes[i]; TVMValue value = arg_values[i]; switch (tcode) { - case kInt: - case kUInt: - case kFloat: + case kDLInt: + case kDLUInt: + case kDLFloat: case kTVMType: { writer_->Write(&value, sizeof(TVMValue)); break; @@ -315,9 +315,9 @@ class RPCSession::EventHandler { int tcode = arg_buf_->tcode[arg_index_]; static_assert(sizeof(TVMValue) == sizeof(uint64_t), "invariant"); switch (tcode) { - case kInt: - case kUInt: - case kFloat: + case kDLInt: + case kDLUInt: + case kDLFloat: case kTVMType: case kHandle: case kStr: @@ -352,9 +352,9 @@ class RPCSession::EventHandler { TVMValue& value = arg_buf_->value[arg_index_]; if (arg_recv_stage_ == 0) { switch (tcode) { - case kInt: - case kUInt: - case kFloat: + case kDLInt: + case kDLUInt: + case kDLFloat: case kTVMType: case kTVMContext: { this->Read(&value, sizeof(TVMValue)); @@ -484,7 +484,7 @@ class RPCSession::EventHandler { this->Read(&offset, sizeof(offset)); this->Read(&size, sizeof(size)); this->Read(&ctx, sizeof(ctx)); - if (ctx.device_type == kCPU) { + if (ctx.device_type == kDLCPU) { RPCCode code = RPCCode::kCopyAck; writer_->Write(&code, sizeof(code)); writer_->Write(reinterpret_cast(handle) + offset, size); @@ -492,7 +492,7 @@ class RPCSession::EventHandler { temp_data_.resize(size + 1); try { TVMContext cpu_ctx; - cpu_ctx.device_type = kCPU; + cpu_ctx.device_type = kDLCPU; cpu_ctx.device_id = 0; DeviceAPI::Get(ctx)->CopyDataFromTo( reinterpret_cast(handle), offset, @@ -531,7 +531,7 @@ class RPCSession::EventHandler { int ret_tcode = kNull; RPCCode code = RPCCode::kReturn; std::string errmsg; - if (copy_ctx_.device_type == kCPU) { + if (copy_ctx_.device_type == kDLCPU) { this->Read( reinterpret_cast(copy_handle_) + copy_offset_, copy_size_); } else { @@ -539,7 +539,7 @@ class RPCSession::EventHandler { this->Read(&temp_data_[0], copy_size_); try { TVMContext cpu_ctx; - cpu_ctx.device_type = kCPU; + cpu_ctx.device_type = kDLCPU; cpu_ctx.device_id = 0; DeviceAPI::Get(copy_ctx_)->CopyDataFromTo( temp_data_.data(), 0, @@ -915,10 +915,10 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) { TVMContext ctx_to = args[6]; TVMStreamHandle stream = args[7]; TVMContext ctx = ctx_from; - if (ctx.device_type == kCPU) { + if (ctx.device_type == kDLCPU) { ctx = ctx_to; } else { - CHECK(ctx_to.device_type == kCPU || + CHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type) << "Can not copy across different ctx types directly"; } diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc index 015d95c90adb..00e428f258a9 100644 --- a/tests/cpp/packed_func_test.cc +++ b/tests/cpp/packed_func_test.cc @@ -14,7 +14,7 @@ TEST(PackedFunc, Basic) { Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { CHECK(args.num_args == 3); CHECK(args.values[0].v_float64 == 1.0); - CHECK(args.type_codes[0] == kFloat); + CHECK(args.type_codes[0] == kDLFloat); CHECK(args.values[1].v_handle == &a); CHECK(args.type_codes[1] == kArrayHandle); CHECK(args.values[2].v_handle == &x); From 0fa4d97588dcd1d23b732cbcd08eec98943f2e4a Mon Sep 17 00:00:00 2001 From: Yuwei Hu Date: Mon, 6 Nov 2017 12:08:22 +0800 Subject: [PATCH 002/948] [TOPI] fix weight layout in conv2d_transpose (#616) --- topi/python/topi/nn/conv2d_transpose.py | 6 +++--- .../topi/testing/conv2d_transpose_nchw_python.py | 16 ++++++++++++---- .../python/test_topi_conv2d_transpose_nchw.py | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py index 33f66d95c798..c783360b8808 100644 --- a/topi/python/topi/nn/conv2d_transpose.py +++ b/topi/python/topi/nn/conv2d_transpose.py @@ -18,7 +18,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] + 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width @@ -32,7 +32,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_c, in_h, in_w = Input.shape - out_c, _, filter_h, filter_w = Filter.shape + _, out_c, filter_h, filter_w = Filter.shape stride_h, stride_w = strides # dilate stage DilatedInput = dilate(Input, [1, 1, stride_h, stride_w], name='DilatedInput') @@ -57,7 +57,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): Output = tvm.compute( (batch, out_c, out_h, out_w), lambda b, c, h, w: tvm.sum( - PaddedInput[b, dc, h+dh, w+dw] * Filter[c, dc, filter_h-1-dh, filter_w-1-dw], + PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw], axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return Output diff --git a/topi/python/topi/testing/conv2d_transpose_nchw_python.py b/topi/python/topi/testing/conv2d_transpose_nchw_python.py index 43af160e8038..2b78452b95cd 100644 --- a/topi/python/topi/testing/conv2d_transpose_nchw_python.py +++ b/topi/python/topi/testing/conv2d_transpose_nchw_python.py @@ -1,6 +1,7 @@ # pylint: disable=unused-variable """Transposed convolution in python""" import numpy as np +import scipy import topi from topi.nn.util import get_pad_tuple @@ -14,7 +15,7 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding): 4-D with shape [batch, in_channel, in_height, in_width] w_np : numpy.ndarray - 4-D with shape [num_filter, in_channel, filter_height, filter_width] + 4-D with shape [in_channel, num_filter, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] @@ -28,7 +29,7 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding): 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_c, in_h, in_w = a_np.shape - out_c, _, filter_h, filter_w = w_np.shape + _, out_c, filter_h, filter_w = w_np.shape if isinstance(stride, int): stride_h = stride_w = stride else: @@ -46,6 +47,13 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding): padded_a_np[:, :, bpad_top:dilated_a_np.shape[2]+bpad_top, \ bpad_left:dilated_a_np.shape[3]+bpad_left] = dilated_a_np # convolution stage - rotated_w_np = np.rot90(w_np, k=2, axes=(2, 3)) - b_np = topi.testing.conv2d_nchw_python(padded_a_np, rotated_w_np, stride=1, padding='VALID') + out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h + out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w + b_np = np.zeros((batch, out_c, out_h, out_w)) + for n in range(batch): + for f in range(out_c): + for c in range(in_c): + out = scipy.signal.convolve2d( + padded_a_np[n, c], w_np[c, f], mode='valid') + b_np[n, f] += out return b_np diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py index 02e085387253..8aa86e6f38be 100644 --- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py +++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py @@ -10,7 +10,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') - W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') + W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W') B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding) C = topi.nn.relu(B) From 033fd2666c14e035ade90cfd5543823d74007ae6 Mon Sep 17 00:00:00 2001 From: masahi Date: Tue, 7 Nov 2017 06:40:25 +0900 Subject: [PATCH 003/948] add tanh dispatch (#619) --- src/codegen/llvm/intrin_rule_rocm.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc index fa27701870f9..38211db0b9b1 100644 --- a/src/codegen/llvm/intrin_rule_rocm.cc +++ b/src/codegen/llvm/intrin_rule_rocm.cc @@ -43,6 +43,8 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.sqrt") TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.pow") .set_body(DispatchExternOCML); +TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.tanh") +.set_body(DispatchExternOCML); } // namespace llvm } // namespace codegen } // namespace tvm From 7c508eb4a79ee696c5dc99cbdd4dccea58e2138d Mon Sep 17 00:00:00 2001 From: eqy Date: Mon, 6 Nov 2017 16:02:09 -0800 Subject: [PATCH 004/948] remove minimum 32-bit restriction (#621) Change minimum 32-bit restriction for floating point types to 8-bit. This change is to enable reduced precision types that may use vector operations underneath the hood (cases #lanes > 1 such as half4). --- src/runtime/c_runtime_api.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 8fc9a16aa851..f036dccc381a 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -127,7 +127,7 @@ inline void TVMArrayFree_(TVMArray* arr) { inline void VerifyType(int dtype_code, int dtype_bits, int dtype_lanes) { CHECK_GE(dtype_lanes, 1); if (dtype_code == kDLFloat) { - CHECK_EQ(dtype_bits % 32, 0); + CHECK_EQ(dtype_bits % 8, 0); } else { CHECK_EQ(dtype_bits % 8, 0); } From a1363b9e3c28cb2ebb19f84e7328b5744e28cee3 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Wed, 8 Nov 2017 12:45:58 -0800 Subject: [PATCH 005/948] conv2d_56_64_128 mark==1 bug fixed (#624) --- topi/python/topi/cuda/conv2d_nchw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index 81271938edea..fea0f542f448 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -80,7 +80,7 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag): if mark % i == 0 and vthread_x > 0: num_thread_x = i break - if num_thread_x * vthread_x > 128: + if mark < 5 or num_thread_x * vthread_x > 128: num_thread_x = 8 vthread_x = 8 num_thread_y = 8 From e1854fea30bd61222a4eaf2c2217b72d96dc879a Mon Sep 17 00:00:00 2001 From: eqy Date: Wed, 8 Nov 2017 15:47:07 -0800 Subject: [PATCH 006/948] Support vector operations for AMD (llvm IR) (#623) * Support vector operations for AMD (llvm IR) * fix whitespace * update comments, docstring --- include/tvm/buffer.h | 4 +++- python/tvm/schedule.py | 9 +++++++-- src/api/api_lang.cc | 2 +- src/codegen/llvm/codegen_llvm.cc | 31 +++++++++++++++++++++++++++---- src/codegen/llvm/codegen_llvm.h | 1 + src/lang/buffer.cc | 15 ++++++++++++--- src/pass/ir_util.h | 1 + 7 files changed, 52 insertions(+), 11 deletions(-) diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h index 610532e261a3..ad4872b8e4e0 100644 --- a/include/tvm/buffer.h +++ b/include/tvm/buffer.h @@ -51,8 +51,10 @@ class Buffer : public NodeRef { * \brief Get access ptr to the entire buffer. * \param access_mask The access mask * \param ptr_type The type of the pointer. + * \param content_lanes The number of lanes for the (data) type. */ - TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle()) const; + TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle(), + int content_lanes = 1) const; /*! * \brief Create an Expr that does a vector load at begin index. * \param begin The beginning index diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py index 26be2de1a69a..6abe4aae2f6f 100644 --- a/python/tvm/schedule.py +++ b/python/tvm/schedule.py @@ -25,7 +25,7 @@ class Buffer(NodeBase): READ = 1 WRITE = 2 - def access_ptr(self, access_mask, ptr_type="handle"): + def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1): """Get an access pointer to the head of buffer. This is the recommended method to get buffer data @@ -41,6 +41,10 @@ def access_ptr(self, access_mask, ptr_type="handle"): The data type of the result pointer. Do not specify unless we want to cast pointer to specific type. + content_lanes: int, optional + The number of lanes for the data type. This value + is greater than one for vector types. + Examples -------- .. code-block:: python @@ -63,7 +67,8 @@ def access_ptr(self, access_mask, ptr_type="handle"): else: raise ValueError("Unknown access_mask %s" % access_mask) access_mask = mask - return _api_internal._BufferAccessPtr(self, access_mask, ptr_type) + return _api_internal._BufferAccessPtr(self, access_mask, ptr_type, + content_lanes) def vload(self, begin, dtype=None): """Generate an Expr that loads dtype from begin index. diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc index 85b7d92c6a25..94075b6ec059 100644 --- a/src/api/api_lang.cc +++ b/src/api/api_lang.cc @@ -159,7 +159,7 @@ TVM_REGISTER_API("_Buffer") TVM_REGISTER_API("_BufferAccessPtr") .set_body([](TVMArgs args, TVMRetValue* ret) { *ret = args[0].operator Buffer() - .access_ptr(args[1], args[2]); + .access_ptr(args[1], args[2], args[3]); }); TVM_REGISTER_API("_BufferVLoad") diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index a4bb815b1b93..d274af73ed82 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -509,6 +509,18 @@ llvm::Value* CodeGenLLVM::CreateBufferPtr( return builder_->CreateInBoundsGEP(buffer, index); } +llvm::Value* CodeGenLLVM::CreateBufferVecPtr( + Type t, llvm::Value* buffer, llvm::Value* index) { + CHECK_GT(t.lanes(), 1); + llvm::PointerType* btype = llvm::dyn_cast(buffer->getType()); + CHECK(btype != nullptr); + llvm::PointerType* ptype = LLVMType(t)->getPointerTo(btype->getAddressSpace()); + if (btype != ptype) { + buffer = builder_->CreatePointerCast(buffer, ptype); + } + return builder_->CreateInBoundsGEP(buffer, index); +} + llvm::Value* CodeGenLLVM::GetVarValue(const Variable* v) const { auto it = var_map_.find(v); CHECK(it != var_map_.end()) << "cannot find variable " << v->name_hint; @@ -572,10 +584,21 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { } else if (op->is_intrinsic(intrinsic::tvm_address_of)) { const Load *l = op->args[0].as(); CHECK(op->args.size() == 1 && l); - llvm::Value* ptr = CreateBufferPtr( - l->type, MakeValue(l->buffer_var), MakeValue(l->index)); - unsigned addrspace = llvm::dyn_cast( - ptr->getType())->getAddressSpace(); + const Ramp *r = l->index.as(); + llvm::Value* ptr; + unsigned addrspace; + if (!r) { + ptr = CreateBufferPtr( + l->type, MakeValue(l->buffer_var), MakeValue(l->index)); + addrspace = llvm::dyn_cast( + ptr->getType())->getAddressSpace(); + } else { + Expr index = r->base / make_const(Int(32), r->lanes); + ptr = CreateBufferVecPtr( + l->type, MakeValue(l->buffer_var), MakeValue(index)); + addrspace = llvm::dyn_cast( + ptr->getType())->getAddressSpace(); + } return builder_->CreatePointerCast(ptr, t_void_->getPointerTo(addrspace)); } else if (op->is_intrinsic(Call::reinterpret) && is_zero(op->args[0])) { return llvm::Constant::getNullValue(t_void_p_); diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h index e4a0b24d381a..fbc74f092825 100644 --- a/src/codegen/llvm/codegen_llvm.h +++ b/src/codegen/llvm/codegen_llvm.h @@ -191,6 +191,7 @@ class CodeGenLLVM : llvm::Value* CreateMul(Type t, llvm::Value* a, llvm::Value* b); llvm::Value* CreateBroadcast(llvm::Value* value, int lanes); llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index); + llvm::Value* CreateBufferVecPtr(Type t, llvm::Value* buffer, llvm::Value* index); // Vector concatenation. llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent); llvm::Value* CreateVecFlip(llvm::Value* vec); diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc index 34abada14118..5cf7ddef3018 100644 --- a/src/lang/buffer.cc +++ b/src/lang/buffer.cc @@ -341,14 +341,23 @@ Buffer Buffer::MakeSlice(Array begins, Array extents) const { 0); } -Expr Buffer::access_ptr(int access_mask, Type ptr_type) const { +Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes) const { const BufferNode* self = operator->(); - Expr e_dtype = make_zero(self->dtype); + Expr e_dtype; Expr extent = (self->strides.size() == self->shape.size() ? arith::ComputeExpr(self->strides[0], self->shape[0]): arith::ComputeReduce(self->shape)); + Expr elem_offset = self->elem_offset; + if (content_lanes > 1) { + e_dtype = make_zero(self->dtype.with_lanes(content_lanes)); + extent = extent / make_const(self->elem_offset.type(), content_lanes); + elem_offset = self->elem_offset / make_const(self->elem_offset.type(), + content_lanes); + } else { + e_dtype = make_zero(self->dtype); + } Array acc_args{ - e_dtype, self->data, self->elem_offset, + e_dtype, self->data, elem_offset, extent, make_const(Int(32), access_mask)}; return ir::Call::make( ptr_type, ir::intrinsic::tvm_access_ptr, acc_args, ir::Call::Intrinsic); diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h index ae7a026c1ecb..082d580a0e45 100644 --- a/src/pass/ir_util.h +++ b/src/pass/ir_util.h @@ -102,6 +102,7 @@ inline Expr AddressOffset(Var handle, Type dtype, int offset) { inline Expr AddressOffset(Var handle, Type dtype, Expr offset) { if (dtype.lanes() != 1) { offset = offset * make_const(offset.type(), dtype.lanes()); + offset = Ramp::make(offset, make_const(offset.type(), 1), dtype.lanes()); } return Call::make( Handle(), intrinsic::tvm_address_of, From 918e2593004fa9076665dbf8ea090625926670d9 Mon Sep 17 00:00:00 2001 From: Erwan BERNARD Date: Thu, 9 Nov 2017 00:47:19 +0100 Subject: [PATCH 007/948] WIP: Add how_to readme to install tvm with nnpack support (#610) * feat(docs) add how_to for tvm install with nnpack support * feat(docs) change python package paragraph * feat(doc) remove unsure sentence * add comments on nnpack usage vs TVM * remove mxnet nnpack tips for nthread change --- docs/how_to/nnpack.md | 144 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 docs/how_to/nnpack.md diff --git a/docs/how_to/nnpack.md b/docs/how_to/nnpack.md new file mode 100644 index 000000000000..060bf9b5399f --- /dev/null +++ b/docs/how_to/nnpack.md @@ -0,0 +1,144 @@ +### NNPACK for Multi-Core CPU Support in TVM +[NNPACK](https://github.com/Maratyszcza/NNPACK) is an acceleration package +for neural network computations, which can run on x86-64, ARMv7, or ARM64 architecture CPUs. +Using NNPACK, higher-level libraries like _MXNet_ can speed up +the execution on multi-core CPU computers, including laptops and mobile devices. + +***Note***: AS TVM already has natively tuned schedules, NNPACK is here mainly for reference and comparison purpose. +For regular use prefer native tuned TVM implementation. + +_TVM_ supports NNPACK for forward propagation (inference only) in convolution, max-pooling, and fully-connected layers. +In this document, we give a high level overview of how to use NNPACK with _TVM_. + +### Conditions +The underlying implementation of NNPACK utilizes several acceleration methods, +including [fft](https://arxiv.org/abs/1312.5851) and [winograd](https://arxiv.org/abs/1509.09308). +These algorithms work better on some special `batch size`, `kernel size`, and `stride` settings than on other, +so depending on the context, not all convolution, max-pooling, or fully-connected layers can be powered by NNPACK. +When favorable conditions for running NNPACKS are not met, + +NNPACK only supports Linux and OS X systems. Windows is not supported at present. +The following table explains under which conditions NNPACK will work. + +| operation | conditions | +|:--------- |:---------- | +|convolution |2d convolution `and` no-bias=False `and` dilate=(1,1) `and` num_group=1 `and` batch-size = 1 or batch-size > 1 && stride = (1,1);| +|pooling | max-pooling `and` kernel=(2,2) `and` stride=(2,2) `and` pooling_convention=full | +|fully-connected| without any restrictions | + +### Build/Install LLVM +LLVM is required for CPU codegen that needs LLVM. +Since LLVM takes long time to build from source, you can download pre-built version of LLVM from [LLVM Download Page](http://releases.llvm.org/download.html). +For llvm 4.0 you can do the following step : + +```bash +# Add llvm repository in apt source list +echo "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main" >> /etc/apt/sources.list + +# Update apt source list +apt-get update +# Install clang and full llvm +apt-get install -y \ + clang-4.0 \ + clang-4.0-doc \ + libclang-common-4.0-dev \ + libclang-4.0-dev \ + libclang1-4.0 \ + libclang1-4.0-dbg \ + libllvm-4.0-ocaml-dev \ + libllvm4.0 \ + libllvm4.0-dbg \ + lldb-4.0 \ + llvm-4.0 \ + llvm-4.0-dev \ + llvm-4.0-doc \ + llvm-4.0-examples \ + llvm-4.0-runtime \ + clang-format-4.0 \ + python-clang-4.0 \ + libfuzzer-4.0-dev +``` + +### Build/Install NNPACK + +If the trained model meets some conditions of using NNPACK, +you can build TVM with NNPACK support. +Follow these simple steps: +* Build NNPACK shared library with the following commands. _TVM_ will link NNPACK dynamically. + +Note: The following NNPACK installation instructions have been tested on Ubuntu 16.04. + +#### Build [Ninja](https://ninja-build.org/) + +NNPACK need a recent version of Ninja. So we need to install ninja from source. +```bash +git clone git://github.com/ninja-build/ninja.git +cd ninja +./configure.py --bootstrap +``` + +Set the environment variable PATH to tell bash where to find the ninja executable. For example, assume we cloned ninja on the home directory ~. then we can added the following line in ~/.bashrc. +```bash +export PATH="${PATH}:~/ninja" +``` + +#### Build [NNPACK](https://github.com/Maratyszcza/NNPACK) + +The new CMAKE version of NNPACK download [Peach](https://github.com/Maratyszcza/PeachPy) and other dependencies alone + +```bash +git clone --recursive https://github.com/Maratyszcza/NNPACK.git +cd NNPACK +# Add PIC option in CFLAG and CXXFLAG to build NNPACK shared library +sed -i "s|gnu99|gnu99 -fPIC|g" CMakeLists.txt +sed -i "s|gnu++11|gnu++11 -fPIC|g" CMakeLists.txt +mkdir build +cd build +# Generate ninja build rule and add shared library in configuration +cmake -G Ninja -D BUILD_SHARED_LIBS=ON .. +ninja +sudo ninja install + +# Add NNPACK lib folder in your ldconfig +echo "/usr/local/lib" > /etc/ld.so.conf.d/nnpack.conf +sudo ldconfig +``` + +### Build TVM with NNPACK support + +```bash +git clone --recursive https://github.com/dmlc/tvm +``` + +* Set `USE_NNPACK = 1` in config.mk. +* Set `NNPACK_PATH` to the $(YOUR_NNPACK_INSTALL_PATH) +* Set `LLVM_CONFIG = llvm-config-4.0` depending of llvm version installed + +after configuration use `make` to build TVM + +```bash +make +make install +``` + +#### Python Package Installation + +The python package for [tvm](https://github.com/dmlc/tvm) depends of [topi](https://github.com/dmlc/tvm/tree/master/topi). +The tvm python package is located at `tvm/python` and topi python package is located in `tvm/topi/python` folder. +There are several ways to install the package, in all these cases the TVM library and TOPI must be present in the python env: + +1. Set the environment variable PYTHONPATH to tell python where to find the libraries. For example, assume we cloned tvm on the home directory ~. then we can added the following line in ~/.bashrc. It is recommended for developers who may change the codes. The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call setup again) + +```bash +export PYTHONPATH=/path/to/tvm/python:/path/to/tvm/topi/python:${PYTHONPATH} +``` + +2. Install tvm and topi python bindings by setup.py: + +```bash +# install tvm package for the current user +cd topi/python +python setup.py install --user; +cd ../../python +python setup.py install --user; +``` From 093da271ed28302504d99bb8af91f833c40a7771 Mon Sep 17 00:00:00 2001 From: eqy Date: Wed, 8 Nov 2017 17:13:02 -0800 Subject: [PATCH 008/948] inline AMD GPU functions (#625) * Support vector operations for AMD (llvm IR) * fix whitespace * update comments, docstring * inline AMD GPU functions --- src/codegen/llvm/codegen_amdgpu.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index fdd2ec2e38d2..9b8995bf5516 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -176,6 +176,9 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { } mlib->setTargetTriple(tm->getTargetTriple().str()); mlib->setDataLayout(tm->createDataLayout()); + for (llvm::Function &f : mlib->functions()) { + f.addFnAttr(llvm::Attribute::AlwaysInline); + } cg->AddLinkModule(std::move(mlib)); } From b0b44d9422b7dbfb08def6b3302dd97f7a5a0ebd Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Fri, 10 Nov 2017 01:56:29 +0800 Subject: [PATCH 009/948] android gemm for topi/recipe (#628) --- topi/recipe/gemm/android_gemm_square.py | 116 ++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 topi/recipe/gemm/android_gemm_square.py diff --git a/topi/recipe/gemm/android_gemm_square.py b/topi/recipe/gemm/android_gemm_square.py new file mode 100644 index 000000000000..f6f3b5ab4589 --- /dev/null +++ b/topi/recipe/gemm/android_gemm_square.py @@ -0,0 +1,116 @@ +"""Example code to do square matrix multiplication on Android Phone.""" +import tvm +import os +from tvm.contrib import rpc, util, ndk +import numpy as np + +# Set to be address of tvm proxy. +proxy_host = os.environ["TVM_ANDROID_RPC_PROXY_HOST"] +proxy_port = 9090 +key = "android" + +# Change target configuration. +# Run `adb shell cat /proc/cpuinfo` to find the arch. +arch = "arm64" +target = "llvm -target=%s-linux-android" % arch + +def ngflops(N): + return 2.0 * float(N * N * N) / (10**9) + +dtype = 'float32' +def evaluate(func, ctx, N, times): + a_np = np.random.uniform(size=(N, N)).astype(dtype) + b_np = np.random.uniform(size=(N, N)).astype(dtype) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) + + time_f = func.time_evaluator(func.entry_name, ctx, number=times) + cost = time_f(a, b, c).mean + gf = ngflops(N) / cost + print('%g secs/op, %g GFLOPS' % (cost, gf)) + np.testing.assert_almost_equal(c.asnumpy(), a_np.dot(b_np), decimal=2) + +def test_gemm_gpu(N, times, bn, num_block, num_thread): + assert(bn <= N) + assert(num_thread * num_thread * 16 <= N) + assert(num_block * num_block * 2 <= N) + A = tvm.placeholder((N, N), name='A') + B = tvm.placeholder((N, N), name='Btmp') + k = tvm.reduce_axis((0, N), name='k') + + packedB = tvm.compute((N, N / bn, bn), + lambda x, y, z: B[x, y * bn + z], name = 'B') + + C = tvm.compute( + (N, N), + lambda ii, jj: tvm.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), + name='C') + + s = tvm.create_schedule(C.op) + CC = s.cache_write(C, "local") + + block_x = tvm.thread_axis("blockIdx.x") + block_y = tvm.thread_axis("blockIdx.y") + thread_x = tvm.thread_axis("threadIdx.x") + thread_y = tvm.thread_axis("threadIdx.y") + + thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") + thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") + + pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) + pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) + s[packedB].bind(pby, thread_y) + s[packedB].bind(pbx, thread_x) + pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) + s[packedB].vectorize(pbk) + + by, yi = s[C].split(C.op.axis[0], nparts=num_block) + bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) + + s[C].bind(by, block_y) + s[C].bind(bx, thread_y) + s[C].reorder(by, bx, yi, xi) + + tyz, yi = s[C].split(yi, nparts=2) + ty, yi = s[C].split(yi, nparts=num_block) + txz, xi = s[C].split(xi, nparts=2) + tx, xi = s[C].split(xi, nparts=num_thread) + + s[C].reorder(tyz, txz, ty, tx, yi, xi) + s[C].bind(tyz, thread_yz) + s[C].bind(txz, thread_xz) + + s[C].bind(ty, block_x) + s[C].bind(tx, thread_x) + + xyi, xxi = s[C].split(xi, factor=8) + s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) + s[C].vectorize(xxi) + + s[CC].compute_at(s[C], yi) + yo, xo = CC.op.axis + s[CC].reorder(k, yo, xo) + xo, xi = s[CC].split(xo, factor=8) + s[CC].vectorize(xi) + + ko, ki = s[CC].split(k, factor=2) + s[CC].unroll(ki) + + print(tvm.lower(s, [A, B, C], simple_mode=True)) + + f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") + temp = util.tempdir() + path_dso = temp.relpath("gemm_gpu.so") + f.export_library(path_dso, ndk.create_shared) + + # connect to the proxy + remote = rpc.connect(proxy_host, proxy_port, key=key) + ctx = remote.cl(0) + remote.upload(path_dso) + f = remote.load_module("gemm_gpu.so") + + evaluate(f, ctx, N, times) + +if __name__ == "__main__": + test_gemm_gpu(1024, times=5, bn=8, num_block=2, num_thread=8) From d3cbe0dcd4f8e21ebe86baf0277bb25dc9ddf4d7 Mon Sep 17 00:00:00 2001 From: ziheng Date: Fri, 10 Nov 2017 19:02:46 -0800 Subject: [PATCH 010/948] [NNPACK] Add argument nthreads (#631) --- python/tvm/contrib/nnpack.py | 16 ++++++++-------- src/contrib/nnpack/convolution.cc | 2 ++ src/contrib/nnpack/fully_connected.cc | 4 ++++ src/contrib/nnpack/nnpack_utils.cc | 25 +++++++++++++++---------- src/contrib/nnpack/nnpack_utils.h | 2 ++ 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py index 66e7a9494d89..d6587df26229 100644 --- a/python/tvm/contrib/nnpack.py +++ b/python/tvm/contrib/nnpack.py @@ -16,7 +16,7 @@ def config(nthreads): """ _Config(nthreads) -def fully_connected_inference(lhs, rhs): +def fully_connected_inference(lhs, rhs, nthreads=1): """Create an extern op that compute fully connected of 1D tensor lhs and 2D tensor rhs with nnpack. @@ -37,9 +37,9 @@ def fully_connected_inference(lhs, rhs): (m, ), [lhs, rhs], lambda ins, outs: _intrin.call_packed( "tvm.contrib.nnpack.fully_connected_inference", - ins[0], ins[1], outs[0]), name="C") + ins[0], ins[1], outs[0], nthreads), name="C") -def fully_connected_output(lhs, rhs): +def fully_connected_output(lhs, rhs, nthreads=1): """Create an extern op that compute fully connected of 2D tensor lhs and 2D tensor rhs with nnpack. @@ -61,9 +61,9 @@ def fully_connected_output(lhs, rhs): (n, m), [lhs, rhs], lambda ins, outs: _intrin.call_packed( "tvm.contrib.nnpack.fully_connected_output", - ins[0], ins[1], outs[0]), name="C") + ins[0], ins[1], outs[0], nthreads), name="C") -def convolution_inference(data, kernel, bias, padding, stride): +def convolution_inference(data, kernel, bias, padding, stride, nthreads=1): """Create an extern op to do inference convolution of 3D tensor data and 4D tensor kernel and 1D tensor bias with nnpack. @@ -104,9 +104,9 @@ def convolution_inference(data, kernel, bias, padding, stride): lambda ins, outs: _intrin.call_packed( "tvm.contrib.nnpack.convolution_inference", ins[0], ins[1], ins[2], outs[0], padding[0], padding[1], padding[2], padding[3], - stride[0], stride[1]), name="C") + stride[0], stride[1], nthreads), name="C") -def convolution_output(data, kernel, bias, padding): +def convolution_output(data, kernel, bias, padding, nthreads=1): """Create an extern op to compute convolution of 4D tensor data and 4D tensor kernel and 1D tensor bias with nnpack. @@ -142,6 +142,6 @@ def convolution_output(data, kernel, bias, padding): (batch, output_channels, output_height, output_width), [data, kernel, bias], lambda ins, outs: _intrin.call_packed( "tvm.contrib.nnpack.convolution_output", ins[0], ins[1], ins[2], - outs[0], padding[0], padding[1], padding[2], padding[3]), name="C") + outs[0], padding[0], padding[1], padding[2], padding[3], nthreads), name="C") _init_api("tvm.contrib.nnpack") diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc index 8480a100dfd7..9ca02118aeb3 100644 --- a/src/contrib/nnpack/convolution.cc +++ b/src/contrib/nnpack/convolution.cc @@ -24,6 +24,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference") nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left}; uint64_t stride_width = args[8], stride_height = args[9]; nnp_size stride_size{stride_width, stride_height}; + NNPackConfig(args[10]); CHECK_EQ(input->ndim, 3); CHECK_EQ(kernel->ndim, 4); @@ -80,6 +81,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output") DLTensor* output = args[3]; uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7]; nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left}; + NNPackConfig(args[8]); CHECK_EQ(input->ndim, 4); CHECK_EQ(kernel->ndim, 4); diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc index 6793ecaa36a7..df6356d933aa 100644 --- a/src/contrib/nnpack/fully_connected.cc +++ b/src/contrib/nnpack/fully_connected.cc @@ -21,6 +21,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference") DLTensor* A = args[0]; DLTensor* B = args[1]; DLTensor* C = args[2]; + NNPackConfig(args[3]); + CHECK_EQ(A->ndim, 1); CHECK_EQ(B->ndim, 2); CHECK_EQ(C->ndim, 1); @@ -49,6 +51,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_output") DLTensor* A = args[0]; DLTensor* B = args[1]; DLTensor* C = args[2]; + NNPackConfig(args[3]); + CHECK_EQ(A->ndim, 2); CHECK_EQ(B->ndim, 2); CHECK_EQ(C->ndim, 2); diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc index e1e2773c1c8d..631f25b36647 100644 --- a/src/contrib/nnpack/nnpack_utils.cc +++ b/src/contrib/nnpack/nnpack_utils.cc @@ -14,18 +14,23 @@ NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() { return NNPackThreadLocalStore::Get(); } +bool NNPackConfig(uint64_t nthreads) { + NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal(); + if (entry->threadpool != NULL && + pthreadpool_get_threads_count(entry->threadpool) != nthreads) { + pthreadpool_destroy(entry->threadpool); + entry->threadpool = NULL; + } + if (entry->threadpool == NULL) { + entry->threadpool = pthreadpool_create(nthreads); + } + return true; +} + + TVM_REGISTER_GLOBAL("contrib.nnpack._Config") .set_body([](TVMArgs args, TVMRetValue *ret) { - NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal(); - size_t nthreads = args[0].operator uint64_t(); - if (entry->threadpool != NULL && - pthreadpool_get_threads_count(entry->threadpool) != nthreads) { - pthreadpool_destroy(entry->threadpool); - entry->threadpool = NULL; - } - if (entry->threadpool == NULL) { - entry->threadpool = pthreadpool_create(nthreads); - } + CHECK(NNPackConfig(args[0])); }); } // namespace contrib } // namespace tvm diff --git a/src/contrib/nnpack/nnpack_utils.h b/src/contrib/nnpack/nnpack_utils.h index 7a2232add145..fe7420786bde 100644 --- a/src/contrib/nnpack/nnpack_utils.h +++ b/src/contrib/nnpack/nnpack_utils.h @@ -18,6 +18,8 @@ struct NNPackThreadLocalEntry { pthreadpool_t threadpool{NULL}; static NNPackThreadLocalEntry* ThreadLocal(); }; + +bool NNPackConfig(uint64_t nthreads); } // namespace contrib } // namespace tvm #endif // TVM_CONTRIB_NNPACK_NNPACK_UTILS_H_ From 6e547634b090b59a352bcd608ce7e6fd63399752 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 11 Nov 2017 12:23:01 -0800 Subject: [PATCH 011/948] [PASS] Enhance LiftAttrScope (#632) * [PASS] Enhance LiftAttrScope * update vt --- src/pass/inject_virtual_thread.cc | 3 +- src/pass/lift_attr_scope.cc | 140 ++++++++++++++++++++---------- 2 files changed, 98 insertions(+), 45 deletions(-) diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc index 72b5753adbcf..07f59cb1b308 100644 --- a/src/pass/inject_virtual_thread.cc +++ b/src/pass/inject_virtual_thread.cc @@ -229,7 +229,8 @@ class VTInjector : public IRMutator { if (visit_touched_var_ && !vt_loop_injected_) { return InjectVTLoop(s, true); } else if (!allow_share_ && !vt_loop_injected_ && - op->attr_key == attr::coproc_uop_scope) { + (op->attr_key == attr::coproc_uop_scope || + op->attr_key == attr::coproc_scope)) { return InjectVTLoop(s, true); } else { Stmt body = Mutate(op->body); diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc index fdf692782523..a3a60aaac4d1 100644 --- a/src/pass/lift_attr_scope.cc +++ b/src/pass/lift_attr_scope.cc @@ -7,6 +7,7 @@ */ #include #include +#include "./ir_util.h" namespace tvm { namespace ir { @@ -57,41 +58,16 @@ class AttrScopeLifter : public IRMutator { } Stmt Mutate_(const Block* op, const Stmt& s) final { - Stmt first = this->Mutate(op->first); - NodeRef first_node_; - Expr first_value_; - std::swap(first_node_, attr_node_); - std::swap(first_value_, attr_value_); - Stmt rest = this->Mutate(op->rest); - if (attr_node_.defined() && - attr_value_.defined() && - first_node_.defined() && - first_value_.defined() && - attr_node_.same_as(first_node_) && - attr_value_.same_as(first_value_)) { - if (first.same_as(op->first) && rest.same_as(op->rest)) { - return s; - } else { - return Block::make(first, rest); - } - } else { - if (first_node_.defined()) { - first = AttrStmt::make( - first_node_, attr_key_, first_value_, first); - } - if (attr_node_.defined()) { - rest = AttrStmt::make( - attr_node_, attr_key_, attr_value_, rest); - // undefine them - attr_node_ = NodeRef(); - attr_value_ = Expr(); - } - if (first.same_as(op->first) && rest.same_as(op->rest)) { - return s; - } else { - return Block::make(first, rest); - } + std::vector seq; + FlattenSeq(op->first, &seq); + FlattenSeq(op->rest, &seq); + seq = MutateSeq(seq); + if (seq.size() == 2 && + seq[0].same_as(op->first) && + seq[1].same_as(op->rest)) { + return s; } + return MergeSeq(seq); } Stmt Mutate_(const IfThenElse* op, const Stmt& s) final { @@ -99,17 +75,17 @@ class AttrScopeLifter : public IRMutator { return IRMutator::Mutate_(op, s); } Stmt then_case = this->Mutate(op->then_case); - NodeRef first_node_; - Expr first_value_; - std::swap(first_node_, attr_node_); - std::swap(first_value_, attr_value_); + NodeRef first_node; + Expr first_value; + std::swap(first_node, attr_node_); + std::swap(first_value, attr_value_); Stmt else_case = this->Mutate(op->else_case); if (attr_node_.defined() && attr_value_.defined() && - first_node_.defined() && - first_value_.defined() && - attr_node_.same_as(first_node_) && - attr_value_.same_as(first_value_)) { + first_node.defined() && + first_value.defined() && + attr_node_.same_as(first_node) && + ValueSame(attr_value_, first_value)) { if (then_case.same_as(op->then_case) && else_case.same_as(op->else_case)) { return s; @@ -117,9 +93,9 @@ class AttrScopeLifter : public IRMutator { return IfThenElse::make(op->condition, then_case, else_case); } } else { - if (first_node_.defined()) { + if (first_node.defined()) { then_case = AttrStmt::make( - first_node_, attr_key_, first_value_, then_case); + first_node, attr_key_, first_value, then_case); } if (attr_node_.defined()) { else_case = AttrStmt::make( @@ -138,6 +114,82 @@ class AttrScopeLifter : public IRMutator { } private: + void FlattenSeq(Stmt s, std::vector* res) { + if (const Block* op = s.as()) { + FlattenSeq(op->first, res); + FlattenSeq(op->rest, res); + } else if (const ProducerConsumer* op = s.as()) { + if (!op->is_producer) { + FlattenSeq(op->body, res); + } else { + res->emplace_back(s); + } + } else { + res->emplace_back(s); + } + } + + std::vector MutateSeq(const std::vector& seq) { + std::vector res_seq; + NodeRef curr_node; + Expr curr_value; + Stmt curr_stmt; + for (const Stmt & stmt : seq) { + attr_node_ = NodeRef(); + attr_value_ = Expr(); + Stmt rest = this->Mutate(stmt); + if (attr_node_.defined() && + attr_value_.defined() && + curr_node.defined() && + curr_value.defined() && + attr_node_.same_as(curr_node) && + ValueSame(attr_value_, curr_value)) { + curr_stmt = Block::make(curr_stmt, rest); + } else { + if (curr_stmt.defined()) { + if (curr_node.defined()) { + curr_stmt = AttrStmt::make( + curr_node, attr_key_, curr_value, curr_stmt); + } + res_seq.push_back(curr_stmt); + } + curr_stmt = rest; + curr_node = attr_node_; + curr_value = attr_value_; + } + } + + if (curr_stmt.defined()) { + // keep attr_node_, attr_node_ + if (res_seq.size() == 0) { + return {curr_stmt}; + } + if (curr_node.defined()) { + curr_stmt = AttrStmt::make( + curr_node, attr_key_, curr_value, curr_stmt); + } + res_seq.push_back(curr_stmt); + // reset + attr_node_ = NodeRef(); + attr_value_ = Expr(); + } + return res_seq; + } + + // value comparison that also compares content of int constant + static bool ValueSame(const Expr& a, const Expr& b) { + if (a.same_as(b)) return true; + if (a->type_key() != b->type_key()) return false; + if (a.type() != b.type()) return false; + if (const IntImm* op = a.as()) { + return op->value == b.as()->value; + } + if (const UIntImm* op = a.as()) { + return op->value == b.as()->value; + } + return false; + } + std::string attr_key_; NodeRef attr_node_; Expr attr_value_; From bb9d5f8cfea9dfbee4e65b74e2b0b86e19796e97 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 12 Nov 2017 08:53:28 +0800 Subject: [PATCH 012/948] [TUTORIAL] use OpenCL on ARM board (#633) --- .../deployment/cross_compilation_and_rpc.py | 58 ++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/tutorials/deployment/cross_compilation_and_rpc.py b/tutorials/deployment/cross_compilation_and_rpc.py index 7848b2a23273..859b24472483 100644 --- a/tutorials/deployment/cross_compilation_and_rpc.py +++ b/tutorials/deployment/cross_compilation_and_rpc.py @@ -168,8 +168,8 @@ # `LLVM guide of cross compilation `_. ###################################################################### -# Run Kernel Remotely by RPC -# -------------------------- +# Run CPU Kernel Remotely by RPC +# ------------------------------ # Here we will show you how to run the kernel on the remote device: # replace host with the ip address of your device @@ -204,6 +204,60 @@ cost = time_f(a, b).mean print('%g secs/op' % cost) +######################################################################### +# Run OpenCL Kernel Remotely by RPC +# --------------------------------- +# As for remote OpenCL devices, the workflow is almost the same as above. +# You can define the kernel, upload files, and run by RPC. The files +# include host object, kernel source code and module meta file. We rely +# on remote compiler to re-link them. +# +# .. note:: +# Raspberry Pi does not support OpenCL, the following code is tested on +# Firefly-RK3399. The target_host should be 'llvm -target=aarch64-linux-gnu'. +# But here we set 'llvm' to enable this tutorial to run locally. + +# build kernel (different from cpu, we need bind axis for OpenCL) +s = tvm.create_schedule(B.op) +xo, xi = s[B].split(B.op.axis[0], factor=32) +s[B].bind(xo, tvm.thread_axis("blockIdx.x")) +s[B].bind(xi, tvm.thread_axis("threadIdx.x")) +f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") + +# save files +path_o = temp.relpath("myadd.o") +path_cl = temp.relpath("myadd.cl") +path_json = temp.relpath("myadd.tvm_meta.json") +f.save(path_o) +f.imported_modules[0].save(path_cl) + +# upload files +remote.upload(path_o) +remote.upload(path_cl) +remote.upload(path_json) + +# load files on remote device +fhost = remote.load_module("myadd.o") +fdev = remote.load_module("myadd.cl") +fhost.import_module(fdev) + +# run +ctx = remote.cl(0) +a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) +b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) +fhost(a, b) +np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + +##################################################################### +# Instead of uploading files separately, there is a more convinient way. +# You can export libraray as a tar ball. +path_tar = temp.relpath("myadd.tar") +f.export_library(path_tar) +remote.upload(path_tar) +fhost = remote.load_module("myadd.tar") +fhost(a, b) +np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + # terminate the server after experiment server.terminate() From 18706afdc6491d1d40d7cbf3306b82ba262eb2cc Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 11 Nov 2017 18:15:31 -0800 Subject: [PATCH 013/948] [PASS] Update coproc sync (#634) --- python/tvm/build_module.py | 10 +- src/pass/coproc_sync.cc | 268 +++++++++++++++++- src/pass/storage_rewrite.cc | 8 +- src/runtime/thread_storage_scope.h | 2 +- .../python/unittest/test_pass_storage_sync.py | 21 ++ 5 files changed, 303 insertions(+), 6 deletions(-) diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index ff6086778e7f..4fe502e987d5 100644 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -201,7 +201,8 @@ def lower(sch, add_lower_pass = cfg.add_lower_pass if cfg.add_lower_pass else [] lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0] lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1] - lower_phase2 = [x[1] for x in add_lower_pass if x[0] > 1] + lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2] + lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2] # normalize schedule first sch = sch.normalize() # Phase 0 @@ -213,6 +214,9 @@ def lower(sch, # Phase 1 stmt = ir_pass.StorageFlatten(stmt, binds, 64) stmt = ir_pass.CanonicalSimplify(stmt) + for f in lower_phase1: + stmt = f(stmt) + # Phase 2 if not simple_mode: stmt = ir_pass.LoopPartition(stmt) stmt = ir_pass.VectorizeLoop(stmt) @@ -224,14 +228,14 @@ def lower(sch, cfg.auto_unroll_max_step, cfg.auto_unroll_max_depth, cfg.unroll_explicit) - for f in lower_phase1: + for f in lower_phase2: stmt = f(stmt) # Phase 2 stmt = ir_pass.Simplify(stmt) stmt = ir_pass.LowerStorageAccessInfo(stmt) stmt = ir_pass.RemoveNoOp(stmt) stmt = ir_pass.RewriteUnsafeSelect(stmt) - for f in lower_phase2: + for f in lower_phase3: stmt = f(stmt) if simple_mode: return stmt diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc index fa77942b6058..28be8aba2057 100644 --- a/src/pass/coproc_sync.cc +++ b/src/pass/coproc_sync.cc @@ -338,6 +338,256 @@ class CoProcBarrierDetector : public StorageAccessVisitor { }; +class CoProcInstDepDetector : public IRVisitor { + public: + explicit CoProcInstDepDetector( + const IterVar& coproc_axis, + const std::string& coproc_name) + : coproc_axis_(coproc_axis) { + sync_push_name_ = coproc_name + ".coproc_dep_push"; + sync_pop_name_ = coproc_name + ".coproc_dep_pop"; + } + + void Plan(Stmt stmt) { + this->Visit(stmt); + if (last_state_.node != nullptr) { + MatchFixEnterPop(first_state_); + MatchFixExitPush(last_state_); + } + } + + void Visit_(const AttrStmt* op) final { + if (op->attr_key == attr::coproc_scope && + op->node.same_as(coproc_axis_)) { + const IntImm* ctx_id = op->value.as(); + CHECK(ctx_id != nullptr); + curr_state_.clear(); + curr_state_.node = op->body.get(); + curr_state_.enter_ctx.insert(ctx_id->value); + curr_state_.exit_ctx.insert(ctx_id->value); + UpdateState(); + } else { + IRVisitor::Visit_(op); + } + } + + void Visit_(const For* op) final { + SyncState temp_first, temp_last; + std::swap(first_state_, temp_first); + std::swap(last_state_, temp_last); + this->Visit(op->body); + curr_state_.clear(); + if (last_state_.node != nullptr) { + curr_state_.node = op; + CHECK(first_state_.node != nullptr); + // loop carry dependency + InjectSync(last_state_, first_state_, + &(curr_state_.exit_push), + &(curr_state_.enter_pop)); + curr_state_.enter_ctx = first_state_.enter_ctx; + curr_state_.exit_ctx = last_state_.enter_ctx; + } + std::swap(first_state_, temp_first); + std::swap(last_state_, temp_last); + if (curr_state_.node != nullptr) { + UpdateState(); + } + } + + void Visit_(const IfThenElse* op) final { + SyncState temp_first, temp_last, curr_state; + std::swap(first_state_, temp_first); + std::swap(last_state_, temp_last); + { + // then stmt + this->Visit(op->then_case); + if (last_state_.node != nullptr) { + curr_state.node = op; + MatchFixEnterPop(first_state_); + MatchFixExitPush(last_state_); + curr_state.enter_ctx.insert( + first_state_.enter_ctx.begin(), + first_state_.enter_ctx.end()); + curr_state.exit_ctx.insert( + last_state_.exit_ctx.begin(), + last_state_.exit_ctx.end()); + } + first_state_.clear(); + last_state_.clear(); + } + if (op->else_case.defined()) { + this->Visit(op->else_case); + if (last_state_.node != nullptr) { + curr_state.node = op; + MatchFixEnterPop(first_state_); + MatchFixExitPush(last_state_); + curr_state.enter_ctx.insert( + first_state_.enter_ctx.begin(), + first_state_.enter_ctx.end()); + curr_state.exit_ctx.insert( + last_state_.exit_ctx.begin(), + last_state_.exit_ctx.end()); + } + } + // update in the trace. + std::swap(first_state_, temp_first); + std::swap(last_state_, temp_last); + std::swap(curr_state_, curr_state); + if (curr_state_.node != nullptr) { + UpdateState(); + } + } + + // insert before is stored in reverse order + // the first element is closest to the node. + std::unordered_map > insert_before_; + std::unordered_map > insert_after_; + + private: + // state in the sync entry + struct SyncState { + // The statement of the state. + const Node* node{nullptr}; + // Set of all possible contexts in the entering moment. + std::unordered_set enter_ctx; + // Set of all possible contexts in the exit moment. + std::unordered_set exit_ctx; + // existing pop performed at enter + std::vector > enter_pop; + // existing push peformed at exit + std::vector > exit_push; + // clear the state + void clear() { + node = nullptr; + enter_ctx.clear(); + exit_ctx.clear(); + enter_pop.clear(); + exit_push.clear(); + } + }; + // inject proper sync into the pair + // record the push/pop sequence that could be possibly un-matched. + // return the push/pop message at enter/exit of the Block + // after considering the existing unmatcheded events and added events + void InjectSync(const SyncState& prev, + const SyncState& next, + std::vector >* prev_exit_push, + std::vector >* next_enter_pop) { + prev_exit_push->clear(); + next_enter_pop->clear(); + // quick path + if (prev.exit_push.size() == 0 && next.enter_pop.size() == 0 && + prev.exit_ctx.size() == 1 && next.enter_ctx.size() == 1) { + int from = *prev.exit_ctx.begin(); + int to = *next.enter_ctx.begin(); + if (from != to) { + insert_after_[prev.node].emplace_back(MakePush(from, to)); + insert_before_[next.node].emplace_back(MakePop(from, to)); + prev_exit_push->emplace_back(std::make_pair(from, to)); + next_enter_pop->emplace_back(std::make_pair(from, to)); + } + return; + } + // complicate path. + std::vector > vpush = prev.exit_push; + std::vector > vpop = next.enter_pop; + std::vector > pending; + for (int from : prev.exit_ctx) { + for (int to : next.enter_ctx) { + if (from != to) { + pending.emplace_back(std::make_pair(from, to)); + } + } + } + // policy 1 + std::vector prev_after, next_before; + for (const std::pair& p : pending) { + if (std::find(prev.exit_push.begin(), + prev.exit_push.end(), p) == + prev.exit_push.end()) { + vpush.push_back(p); + prev_after.emplace_back(MakePush(p.first, p.second)); + } + if (std::find(next.enter_pop.begin(), + next.enter_pop.end(), p) == + next.enter_pop.end()) { + vpop.push_back(p); + next_before.emplace_back(MakePop(p.first, p.second)); + } + } + // fix pending + for (const std::pair& p : vpush) { + if (std::find(vpop.begin(), vpop.end(), p) == vpop.end()) { + prev_after.emplace_back(MakePop(p.first, p.second)); + } else { + prev_exit_push->push_back(p); + } + } + for (const std::pair& p : vpop) { + if (std::find(vpush.begin(), vpush.end(), p) == vpush.end()) { + next_before.emplace_back(MakePush(p.first, p.second)); + } else { + next_enter_pop->push_back(p); + } + } + if (prev_after.size() != 0) { + auto &v1 = insert_after_[prev.node]; + v1.insert(v1.end(), prev_after.begin(), prev_after.end()); + } + if (next_before.size() != 0) { + auto &v2 = insert_before_[next.node]; + v2.insert(v2.end(), next_before.begin(), next_before.end()); + } + } + + void MatchFixEnterPop(const SyncState& state) { + if (state.enter_pop.size() == 0) return; + auto &vec = insert_before_[state.node]; + for (const std::pair& p : state.enter_pop) { + vec.push_back(MakePush(p.first, p.second)); + } + } + + void MatchFixExitPush(const SyncState& state) { + if (state.exit_push.size() == 0) return; + auto &vec = insert_after_[state.node]; + for (const std::pair& p : state.exit_push) { + vec.push_back(MakePop(p.first, p.second)); + } + } + + void UpdateState() { + if (last_state_.node != nullptr) { + std::vector > t1, t2; + InjectSync(last_state_, curr_state_, &t1, &t2); + std::swap(last_state_, curr_state_); + } else { + CHECK(first_state_.node == nullptr); + first_state_ = curr_state_; + last_state_ = curr_state_; + } + } + + Stmt MakePush(int from, int to) { + return Evaluate::make(Call::make( + Int(32), sync_push_name_, + {make_const(Int(32), from), make_const(Int(32), to)}, + Call::Intrinsic)); + } + Stmt MakePop(int from, int to) { + return Evaluate::make(Call::make( + Int(32), sync_pop_name_, + {make_const(Int(32), from), make_const(Int(32), to)}, + Call::Intrinsic)); + } + // sync states. + SyncState first_state_, last_state_, curr_state_; + // Variables + IterVar coproc_axis_; + std::string sync_push_name_, sync_pop_name_; +}; + + class CoProcSyncInserter : public IRMutator { public: Stmt Insert(Stmt stmt) { @@ -372,6 +622,18 @@ class CoProcSyncInserter : public IRMutator { auto& vec = insert_after_[kv.first]; vec.insert(vec.end(), kv.second.begin(), kv.second.end()); } + // Detect barrier + CoProcInstDepDetector sync_detector( + *visitor.coproc_.begin(), coproc_name); + sync_detector.Plan(stmt); + for (const auto& kv : sync_detector.insert_before_) { + auto& vec = insert_before_[kv.first]; + vec.insert(vec.end(), kv.second.begin(), kv.second.end()); + } + for (const auto& kv : sync_detector.insert_after_) { + auto& vec = insert_after_[kv.first]; + vec.insert(vec.end(), kv.second.begin(), kv.second.end()); + } return Mutate(stmt); } @@ -379,7 +641,8 @@ class CoProcSyncInserter : public IRMutator { Stmt before, after; auto it = insert_before_.find(stmt.get()); if (it != insert_before_.end()) { - before = MergeSeq(it->second); + before = MergeSeq(std::vector( + it->second.rbegin(), it->second.rend())); } it = insert_after_.find(stmt.get()); if (it != insert_after_.end()) { @@ -396,10 +659,13 @@ class CoProcSyncInserter : public IRMutator { } private: + // insert before is stored in reverse order + // the first element is closest to the node. std::unordered_map > insert_before_; std::unordered_map > insert_after_; }; + Stmt CoProcSync(Stmt stmt) { return CoProcSyncInserter().Insert(stmt); } diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 933a84598b88..2f3616017215 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -189,7 +189,7 @@ class StoragePlanRewriter : public IRMutator { if (attach_map_.count(nullptr)) { std::vector nest; for (StorageEntry* e : attach_map_.at(nullptr)) { - CHECK_EQ(e->scope.rank, 0); + // CHECK_EQ(e->scope.rank, 0); if (e->new_alloc.defined()) { nest.emplace_back(AttrStmt::make( e->alloc_var, attr::storage_scope, @@ -395,6 +395,12 @@ class StoragePlanRewriter : public IRMutator { e->new_alloc = Allocate::make( e->alloc_var, alloc_type, e->allocs[0]->extents, e->allocs[0]->condition, Evaluate::make(0)); + if (e->scope.tag.length() != 0) { + MemoryInfo info = GetMemoryInfo(e->scope.to_string()); + uint64_t total_elem = e->const_nbits / e->elem_type.bits(); + CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits) + << "Allocation exceed bound of memory tag " << e->scope.to_string(); + } } else { // Build a merged allocation Expr combo_size; diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h index e2767647dd77..48b5e8f1ef16 100644 --- a/src/runtime/thread_storage_scope.h +++ b/src/runtime/thread_storage_scope.h @@ -71,7 +71,7 @@ struct ThreadScope { */ static ThreadScope make(const std::string& s) { ThreadScope r; - if (s == "vthread") { + if (s == "vthread" || s == "cthread") { // virtual thread at the same level as local r.rank = 1; r.dim_index = -1; diff --git a/tests/python/unittest/test_pass_storage_sync.py b/tests/python/unittest/test_pass_storage_sync.py index 8360ed2f0c3f..ce9e2f9a4af9 100644 --- a/tests/python/unittest/test_pass_storage_sync.py +++ b/tests/python/unittest/test_pass_storage_sync.py @@ -58,6 +58,27 @@ def meminfo_cache(): assert(blist[-1].value.args[3].value == 10) +def test_coproc_sync2(): + ib = tvm.ir_builder.create() + n = tvm.var("n") + cp = tvm.thread_axis((0, 1), "cop") + ty = tvm.thread_axis("cthread") + A = ib.allocate("float32", 128, name="A") + ib.scope_attr(ty, "virtual_thread", 2) + with ib.new_scope(): + ib.scope_attr(cp, "coproc_scope", 2) + A[ty] = 0.0 + with ib.for_range(0, n, name="i") as i: + with ib.new_scope(): + ib.scope_attr(cp, "coproc_scope", 1) + A[ty] = 1.0 + with ib.new_scope(): + ib.scope_attr(cp, "coproc_scope", 2) + A[ty] = 1.0 + stmt = ib.get() + stmt = tvm.ir_pass.CoProcSync(stmt) + if __name__ == "__main__": test_coproc_sync() test_storage_sync() + test_coproc_sync2() From f5990efce668b56515c52e7e69ce5795a00a3b87 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 12 Nov 2017 14:50:00 -0800 Subject: [PATCH 014/948] [CODEGEN] Enable closure with no argument (#635) --- src/codegen/llvm/codegen_cpu.cc | 18 ++++++++++++------ src/codegen/llvm/codegen_cpu.h | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc index bc04a6337f28..ea5d90bccc1f 100644 --- a/src/codegen/llvm/codegen_cpu.cc +++ b/src/codegen/llvm/codegen_cpu.cc @@ -337,7 +337,11 @@ void CodeGenCPU::CreateComputeScope(const AttrStmt* op) { builder_->SetInsertPoint(compute_call_end); } -llvm::Value* CodeGenCPU::PackClosureData(const Array& vfields) { +llvm::Value* CodeGenCPU::PackClosureData(const Array& vfields, uint64_t* num_bytes) { + if (vfields.size() == 0) { + *num_bytes = 0U; + return llvm::Constant::getNullValue(t_void_p_); + } std::vector fields; for (Var v : vfields) { auto it = var_map_.find(v.get()); @@ -352,6 +356,8 @@ llvm::Value* CodeGenCPU::PackClosureData(const Array& vfields) { var_map_.at(vfields[i].get()), builder_->CreateInBoundsGEP(cdata, {zero, ConstInt32(i)})); } + *num_bytes = data_layout_->getTypeAllocSize( + llvm::cast(cdata->getType())->getElementType()); return cdata; } @@ -374,7 +380,8 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) { "__tvm_parallel_lambda", module_.get()); // allocate and setup the closure, call the closure. Array vfields = ir::UndefinedVars(body, {}); - llvm::Value* cdata = PackClosureData(vfields); + uint64_t nbytes; + llvm::Value* cdata = PackClosureData(vfields, &nbytes); BasicBlock* par_launch_end = CheckCallSuccess( builder_->CreateCall( RuntimeTVMParallelLaunch(), @@ -431,14 +438,13 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod ftype_tvm_static_init_, llvm::Function::ExternalLinkage, init_fname, module_.get()); } // allocate and setup the closure, call the closure. + uint64_t nbytes; Array vfields = ir::UndefinedVars(body, {}); - llvm::Value* cdata = PackClosureData(vfields); - llvm::Value* nbytes = ConstInt32(data_layout_->getTypeAllocSize( - llvm::cast(cdata->getType())->getElementType())); + llvm::Value* cdata = PackClosureData(vfields, &nbytes); BasicBlock* init_end = CheckCallSuccess( builder_->CreateCall( finit, - {gv, f, builder_->CreatePointerCast(cdata, t_void_p_), nbytes})); + {gv, f, builder_->CreatePointerCast(cdata, t_void_p_), ConstInt32(nbytes)})); // Setup the closure function. BasicBlock *lambda_entry = BasicBlock::Create(*ctx_, "entry", f); builder_->SetInsertPoint(lambda_entry); diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h index ac7f2e34dc64..702d8777d50b 100644 --- a/src/codegen/llvm/codegen_cpu.h +++ b/src/codegen/llvm/codegen_cpu.h @@ -73,7 +73,7 @@ class CodeGenCPU : public CodeGenLLVM { llvm::Value* RuntimeTVMParallelLaunch(); llvm::Value* RuntimeTVMParallelBarrier(); llvm::Value* GetPackedFuncHandle(const std::string& str); - llvm::Value* PackClosureData(const Array& fields); + llvm::Value* PackClosureData(const Array& fields, uint64_t *num_bytes); llvm::Value* CreateStructRefPtr(Type t, llvm::Value* buffer, llvm::Value* index, int kind); void UnpackClosureData(llvm::Value*cdata, const Array& fields, From 22652d626b1e9c5af588634bbf1400b5f9194686 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 12 Nov 2017 16:30:02 -0800 Subject: [PATCH 015/948] [PASS] Fix vthread when extern access touching (#636) --- src/pass/inject_virtual_thread.cc | 50 ++++++++++++++++--- .../unittest/test_pass_inject_vthread.py | 34 +++++++++++++ 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc index 07f59cb1b308..28e90ec4805b 100644 --- a/src/pass/inject_virtual_thread.cc +++ b/src/pass/inject_virtual_thread.cc @@ -15,11 +15,12 @@ namespace ir { // If expression is touched by var. class ExprTouched final : public IRVisitor { public: - explicit ExprTouched(const std::unordered_set &touched) - : touched_var_(touched) {} + explicit ExprTouched(const std::unordered_set &touched, + bool check_write) + : touched_var_(touched), check_write_(check_write) {} void Visit(const NodeRef& n) final { // early stopping - if (expr_touched_) return; + if (expr_touched_ && !check_write_) return; IRVisitor::Visit(n); } void Visit_(const Load *op) final { @@ -29,6 +30,24 @@ class ExprTouched final : public IRVisitor { void Visit_(const Variable *op) final { HandleUseVar(op); } + void Visit_(const Call *op) final { + if (op->is_intrinsic(intrinsic::tvm_access_ptr)) { + int rw_mask; + CHECK(arith::GetConstInt(op->args[4], &rw_mask)); + const Variable* buffer_var = op->args[1].as(); + CHECK(buffer_var); + // read + if (rw_mask & 1) { + HandleUseVar(buffer_var); + } + if (rw_mask & 2) { + HandleWriteVar(buffer_var); + } + this->Visit(op->args[2]); + } else { + IRVisitor::Visit_(op); + } + } void HandleUseVar(const Variable* var) { auto it = touched_var_.find(var); if (it != touched_var_.end()) { @@ -40,36 +59,49 @@ class ExprTouched final : public IRVisitor { used_vars_.push_back(var); } } + void HandleWriteVar(const Variable* var) { + write_vars_.push_back(var); + } // the fields. bool expr_touched_{false}; std::vector used_vars_; + std::vector write_vars_; const std::unordered_set& touched_var_; + bool check_write_; }; // Analyze if the buffers are invariant to value of var class VarTouchedAnalysis : public IRVisitor { public: void Visit_(const LetStmt *op) { - ExprTouched tc(touched_var_); + ExprTouched tc(touched_var_, false); tc.Visit(op->value); Record(op->var.get(), tc); this->Visit(op->body); } void Visit_(const Store *op) { - ExprTouched tc(touched_var_); + ExprTouched tc(touched_var_, false); tc.Visit(op->value); tc.Visit(op->index); Record(op->buffer_var.get(), tc); } void Visit_(const For *op) { - ExprTouched tc(touched_var_); + ExprTouched tc(touched_var_, false); tc.Visit(op->min); tc.Visit(op->extent); Record(op->loop_var.get(), tc); this->Visit(op->body); } + // external function call + void Visit_(const Evaluate *op) { + ExprTouched tc(touched_var_, true); + tc.Visit(op->value); + for (const Variable* var : tc.write_vars_) { + Record(var, tc); + } + } void Visit_(const Allocate *op) { - ExprTouched tc(touched_var_); + ExprTouched tc(touched_var_, false); for (size_t i = 0; i < op->extents.size(); ++i) { tc.Visit(op->extents[i]); } @@ -87,7 +119,9 @@ class VarTouchedAnalysis : public IRVisitor { touched_var_.insert(var); } else { for (const Variable* r : tc.used_vars_) { - affect_[r].push_back(var); + if (r != var) { + affect_[r].push_back(var); + } } } } diff --git a/tests/python/unittest/test_pass_inject_vthread.py b/tests/python/unittest/test_pass_inject_vthread.py index e4b3b51fbd2d..502a55574df0 100644 --- a/tests/python/unittest/test_pass_inject_vthread.py +++ b/tests/python/unittest/test_pass_inject_vthread.py @@ -28,5 +28,39 @@ def get_vthread(name): stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("cthread")) assert len(stmt.body.body.extents) == 3 + +def test_vthread_extern(): + dtype = 'int64' + n = 100 + m = 4 + nthread = 2 + def get_vthread(name): + tx = tvm.thread_axis(name) + ty = tvm.thread_axis(name) + ib = tvm.ir_builder.create() + with ib.for_range(0, n) as i: + ib.scope_attr(tx, "virtual_thread", nthread) + ib.scope_attr(ty, "virtual_thread", nthread) + A = ib.allocate("float32", m, name="A", scope="shared") + B = ib.allocate("float32", m, name="B", scope="shared") + C = ib.allocate("float32", m, name="C", scope="shared") + cbuffer = tvm.decl_buffer((m,), dtype=C.dtype, data=C.asnode()) + abuffer = tvm.decl_buffer((m,), dtype=A.dtype, data=A.asnode()) + bbuffer = tvm.decl_buffer((m,), dtype=B.dtype, data=B.asnode()) + A[tx] = tx + 1.0 + B[ty] = ty + 1.0 + ib.emit(tvm.call_extern("int32", "Run", + abuffer.access_ptr("r"), + bbuffer.access_ptr("r"), + cbuffer.access_ptr("rw"))) + return ib.get() + + stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("vthread")) + assert stmt.body.body.extents[0].value == 2 + assert stmt.body.body.body.body.body.body.extents[0].value == 2 + assert len(stmt.body.body.body.body.body.body.extents) == 3 + + if __name__ == "__main__": + test_vthread_extern() test_vthread() From 80c434dd076fd12c6aaf5e7cba422da53672b214 Mon Sep 17 00:00:00 2001 From: abergeron Date: Mon, 13 Nov 2017 15:56:47 -0500 Subject: [PATCH 016/948] Fix conda packages (#642) * Make the tvm conda package build with in-place source and use cmake from conda. * Add a package for topi. --- python/conda/build.sh | 2 +- python/conda/meta.yaml | 1 + topi/python/conda/meta.yaml | 30 ++++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 topi/python/conda/meta.yaml diff --git a/python/conda/build.sh b/python/conda/build.sh index 4965abc56b54..60582de9523b 100644 --- a/python/conda/build.sh +++ b/python/conda/build.sh @@ -26,7 +26,7 @@ mkdir -p build cd build # Enable static-libstdc++ to make it easier to link this library with # other C++ compilers -CXXFLAGS=-static-libstdc++ cmake3 -DCMAKE_PREFIX_PATH=${PREFIX} -DCMAKE_INSTALL_PREFIX=${PREFIX} -DUSE_CUDA=1 -DUSE_LLVM=1 -DINSTALL_DEV=1 $CUDA_ARGS .. +CXXFLAGS=-static-libstdc++ cmake -DCMAKE_PREFIX_PATH=${PREFIX} -DCMAKE_INSTALL_PREFIX=${PREFIX} -DUSE_CUDA=1 -DUSE_LLVM=1 -DINSTALL_DEV=1 $CUDA_ARGS .. make -j20 VERBOSE=1 make install/fast cd .. diff --git a/python/conda/meta.yaml b/python/conda/meta.yaml index 9ebb5afac543..f4e64dfd6bef 100644 --- a/python/conda/meta.yaml +++ b/python/conda/meta.yaml @@ -16,6 +16,7 @@ build: requirements: build: - llvmdev ==4.0.0 + - cmake - python >=3 - numpy - setuptools diff --git a/topi/python/conda/meta.yaml b/topi/python/conda/meta.yaml new file mode 100644 index 000000000000..37d3a35da433 --- /dev/null +++ b/topi/python/conda/meta.yaml @@ -0,0 +1,30 @@ +{% set version = "0.1.dev" %} + +package: + name: topi + version: {{ version }} + +source: + path: .. + +build: + number: 0 + script: $PYTHON setup.py install + +requirements: + build: + - cmake + - python >=3 + - numpy + - setuptools + - nose + - decorator + run: + - python >=3 + - numpy + - decorator + +about: + home: https://github.com/dmlc/tvm + license: Apache2 + summary: "TOPI: TVM Operator Inventory" From a3d455e5ad1147b8b5b3d5ade3c1338f9fa7753c Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 13 Nov 2017 21:07:01 -0800 Subject: [PATCH 017/948] conv2d perf improved for conv2d_56_64_128, super resolution workloads added (#643) * conv2d perf improved for conv2d_56_64_128, test name added to differentiate workloads * fix lint error --- topi/python/topi/cuda/conv2d_nchw.py | 32 ++++++++++++++-------- topi/tests/python/test_topi_conv2d_nchw.py | 14 +++++++--- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index fea0f542f448..b6bd35768112 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -95,20 +95,30 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag): thread_yz = tvm.thread_axis((0, vthread_y), "vthread", name="vy") i, oc, h, w = s[Out].op.axis - ow, iw = s[Out].split(w, factor=num_thread_x) - oh, ih = s[Out].split(h, factor=vthread_x) + factor = util.get_const_int(Out.shape[3]) ooc, ioc = s[Out].split(oc, factor=num_thread_y*vthread_y) oioc, iioc = s[Out].split(ioc, nparts=vthread_y) - s[Out].reorder(i, ooc, oh, ow, oioc, ih, iioc, iw) - oh = s[Out].fuse(oh, ow) - s[Out].bind(iw, thread_x) s[Out].bind(iioc, thread_y) - s[Out].bind(ih, thread_xz) s[Out].bind(oioc, thread_yz) - s[Out].bind(oh, block_x) s[Out].bind(ooc, block_y) - - s[Out_L].compute_at(s[Out], iw) + if factor < num_thread_x*vthread_x: + oh, ih = s[Out].split(h, factor=num_thread_x*vthread_x//factor) + w = s[Out].fuse(ih, w) + ow, iw = s[Out].split(w, nparts=vthread_x) + s[Out].reorder(i, ooc, oh, oioc, ow, iioc, iw) + s[Out].bind(iw, thread_x) + s[Out].bind(ow, thread_xz) + s[Out].bind(oh, block_x) + s[Out_L].compute_at(s[Out], iw) + else: + ow, iw = s[Out].split(w, factor=num_thread_x) + oh, ih = s[Out].split(h, factor=vthread_x) + s[Out].reorder(i, ooc, oh, ow, oioc, ih, iioc, iw) + oh = s[Out].fuse(oh, ow) + s[Out].bind(iw, thread_x) + s[Out].bind(ih, thread_xz) + s[Out].bind(oh, block_x) + s[Out_L].compute_at(s[Out], iw) # schedule Out_L local write i, oc, h, w = s[Out_L].op.axis @@ -350,14 +360,14 @@ def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L): if util.get_const_int(Filter.shape[0]) == 64: opart2 = 8 ifactor = 16 - sfactor = max(1, ofactor//(opart2*2)) + sfactor = max(1, ofactor // (opart2*2)) spart = max(1, (wfactor + vthread-1) // vthread) block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") block_z = tvm.thread_axis("blockIdx.z") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") - thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") + thread_y = tvm.thread_axis((0, wfactor // vthread), "threadIdx.y") thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py index 6e18d9110520..86cfc9493cdc 100644 --- a/topi/tests/python/test_topi_conv2d_nchw.py +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -6,7 +6,6 @@ from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple - def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding): in_height = in_width = in_size @@ -42,10 +41,10 @@ def check_device(device): w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) - with tvm.build_config(auto_unroll_max_step=128, + with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): - func1 = tvm.build(s1, [A, W, B], device) - func2 = tvm.build(s2, [A, W, C], device) + func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) + func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) func1(a, w, b) func2(a, w, c) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -56,6 +55,7 @@ def check_device(device): def test_conv2d_nchw(): + # ResNet18 worklaods verify_conv2d_nchw(1, 3, 224, 64, 7, 3, 2) verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0) @@ -68,7 +68,13 @@ def test_conv2d_nchw(): verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1) verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0) verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) + # Vgg16 workloads verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1) + # Super resolution workloads + verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2) + verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1) + verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1) + verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1) if __name__ == "__main__": test_conv2d_nchw() From afa6a3c16434ff6beb7d31a54bfd607540943045 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 13 Nov 2017 21:25:10 -0800 Subject: [PATCH 018/948] [APP] improve parameter pack (#645) --- apps/howto_deploy/tvm_runtime_pack.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc index e5c65b66b71a..9a090d863729 100644 --- a/apps/howto_deploy/tvm_runtime_pack.cc +++ b/apps/howto_deploy/tvm_runtime_pack.cc @@ -44,10 +44,19 @@ // #include "../../src/runtime/rpc/rpc_event_impl.cc" // #include "../../src/runtime/rpc/rpc_server_env.cc" +// These macros enables the device API when uncommented. +#define TVM_CUDA_RUNTIME 1 +#define TVM_METAL_RUNTIME 1 +#define TVM_OPENCL_RUNTIME 1 + // Uncomment the following lines to enable Metal // #include "../../src/runtime/metal/metal_device_api.mm" // #include "../../src/runtime/metal/metal_module.mm" +// Uncomment the following lines to enable CUDA +// #include "../../src/runtime/cuda/cuda_device_api.cc" +// #include "../../src/runtime/cuda/cuda_runtime.cc" + // Uncomment the following lines to enable OpenCL // #include "../../src/runtime/opencl/opencl_device_api.cc" // #include "../../src/runtime/opencl/opencl_module.cc" From f344a04ab1e2e6a3df56dcce1710ae61e2b31192 Mon Sep 17 00:00:00 2001 From: ziheng Date: Tue, 14 Nov 2017 10:11:29 -0800 Subject: [PATCH 019/948] [TOPI] Add out_dtype argument for conv2d; Add x86 schedules (#646) * [TOPI] Add out_dtype argument for conv2d; Add x86 schedules * Fix * Fix lint * Fix --- topi/python/topi/__init__.py | 1 + topi/python/topi/nn/conv2d.py | 80 ++++++++++++----------- topi/python/topi/nn/depthwise_conv2d.py | 6 +- topi/python/topi/rasp/conv2d.py | 19 +++--- topi/python/topi/rasp/depthwise_conv2d.py | 28 ++++---- topi/python/topi/x86/__init__.py | 5 ++ topi/python/topi/x86/conv2d.py | 37 +++++++++++ 7 files changed, 111 insertions(+), 65 deletions(-) create mode 100644 topi/python/topi/x86/__init__.py create mode 100644 topi/python/topi/x86/conv2d.py diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index 1306f9d9cac8..62a9ae153052 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -14,6 +14,7 @@ from .transform import * from .broadcast import * from . import nn +from . import x86 from . import cuda from . import rasp from . import testing diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index db3a6079f96a..cc1ee0198c3d 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -9,7 +9,7 @@ # workload description of conv2d Workload = namedtuple('Workload', - ['height', 'width', 'in_filter', 'out_filter', + ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) # schedule description of spatial @@ -22,36 +22,36 @@ _WORKLOADS = [ # workloads of resnet18 on imagenet - Workload(224, 224, 3, 64, 7, 7, 3, 3, 2, 2), - Workload(56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - Workload(56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - Workload(56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - Workload(56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - Workload(28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - Workload(28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - Workload(28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - Workload(14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - Workload(14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - Workload(14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - Workload(7, 7, 512, 512, 3, 3, 1, 1, 1, 1), + Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2), + Workload('float32', 'float32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), + Workload('float32', 'float32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), + Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), + Workload('float32', 'float32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), + Workload('float32', 'float32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), + Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), + Workload('float32', 'float32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), + Workload('float32', 'float32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), + Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('float32', 'float32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), # workloads of mobile net on imagenet - Workload(224, 224, 3, 32, 3, 3, 1, 1, 2, 2), - Workload(112, 112, 32, 64, 1, 1, 0, 0, 1, 1), - Workload(56, 56, 64, 128, 1, 1, 0, 0, 1, 1), - Workload(56, 56, 128, 128, 1, 1, 0, 0, 1, 1), - Workload(28, 28, 128, 256, 1, 1, 0, 0, 1, 1), - Workload(28, 28, 256, 256, 1, 1, 0, 0, 1, 1), - Workload(14, 14, 256, 512, 1, 1, 0, 0, 1, 1), - Workload(14, 14, 512, 512, 1, 1, 0, 0, 1, 1), - Workload(7, 7, 512, 1024, 1, 1, 0, 0, 1, 1), - Workload(7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2), + Workload('float32', 'float32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1), ] # platform specific schedule _CONV_SCHEDULE = {} @tvm.target.generic_func -def conv2d(data, kernel, stride, padding, layout='NCHW'): +def conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): """Conv2D operator. Parameters @@ -79,14 +79,14 @@ def conv2d(data, kernel, stride, padding, layout='NCHW'): # search platform specific declaration first # default declaration if layout == 'NCHW': - return conv2d_nchw(data, kernel, stride, padding) + return conv2d_nchw(data, kernel, stride, padding, out_dtype) elif layout == 'HWCN': - return conv2d_hwcn(data, kernel, stride, padding) + return conv2d_hwcn(data, kernel, stride, padding, out_dtype) else: raise ValueError("not support this layout {} yet".format(layout)) -def _get_workload(data, kernel, stride, padding): +def _get_workload(data, kernel, stride, padding, out_dtype): """ Get the workload structure. """ _, CI, IH, IW = [x.value for x in data.shape] CO, _, KH, KW = [x.value for x in kernel.shape] @@ -95,7 +95,8 @@ def _get_workload(data, kernel, stride, padding): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride - return Workload(IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) + assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." + return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) @tvm.target.generic_func @@ -108,10 +109,10 @@ def _get_schedule(wkl): # This return has no use, merely to supress pylint warning return wkl -def _spatial_pack(data, kernel, stride, padding): +def _spatial_pack(data, kernel, stride, padding, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" - wkl = _get_workload(data, kernel, stride, padding) + wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) H, W = wkl.height, wkl.width @@ -158,8 +159,8 @@ def _spatial_pack(data, kernel, stride, padding): dw = tvm.reduce_axis((0, KW), name='dw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ - tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw] * - kernel_vec[co, ci, dh, dw, vc], + tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) * + kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: @@ -169,10 +170,10 @@ def _spatial_pack(data, kernel, stride, padding): return output -def _im2col_pack(data, kernel, stride, padding): +def _im2col_pack(data, kernel, stride, padding, out_dtype): """ Compute convolution with im2col pack layout. """ assert data.shape[0].value == 1, "im2col pack convolution only support batch size=1" - wkl = _get_workload(data, kernel, stride, padding) + wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) N = 1 @@ -234,7 +235,7 @@ def _im2col_pack(data, kernel, stride, padding): return output -def conv2d_nchw(Input, Filter, stride, padding): +def conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'): """Convolution operator in NCHW layout. Parameters @@ -280,11 +281,12 @@ def conv2d_nchw(Input, Filter, stride, padding): return tvm.compute( (batch, out_channel, out_height, out_width), lambda nn, ff, yy, xx: tvm.sum( - temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx] * Filter[ff, rc, ry, rx], + temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * + Filter[ff, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx]), tag="conv2d_nchw") -def conv2d_hwcn(Input, Filter, stride, padding): +def conv2d_hwcn(Input, Filter, stride, padding, out_dtype='float32'): """Convolution operator in HWCN layout. Parameters @@ -329,8 +331,8 @@ def conv2d_hwcn(Input, Filter, stride, padding): Output = tvm.compute( (out_height, out_width, out_channel, batch), lambda yy, xx, ff, nn: tvm.sum( - PaddedInput[yy * stride_h + ry, xx * stride_w + rx, rc, nn] * Filter[ry, rx, rc, ff], - axis=[ry, rx, rc]), + PaddedInput[yy * stride_h + ry, xx * stride_w + rx, rc, nn].astype(out_dtype) * + Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput", tag="conv2d_hwcn") return Output diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py index 40aed1572db1..785bdab27738 100644 --- a/topi/python/topi/nn/depthwise_conv2d.py +++ b/topi/python/topi/nn/depthwise_conv2d.py @@ -9,7 +9,7 @@ from ..util import simplify -def depthwise_conv2d_nchw(Input, Filter, stride, padding): +def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'): """Depthwise convolution nchw forward operator. Parameters @@ -51,8 +51,8 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding): Output = tvm.compute( (batch, out_channel, out_height, out_width), lambda b, c, i, j: tvm.sum( - (PaddedInput[b, c/channel_multiplier, i*stride_h + di, j*stride_w + dj] * - Filter[c/channel_multiplier, c%channel_multiplier, di, dj]), + (PaddedInput[b, c/channel_multiplier, i*stride_h+di, j*stride_w+dj].astype(out_dtype) * + Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)), axis=[di, dj]), name='DepthwiseConv2d', tag="depthwise_conv2d_nchw") return Output diff --git a/topi/python/topi/rasp/conv2d.py b/topi/python/topi/rasp/conv2d.py index 86cb8a9d01a0..6e5a1b335ccf 100644 --- a/topi/python/topi/rasp/conv2d.py +++ b/topi/python/topi/rasp/conv2d.py @@ -12,6 +12,7 @@ from .. import generic _SCHEDULES = [ + # float32 imagenet SpatialPack(1, 8, 4, 1, 4, True), SpatialPack(1, 7, 4, 2, 4, True), SpatialPack(1, 4, 8, 4, 1, True), @@ -25,6 +26,7 @@ Im2ColPack(7, 4, 1, 8, False), Im2ColPack(7, 4, 1, 16, False), + # float32 mobilenet SpatialPack(2, 2, 4, 28, 1, True), SpatialPack(1, 4, 8, 14, 1, False), SpatialPack(1, 2, 16, 8, 1, True), @@ -47,12 +49,12 @@ def _schedule_conv2d(wkl): @conv2d.register("rasp") -def _declaration_conv2d(data, kernel, stride, padding, layout): +def _declaration_conv2d(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHW', "only support NCHW convolution on rasp" assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" - wkl = _get_workload(data, kernel, stride, padding) + wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) - return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding) + return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, out_dtype) def _schedule_spatial_conv2d(s, data, data_pad, data_vec, @@ -64,10 +66,8 @@ def _schedule_spatial_conv2d(s, data, data_pad, data_vec, stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) - wkl = _get_workload(data, kernel, stride, padding) - - with tvm.target.rasp(): - sch = _get_schedule(wkl) + wkl = _get_workload(data, kernel, stride, padding, output.dtype) + sch = _get_schedule(wkl) H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter @@ -172,7 +172,7 @@ def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec, stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) - wkl = _get_workload(data, kernel, stride, padding) + wkl = _get_workload(data, kernel, stride, padding, output.dtype) with _target.rasp(): sch = _get_schedule(wkl) @@ -280,7 +280,7 @@ def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec, return s -@generic.schedule_conv2d_nchw.register(["cpu", "rasp"]) +@generic.schedule_conv2d_nchw.register(["rasp"]) def schedule_conv2d(outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) @@ -294,6 +294,7 @@ def traverse(op): for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) + if 'spatial_conv_output' in op.tag: output = op.output(0) conv_out = op.input_tensors[0] diff --git a/topi/python/topi/rasp/depthwise_conv2d.py b/topi/python/topi/rasp/depthwise_conv2d.py index e695f0463852..a6fd691f843f 100644 --- a/topi/python/topi/rasp/depthwise_conv2d.py +++ b/topi/python/topi/rasp/depthwise_conv2d.py @@ -8,22 +8,22 @@ from .. import generic _Workload = namedtuple('Workload', - ['height', 'width', 'channel', 'multiplier', + ['in_dtype', 'out_dtype', 'height', 'width', 'channel', 'multiplier', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) _Schedule = namedtuple('Schedule', ['vh', 'vw', 'vc', 'bc', 'unroll']) # workloads of depthwise conv mobile net on imagenet _WORKLOADS = [ - _Workload(112, 112, 32, 1, 3, 3, 1, 1, 1, 1), - _Workload(112, 112, 64, 1, 3, 3, 1, 1, 2, 2), - _Workload(56, 56, 128, 1, 3, 3, 1, 1, 1, 1), - _Workload(56, 56, 128, 1, 3, 3, 1, 1, 2, 2), - _Workload(28, 28, 256, 1, 3, 3, 1, 1, 1, 1), - _Workload(28, 28, 256, 1, 3, 3, 1, 1, 2, 2), - _Workload(14, 14, 512, 1, 3, 3, 1, 1, 1, 1), - _Workload(14, 14, 512, 1, 3, 3, 1, 1, 2, 2), - _Workload(14, 14, 1024, 1, 3, 3, 1, 1, 1, 1), + _Workload('float32', 'float32', 112, 112, 32, 1, 3, 3, 1, 1, 1, 1), + _Workload('float32', 'float32', 112, 112, 64, 1, 3, 3, 1, 1, 2, 2), + _Workload('float32', 'float32', 56, 56, 128, 1, 3, 3, 1, 1, 1, 1), + _Workload('float32', 'float32', 56, 56, 128, 1, 3, 3, 1, 1, 2, 2), + _Workload('float32', 'float32', 28, 28, 256, 1, 3, 3, 1, 1, 1, 1), + _Workload('float32', 'float32', 28, 28, 256, 1, 3, 3, 1, 1, 2, 2), + _Workload('float32', 'float32', 14, 14, 512, 1, 3, 3, 1, 1, 1, 1), + _Workload('float32', 'float32', 14, 14, 512, 1, 3, 3, 1, 1, 2, 2), + _Workload('float32', 'float32', 7, 7, 1024, 1, 3, 3, 1, 1, 1, 1), ] _SCHEDULES = [ @@ -35,10 +35,10 @@ _Schedule(1, 1, 4, 2, True), _Schedule(1, 1, 8, 8, True), _Schedule(1, 1, 4, 1, False), - _Schedule(2, 1, 4, 16, False), + _Schedule(1, 1, 4, 4, False), ] -def _get_workload(data, kernel, stride, padding): +def _get_workload(data, kernel, stride, padding, out_dtype): _, C, IH, IW = [x.value for x in data.shape] _, MT, KH, KW = [x.value for x in kernel.shape] HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) @@ -46,7 +46,7 @@ def _get_workload(data, kernel, stride, padding): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride - return _Workload(IH, IW, C, MT, KH, KW, HPAD, WPAD, HSTR, WSTR) + return _Workload(data.dtype, out_dtype, IH, IW, C, MT, KH, KW, HPAD, WPAD, HSTR, WSTR) def _schedule(s, data, data_pad, kernel, output, last): @@ -55,7 +55,7 @@ def _schedule(s, data, data_pad, kernel, output, last): stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) - wkl = _get_workload(data, kernel, stride, padding) + wkl = _get_workload(data, kernel, stride, padding, output.dtype) if wkl not in _WORKLOADS: return s diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py new file mode 100644 index 000000000000..d9912de2870d --- /dev/null +++ b/topi/python/topi/x86/__init__.py @@ -0,0 +1,5 @@ +# pylint: disable=redefined-builtin, wildcard-import +"""x86 specific declaration and schedules.""" +from __future__ import absolute_import as _abs + +from .conv2d import schedule_conv2d diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py new file mode 100644 index 000000000000..0c91f8c25c88 --- /dev/null +++ b/topi/python/topi/x86/conv2d.py @@ -0,0 +1,37 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name +"""Conv2D schedule on x86""" +import tvm +from .. import generic +from .. import tag + +@generic.schedule_conv2d_nchw.register(["cpu"]) +def schedule_conv2d(outs): + """Create schedule for tensors""" + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + if 'conv2d_nchw' in op.tag: + conv = op.output(0) + kernel = op.input_tensors[1] + data = op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + C = conv + n, c, h, w = C.op.axis + s[C].parallel(c) + s[C].pragma(n, "parallel_launch_point") + + traverse(outs[0].op) + return s From 6aae21333f761b9d3c0fe6083dfa0ff444b7fd21 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 14 Nov 2017 15:01:15 -0800 Subject: [PATCH 020/948] [UNROLL] New unroll option (#647) --- include/tvm/ir_pass.h | 8 +++++++- python/tvm/build_module.py | 4 +++- src/api/api_pass.cc | 2 +- src/pass/unroll_loop.cc | 16 +++++++++++++--- tests/python/unittest/test_pass_unroll.py | 6 +++--- 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index 6b95bd268652..897f96c763e7 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -205,10 +205,16 @@ Stmt NarrowChannelAccess(Stmt stmt); * \param stmt The statment to be unrolled. * \param auto_max_step The maximum step before stop attach automatic unroll * \param auto_min_depth The minimum depth before we can start automatic unroll + * \param auto_max_extent The maximum extent of the loop we can unroll, + * this is an legacy option that donot take the loop total steps into account. * \param explicit_unroll Whether explicitly unroll the loop, or leave unroll annotation to codegen. * \return Transformed stmt. */ -Stmt UnrollLoop(Stmt stmt, int auto_max_step, int auto_min_depth, bool explicit_unroll); +Stmt UnrollLoop(Stmt stmt, + int auto_max_step, + int auto_min_depth, + int auto_max_extent, + bool explicit_unroll); /*! * \brief vectorize the constant loops diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 4fe502e987d5..5756fe6d14c6 100644 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -28,7 +28,8 @@ class BuildConfig(object): current = None defaults = { "auto_unroll_max_step": 0, - "auto_unroll_max_depth": 4, + "auto_unroll_max_depth": 8, + "auto_unroll_max_extent": 0, "unroll_explicit": True, "detect_global_barrier": False, "offset_factor": 0, @@ -227,6 +228,7 @@ def lower(sch, stmt, cfg.auto_unroll_max_step, cfg.auto_unroll_max_depth, + cfg.auto_unroll_max_extent, cfg.unroll_explicit) for f in lower_phase2: stmt = f(stmt) diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index 2dacb32e54f7..a3134f5114e1 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -91,7 +91,7 @@ REGISTER_PASS4(Inline); REGISTER_PASS3(StorageFlatten); REGISTER_PASS4(IRTransform); REGISTER_PASS1(VectorizeLoop); -REGISTER_PASS4(UnrollLoop); +REGISTER_PASS5(UnrollLoop); REGISTER_PASS3(InjectCopyIntrin); REGISTER_PASS2(ThreadSync); REGISTER_PASS5(MakeAPI); diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc index 7a0ce24963e6..01c5e6ebff00 100644 --- a/src/pass/unroll_loop.cc +++ b/src/pass/unroll_loop.cc @@ -19,9 +19,11 @@ class LoopUnroller : public IRMutator { public: explicit LoopUnroller(int auto_max_step, int auto_max_depth, + int auto_max_extent, bool explicit_unroll) : auto_max_step_(auto_max_step), auto_max_depth_(auto_max_depth), + auto_max_extent_(auto_max_extent), explicit_unroll_(explicit_unroll) { } @@ -42,10 +44,13 @@ class LoopUnroller : public IRMutator { // condition for auto unroll bool auto_unroll = ( op->for_type == ForType::Serial && - normal_loop_depth_ == 0 && value >= 0 && - unroll_depth_ <= auto_max_depth_ && - value * step_count_ <= auto_max_step_); + normal_loop_depth_ == 0 && + unroll_depth_ <= auto_max_depth_); + + auto_unroll = auto_unroll && ( + value * step_count_ <= auto_max_step_|| + value <= auto_max_extent_); if (op->for_type == ForType::Unrolled) { CHECK_GE(value, 0) @@ -127,6 +132,9 @@ class LoopUnroller : public IRMutator { // maximum number of step to perform auto unroll. int auto_max_step_; int auto_max_depth_; + // max extent of loop to auto unroll + // this not not count the total steps, only count the number of loops + int auto_max_extent_; bool explicit_unroll_; // Number of normal loops in scope int normal_loop_depth_{0}; @@ -140,10 +148,12 @@ class LoopUnroller : public IRMutator { Stmt UnrollLoop(Stmt stmt, int auto_max_step, int auto_max_depth, + int auto_max_extent, bool explicit_unroll) { Stmt ret = LoopUnroller( auto_max_step, auto_max_depth, + auto_max_extent, explicit_unroll).Mutate(stmt); if (!ret.same_as(stmt)) { return ConvertSSA(ret); diff --git a/tests/python/unittest/test_pass_unroll.py b/tests/python/unittest/test_pass_unroll.py index 3c155e44aa0a..9e52a455e7da 100644 --- a/tests/python/unittest/test_pass_unroll.py +++ b/tests/python/unittest/test_pass_unroll.py @@ -14,11 +14,11 @@ def test_unroll_loop(): tvm.make.Load(dtype, Ab.data, i) + 1, j + 1))) assert isinstance(stmt, tvm.stmt.For) - ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, True) + ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True) assert not isinstance(ret, tvm.stmt.For) - ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, True) + ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True) assert isinstance(ret, tvm.stmt.For) - ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, False) + ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False) assert isinstance(ret, tvm.stmt.For) assert ret.for_type == tvm.stmt.For.Unrolled From dcbfb9ffb477e7ceb55be3586562eb58cedcc261 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Wed, 15 Nov 2017 20:49:13 -0800 Subject: [PATCH 021/948] Conv2d scheduler tweaked for super resolution perf (#652) * scheduler tweaked for super resolution perf * lint error fixed * lint error fixed * conv2d_transpose schedule error fixed --- topi/python/topi/cuda/conv2d_nchw.py | 35 +++++++++---------- .../python/topi/cuda/conv2d_transpose_nchw.py | 8 ++--- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index b6bd35768112..a59a99624233 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -1,15 +1,17 @@ -#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches +#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long """Schedule for conv2d_nchw with auto fusion""" import tvm from .. import util from .. import tag from .. import generic -def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L): +def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag): """Schedule conv2d for specific feature_in_out_filter pattern""" # scheduler params ofactor = 16 hfactor = 2 + if flag >= 96: + hfactor = 4 ow_size = util.get_const_int(Out.shape[3]) num_thread = ow_size * hfactor vthread = ofactor @@ -22,7 +24,8 @@ def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L): oh, ih = s[Out].split(h, factor=hfactor) s[Out].reorder(ooc, oh, ioc, ih, w) oc = s[Out].fuse(ooc, oh) - w = s[Out].fuse(w, ih) + ow, _ = s[Out].split(w, nparts=ow_size) + w = s[Out].fuse(ow, ih) s[Out].bind(w, thread_x) s[Out].bind(ioc, thread_xz) s[Out].bind(oc, block_x) @@ -360,7 +363,11 @@ def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L): if util.get_const_int(Filter.shape[0]) == 64: opart2 = 8 ifactor = 16 - sfactor = max(1, ofactor // (opart2*2)) + if util.get_const_int(Out.shape[2]) == 224: + num_thread = 4 + wfactor = 112 + ifactor = 4 + sfactor = max(1, ofactor // (opart2*vthread)) spart = max(1, (wfactor + vthread-1) // vthread) block_x = tvm.thread_axis("blockIdx.x") @@ -368,7 +375,7 @@ def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L): block_z = tvm.thread_axis("blockIdx.z") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") thread_y = tvm.thread_axis((0, wfactor // vthread), "threadIdx.y") - thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") + thread_xz = tvm.thread_axis((0, opart2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") i, oc, h, w = s[Out].op.axis @@ -394,10 +401,10 @@ def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L): ic, dh, dw = s[Out_L].op.reduce_axis oic, iic = s[Out_L].split(ic, factor=ifactor) s[Out_L].reorder(oic, dh, dw, iic, h, w) - fuse_index = s[Out_L].fuse(dw, dh) fuse_index = s[Out_L].fuse(fuse_index, oic) dw = fuse_index + s[temp_S].compute_at(s[Out_L], dw) s[Filter_S].compute_at(s[Out_L], dw) @@ -421,16 +428,6 @@ def schedule_conv2d_small_batch(outs): def schedule(temp, Filter, Output): """Schedule conv2d_nchw""" - block_h = util.get_const_int(Output.shape[3]) - block_w = util.get_const_int(temp.shape[1]) - if block_h % 48 == 0: - block_h = 48 - elif block_h % 32 == 0: - block_h = 32 - if block_w % 48 == 0: - block_w = 48 - elif block_w % 32 == 0: - block_w = 32 flag = util.get_const_int(Filter.shape[0])+util.get_const_int(Filter.shape[1]) @@ -450,7 +447,7 @@ def schedule(temp, Filter, Output): s[temp_G].reorder(i, oic, h, w, iic) temp_R = s.cache_write(temp_G, "global") temp_S = s.cache_read(temp_R, "shared", [temp_G]) - elif util.get_const_int(Filter.shape[3]) == 7: + elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128): temp_G = s.cache_read(temp, "global", [Output]) s[temp_G].compute_inline() i, ic, h, w = s[temp_G].op.axis @@ -472,8 +469,8 @@ def schedule(temp, Filter, Output): s[Output].set_scope("local") Out_L = Output - if util.get_const_int(Filter.shape[3]) == 7: - conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L) + if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128): + conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag) elif 128 < flag < 512: conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag) elif flag >= 512: diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py index 82fd451f4ecb..edd255a8fac0 100644 --- a/topi/python/topi/cuda/conv2d_transpose_nchw.py +++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py @@ -1,4 +1,4 @@ -#pylint: disable=invalid-name +#pylint: disable=invalid-name, line-too-long """Schedule for conv2d_transpose_nchw with auto fusion""" import tvm from .. import util @@ -42,7 +42,7 @@ def schedule(temp, Filter, Output): s[temp_G].reorder(i, oic, h, w, iic) temp_R = s.cache_write(temp_G, "global") temp_S = s.cache_read(temp_R, "shared", [temp_G]) - elif util.get_const_int(Filter.shape[3]) == 7: + elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128): temp_G = s.cache_read(temp, "global", [Output]) s[temp_G].compute_inline() i, ic, h, w = s[temp_G].op.axis @@ -64,8 +64,8 @@ def schedule(temp, Filter, Output): s[Output].set_scope("local") Out_L = Output - if util.get_const_int(Filter.shape[3]) == 7: - conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L) + if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128): + conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag) elif 128 < flag < 512: conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag) elif flag >= 512: From b9f9b08133c6ecd96bd924feffe6a969aac504f9 Mon Sep 17 00:00:00 2001 From: haolongzhangm <1041563782@qq.com> Date: Fri, 17 Nov 2017 01:12:46 +0800 Subject: [PATCH 022/948] Compat for opencl mode between cpu mode and gpu mode (#655) some host opencl runtime may at cpu mode, but remote client opencl runtime at gpu mode, compat it --- CONTRIBUTORS.md | 1 + src/runtime/opencl/opencl_device_api.cc | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ab9950a9f31d..944e76fd3b83 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -35,3 +35,4 @@ List of Contributors - [Qiao Zhang](https://github.com/zhangqiaorjc) - [Jian Weng](https://github.com/were) - [Masahiro Masuda](https://github.com/masahi) +- [Haolong Zhang](https://github.com/haolongzhangm) diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index e95fddaa867c..23c897e04825 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -197,8 +197,13 @@ void OpenCLWorkspace::Init() { std::vector devices_matched = cl::GetDeviceIDs(this->platform_id, "gpu"); if (devices_matched.size() == 0) { - LOG(WARNING) << "No OpenCL device any device matched given the options"; - return; + LOG(WARNING) << "No OpenCL device any device matched given the options: gpu mode"; + LOG(WARNING) << "Now try OpenCL cpu mode"; + devices_matched = cl::GetDeviceIDs(this->platform_id, "cpu"); + if (devices_matched.size() == 0) { + LOG(WARNING) << "No OpenCL device any device matched given the options: cpu mode"; + return; + } } this->devices = devices_matched; cl_int err_code; From bd3561d0de7754b414a27db3417f96a44ac36fb4 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 19 Nov 2017 07:49:09 +0800 Subject: [PATCH 023/948] [RUNTIME] support limited save without cross compile (#659) --- src/codegen/build_metal.cc | 2 +- src/codegen/build_opencl.cc | 2 +- src/codegen/codegen.cc | 2 +- src/codegen/codegen_source_base.h | 14 ++++++++ src/codegen/source_module.cc | 57 ++++++++++++++++++++++++++++-- src/runtime/opencl/opencl_module.h | 2 +- 6 files changed, 73 insertions(+), 6 deletions(-) diff --git a/src/codegen/build_metal.cc b/src/codegen/build_metal.cc index f2a7e14f9a9f..42aa0965ec9d 100644 --- a/src/codegen/build_metal.cc +++ b/src/codegen/build_metal.cc @@ -35,7 +35,7 @@ runtime::Module BuildMetal(Array funcs) { return MetalModuleCreate(code, fmt, ExtractFuncInfo(funcs), source); #else LOG(WARNING) << "Metal runtime not enabled, return a source module..."; - return SourceModuleCreate(code, "metal"); + return DeviceSourceModuleCreate(code, "metal", ExtractFuncInfo(funcs), "metal"); #endif // TVM_METAL_RUNTIME } diff --git a/src/codegen/build_opencl.cc b/src/codegen/build_opencl.cc index 499c88a009cd..51779d3f7a3e 100644 --- a/src/codegen/build_opencl.cc +++ b/src/codegen/build_opencl.cc @@ -27,7 +27,7 @@ runtime::Module BuildOpenCL(Array funcs) { return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs)); #else LOG(WARNING) << "OpenCL runtime not enabled, return a source module..."; - return SourceModuleCreate(code, "cl"); + return DeviceSourceModuleCreate(code, "cl", ExtractFuncInfo(funcs), "opencl"); #endif // TVM_OPENCL_RUNTIME } diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc index e1f003d32b10..d289d627b310 100644 --- a/src/codegen/codegen.cc +++ b/src/codegen/codegen.cc @@ -38,7 +38,7 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) { stream->Write(sz); for (runtime::Module im : mod->imports()) { CHECK_EQ(im->imports().size(), 0U) - << "Only support simply one-level hierachy"; + << "Only support simply one-level hierarchy"; std::string tkey = im->type_key(); std::string bin; stream->Write(tkey); diff --git a/src/codegen/codegen_source_base.h b/src/codegen/codegen_source_base.h index 0ee5b71d017c..bc99eeeb1d33 100644 --- a/src/codegen/codegen_source_base.h +++ b/src/codegen/codegen_source_base.h @@ -11,6 +11,7 @@ #include #include #include +#include "../runtime/meta_data.h" namespace tvm { namespace codegen { @@ -108,6 +109,19 @@ class CodeGenSourceBase { * \param fmt The code. format. */ runtime::Module SourceModuleCreate(std::string code, std::string fmt); + +/*! + * \brief Create a source module for viewing and limited saving + * \param code The code to be viewed. + * \param fmt The code. format. + * \param fmap The map function information map of each function. + * \param type_key The type_key of the runtime module of this source code + */ +runtime::Module DeviceSourceModuleCreate( + std::string code, + std::string fmt, + std::unordered_map fmap, + std::string type_key); } // namespace codegen } // namespace tvm #endif // TVM_CODEGEN_CODEGEN_SOURCE_BASE_H_ diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc index 1ad2168ae06e..23c0cbd8466e 100644 --- a/src/codegen/source_module.cc +++ b/src/codegen/source_module.cc @@ -5,6 +5,8 @@ */ #include #include "./codegen_source_base.h" +#include "../runtime/file_util.h" +#include "../runtime/meta_data.h" namespace tvm { namespace codegen { @@ -12,8 +14,14 @@ namespace codegen { using runtime::TVMArgs; using runtime::TVMRetValue; using runtime::PackedFunc; + +using runtime::GetFileFormat; +using runtime::GetMetaFilePath; +using runtime::FunctionInfo; +using runtime::SaveBinaryToFile; + // Simulator function -class SourceModuleNode final : public runtime::ModuleNode { +class SourceModuleNode : public runtime::ModuleNode { public: SourceModuleNode(std::string code, std::string fmt) @@ -21,6 +29,7 @@ class SourceModuleNode final : public runtime::ModuleNode { const char* type_key() const { return "source"; } + PackedFunc GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) final { @@ -33,7 +42,7 @@ class SourceModuleNode final : public runtime::ModuleNode { return code_; } - private: + protected: std::string code_; std::string fmt_; }; @@ -44,6 +53,50 @@ runtime::Module SourceModuleCreate(std::string code, std::string fmt) { return runtime::Module(n); } +// supports limited save without cross compile +class DeviceSourceModuleNode final : public SourceModuleNode { + public: + DeviceSourceModuleNode(std::string code, + std::string fmt, + std::unordered_map fmap, + std::string type_key) + : SourceModuleNode(code, fmt), fmap_(fmap), type_key_(type_key) {} + + const char* type_key() const { + return type_key_.c_str(); + } + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + std::string fmt = GetFileFormat(file_name, format); + CHECK_EQ(fmt, fmt_) + << "Can only save to format=" << fmt_; + std::string meta_file = GetMetaFilePath(file_name); + SaveMetaDataToFile(meta_file, fmap_); + SaveBinaryToFile(file_name, code_); + } + + void SaveToBinary(dmlc::Stream* stream) final { + stream->Write(fmt_); + stream->Write(fmap_); + stream->Write(code_); + } + + private: + std::unordered_map fmap_; + std::string type_key_; +}; + +runtime::Module DeviceSourceModuleCreate( + std::string code, + std::string fmt, + std::unordered_map fmap, + std::string type_key) { + std::shared_ptr n = + std::make_shared(code, fmt, fmap, type_key); + return runtime::Module(n); +} + TVM_REGISTER_GLOBAL("module.source_module_create") .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = SourceModuleCreate(args[0], args[1]); diff --git a/src/runtime/opencl/opencl_module.h b/src/runtime/opencl/opencl_module.h index 85c50e3e9755..54da2d9b0443 100644 --- a/src/runtime/opencl/opencl_module.h +++ b/src/runtime/opencl/opencl_module.h @@ -16,7 +16,7 @@ namespace tvm { namespace runtime { /*! - * \brief create a cuda module from data. + * \brief create a opencl module from data. * * \param data The module data. * \param fmt The format of the data, can be "clbin", "cl" From 6c7f8cdd83cf57a4d8bc73169a420b83f0a35222 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Sat, 18 Nov 2017 21:23:17 -0800 Subject: [PATCH 024/948] Fixed nnvm issue #239 (#660) * scheduler tweaked for super resolution perf * conv2d_transpose schedule error fixed * nnvm issue #239 fixed --- topi/python/topi/cuda/conv2d_nchw.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index a59a99624233..0d3f5eedb0a2 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -75,6 +75,9 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag): if mark % 8 == 0 and mark % 7 == 0: num_thread_x = 8 vthread_x = 7 + elif mark % 4 == 0 and mark % 7 == 0: + num_thread_x = 4 + vthread_x = 7 else: for i in range(5, mark): if mark % i == 0 and num_thread_x == 0: From abe224ce42dc361061df255dc2a687b55177fff7 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Tue, 21 Nov 2017 09:25:35 -0800 Subject: [PATCH 025/948] [CONTRIB] MPS DNN Dense (#615) * mps * update --- Makefile | 1 + make/config.mk | 3 ++ make/contrib/mps.mk | 14 +++++ python/tvm/contrib/mps.py | 35 ++++++++++++ src/contrib/mps/gemm.mm | 93 ++++++++++++++++++++++++++++++++ src/contrib/mps/mps_utils.cc | 58 ++++++++++++++++++++ src/contrib/mps/mps_utils.h | 33 ++++++++++++ tests/python/contrib/test_mps.py | 40 ++++++++++++++ 8 files changed, 277 insertions(+) create mode 100644 make/contrib/mps.mk create mode 100644 python/tvm/contrib/mps.py create mode 100644 src/contrib/mps/gemm.mm create mode 100644 src/contrib/mps/mps_utils.cc create mode 100644 src/contrib/mps/mps_utils.h create mode 100644 tests/python/contrib/test_mps.py diff --git a/Makefile b/Makefile index 4a16d5162102..75c5a563408e 100644 --- a/Makefile +++ b/Makefile @@ -133,6 +133,7 @@ endif include make/contrib/cblas.mk include make/contrib/nnpack.mk include make/contrib/cudnn.mk +include make/contrib/mps.mk ifdef ADD_CFLAGS CFLAGS += $(ADD_CFLAGS) diff --git a/make/config.mk b/make/config.mk index 53775df1ab36..8d8082c68142 100644 --- a/make/config.mk +++ b/make/config.mk @@ -68,3 +68,6 @@ USE_NNPACK = 0 # Whether use CuDNN USE_CUDNN = 0 + +# Whether use MPS +USE_MPS = 0 diff --git a/make/contrib/mps.mk b/make/contrib/mps.mk new file mode 100644 index 000000000000..501e62b2a671 --- /dev/null +++ b/make/contrib/mps.mk @@ -0,0 +1,14 @@ +MPS_CONTRIB_SRC = $(wildcard src/contrib/mps/*.mm, src/contrib/mps/*.cc) +MPS_CONTRIB_OBJ = $(patsubst src/%.mm, build/%.o, $(MPS_CONTRIB_SRC)) + +ifeq ($(USE_MPS), 1) +FRAMEWORKS += -framework MetalPerformanceShaders +CFLAGS += +ADD_LDFLAGS += +RUNTIME_DEP += $(MPS_CONTRIB_OBJ) +endif + +build/contrib/mps/%.o: src/contrib/mps/%.mm src/contrib/mps/%.cc + @mkdir -p $(@D) + $(CXX) $(OBJCFLAGS) $(CFLAGS) -MM -MT build/contrib/mps/$*.o $< >build/contrib/mps/$*.d + $(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@ diff --git a/python/tvm/contrib/mps.py b/python/tvm/contrib/mps.py new file mode 100644 index 000000000000..d214d4b93631 --- /dev/null +++ b/python/tvm/contrib/mps.py @@ -0,0 +1,35 @@ +"""External function interface to MPS libraroes.""" +from __future__ import absolute_import as _abs + +from .. import api as _api +from .. import intrin as _intrin + + +def matmul(lhs, rhs, transa=False, transb=False): + """Create an extern op that compute matrix mult of A and rhs with CrhsLAS + + This function serves as an example on how to calle external libraries. + + Parameters + ---------- + lhs : Tensor + The left matrix operand + rhs : Tensor + The right matrix operand + transa : bool + Whether transpose lhs + transb : bool + Whether transpose rhs + + Returns + ------- + C : Tensor + The result tensor. + """ + m = lhs.shape[0] + n = rhs.shape[1] + return _api.extern( + (n, m), [lhs, rhs], + lambda ins, outs: _intrin.call_packed( + "tvm.contrib.mps.matmul", ins[0], ins[1], outs[0], transa, transb), + name="C") diff --git a/src/contrib/mps/gemm.mm b/src/contrib/mps/gemm.mm new file mode 100644 index 000000000000..f877cb8b0ea1 --- /dev/null +++ b/src/contrib/mps/gemm.mm @@ -0,0 +1,93 @@ +#include "../../runtime/metal/metal_common.h" +#include +#include +#include +#include +#include + +namespace tvm { +namespace contrib { + +using namespace runtime; + +TVM_REGISTER_GLOBAL("tvm.contrib.mps.matmul") + .set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor *A = args[0]; + DLTensor *B = args[1]; + DLTensor *C = args[2]; + bool transa = args[3]; + bool transb = args[4]; + // call gemm for simple compact code. + CHECK_EQ(A->ndim, 2); + CHECK_EQ(B->ndim, 2); + CHECK_EQ(C->ndim, 2); + CHECK(C->strides == nullptr); + CHECK(B->strides == nullptr); + CHECK(A->strides == nullptr); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); + // Get Metal device API + MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal(); + CHECK_EQ(A->ctx, B->ctx); + CHECK_EQ(A->ctx, C->ctx); + id dev = entry_ptr->metal_api->GetDevice(A->ctx); + id queue = entry_ptr->metal_api->GetCommandQueue(A->ctx); + id cb = [queue commandBuffer]; + NSUInteger M = A->shape[0 + transa?1:0]; + NSUInteger N = B->shape[1 - transb?1:0]; + NSUInteger K = B->shape[0 + transb?1:0]; + CHECK_EQ(A->shape[1-transa?1:0], K); + // mps a + MPSDataType dtype = MPSType::DLTypeToMPSType(A->dtype); + MPSMatrixDescriptor *descA = [MPSMatrixDescriptor + matrixDescriptorWithDimensions:M + columns:K + rowBytes:M * sizeof(dtype) + dataType:dtype]; + id bufA = (__bridge id)(A->data); + MPSMatrix *matrixA = + [[MPSMatrix alloc] initWithBuffer:bufA descriptor:descA]; + // mps b + MPSMatrixDescriptor *descB = [MPSMatrixDescriptor + matrixDescriptorWithDimensions:K + columns:N + rowBytes:K * sizeof(dtype) + dataType:dtype]; + id bufB = (__bridge id)(B->data); + MPSMatrix *matrixB = + [[MPSMatrix alloc] initWithBuffer:bufB descriptor:descB]; + // mps c + MPSMatrixDescriptor *descC = [MPSMatrixDescriptor + matrixDescriptorWithDimensions:M + columns:N + rowBytes:M * sizeof(dtype) + dataType:dtype]; + id bufC = (__bridge id)(C->data); + MPSMatrix *matrixC = + [[MPSMatrix alloc] initWithBuffer:bufC descriptor:descC]; + // kernel + + MPSMatrixMultiplication *mul_obj = [[MPSMatrixMultiplication alloc] init]; + MPSMatrixMultiplication *sgemm = [mul_obj initWithDevice:dev + transposeLeft:transa + transposeRight:transb + resultRows:M + resultColumns:N + interiorColumns:K + alpha:1.0f + beta:0.0f]; + CHECK(sgemm != nil); + [sgemm encodeToCommandBuffer:cb + leftMatrix:matrixA + rightMatrix:matrixB + resultMatrix:matrixC]; + [cb commit]; + [mul_obj dealloc]; + [matrixA dealloc]; + [matrixB dealloc]; + [matrixC dealloc]; + }); + +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/mps/mps_utils.cc b/src/contrib/mps/mps_utils.cc new file mode 100644 index 000000000000..2e3ca6218bb4 --- /dev/null +++ b/src/contrib/mps/mps_utils.cc @@ -0,0 +1,58 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external mps utils function + */ +#include "mps_utils.h" +#include +#include +#include + + +namespace tvm { +namespace contrib { + +// MPS Data Type +MPSDataType MPSType::DLTypeToMPSType(const DLDataType &dtype) { + switch (dtype.code) { + case kDLInt: + if (dtype.bits == 8 && dtype.lanes == 1) return MPSDataTypeInt8; + else if (dtype.bits == 16 && dtype.lanes == 1) return MPSDataTypeInt16; + else + LOG(FATAL) << "Unsupported type"; + break; + case kDLUInt: + if (dtype.bits == 8 && dtype.lanes == 1) return MPSDataTypeUInt8; + else if (dtype.bits == 16 && dtype.lanes == 1) return MPSDataTypeUInt16; + else if (dtype.bits == 32 && dtype.lanes == 1) return MPSDataTypeUInt32; + LOG(FATAL) << "Unsupported type"; + break; + case kDLFloat: + if (dtype.bits == 16 && dtype.lanes == 1) return MPSDataTypeFloat16; + else if (dtype.bits == 32 && dtype.lanes == 1) return MPSDataTypeFloat32; + else + LOG(FATAL) << "Unsupported type"; + break; + default: + LOG(FATAL) << "Unsupported type"; + } +} + +// MetalThreadEntry + +MetalThreadEntry::MetalThreadEntry() { + auto func = runtime::Registry::Get("device_api.metal"); + void *ret = (*func)(); + metal_api = static_cast(ret); +} + +MetalThreadEntry::~MetalThreadEntry() { +} + +typedef dmlc::ThreadLocalStore MetalThreadStore; + +MetalThreadEntry* MetalThreadEntry::ThreadLocal() { + return MetalThreadStore::Get(); +} + +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/mps/mps_utils.h b/src/contrib/mps/mps_utils.h new file mode 100644 index 000000000000..91336ce44edd --- /dev/null +++ b/src/contrib/mps/mps_utils.h @@ -0,0 +1,33 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external mps utils function + */ + +#ifndef TVM_CONTRIB_MPS_MPS_UTILS_H_ +#define TVM_CONTRIB_MPS_MPS_UTILS_H_ + +#include +#include +#include "../../runtime/metal/metal_common.h" + + +namespace tvm { +namespace contrib { + +/*! breif Convert DLTensor type to MPS type */ +struct MPSType { + static MPSDataType DLTypeToMPSType(const DLDataType &dtype); +}; // struct MPSType + + +struct MetalThreadEntry { + MetalThreadEntry(); + ~MetalThreadEntry(); + runtime::MetalWorkspace *metal_api{nullptr}; + static MetalThreadEntry* ThreadLocal(); +}; // MetalThreadEntry + +} // namespace contrib +} // namespace tvm + +#endif // TVM_CONTRIB_MPS_MPS_UTILS_H_ diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py new file mode 100644 index 000000000000..68dcb135e908 --- /dev/null +++ b/tests/python/contrib/test_mps.py @@ -0,0 +1,40 @@ +import tvm +import numpy as np +from tvm.contrib import mps + +def test_matmul_add(): + n = 1024 + l = 128 + m = 235 + bias = tvm.var('bias', dtype=tvm.float32) + A = tvm.placeholder((n, l), name='A') + B = tvm.placeholder((l, m), name='B') + C1 = mps.matmul(A, B) + C2 = mps.matmul(B, A, True, True) + D1 = tvm.compute(C1.shape, lambda i, j: C1[i,j] + bias, name="D1") + D2 = tvm.compute(C2.shape, lambda i, j: C2[i,j] + bias, name="D2") + s1 = tvm.create_schedule(D1.op) + s2 = tvm.create_schedule(D2.op) + + def verify(A, B, D, s, bias, target="llvm"): + if not tvm.module.enabled(target): + print("skip because %s is not enabled..." % target) + return + if not tvm.get_global_func("tvm.contrib.mps.matmul", True): + print("skip because extern function is not avalable") + return + ctx = tvm.cpu(0) + f = tvm.build(s, [A, B, D, bias], target) + a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) + d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) + bb = 10.0 + f(a, b, d, bb) + np.testing.assert_allclose( + d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) + verify(A, B, D1, s1, bias) + verify(A, B, D2, s2, bias) + + +if __name__ == "__main__": + test_matmul_add() From 171ea2aea99f638ee587857f1d511eb7d6f183f2 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 21 Nov 2017 11:11:41 -0800 Subject: [PATCH 026/948] [PASS/SETUP] Fix minior issues (#663) * [PASS/SETUP] Fix minior issues * fix lint --- include/tvm/ir_pass.h | 20 +++++++--- python/setup.py | 42 ++++++++++++++------- python/tvm/_ffi/libinfo.py | 3 +- src/api/api_pass.cc | 12 +++++- src/arithmetic/canonical.cc | 24 ++++++++++++ tests/python/unittest/test_pass_simplify.py | 8 ++++ 6 files changed, 86 insertions(+), 23 deletions(-) diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index 897f96c763e7..b6b24822805a 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -22,13 +22,21 @@ namespace tvm { namespace ir { -inline Expr Simplify(Expr a) { - return Halide::Internal::simplify(a); -} +/*! + * \brief Simplify the expression. + * \param expr The expression to be simplifed. + * \param vrange The range information about the variable. + * \return Canonicalized statement. + */ +Expr Simplify(Expr expr, Map vrange = Map()); -inline Stmt Simplify(Stmt a) { - return Halide::Internal::simplify(a); -} +/*! + * \brief Simplify the statement. + * \param stmt The statement to be simplifed. + * \param vrange The range information about the variable. + * \return Canonicalized statement. + */ +Stmt Simplify(Stmt stmt, Map vrange = Map()); /*! * \brief Simplify by applying canonical form. diff --git a/python/setup.py b/python/setup.py index 168729391412..5a87325e9a1a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -18,16 +18,25 @@ from setuptools import setup from setuptools.extension import Extension -# We can not import `libinfo.py` in setup.py directly since __init__.py -# Will be invoked which introduces dependences -CURRENT_DIR = os.path.dirname(__file__) -libinfo_py = os.path.join(CURRENT_DIR, './tvm/_ffi/libinfo.py') -libinfo = {'__file__': libinfo_py} -exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo) +def get_lib_path(): + """Get library path, name and version""" + # We can not import `libinfo.py` in setup.py directly since __init__.py + # Will be invoked which introduces dependences + CURRENT_DIR = os.path.dirname(__file__) + libinfo_py = os.path.join(CURRENT_DIR, './tvm/_ffi/libinfo.py') + libinfo = {'__file__': libinfo_py} + exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo) + lib_path = libinfo['find_lib_path']() + version = libinfo['__version__'] + libs = [lib_path[0]] + if libs[0].find("runtime") == -1: + for name in lib_path[1:]: + if name.find("runtime") != -1: + libs.append(name) + break + return libs, version -LIB_PATH = libinfo['find_lib_path']() -_, LIB_NAME = os.path.split(LIB_PATH[0]) -__version__ = libinfo['__version__'] +LIB_LIST, __version__ = get_lib_path() def config_cython(): """Try to configure cython and return cython configuration""" @@ -81,18 +90,21 @@ def is_pure(self): # For bdist_wheel only if "bdist_wheel" in sys.argv: - shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'tvm')) with open("MANIFEST.in", "w") as fo: - fo.write("include tvm/%s\n" % LIB_NAME) + for path in LIB_LIST: + shutil.copy(path, os.path.join(CURRENT_DIR, 'tvm')) + _, libname = os.path.split(path) + fo.write("include tvm/%s\n" % libname) setup_kwargs = { "include_package_data": True } else: curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) - rpath = os.path.relpath(LIB_PATH[0], curr_path) + for i, path in enumerate(LIB_LIST): + LIB_LIST[i] = os.path.relpath(path, curr_path) setup_kwargs = { "include_package_data": True, - "data_files": [('tvm', [rpath])] + "data_files": [('tvm', LIB_LIST)] } setup(name='tvm', @@ -112,4 +124,6 @@ def is_pure(self): # Wheel cleanup if "bdist_wheel" in sys.argv: os.remove("MANIFEST.in") - os.remove("tvm/%s" % LIB_NAME) + for path in LIB_LIST: + _, libname = os.path.split(path) + os.remove("tvm/%s" % LIB_NAME) diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index 273e8f8fb003..f3ed174c0d44 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -74,7 +74,8 @@ def find_lib_path(name=None, search_path=None): if not use_runtime: # try to find lib_dll_path lib_found = [p for p in lib_dll_path if os.path.exists(p) and os.path.isfile(p)] - if use_runtime or not lib_found: + lib_found += [p for p in runtime_dll_path if os.path.exists(p) and os.path.isfile(p)] + else: # try to find runtime_dll_path use_runtime = True lib_found = [p for p in runtime_dll_path if os.path.exists(p) and os.path.isfile(p)] diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index a3134f5114e1..024af23a37e9 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -16,9 +16,17 @@ namespace ir { TVM_REGISTER_API("ir_pass.Simplify") .set_body([](TVMArgs args, TVMRetValue *ret) { if (args[0].IsNodeType()) { - *ret = Simplify(args[0].operator Stmt()); + if (args.size() > 1) { + *ret = Simplify(args[0].operator Stmt(), args[1]); + } else { + *ret = Simplify(args[0].operator Stmt()); + } } else { - *ret = Simplify(args[0].operator Expr()); + if (args.size() > 1) { + *ret = Simplify(args[0].operator Expr(), args[1]); + } else { + *ret = Simplify(args[0].operator Expr()); + } } }); diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index 933a8f78ea16..808e070ef162 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -7,6 +7,7 @@ #include #include "./canonical.h" #include "./compute_expr.h" +#include "arithmetic/Simplify.h" namespace tvm { namespace arith { @@ -559,5 +560,28 @@ Stmt CanonicalSimplify(Stmt stmt) { Expr CanonicalSimplify(Expr expr) { return arith::Canonical().Simplify(expr); } + +template +T Simplify_(T a, Map vrange) { + using namespace Halide::Internal; + Scope rscope; + for (auto kv : vrange) { + Range r = kv.second; + rscope.push( + kv.first.get(), + Interval(r->min, + simplify(r->min + r->extent - make_const(r->min.type(), 1)))); + } + return Halide::Internal::simplify(a, true, rscope); +} + + +Expr Simplify(Expr a, Map vrange) { + return Simplify_(a, vrange); +} + +Stmt Simplify(Stmt a, Map vrange) { + return Simplify_(a, vrange); +} } // namespace ir } // namespace tvm diff --git a/tests/python/unittest/test_pass_simplify.py b/tests/python/unittest/test_pass_simplify.py index 2cc8825e37f3..9105693b3835 100644 --- a/tests/python/unittest/test_pass_simplify.py +++ b/tests/python/unittest/test_pass_simplify.py @@ -27,6 +27,13 @@ def test_basic(): assert str(ret.value) == "(m - 1)" +def test_bound(): + m = tvm.var('m') + vrange = tvm.convert({m: tvm.Range(tvm.const(0), tvm.const(10))}) + ret = tvm.ir_pass.Simplify(m % 10, vrange) + assert ret == m + + def test_canonical(): x = tvm.var("x") z = tvm.const(3) @@ -37,6 +44,7 @@ def test_canonical(): assert(tvm.ir_pass.Equal(ret, 0)) if __name__ == "__main__": + test_bound() test_basic() test_simplify() test_canonical() From 29512f1d234de3395af9d4b473961c31b0968fa9 Mon Sep 17 00:00:00 2001 From: Siva Date: Fri, 24 Nov 2017 00:04:57 +0530 Subject: [PATCH 027/948] Documentation correction (#665) Readability. --- src/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/README.md b/src/README.md index e4fc992b2d8f..dfa7a1d33d22 100644 --- a/src/README.md +++ b/src/README.md @@ -4,13 +4,13 @@ Header files in include are public APIs that share across modules. There can be internal header files within each module that sit in src. ## Modules -- common Internal common utilities. -- api API function registration -- lang The definition of DSL related data structure -- arithmetic Arithmetic expression and set simplification -- op The detail implementations about each operation(compute, scan, placeholder) -- schedule The operations on the schedule graph before converting to IR. -- pass The optimization pass on the IR structure -- codegen The code generator. -- runtime Minimum runtime related codes -- contrib Contrib extension libraries +- common: Internal common utilities. +- api: API function registration +- lang: The definition of DSL related data structure +- arithmetic: Arithmetic expression and set simplification +- op: The detail implementations about each operation(compute, scan, placeholder) +- schedule: The operations on the schedule graph before converting to IR. +- pass: The optimization pass on the IR structure +- codegen: The code generator. +- runtime: Minimum runtime related codes +- contrib: Contrib extension libraries From 3de26d2e3f093a8a0bdc8b868da540a5a3c4af33 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 25 Nov 2017 10:00:14 -0800 Subject: [PATCH 028/948] [PASS] Allow compact checking when strides is available (#669) * [PASS] Allow compact checking when strides is available * remove assert compact --- src/codegen/stack_vm/codegen_stack_vm.cc | 1 + src/pass/arg_binder.cc | 30 +++++++++++++++++------- src/pass/ir_util.cc | 5 ++++ topi/python/topi/nn/conv2d.py | 3 ++- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stack_vm/codegen_stack_vm.cc index 97a2388f16e4..5b01dae7100a 100644 --- a/src/codegen/stack_vm/codegen_stack_vm.cc +++ b/src/codegen/stack_vm/codegen_stack_vm.cc @@ -362,6 +362,7 @@ void CodeGenStackVM::VisitExpr_(const Or *op) { } void CodeGenStackVM::VisitExpr_(const Not* op) { + this->Push(op->a); this->PushOp(StackVM::NOT); } diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc index f9969cc5dba2..20c8593a1494 100644 --- a/src/pass/arg_binder.cc +++ b/src/pass/arg_binder.cc @@ -136,12 +136,6 @@ inline Expr TVMArrayGet(Type t, Var arr, intrinsic::TVMStructFieldKind kind) { return TVMStructGet(t, arr, 0, kind); } -inline Stmt AssertNull(Var handle, std::string msg) { - return AssertStmt::make(Call::make( - Bool(1), intrinsic::tvm_handle_is_null, - {handle}, Call::PureIntrinsic), msg, Evaluate::make(0)); -} - void ArgBinder::BindDLTensor(const Buffer& buffer, const Expr& device_type, const Expr& device_id, @@ -201,10 +195,30 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, v_strides, TVMArrayGet(Handle(), handle, intrinsic::kArrStrides), nop)); if (buffer->strides.size() == 0) { + // Assert the buffer is compact + Type stype = buffer->shape[0].type(); + Expr expect_stride = make_const(stype, 1); + Array conds; + for (size_t i = buffer->shape.size(); i != 0; --i) { + size_t k = i - 1; + Expr svalue = cast( + stype, + Load::make(tvm_shape_type, v_strides, + IntImm::make(Int(32), k), const_true(1))); + conds.push_back(expect_stride == svalue); + expect_stride = expect_stride * buffer->shape[k]; + } std::ostringstream stride_err_msg; stride_err_msg << arg_name << ".strides:" - << " expected to be nullptr for contiguous array"; - init_nest_.emplace_back(AssertNull(v_strides, stride_err_msg.str())); + << " expected to be compact array"; + Stmt check = + AssertStmt::make(arith::ComputeReduce(conds), + stride_err_msg.str(), Evaluate::make(0)); + Expr is_null = Call::make( + Bool(1), intrinsic::tvm_handle_is_null, + {v_strides}, Call::PureIntrinsic); + check = IfThenElse::make(Not::make(is_null), check, Stmt()); + init_nest_.emplace_back(Block::make(check, Evaluate::make(0))); } else { for (size_t k = 0; k < buffer->strides.size(); ++k) { std::ostringstream field_name; diff --git a/src/pass/ir_util.cc b/src/pass/ir_util.cc index 12551947ad7c..579706ca9964 100644 --- a/src/pass/ir_util.cc +++ b/src/pass/ir_util.cc @@ -33,6 +33,11 @@ Stmt MergeNest(const std::vector& nest, Stmt body) { CHECK(!n->else_case.defined()); n->then_case = body; body = Stmt(n); + } else if (s.as()) { + auto n = std::make_shared(*s.as()); + CHECK(is_no_op(n->rest)); + n->rest = body; + body = Stmt(n); } else if (s.as()) { auto n = std::make_shared(*s.as()); CHECK(is_no_op(n->body)); diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index cc1ee0198c3d..11866aedc101 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -225,7 +225,8 @@ def _im2col_pack(data, kernel, stride, padding, out_dtype): wk = tvm.reduce_axis((0, KW), name='wk') conv = tvm.compute(ovshape, lambda n, co, im, vim, vco: \ - tvm.sum(data_vec[n][im][ci][hk][wk][vim] * kernel_vec[co][ci][hk][wk][vco], + tvm.sum(data_vec[n][im][ci][hk][wk][vim].astype(out_dtype) * + kernel_vec[co][ci][hk][wk][vco].astype(out_dtype), axis=[ci, hk, wk]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: \ From 6aa8e57f66d0abac5d12c4d88b92e3da12564877 Mon Sep 17 00:00:00 2001 From: ziheng Date: Mon, 27 Nov 2017 14:00:42 -0800 Subject: [PATCH 029/948] [TOPI] Fix for pooling (#673) --- topi/python/topi/nn/pooling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/topi/python/topi/nn/pooling.py b/topi/python/topi/nn/pooling.py index 955f724220a5..99b15e18e4e1 100644 --- a/topi/python/topi/nn/pooling.py +++ b/topi/python/topi/nn/pooling.py @@ -81,14 +81,14 @@ def pool(data, kernel, stride, padding, pool_type, ceil_mode=False): pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_height, kernel_width)) - pad_before = [0, 0, pad_top, pad_left] - pad_after = [0, 0, pad_down, pad_right] - if ceil_mode: # Additional padding to ensure we do ceil instead of floor when divide stride. pad_down += stride_height -1 pad_right += stride_width - 1 + pad_before = [0, 0, pad_top, pad_left] + pad_after = [0, 0, pad_down, pad_right] + out_height = util.simplify((height - kernel_height + pad_top + pad_down) // stride_height + 1) out_width = util.simplify((width - kernel_width + pad_left + pad_right) // stride_width + 1) From 93460475f73981da5a9d4960e2ae51c98ffdafe7 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 28 Nov 2017 13:26:17 -0800 Subject: [PATCH 030/948] [ARITH] Upgrade CanonicalSimplify to Simplify Mod (#676) --- include/tvm/ir_pass.h | 8 +- src/api/api_pass.cc | 12 ++- src/arithmetic/canonical.cc | 83 +++++++++++++++++--- src/arithmetic/canonical.h | 2 +- tests/python/unittest/test_arith_simplify.py | 18 +++++ 5 files changed, 106 insertions(+), 17 deletions(-) diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index b6b24822805a..d0f32478eb4c 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -41,16 +41,20 @@ Stmt Simplify(Stmt stmt, Map vrange = Map()); /*! * \brief Simplify by applying canonical form. * \param stmt The statement to be canonically simplifed. + * \param vrange The range information about the variable. * \return Canonicalized statement. */ -Stmt CanonicalSimplify(Stmt stmt); +Stmt CanonicalSimplify(Stmt stmt, + Map vrange = Map()); /*! * \brief Simplify by applying canonical form. * \param expr The statement to be canonically simplifed. + * \param vrange The range information about the variable. * \return Canonicalized expression. */ -Expr CanonicalSimplify(Expr expr); +Expr CanonicalSimplify(Expr expr, + Map vrange = Map()); /*! * \brief Deep compare lhs and rhs diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index 024af23a37e9..23deb03af482 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -33,9 +33,17 @@ TVM_REGISTER_API("ir_pass.Simplify") TVM_REGISTER_API("ir_pass.CanonicalSimplify") .set_body([](TVMArgs args, TVMRetValue *ret) { if (args[0].IsNodeType()) { - *ret = CanonicalSimplify(args[0].operator Stmt()); + if (args.size() > 1) { + *ret = CanonicalSimplify(args[0].operator Stmt(), args[1]); + } else { + *ret = CanonicalSimplify(args[0].operator Stmt()); + } } else { - *ret = CanonicalSimplify(args[0].operator Expr()); + if (args.size() > 1) { + *ret = CanonicalSimplify(args[0].operator Expr(), args[1]); + } else { + *ret = CanonicalSimplify(args[0].operator Expr()); + } } }); diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index 808e070ef162..e7f9da1b448a 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -129,6 +129,11 @@ inline Expr Binary_(const T* op, // internal of canonical engine. class Canonical::Internal : public IRMutator { public: + explicit Internal(Map vrange) { + for (auto kv : vrange) { + SetRange(kv.first, kv.second, 0); + } + } // stack entry. struct StackEntry { int max_level{0}; @@ -300,9 +305,25 @@ class Canonical::Internal : public IRMutator { Expr Mutate_(const Div* op, const Expr& e) final { return Binary(op, e); } + // Mod operator Expr Mutate_(const Mod* op, const Expr& e) final { - return Binary(op, e); + if (!EnableOpt(op->type)) { + return Binary(op, e); + } + CacheEntry a = Produce(op->a); + CacheEntry b = Produce(op->b); + if (a.has_side_effect || b.has_side_effect) { + return Binary_(op, e, a.value, b.value); + } + if (is_const(a.value) && is_const(b.value)) { + return ComputeExpr(a.value, b.value); + } else if (is_const(b.value)) { + return SumModConst(a.AsSum(), b.value); + } else { + return Binary(op, e); + } } + Expr Mutate_(const And* op, const Expr& e) final { Expr expr = IRMutator::Mutate_(op, e); op = expr.as(); @@ -367,7 +388,7 @@ class Canonical::Internal : public IRMutator { private: template - Expr Binary(const T* op, const Expr& e) { + Expr Binary(const T* op, Expr e) { Expr a = this->Mutate(op->a); Expr b = this->Mutate(op->b); BinaryExpr key{static_cast(T::_type_info), a, b}; @@ -398,8 +419,8 @@ class Canonical::Internal : public IRMutator { std::vector var_rec_; // level counter int level_counter_{0}; - // subroutine to do produce - Expr SumMulConst(ComExpr a, Expr v) { + // get constant int value + int64_t GetConstIntValue(const Expr& v) { int64_t value = 0; const int64_t *v1 = as_const_int(v); const uint64_t *v2 = as_const_uint(v); @@ -411,7 +432,45 @@ class Canonical::Internal : public IRMutator { static_cast(std::numeric_limits::max())); value = static_cast(*v2); } - + return value; + } + // subroutine to do produce a % v + Expr SumModConst(ComExpr a, Expr v) { + int64_t value = GetConstIntValue(v); + std::shared_ptr n = std::make_shared(); + int mod_level = 0; + n->base = a->base % value; + if (n->base != 0) mod_level = 1; + for (auto e : a->elem) { + if (e.scale % value == 0) continue; + e.scale = e.scale % value; + if (!EvalSet(v - e.value, var_range_).can_prove_positive()) { + mod_level = 2; + } else { + ++mod_level; + } + n->elem.push_back(e); + } + // cannot remove mode because there are more than two parts + if (mod_level >= 2) { + Expr ret = Sum2Expr(ComExpr(n), v.type()) % v; + return Binary(ret.as(), ret); + } + ret_entry_.sum = ComExpr(n); + ret_entry_.max_level = stack_.back().max_level; + ret_entry_.has_side_effect = stack_.back().has_side_effect; + auto it = cache_sum_.find(ret_entry_.sum); + if (it != cache_sum_.end()) { + ret_entry_ = it->second; + } else { + ret_entry_.value = Sum2Expr(ret_entry_.sum, v.type()); + cache_sum_[ret_entry_.sum] = ret_entry_; + } + return ret_entry_.value; + } + // subroutine to do produce + Expr SumMulConst(ComExpr a, Expr v) { + int64_t value = GetConstIntValue(v); if (value == 0) { return make_zero(v.type()); } @@ -421,9 +480,9 @@ class Canonical::Internal : public IRMutator { for (auto& e : vsum->elem) { e.scale *= value; } + ret_entry_.sum = ComExpr(vsum); ret_entry_.max_level = stack_.back().max_level; ret_entry_.has_side_effect = stack_.back().has_side_effect; - ret_entry_.sum = ComExpr(vsum); auto it = cache_sum_.find(ret_entry_.sum); if (it != cache_sum_.end()) { ret_entry_ = it->second; @@ -536,8 +595,8 @@ class Canonical::Internal : public IRMutator { using CInternal = Canonical::Internal; -Canonical::Canonical() - : ptr_(std::make_shared()) {} +Canonical::Canonical(Map vrange) + : ptr_(std::make_shared(vrange)) {} Expr Canonical::Simplify(Expr expr) { return ptr_->Mutate(expr); @@ -553,12 +612,12 @@ void Canonical::SetRange(Var v, Range r, int level) { } // namespace arith namespace ir { -Stmt CanonicalSimplify(Stmt stmt) { - return arith::Canonical().Simplify(stmt); +Stmt CanonicalSimplify(Stmt stmt, Map vrange) { + return arith::Canonical(vrange).Simplify(stmt); } -Expr CanonicalSimplify(Expr expr) { - return arith::Canonical().Simplify(expr); +Expr CanonicalSimplify(Expr expr, Map vrange) { + return arith::Canonical(vrange).Simplify(expr); } template diff --git a/src/arithmetic/canonical.h b/src/arithmetic/canonical.h index 174acc20aebe..37f9a178f696 100644 --- a/src/arithmetic/canonical.h +++ b/src/arithmetic/canonical.h @@ -22,7 +22,7 @@ namespace arith { class Canonical { public: /*! \brief constructor */ - Canonical(); + explicit Canonical(Map var_range); /*! * \brief simplify expression e. * \param expr The expression to be simplified. diff --git a/tests/python/unittest/test_arith_simplify.py b/tests/python/unittest/test_arith_simplify.py index 9ff8571eac42..8ce1773ee3c4 100644 --- a/tests/python/unittest/test_arith_simplify.py +++ b/tests/python/unittest/test_arith_simplify.py @@ -20,5 +20,23 @@ def test_simplify(): zz = zz.a assert zz.a == x and zz.b.value == 4 +def test_simplify_mod(): + """Not yet working, mock design""" + ib = tvm.ir_builder.create() + n = tvm.var('n') + j = tvm.var('j') + A = ib.pointer("float32", name="A") + with ib.for_range(0, 16, name="i") as i: + A[i] = A[((n * 4 + j * 2) * 8 + i+1) % 16] + body = ib.get() + stmt = tvm.ir_pass.CanonicalSimplify(body) + diff = tvm.ir_pass.CanonicalSimplify(stmt.body.value.index - (1 + i) % 16) + assert diff.value == 0 + index = tvm.ir_pass.CanonicalSimplify( + (j + n * 32) % 16, {j: tvm.Range(0, 6)}) + assert index == j + + if __name__ == "__main__": + test_simplify_mod() test_simplify() From 0824de5489a6f7a9c94a804a7127d7536107edbf Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 28 Nov 2017 17:08:24 -0800 Subject: [PATCH 031/948] [ANDROID][RPC] Remove binary distro jar (#677) * [RPC][JVM] Remove binary dist gradle from repo * fix header --- .../gradle/wrapper/gradle-wrapper.jar | Bin 53636 -> 0 bytes src/op/op_util.cc | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 apps/android_rpc/gradle/wrapper/gradle-wrapper.jar diff --git a/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar b/apps/android_rpc/gradle/wrapper/gradle-wrapper.jar deleted file mode 100644 index 13372aef5e24af05341d49695ee84e5f9b594659..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 53636 zcmafaW0a=B^559DjdyHo$F^PVt zzd|cWgMz^T0YO0lQ8%TE1O06v|NZl~LH{LLQ58WtNjWhFP#}eWVO&eiP!jmdp!%24 z{&z-MK{-h=QDqf+S+Pgi=_wg$I{F28X*%lJ>A7Yl#$}fMhymMu?R9TEB?#6@|Q^e^AHhxcRL$z1gsc`-Q`3j+eYAd<4@z^{+?JM8bmu zSVlrVZ5-)SzLn&LU9GhXYG{{I+u(+6ES+tAtQUanYC0^6kWkks8cG;C&r1KGs)Cq}WZSd3k1c?lkzwLySimkP5z)T2Ox3pNs;PdQ=8JPDkT7#0L!cV? zzn${PZs;o7UjcCVd&DCDpFJvjI=h(KDmdByJuDYXQ|G@u4^Kf?7YkE67fWM97kj6F z973tGtv!k$k{<>jd~D&c(x5hVbJa`bILdy(00%lY5}HZ2N>)a|))3UZ&fUa5@uB`H z+LrYm@~t?g`9~@dFzW5l>=p0hG%rv0>(S}jEzqQg6-jImG%Pr%HPtqIV_Ym6yRydW z4L+)NhcyYp*g#vLH{1lK-hQQSScfvNiNx|?nSn-?cc8}-9~Z_0oxlr~(b^EiD`Mx< zlOLK)MH?nl4dD|hx!jBCIku-lI(&v~bCU#!L7d0{)h z;k4y^X+=#XarKzK*)lv0d6?kE1< zmCG^yDYrSwrKIn04tG)>>10%+ zEKzs$S*Zrl+GeE55f)QjY$ zD5hi~J17k;4VSF_`{lPFwf^Qroqg%kqM+Pdn%h#oOPIsOIwu?JR717atg~!)*CgXk zERAW?c}(66rnI+LqM^l7BW|9dH~5g1(_w$;+AAzSYlqop*=u5}=g^e0xjlWy0cUIT7{Fs2Xqx*8% zW71JB%hk%aV-wjNE0*$;E-S9hRx5|`L2JXxz4TX3nf8fMAn|523ssV;2&145zh{$V z#4lt)vL2%DCZUgDSq>)ei2I`*aeNXHXL1TB zC8I4!uq=YYVjAdcCjcf4XgK2_$y5mgsCdcn2U!VPljXHco>+%`)6W=gzJk0$e%m$xWUCs&Ju-nUJjyQ04QF_moED2(y6q4l+~fo845xm zE5Esx?~o#$;rzpCUk2^2$c3EBRNY?wO(F3Pb+<;qfq;JhMFuSYSxiMejBQ+l8(C-- zz?Xufw@7{qvh$;QM0*9tiO$nW(L>83egxc=1@=9Z3)G^+*JX-z92F((wYiK>f;6 zkc&L6k4Ua~FFp`x7EF;ef{hb*n8kx#LU|6{5n=A55R4Ik#sX{-nuQ}m7e<{pXq~8#$`~6| zi{+MIgsBRR-o{>)CE8t0Bq$|SF`M0$$7-{JqwFI1)M^!GMwq5RAWMP!o6G~%EG>$S zYDS?ux;VHhRSm*b^^JukYPVb?t0O%^&s(E7Rb#TnsWGS2#FdTRj_SR~YGjkaRFDI=d)+bw$rD;_!7&P2WEmn zIqdERAbL&7`iA^d?8thJ{(=)v>DgTF7rK-rck({PpYY$7uNY$9-Z< ze4=??I#p;$*+-Tm!q8z}k^%-gTm59^3$*ByyroqUe02Dne4?Fc%JlO>*f9Zj{++!^ zBz0FxuS&7X52o6-^CYq>jkXa?EEIfh?xdBPAkgpWpb9Tam^SXoFb3IRfLwanWfskJ zIbfU-rJ1zPmOV)|%;&NSWIEbbwj}5DIuN}!m7v4($I{Rh@<~-sK{fT|Wh?<|;)-Z; zwP{t@{uTsmnO@5ZY82lzwl4jeZ*zsZ7w%a+VtQXkigW$zN$QZnKw4F`RG`=@eWowO zFJ6RC4e>Y7Nu*J?E1*4*U0x^>GK$>O1S~gkA)`wU2isq^0nDb`);Q(FY<8V6^2R%= zDY}j+?mSj{bz2>F;^6S=OLqiHBy~7h4VVscgR#GILP!zkn68S^c04ZL3e$lnSU_(F zZm3e`1~?eu1>ys#R6>Gu$`rWZJG&#dsZ?^)4)v(?{NPt+_^Ak>Ap6828Cv^B84fa4 z_`l$0SSqkBU}`f*H#<14a)khT1Z5Z8;=ga^45{l8y*m|3Z60vgb^3TnuUKaa+zP;m zS`za@C#Y;-LOm&pW||G!wzr+}T~Q9v4U4ufu*fLJC=PajN?zN=?v^8TY}wrEeUygdgwr z7szml+(Bar;w*c^!5txLGKWZftqbZP`o;Kr1)zI}0Kb8yr?p6ZivtYL_KA<+9)XFE z=pLS5U&476PKY2aKEZh}%|Vb%!us(^qf)bKdF7x_v|Qz8lO7Ro>;#mxG0gqMaTudL zi2W!_#3@INslT}1DFJ`TsPvRBBGsODklX0`p-M6Mrgn~6&fF`kdj4K0I$<2Hp(YIA z)fFdgR&=qTl#sEFj6IHzEr1sYM6 zNfi!V!biByA&vAnZd;e_UfGg_={}Tj0MRt3SG%BQYnX$jndLG6>ssgIV{T3#=;RI% zE}b!9z#fek19#&nFgC->@!IJ*Fe8K$ZOLmg|6(g}ccsSBpc`)3;Ar8;3_k`FQ#N9&1tm>c|2mzG!!uWvelm zJj|oDZ6-m(^|dn3em(BF&3n12=hdtlb@%!vGuL*h`CXF?^=IHU%Q8;g8vABm=U!vX zT%Ma6gpKQC2c;@wH+A{)q+?dAuhetSxBDui+Z;S~6%oQq*IwSMu-UhMDy{pP z-#GB-a0`0+cJ%dZ7v0)3zfW$eV>w*mgU4Cma{P$DY3|w364n$B%cf()fZ;`VIiK_O zQ|q|(55+F$H(?opzr%r)BJLy6M&7Oq8KCsh`pA5^ohB@CDlMKoDVo5gO&{0k)R0b(UOfd>-(GZGeF}y?QI_T+GzdY$G{l!l% zHyToqa-x&X4;^(-56Lg$?(KYkgJn9W=w##)&CECqIxLe@+)2RhO*-Inpb7zd8txFG6mY8E?N8JP!kRt_7-&X{5P?$LAbafb$+hkA*_MfarZxf zXLpXmndnV3ubbXe*SYsx=eeuBKcDZI0bg&LL-a8f9>T(?VyrpC6;T{)Z{&|D5a`Aa zjP&lP)D)^YYWHbjYB6ArVs+4xvrUd1@f;;>*l zZH``*BxW+>Dd$be{`<&GN(w+m3B?~3Jjz}gB8^|!>pyZo;#0SOqWem%xeltYZ}KxOp&dS=bg|4 zY-^F~fv8v}u<7kvaZH`M$fBeltAglH@-SQres30fHC%9spF8Ld%4mjZJDeGNJR8+* zl&3Yo$|JYr2zi9deF2jzEC) zl+?io*GUGRp;^z+4?8gOFA>n;h%TJC#-st7#r&-JVeFM57P7rn{&k*z@+Y5 zc2sui8(gFATezp|Te|1-Q*e|Xi+__8bh$>%3|xNc2kAwTM!;;|KF6cS)X3SaO8^z8 zs5jV(s(4_NhWBSSJ}qUzjuYMKlkjbJS!7_)wwVsK^qDzHx1u*sC@C1ERqC#l%a zk>z>m@sZK{#GmsB_NkEM$$q@kBrgq%=NRBhL#hjDQHrI7(XPgFvP&~ZBJ@r58nLme zK4tD}Nz6xrbvbD6DaDC9E_82T{(WRQBpFc+Zb&W~jHf1MiBEqd57}Tpo8tOXj@LcF zwN8L-s}UO8%6piEtTrj@4bLH!mGpl5mH(UJR1r9bBOrSt0tSJDQ9oIjcW#elyMAxl7W^V(>8M~ss0^>OKvf{&oUG@uW{f^PtV#JDOx^APQKm& z{*Ysrz&ugt4PBUX@KERQbycxP%D+ApR%6jCx7%1RG2YpIa0~tqS6Xw6k#UN$b`^l6d$!I z*>%#Eg=n#VqWnW~MurJLK|hOQPTSy7G@29g@|g;mXC%MF1O7IAS8J^Q6D&Ra!h^+L&(IBYg2WWzZjT-rUsJMFh@E)g)YPW_)W9GF3 zMZz4RK;qcjpnat&J;|MShuPc4qAc)A| zVB?h~3TX+k#Cmry90=kdDoPYbhzs#z96}#M=Q0nC{`s{3ZLU)c(mqQQX;l~1$nf^c zFRQ~}0_!cM2;Pr6q_(>VqoW0;9=ZW)KSgV-c_-XdzEapeLySavTs5-PBsl-n3l;1jD z9^$^xR_QKDUYoeqva|O-+8@+e??(pRg@V|=WtkY!_IwTN~ z9Rd&##eWt_1w$7LL1$-ETciKFyHnNPjd9hHzgJh$J(D@3oYz}}jVNPjH!viX0g|Y9 zDD`Zjd6+o+dbAbUA( zEqA9mSoX5p|9sDVaRBFx_8)Ra4HD#xDB(fa4O8_J2`h#j17tSZOd3%}q8*176Y#ak zC?V8Ol<*X{Q?9j{Ys4Bc#sq!H;^HU$&F_`q2%`^=9DP9YV-A!ZeQ@#p=#ArloIgUH%Y-s>G!%V3aoXaY=f<UBrJTN+*8_lMX$yC=Vq+ zrjLn-pO%+VIvb~>k%`$^aJ1SevcPUo;V{CUqF>>+$c(MXxU12mxqyFAP>ki{5#;Q0 zx7Hh2zZdZzoxPY^YqI*Vgr)ip0xnpQJ+~R*UyFi9RbFd?<_l8GH@}gGmdB)~V7vHg z>Cjy78TQTDwh~+$u$|K3if-^4uY^|JQ+rLVX=u7~bLY29{lr>jWV7QCO5D0I>_1?; zx>*PxE4|wC?#;!#cK|6ivMzJ({k3bT_L3dHY#h7M!ChyTT`P#%3b=k}P(;QYTdrbe z+e{f@we?3$66%02q8p3;^th;9@y2vqt@LRz!DO(WMIk?#Pba85D!n=Ao$5NW0QVgS zoW)fa45>RkjU?H2SZ^#``zs6dG@QWj;MO4k6tIp8ZPminF`rY31dzv^e-3W`ZgN#7 z)N^%Rx?jX&?!5v`hb0-$22Fl&UBV?~cV*{hPG6%ml{k;m+a-D^XOF6DxPd$3;2VVY zT)E%m#ZrF=D=84$l}71DK3Vq^?N4``cdWn3 zqV=mX1(s`eCCj~#Nw4XMGW9tK>$?=cd$ule0Ir8UYzhi?%_u0S?c&j7)-~4LdolkgP^CUeE<2`3m)I^b ztV`K0k$OS^-GK0M0cNTLR22Y_eeT{<;G(+51Xx}b6f!kD&E4; z&Op8;?O<4D$t8PB4#=cWV9Q*i4U+8Bjlj!y4`j)^RNU#<5La6|fa4wLD!b6?RrBsF z@R8Nc^aO8ty7qzlOLRL|RUC-Bt-9>-g`2;@jfNhWAYciF{df9$n#a~28+x~@x0IWM zld=J%YjoKm%6Ea>iF){z#|~fo_w#=&&HRogJmXJDjCp&##oVvMn9iB~gyBlNO3B5f zXgp_1I~^`A0z_~oAa_YBbNZbDsnxLTy0@kkH!=(xt8|{$y<+|(wSZW7@)#|fs_?gU5-o%vpsQPRjIxq;AED^oG%4S%`WR}2(*!84Pe8Jw(snJ zq~#T7+m|w#acH1o%e<+f;!C|*&_!lL*^zRS`;E}AHh%cj1yR&3Grv&0I9k9v0*w8^ zXHEyRyCB`pDBRAxl;ockOh6$|7i$kzCBW$}wGUc|2bo3`x*7>B@eI=-7lKvI)P=gQ zf_GuA+36kQb$&{ZH)6o^x}wS}S^d&Xmftj%nIU=>&j@0?z8V3PLb1JXgHLq)^cTvB zFO6(yj1fl1Bap^}?hh<>j?Jv>RJdK{YpGjHxnY%d8x>A{k+(18J|R}%mAqq9Uzm8^Us#Ir_q^w9-S?W07YRD`w%D(n;|8N%_^RO`zp4 z@`zMAs>*x0keyE)$dJ8hR37_&MsSUMlGC*=7|wUehhKO)C85qoU}j>VVklO^TxK?! zO!RG~y4lv#W=Jr%B#sqc;HjhN={wx761vA3_$S>{j+r?{5=n3le|WLJ(2y_r>{)F_ z=v8Eo&xFR~wkw5v-{+9^JQukxf8*CXDWX*ZzjPVDc>S72uxAcY+(jtg3ns_5R zRYl2pz`B)h+e=|7SfiAAP;A zk0tR)3u1qy0{+?bQOa17SpBRZ5LRHz(TQ@L0%n5xJ21ri>^X420II1?5^FN3&bV?( zCeA)d9!3FAhep;p3?wLPs`>b5Cd}N!;}y`Hq3ppDs0+><{2ey0yq8o7m-4|oaMsWf zsLrG*aMh91drd-_QdX6t&I}t2!`-7$DCR`W2yoV%bcugue)@!SXM}fJOfG(bQQh++ zjAtF~zO#pFz})d8h)1=uhigDuFy`n*sbxZ$BA^Bt=Jdm}_KB6sCvY(T!MQnqO;TJs zVD{*F(FW=+v`6t^6{z<3-fx#|Ze~#h+ymBL^^GKS%Ve<)sP^<4*y_Y${06eD zH_n?Ani5Gs4&1z)UCL-uBvq(8)i!E@T_*0Sp5{Ddlpgke^_$gukJc_f9e=0Rfpta@ ze5~~aJBNK&OJSw!(rDRAHV0d+eW#1?PFbr==uG-$_fu8`!DWqQD~ef-Gx*ZmZx33_ zb0+I(0!hIK>r9_S5A*UwgRBKSd6!ieiYJHRigU@cogJ~FvJHY^DSysg)ac=7#wDBf zNLl!E$AiUMZC%%i5@g$WsN+sMSoUADKZ}-Pb`{7{S>3U%ry~?GVX!BDar2dJHLY|g zTJRo#Bs|u#8ke<3ohL2EFI*n6adobnYG?F3-#7eZZQO{#rmM8*PFycBR^UZKJWr(a z8cex$DPOx_PL^TO<%+f^L6#tdB8S^y#+fb|acQfD(9WgA+cb15L+LUdHKv)wE6={i zX^iY3N#U7QahohDP{g`IHS?D00eJC9DIx0V&nq!1T* z4$Bb?trvEG9JixrrNRKcjX)?KWR#Y(dh#re_<y*=5!J+-Wwb*D>jKXgr5L8_b6pvSAn3RIvI5oj!XF^m?otNA=t^dg z#V=L0@W)n?4Y@}49}YxQS=v5GsIF3%Cp#fFYm0Bm<}ey& zOfWB^vS8ye?n;%yD%NF8DvOpZqlB++#4KnUj>3%*S(c#yACIU>TyBG!GQl7{b8j#V z;lS})mrRtT!IRh2B-*T58%9;!X}W^mg;K&fb7?2#JH>JpCZV5jbDfOgOlc@wNLfHN z8O92GeBRjCP6Q9^Euw-*i&Wu=$>$;8Cktx52b{&Y^Ise-R1gTKRB9m0*Gze>$k?$N zua_0Hmbcj8qQy{ZyJ%`6v6F+yBGm>chZxCGpeL@os+v&5LON7;$tb~MQAbSZKG$k z8w`Mzn=cX4Hf~09q8_|3C7KnoM1^ZGU}#=vn1?1^Kc-eWv4x^T<|i9bCu;+lTQKr- zRwbRK!&XrWRoO7Kw!$zNQb#cJ1`iugR(f_vgmu!O)6tFH-0fOSBk6$^y+R07&&B!(V#ZV)CX42( zTC(jF&b@xu40fyb1=_2;Q|uPso&Gv9OSM1HR{iGPi@JUvmYM;rkv#JiJZ5-EFA%Lu zf;wAmbyclUM*D7>^nPatbGr%2aR5j55qSR$hR`c?d+z z`qko8Yn%vg)p=H`1o?=b9K0%Blx62gSy)q*8jWPyFmtA2a+E??&P~mT@cBdCsvFw4 zg{xaEyVZ|laq!sqN}mWq^*89$e6%sb6Thof;ml_G#Q6_0-zwf80?O}D0;La25A0C+ z3)w-xesp6?LlzF4V%yA9Ryl_Kq*wMk4eu&)Tqe#tmQJtwq`gI^7FXpToum5HP3@;N zpe4Y!wv5uMHUu`zbdtLys5)(l^C(hFKJ(T)z*PC>7f6ZRR1C#ao;R&_8&&a3)JLh* zOFKz5#F)hJqVAvcR#1)*AWPGmlEKw$sQd)YWdAs_W-ojA?Lm#wCd}uF0^X=?AA#ki zWG6oDQZJ5Tvifdz4xKWfK&_s`V*bM7SVc^=w7-m}jW6U1lQEv_JsW6W(| zkKf>qn^G!EWn~|7{G-&t0C6C%4)N{WRK_PM>4sW8^dDkFM|p&*aBuN%fg(I z^M-49vnMd%=04N95VO+?d#el>LEo^tvnQsMop70lNqq@%cTlht?e+B5L1L9R4R(_6 z!3dCLeGXb+_LiACNiqa^nOELJj%q&F^S+XbmdP}`KAep%TDop{Pz;UDc#P&LtMPgH zy+)P1jdgZQUuwLhV<89V{3*=Iu?u#v;v)LtxoOwV(}0UD@$NCzd=id{UuDdedeEp| z`%Q|Y<6T?kI)P|8c!K0Za&jxPhMSS!T`wlQNlkE(2B*>m{D#`hYYD>cgvsKrlcOcs7;SnVCeBiK6Wfho@*Ym9 zr0zNfrr}0%aOkHd)d%V^OFMI~MJp+Vg-^1HPru3Wvac@-QjLX9Dx}FL(l>Z;CkSvC zOR1MK%T1Edv2(b9$ttz!E7{x4{+uSVGz`uH&)gG`$)Vv0^E#b&JSZp#V)b6~$RWwe zzC3FzI`&`EDK@aKfeqQ4M(IEzDd~DS>GB$~ip2n!S%6sR&7QQ*=Mr(v*v-&07CO%# zMBTaD8-EgW#C6qFPPG1Ph^|0AFs;I+s|+A@WU}%@WbPI$S0+qFR^$gim+Fejs2f!$ z@Xdlb_K1BI;iiOUj`j+gOD%mjq^S~J0cZZwuqfzNH9}|(vvI6VO+9ZDA_(=EAo;( zKKzm`k!s!_sYCGOm)93Skaz+GF7eY@Ra8J$C)`X)`aPKym?7D^SI}Mnef4C@SgIEB z>nONSFl$qd;0gSZhNcRlq9VVHPkbakHlZ1gJ1y9W+@!V$TLpdsbKR-VwZrsSM^wLr zL9ob&JG)QDTaf&R^cnm5T5#*J3(pSpjM5~S1 z@V#E2syvK6wb?&h?{E)CoI~9uA(hST7hx4_6M(7!|BW3TR_9Q zLS{+uPoNgw(aK^?=1rFcDO?xPEk5Sm=|pW%-G2O>YWS^(RT)5EQ2GSl75`b}vRcD2 z|HX(x0#Qv+07*O|vMIV(0?KGjOny#Wa~C8Q(kF^IR8u|hyyfwD&>4lW=)Pa311caC zUk3aLCkAFkcidp@C%vNVLNUa#1ZnA~ZCLrLNp1b8(ndgB(0zy{Mw2M@QXXC{hTxr7 zbipeHI-U$#Kr>H4}+cu$#2fG6DgyWgq{O#8aa)4PoJ^;1z7b6t&zt zPei^>F1%8pcB#1`z`?f0EAe8A2C|}TRhzs*-vN^jf(XNoPN!tONWG=abD^=Lm9D?4 zbq4b(in{eZehKC0lF}`*7CTzAvu(K!eAwDNC#MlL2~&gyFKkhMIF=32gMFLvKsbLY z1d$)VSzc^K&!k#2Q?(f>pXn){C+g?vhQ0ijV^Z}p5#BGrGb%6n>IH-)SA$O)*z3lJ z1rtFlovL`cC*RaVG!p!4qMB+-f5j^1)ALf4Z;2X&ul&L!?`9Vdp@d(%(>O=7ZBV;l z?bbmyPen>!P{TJhSYPmLs759b1Ni1`d$0?&>OhxxqaU|}-?Z2c+}jgZ&vCSaCivx| z-&1gw2Lr<;U-_xzlg}Fa_3NE?o}R-ZRX->__}L$%2ySyiPegbnM{UuADqwDR{C2oS zPuo88%DNfl4xBogn((9j{;*YGE0>2YoL?LrH=o^SaAcgO39Ew|vZ0tyOXb509#6{7 z0<}CptRX5(Z4*}8CqCgpT@HY3Q)CvRz_YE;nf6ZFwEje^;Hkj0b1ESI*8Z@(RQrW4 z35D5;S73>-W$S@|+M~A(vYvX(yvLN(35THo!yT=vw@d(=q8m+sJyZMB7T&>QJ=jkwQVQ07*Am^T980rldC)j}}zf!gq7_z4dZ zHwHB94%D-EB<-^W@9;u|(=X33c(G>q;Tfq1F~-Lltp|+uwVzg?e$M96ndY{Lcou%w zWRkjeE`G*i)Bm*|_7bi+=MPm8by_};`=pG!DSGBP6y}zvV^+#BYx{<>p0DO{j@)(S zxcE`o+gZf8EPv1g3E1c3LIbw+`rO3N+Auz}vn~)cCm^DlEi#|Az$b z2}Pqf#=rxd!W*6HijC|u-4b~jtuQS>7uu{>wm)PY6^S5eo=?M>;tK`=DKXuArZvaU zHk(G??qjKYS9G6Du)#fn+ob=}C1Hj9d?V$_=J41ljM$CaA^xh^XrV-jzi7TR-{{9V zZZI0;aQ9YNEc`q=Xvz;@q$eqL<}+L(>HR$JA4mB6~g*YRSnpo zTofY;u7F~{1Pl=pdsDQx8Gg#|@BdoWo~J~j%DfVlT~JaC)he>he6`C`&@@#?;e(9( zgKcmoidHU$;pi{;VXyE~4>0{kJ>K3Uy6`s*1S--*mM&NY)*eOyy!7?9&osK*AQ~vi z{4qIQs)s#eN6j&0S()cD&aCtV;r>ykvAzd4O-fG^4Bmx2A2U7-kZR5{Qp-R^i4H2yfwC7?9(r3=?oH(~JR4=QMls>auMv*>^^!$}{}R z;#(gP+O;kn4G|totqZGdB~`9yzShMze{+$$?9%LJi>4YIsaPMwiJ{`gocu0U}$Q$vI5oeyKrgzz>!gI+XFt!#n z7vs9Pn`{{5w-@}FJZn?!%EQV!PdA3hw%Xa2#-;X4*B4?`WM;4@bj`R-yoAs_t4!!` zEaY5OrYi`3u3rXdY$2jZdZvufgFwVna?!>#t#DKAD2;U zqpqktqJ)8EPY*w~yj7r~#bNk|PDM>ZS?5F7T5aPFVZrqeX~5_1*zTQ%;xUHe#li?s zJ*5XZVERVfRjwX^s=0<%nXhULK+MdibMjzt%J7#fuh?NXyJ^pqpfG$PFmG!h*opyi zmMONjJY#%dkdRHm$l!DLeBm#_0YCq|x17c1fYJ#5YMpsjrFKyU=y>g5QcTgbDm28X zYL1RK)sn1@XtkGR;tNb}(kg#9L=jNSbJizqAgV-TtK2#?LZXrCIz({ zO^R|`ZDu(d@E7vE}df5`a zNIQRp&mDFbgyDKtyl@J|GcR9!h+_a$za$fnO5Ai9{)d7m@?@qk(RjHwXD}JbKRn|u z=Hy^z2vZ<1Mf{5ihhi9Y9GEG74Wvka;%G61WB*y7;&L>k99;IEH;d8-IR6KV{~(LZ zN7@V~f)+yg7&K~uLvG9MAY+{o+|JX?yf7h9FT%7ZrW7!RekjwgAA4jU$U#>_!ZC|c zA9%tc9nq|>2N1rg9uw-Qc89V}I5Y`vuJ(y`Ibc_?D>lPF0>d_mB@~pU`~)uWP48cT@fTxkWSw{aR!`K{v)v zpN?vQZZNPgs3ki9h{An4&Cap-c5sJ!LVLtRd=GOZ^bUpyDZHm6T|t#218}ZA zx*=~9PO>5IGaBD^XX-_2t7?7@WN7VfI^^#Csdz9&{1r z9y<9R?BT~-V8+W3kzWWQ^)ZSI+R zt^Lg`iN$Z~a27)sC_03jrD-%@{ArCPY#Pc*u|j7rE%}jF$LvO4vyvAw3bdL_mg&ei zXys_i=Q!UoF^Xp6^2h5o&%cQ@@)$J4l`AG09G6Uj<~A~!xG>KjKSyTX)zH*EdHMK0 zo;AV-D+bqWhtD-!^+`$*P0B`HokilLd1EuuwhJ?%3wJ~VXIjIE3tj653PExvIVhE& zFMYsI(OX-Q&W$}9gad^PUGuKElCvXxU_s*kx%dH)Bi&$*Q(+9j>(Q>7K1A#|8 zY!G!p0kW29rP*BNHe_wH49bF{K7tymi}Q!Vc_Ox2XjwtpM2SYo7n>?_sB=$c8O5^? z6as!fE9B48FcE`(ruNXP%rAZlDXrFTC7^aoXEX41k)tIq)6kJ*(sr$xVqsh_m3^?? zOR#{GJIr6E0Sz{-( z-R?4asj|!GVl0SEagNH-t|{s06Q3eG{kZOoPHL&Hs0gUkPc&SMY=&{C0&HDI)EHx9 zm#ySWluxwp+b~+K#VG%21%F65tyrt9RTPR$eG0afer6D`M zTW=y!@y6yi#I5V#!I|8IqU=@IfZo!@9*P+f{yLxGu$1MZ%xRY(gRQ2qH@9eMK0`Z> zgO`4DHfFEN8@m@dxYuljsmVv}c4SID+8{kr>d_dLzF$g>urGy9g+=`xAfTkVtz56G zrKNsP$yrDyP=kIqPN9~rVmC-wH672NF7xU>~j5M06Xr&>UJBmOV z%7Ie2d=K=u^D`~i3(U7x?n=h!SCSD1`aFe-sY<*oh+=;B>UVFBOHsF=(Xr(Cai{dL z4S7Y>PHdfG9Iav5FtKzx&UCgg)|DRLvq7!0*9VD`e6``Pgc z1O!qSaNeBBZnDXClh(Dq@XAk?Bd6+_rsFt`5(E+V2c)!Mx4X z47X+QCB4B7$B=Fw1Z1vnHg;x9oDV1YQJAR6Q3}_}BXTFg$A$E!oGG%`Rc()-Ysc%w za(yEn0fw~AaEFr}Rxi;if?Gv)&g~21UzXU9osI9{rNfH$gPTTk#^B|irEc<8W+|9$ zc~R${X2)N!npz1DFVa%nEW)cgPq`MSs)_I*Xwo<+ZK-2^hD(Mc8rF1+2v7&qV;5SET-ygMLNFsb~#u+LpD$uLR1o!ha67gPV5Q{v#PZK5X zUT4aZ{o}&*q7rs)v%*fDTl%}VFX?Oi{i+oKVUBqbi8w#FI%_5;6`?(yc&(Fed4Quy8xsswG+o&R zO1#lUiA%!}61s3jR7;+iO$;1YN;_*yUnJK=$PT_}Q%&0T@2i$ zwGC@ZE^A62YeOS9DU9me5#`(wv24fK=C)N$>!!6V#6rX3xiHehfdvwWJ>_fwz9l)o`Vw9yi z0p5BgvIM5o_ zgo-xaAkS_mya8FXo1Ke4;U*7TGSfm0!fb4{E5Ar8T3p!Z@4;FYT8m=d`C@4-LM121 z?6W@9d@52vxUT-6K_;1!SE%FZHcm0U$SsC%QB zxkTrfH;#Y7OYPy!nt|k^Lgz}uYudos9wI^8x>Y{fTzv9gfTVXN2xH`;Er=rTeAO1x znaaJOR-I)qwD4z%&dDjY)@s`LLSd#FoD!?NY~9#wQRTHpD7Vyyq?tKUHKv6^VE93U zt_&ePH+LM-+9w-_9rvc|>B!oT>_L59nipM-@ITy|x=P%Ezu@Y?N!?jpwP%lm;0V5p z?-$)m84(|7vxV<6f%rK3!(R7>^!EuvA&j@jdTI+5S1E{(a*wvsV}_)HDR&8iuc#>+ zMr^2z*@GTnfDW-QS38OJPR3h6U&mA;vA6Pr)MoT7%NvA`%a&JPi|K8NP$b1QY#WdMt8-CDA zyL0UXNpZ?x=tj~LeM0wk<0Dlvn$rtjd$36`+mlf6;Q}K2{%?%EQ+#FJy6v5cS+Q-~ ztk||Iwr$(CZQHi38QZF;lFFBNt+mg2*V_AhzkM<8#>E_S^xj8%T5tXTytD6f)vePG z^B0Ne-*6Pqg+rVW?%FGHLhl^ycQM-dhNCr)tGC|XyES*NK%*4AnZ!V+Zu?x zV2a82fs8?o?X} zjC1`&uo1Ti*gaP@E43NageV^$Xue3%es2pOrLdgznZ!_a{*`tfA+vnUv;^Ebi3cc$?-kh76PqA zMpL!y(V=4BGPQSU)78q~N}_@xY5S>BavY3Sez-+%b*m0v*tOz6zub9%*~%-B)lb}t zy1UgzupFgf?XyMa+j}Yu>102tP$^S9f7;b7N&8?_lYG$okIC`h2QCT_)HxG1V4Uv{xdA4k3-FVY)d}`cmkePsLScG&~@wE?ix2<(G7h zQ7&jBQ}Kx9mm<0frw#BDYR7_HvY7En#z?&*FurzdDNdfF znCL1U3#iO`BnfPyM@>;#m2Lw9cGn;(5*QN9$zd4P68ji$X?^=qHraP~Nk@JX6}S>2 zhJz4MVTib`OlEAqt!UYobU0-0r*`=03)&q7ubQXrt|t?^U^Z#MEZV?VEin3Nv1~?U zuwwSeR10BrNZ@*h7M)aTxG`D(By$(ZP#UmBGf}duX zhx;7y1x@j2t5sS#QjbEPIj95hV8*7uF6c}~NBl5|hgbB(}M3vnt zu_^>@s*Bd>w;{6v53iF5q7Em>8n&m&MXL#ilSzuC6HTzzi-V#lWoX zBOSBYm|ti@bXb9HZ~}=dlV+F?nYo3?YaV2=N@AI5T5LWWZzwvnFa%w%C<$wBkc@&3 zyUE^8xu<=k!KX<}XJYo8L5NLySP)cF392GK97(ylPS+&b}$M$Y+1VDrJa`GG7+%ToAsh z5NEB9oVv>as?i7f^o>0XCd%2wIaNRyejlFws`bXG$Mhmb6S&shdZKo;p&~b4wv$ z?2ZoM$la+_?cynm&~jEi6bnD;zSx<0BuCSDHGSssT7Qctf`0U!GDwG=+^|-a5%8Ty z&Q!%m%geLjBT*#}t zv1wDzuC)_WK1E|H?NZ&-xr5OX(ukXMYM~_2c;K}219agkgBte_#f+b9Al8XjL-p}1 z8deBZFjplH85+Fa5Q$MbL>AfKPxj?6Bib2pevGxIGAG=vr;IuuC%sq9x{g4L$?Bw+ zvoo`E)3#bpJ{Ij>Yn0I>R&&5B$&M|r&zxh+q>*QPaxi2{lp?omkCo~7ibow#@{0P> z&XBocU8KAP3hNPKEMksQ^90zB1&&b1Me>?maT}4xv7QHA@Nbvt-iWy7+yPFa9G0DP zP82ooqy_ku{UPv$YF0kFrrx3L=FI|AjG7*(paRLM0k1J>3oPxU0Zd+4&vIMW>h4O5G zej2N$(e|2Re z@8xQ|uUvbA8QVXGjZ{Uiolxb7c7C^nW`P(m*Jkqn)qdI0xTa#fcK7SLp)<86(c`A3 zFNB4y#NHe$wYc7V)|=uiW8gS{1WMaJhDj4xYhld;zJip&uJ{Jg3R`n+jywDc*=>bW zEqw(_+j%8LMRrH~+M*$V$xn9x9P&zt^evq$P`aSf-51`ZOKm(35OEUMlO^$>%@b?a z>qXny!8eV7cI)cb0lu+dwzGH(Drx1-g+uDX;Oy$cs+gz~?LWif;#!+IvPR6fa&@Gj zwz!Vw9@-Jm1QtYT?I@JQf%`=$^I%0NK9CJ75gA}ff@?I*xUD7!x*qcyTX5X+pS zAVy4{51-dHKs*OroaTy;U?zpFS;bKV7wb}8v+Q#z<^$%NXN(_hG}*9E_DhrRd7Jqp zr}2jKH{avzrpXj?cW{17{kgKql+R(Ew55YiKK7=8nkzp7Sx<956tRa(|yvHlW zNO7|;GvR(1q}GrTY@uC&ow0me|8wE(PzOd}Y=T+Ih8@c2&~6(nzQrK??I7DbOguA9GUoz3ASU%BFCc8LBsslu|nl>q8Ag(jA9vkQ`q2amJ5FfA7GoCdsLW znuok(diRhuN+)A&`rH{$(HXWyG2TLXhVDo4xu?}k2cH7QsoS>sPV)ylb45Zt&_+1& zT)Yzh#FHRZ-z_Q^8~IZ+G~+qSw-D<{0NZ5!J1%rAc`B23T98TMh9ylkzdk^O?W`@C??Z5U9#vi0d<(`?9fQvNN^ji;&r}geU zSbKR5Mv$&u8d|iB^qiLaZQ#@)%kx1N;Og8Js>HQD3W4~pI(l>KiHpAv&-Ev45z(vYK<>p6 z6#pU(@rUu{i9UngMhU&FI5yeRub4#u=9H+N>L@t}djC(Schr;gc90n%)qH{$l0L4T z;=R%r>CuxH!O@+eBR`rBLrT0vnP^sJ^+qE^C8ZY0-@te3SjnJ)d(~HcnQw@`|qAp|Trrs^E*n zY1!(LgVJfL?@N+u{*!Q97N{Uu)ZvaN>hsM~J?*Qvqv;sLnXHjKrtG&x)7tk?8%AHI zo5eI#`qV1{HmUf-Fucg1xn?Kw;(!%pdQ)ai43J3NP4{%x1D zI0#GZh8tjRy+2{m$HyI(iEwK30a4I36cSht3MM85UqccyUq6$j5K>|w$O3>`Ds;`0736+M@q(9$(`C6QZQ-vAKjIXKR(NAH88 zwfM6_nGWlhpy!_o56^BU``%TQ%tD4hs2^<2pLypjAZ;W9xAQRfF_;T9W-uidv{`B z{)0udL1~tMg}a!hzVM0a_$RbuQk|EG&(z*{nZXD3hf;BJe4YxX8pKX7VaIjjDP%sk zU5iOkhzZ&%?A@YfaJ8l&H;it@;u>AIB`TkglVuy>h;vjtq~o`5NfvR!ZfL8qS#LL` zD!nYHGzZ|}BcCf8s>b=5nZRYV{)KK#7$I06s<;RyYC3<~`mob_t2IfR*dkFJyL?FU zvuo-EE4U(-le)zdgtW#AVA~zjx*^80kd3A#?vI63pLnW2{j*=#UG}ISD>=ZGA$H&` z?Nd8&11*4`%MQlM64wfK`{O*ad5}vk4{Gy}F98xIAsmjp*9P=a^yBHBjF2*Iibo2H zGJAMFDjZcVd%6bZ`dz;I@F55VCn{~RKUqD#V_d{gc|Z|`RstPw$>Wu+;SY%yf1rI=>51Oolm>cnjOWHm?ydcgGs_kPUu=?ZKtQS> zKtLS-v$OMWXO>B%Z4LFUgw4MqA?60o{}-^6tf(c0{Y3|yF##+)RoXYVY-lyPhgn{1 z>}yF0Ab}D#1*746QAj5c%66>7CCWs8O7_d&=Ktu!SK(m}StvvBT1$8QP3O2a*^BNA z)HPhmIi*((2`?w}IE6Fo-SwzI_F~OC7OR}guyY!bOQfpNRg3iMvsFPYb9-;dT6T%R zhLwIjgiE^-9_4F3eMHZ3LI%bbOmWVe{SONpujQ;3C+58=Be4@yJK>3&@O>YaSdrevAdCLMe_tL zl8@F}{Oc!aXO5!t!|`I zdC`k$5z9Yf%RYJp2|k*DK1W@AN23W%SD0EdUV^6~6bPp_HZi0@dku_^N--oZv}wZA zH?Bf`knx%oKB36^L;P%|pf#}Tp(icw=0(2N4aL_Ea=9DMtF})2ay68V{*KfE{O=xL zf}tcfCL|D$6g&_R;r~1m{+)sutQPKzVv6Zw(%8w&4aeiy(qct1x38kiqgk!0^^X3IzI2ia zxI|Q)qJNEf{=I$RnS0`SGMVg~>kHQB@~&iT7+eR!Ilo1ZrDc3TVW)CvFFjHK4K}Kh z)dxbw7X%-9Ol&Y4NQE~bX6z+BGOEIIfJ~KfD}f4spk(m62#u%k<+iD^`AqIhWxtKGIm)l$7=L`=VU0Bz3-cLvy&xdHDe-_d3%*C|Q&&_-n;B`87X zDBt3O?Wo-Hg6*i?f`G}5zvM?OzQjkB8uJhzj3N;TM5dSM$C@~gGU7nt-XX_W(p0IA6$~^cP*IAnA<=@HVqNz=Dp#Rcj9_6*8o|*^YseK_4d&mBY*Y&q z8gtl;(5%~3Ehpz)bLX%)7|h4tAwx}1+8CBtu9f5%^SE<&4%~9EVn4*_!r}+{^2;} zwz}#@Iw?&|8F2LdXUIjh@kg3QH69tqxR_FzA;zVpY=E zcHnWh(3j3UXeD=4m_@)Ea4m#r?axC&X%#wC8FpJPDYR~@65T?pXuWdPzEqXP>|L`S zKYFF0I~%I>SFWF|&sDsRdXf$-TVGSoWTx7>7mtCVUrQNVjZ#;Krobgh76tiP*0(5A zs#<7EJ#J`Xhp*IXB+p5{b&X3GXi#b*u~peAD9vr0*Vd&mvMY^zxTD=e(`}ybDt=BC(4q)CIdp>aK z0c?i@vFWjcbK>oH&V_1m_EuZ;KjZSiW^i30U` zGLK{%1o9TGm8@gy+Rl=-5&z`~Un@l*2ne3e9B+>wKyxuoUa1qhf?-Pi= zZLCD-b7*(ybv6uh4b`s&Ol3hX2ZE<}N@iC+h&{J5U|U{u$XK0AJz)!TSX6lrkG?ris;y{s zv`B5Rq(~G58?KlDZ!o9q5t%^E4`+=ku_h@~w**@jHV-+cBW-`H9HS@o?YUUkKJ;AeCMz^f@FgrRi@?NvO3|J zBM^>4Z}}!vzNum!R~o0)rszHG(eeq!#C^wggTgne^2xc9nIanR$pH1*O;V>3&#PNa z7yoo?%T(?m-x_ow+M0Bk!@ow>A=skt&~xK=a(GEGIWo4AW09{U%(;CYLiQIY$bl3M zxC_FGKY%J`&oTS{R8MHVe{vghGEshWi!(EK*DWmoOv|(Ff#(bZ-<~{rc|a%}Q4-;w z{2gca97m~Nj@Nl{d)P`J__#Zgvc@)q_(yfrF2yHs6RU8UXxcU(T257}E#E_A}%2_IW?%O+7v((|iQ{H<|$S7w?;7J;iwD>xbZc$=l*(bzRXc~edIirlU0T&0E_EXfS5%yA zs0y|Sp&i`0zf;VLN=%hmo9!aoLGP<*Z7E8GT}%)cLFs(KHScNBco(uTubbxCOD_%P zD7XlHivrSWLth7jf4QR9`jFNk-7i%v4*4fC*A=;$Dm@Z^OK|rAw>*CI%E z3%14h-)|Q%_$wi9=p!;+cQ*N1(47<49TyB&B*bm_m$rs+*ztWStR~>b zE@V06;x19Y_A85N;R+?e?zMTIqdB1R8>(!4_S!Fh={DGqYvA0e-P~2DaRpCYf4$-Q z*&}6D!N_@s`$W(|!DOv%>R0n;?#(HgaI$KpHYpnbj~I5eeI(u4CS7OJajF%iKz)*V zt@8=9)tD1ML_CrdXQ81bETBeW!IEy7mu4*bnU--kK;KfgZ>oO>f)Sz~UK1AW#ZQ_ic&!ce~@(m2HT@xEh5u%{t}EOn8ET#*U~PfiIh2QgpT z%gJU6!sR2rA94u@xj3%Q`n@d}^iMH#X>&Bax+f4cG7E{g{vlJQ!f9T5wA6T`CgB%6 z-9aRjn$BmH=)}?xWm9bf`Yj-f;%XKRp@&7?L^k?OT_oZXASIqbQ#eztkW=tmRF$~% z6(&9wJuC-BlGrR*(LQKx8}jaE5t`aaz#Xb;(TBK98RJBjiqbZFyRNTOPA;fG$;~e` zsd6SBii3^(1Y`6^#>kJ77xF{PAfDkyevgox`qW`nz1F`&w*DH5Oh1idOTLES>DToi z8Qs4|?%#%>yuQO1#{R!-+2AOFznWo)e3~_D!nhoDgjovB%A8< zt%c^KlBL$cDPu!Cc`NLc_8>f?)!FGV7yudL$bKj!h;eOGkd;P~sr6>r6TlO{Wp1%xep8r1W{`<4am^(U} z+nCDP{Z*I?IGBE&*KjiaR}dpvM{ZFMW%P5Ft)u$FD373r2|cNsz%b0uk1T+mQI@4& zFF*~xDxDRew1Bol-*q>F{Xw8BUO;>|0KXf`lv7IUh%GgeLUzR|_r(TXZTbfXFE0oc zmGMwzNFgkdg><=+3MnncRD^O`m=SxJ6?}NZ8BR)=ag^b4Eiu<_bN&i0wUaCGi60W6 z%iMl&`h8G)y`gfrVw$={cZ)H4KSQO`UV#!@@cDx*hChXJB7zY18EsIo1)tw0k+8u; zg(6qLysbxVbLFbkYqKbEuc3KxTE+%j5&k>zHB8_FuDcOO3}FS|eTxoUh2~|Bh?pD| zsmg(EtMh`@s;`(r!%^xxDt(5wawK+*jLl>_Z3shaB~vdkJ!V3RnShluzmwn7>PHai z3avc`)jZSAvTVC6{2~^CaX49GXMtd|sbi*swkgoyLr=&yp!ASd^mIC^D;a|<=3pSt zM&0u%#%DGzlF4JpMDs~#kU;UCtyW+d3JwNiu`Uc7Yi6%2gfvP_pz8I{Q<#25DjM_D z(>8yI^s@_tG@c=cPoZImW1CO~`>l>rs=i4BFMZT`vq5bMOe!H@8q@sEZX<-kiY&@u3g1YFc zc@)@OF;K-JjI(eLs~hy8qOa9H1zb!3GslI!nH2DhP=p*NLHeh^9WF?4Iakt+b( z-4!;Q-8c|AX>t+5I64EKpDj4l2x*!_REy9L_9F~i{)1?o#Ws{YG#*}lg_zktt#ZlN zmoNsGm7$AXLink`GWtY*TZEH!J9Qv+A1y|@>?&(pb(6XW#ZF*}x*{60%wnt{n8Icp zq-Kb($kh6v_voqvA`8rq!cgyu;GaWZ>C2t6G5wk! zcKTlw=>KX3ldU}a1%XESW71))Z=HW%sMj2znJ;fdN${00DGGO}d+QsTQ=f;BeZ`eC~0-*|gn$9G#`#0YbT(>O(k&!?2jI z&oi9&3n6Vz<4RGR}h*1ggr#&0f%Op(6{h>EEVFNJ0C>I~~SmvqG+{RXDrexBz zw;bR@$Wi`HQ3e*eU@Cr-4Z7g`1R}>3-Qej(#Dmy|CuFc{Pg83Jv(pOMs$t(9vVJQJ zXqn2Ol^MW;DXq!qM$55vZ{JRqg!Q1^Qdn&FIug%O3=PUr~Q`UJuZ zc`_bE6i^Cp_(fka&A)MsPukiMyjG$((zE$!u>wyAe`gf-1Qf}WFfi1Y{^ zdCTTrxqpQE#2BYWEBnTr)u-qGSVRMV7HTC(x zb(0FjYH~nW07F|{@oy)rlK6CCCgyX?cB;19Z(bCP5>lwN0UBF}Ia|L0$oGHl-oSTZ zr;(u7nDjSA03v~XoF@ULya8|dzH<2G=n9A)AIkQKF0mn?!BU(ipengAE}6r`CE!jd z=EcX8exgDZZQ~~fgxR-2yF;l|kAfnjhz|i_o~cYRdhnE~1yZ{s zG!kZJ<-OVnO{s3bOJK<)`O;rk>=^Sj3M76Nqkj<_@Jjw~iOkWUCL+*Z?+_Jvdb!0cUBy=(5W9H-r4I zxAFts>~r)B>KXdQANyaeKvFheZMgoq4EVV0|^NR@>ea* zh%<78{}wsdL|9N1!jCN-)wH4SDhl$MN^f_3&qo?>Bz#?c{ne*P1+1 z!a`(2Bxy`S^(cw^dv{$cT^wEQ5;+MBctgPfM9kIQGFUKI#>ZfW9(8~Ey-8`OR_XoT zflW^mFO?AwFWx9mW2-@LrY~I1{dlX~jBMt!3?5goHeg#o0lKgQ+eZcIheq@A&dD}GY&1c%hsgo?z zH>-hNgF?Jk*F0UOZ*bs+MXO(dLZ|jzKu5xV1v#!RD+jRrHdQ z>>b){U(I@i6~4kZXn$rk?8j(eVKYJ2&k7Uc`u01>B&G@c`P#t#x@>Q$N$1aT514fK zA_H8j)UKen{k^ehe%nbTw}<JV6xN_|| z(bd-%aL}b z3VITE`N~@WlS+cV>C9TU;YfsU3;`+@hJSbG6aGvis{Gs%2K|($)(_VfpHB|DG8Nje+0tCNW%_cu3hk0F)~{-% zW{2xSu@)Xnc`Dc%AOH)+LT97ImFR*WekSnJ3OYIs#ijP4TD`K&7NZKsfZ;76k@VD3py?pSw~~r^VV$Z zuUl9lF4H2(Qga0EP_==vQ@f!FLC+Y74*s`Ogq|^!?RRt&9e9A&?Tdu=8SOva$dqgYU$zkKD3m>I=`nhx-+M;-leZgt z8TeyQFy`jtUg4Ih^JCUcq+g_qs?LXSxF#t+?1Jsr8c1PB#V+f6aOx@;ThTIR4AyF5 z3m$Rq(6R}U2S}~Bn^M0P&Aaux%D@ijl0kCCF48t)+Y`u>g?|ibOAJoQGML@;tn{%3IEMaD(@`{7ByXQ`PmDeK*;W?| zI8%%P8%9)9{9DL-zKbDQ*%@Cl>Q)_M6vCs~5rb(oTD%vH@o?Gk?UoRD=C-M|w~&vb z{n-B9>t0EORXd-VfYC>sNv5vOF_Wo5V)(Oa%<~f|EU7=npanpVX^SxPW;C!hMf#kq z*vGNI-!9&y!|>Zj0V<~)zDu=JqlQu+ii387D-_U>WI_`3pDuHg{%N5yzU zEulPN)%3&{PX|hv*rc&NKe(bJLhH=GPuLk5pSo9J(M9J3v)FxCo65T%9x<)x+&4Rr2#nu2?~Glz|{28OV6 z)H^`XkUL|MG-$XE=M4*fIPmeR2wFWd>5o*)(gG^Y>!P4(f z68RkX0cRBOFc@`W-IA(q@p@m>*2q-`LfujOJ8-h$OgHte;KY4vZKTxO95;wh#2ZDL zKi8aHkz2l54lZd81t`yY$Tq_Q2_JZ1d(65apMg}vqwx=ceNOWjFB)6m3Q!edw2<{O z4J6+Un(E8jxs-L-K_XM_VWahy zE+9fm_ZaxjNi{fI_AqLKqhc4IkqQ4`Ut$=0L)nzlQw^%i?bP~znsbMY3f}*nPWqQZ zz_CQDpZ?Npn_pEr`~SX1`OoSkS;bmzQ69y|W_4bH3&U3F7EBlx+t%2R02VRJ01cfX zo$$^ObDHK%bHQaOcMpCq@@Jp8!OLYVQO+itW1ZxlkmoG#3FmD4b61mZjn4H|pSmYi2YE;I#@jtq8Mhjdgl!6({gUsQA>IRXb#AyWVt7b=(HWGUj;wd!S+q z4S+H|y<$yPrrrTqQHsa}H`#eJFV2H5Dd2FqFMA%mwd`4hMK4722|78d(XV}rz^-GV(k zqsQ>JWy~cg_hbp0=~V3&TnniMQ}t#INg!o2lN#H4_gx8Tn~Gu&*ZF8#kkM*5gvPu^ zw?!M^05{7q&uthxOn?%#%RA_%y~1IWly7&_-sV!D=Kw3DP+W)>YYRiAqw^d7vG_Q%v;tRbE1pOBHc)c&_5=@wo4CJTJ1DeZErEvP5J(kc^GnGYX z|LqQjTkM{^gO2cO#-(g!7^di@$J0ibC(vsnVkHt3osnWL8?-;R1BW40q5Tmu_9L-s z7fNF5fiuS-%B%F$;D97N-I@!~c+J>nv%mzQ5vs?1MgR@XD*Gv`A{s8 z5Cr>z5j?|sb>n=c*xSKHpdy667QZT?$j^Doa%#m4ggM@4t5Oe%iW z@w~j_B>GJJkO+6dVHD#CkbC(=VMN8nDkz%44SK62N(ZM#AsNz1KW~3(i=)O;q5JrK z?vAVuL}Rme)OGQuLn8{3+V352UvEBV^>|-TAAa1l-T)oiYYD&}Kyxw73shz?Bn})7 z_a_CIPYK(zMp(i+tRLjy4dV#CBf3s@bdmwXo`Y)dRq9r9-c@^2S*YoNOmAX%@OYJOXs zT*->in!8Ca_$W8zMBb04@|Y)|>WZ)-QGO&S7Zga1(1#VR&)X+MD{LEPc%EJCXIMtr z1X@}oNU;_(dfQ_|kI-iUSTKiVzcy+zr72kq)TIp(GkgVyd%{8@^)$%G)pA@^Mfj71FG%d?sf(2Vm>k%X^RS`}v0LmwIQ7!_7cy$Q8pT?X1VWecA_W68u==HbrU& z@&L6pM0@8ZHL?k{6+&ewAj%grb6y@0$3oamTvXsjGmPL_$~OpIyIq%b$(uI1VKo zk_@{r>1p84UK3}B>@d?xUZ}dJk>uEd+-QhwFQ`U?rA=jj+$w8sD#{492P}~R#%z%0 z5dlltiAaiPKv9fhjmuy{*m!C22$;>#85EduvdSrFES{QO$bHpa7E@&{bWb@<7VhTF zXCFS_wB>7*MjJ3$_i4^A2XfF2t7`LOr3B@??OOUk=4fKkaHne4RhI~Lm$JrHfUU*h zgD9G66;_F?3>0W{pW2A^DR7Bq`ZUiSc${S8EM>%gFIqAw0du4~kU#vuCb=$I_PQv? zZfEY7X6c{jJZ@nF&T>4oyy(Zr_XqnMq)ZtGPASbr?IhZOnL|JKY()`eo=P5UK9(P-@ zOJKFogtk|pscVD+#$7KZs^K5l4gC}*CTd0neZ8L(^&1*bPrCp23%{VNp`4Ld*)Fly z)b|zb*bCzp?&X3_=qLT&0J+=p01&}9*xbk~^hd^@mV!Ha`1H+M&60QH2c|!Ty`RepK|H|Moc5MquD z=&$Ne3%WX+|7?iiR8=7*LW9O3{O%Z6U6`VekeF8lGr5vd)rsZu@X#5!^G1;nV60cz zW?9%HgD}1G{E(YvcLcIMQR65BP50)a;WI*tjRzL7diqRqh$3>OK{06VyC=pj6OiardshTnYfve5U>Tln@y{DC99f!B4> zCrZa$B;IjDrg}*D5l=CrW|wdzENw{q?oIj!Px^7DnqAsU7_=AzXxoA;4(YvN5^9ag zwEd4-HOlO~R0~zk>!4|_Z&&q}agLD`Nx!%9RLC#7fK=w06e zOK<>|#@|e2zjwZ5aB>DJ%#P>k4s0+xHJs@jROvoDQfSoE84l8{9y%5^POiP+?yq0> z7+Ymbld(s-4p5vykK@g<{X*!DZt1QWXKGmj${`@_R~=a!qPzB357nWW^KmhV!^G3i zsYN{2_@gtzsZH*FY!}}vNDnqq>kc(+7wK}M4V*O!M&GQ|uj>+8!Q8Ja+j3f*MzwcI z^s4FXGC=LZ?il4D+Y^f89wh!d7EU-5dZ}}>_PO}jXRQ@q^CjK-{KVnmFd_f&IDKmx zZ5;PDLF%_O);<4t`WSMN;Ec^;I#wU?Z?_R|Jg`#wbq;UM#50f@7F?b7ySi-$C-N;% zqXowTcT@=|@~*a)dkZ836R=H+m6|fynm#0Y{KVyYU=_*NHO1{=Eo{^L@wWr7 zjz9GOu8Fd&v}a4d+}@J^9=!dJRsCO@=>K6UCM)Xv6};tb)M#{(k!i}_0Rjq z2kb7wPcNgov%%q#(1cLykjrxAg)By+3QueBR>Wsep&rWQHq1wE!JP+L;q+mXts{j@ zOY@t9BFmofApO0k@iBFPeKsV3X=|=_t65QyohXMSfMRr7Jyf8~ogPVmJwbr@`nmml zov*NCf;*mT(5s4K=~xtYy8SzE66W#tW4X#RnN%<8FGCT{z#jRKy@Cy|!yR`7dsJ}R z!eZzPCF+^b0qwg(mE=M#V;Ud9)2QL~ z-r-2%0dbya)%ui_>e6>O3-}4+Q!D+MU-9HL2tH)O`cMC1^=rA=q$Pcc;Zel@@ss|K zH*WMdS^O`5Uv1qNTMhM(=;qjhaJ|ZC41i2!kt4;JGlXQ$tvvF8Oa^C@(q6(&6B^l) zNG{GaX?`qROHwL-F1WZDEF;C6Inuv~1&ZuP3j53547P38tr|iPH#3&hN*g0R^H;#) znft`cw0+^Lwe{!^kQat+xjf_$SZ05OD6~U`6njelvd+4pLZU(0ykS5&S$)u?gm!;} z+gJ8g12b1D4^2HH!?AHFAjDAP^q)Juw|hZfIv{3Ryn%4B^-rqIF2 zeWk^za4fq#@;re{z4_O|Zj&Zn{2WsyI^1%NW=2qA^iMH>u>@;GAYI>Bk~u0wWQrz* zdEf)7_pSYMg;_9^qrCzvv{FZYwgXK}6e6ceOH+i&+O=x&{7aRI(oz3NHc;UAxMJE2 zDb0QeNpm$TDcshGWs!Zy!shR$lC_Yh-PkQ`{V~z!AvUoRr&BAGS#_*ZygwI2-)6+a zq|?A;+-7f0Dk4uuht z6sWPGl&Q$bev1b6%aheld88yMmBp2j=z*egn1aAWd?zN=yEtRDGRW&nmv#%OQwuJ; zqKZ`L4DsqJwU{&2V9f>2`1QP7U}`6)$qxTNEi`4xn!HzIY?hDnnJZw+mFnVSry=bLH7ar+M(e9h?GiwnOM?9ZJcTJ08)T1-+J#cr&uHhXkiJ~}&(}wvzCo33 zLd_<%rRFQ3d5fzKYQy41<`HKk#$yn$Q+Fx-?{3h72XZrr*uN!5QjRon-qZh9-uZ$rWEKZ z!dJMP`hprNS{pzqO`Qhx`oXGd{4Uy0&RDwJ`hqLw4v5k#MOjvyt}IkLW{nNau8~XM z&XKeoVYreO=$E%z^WMd>J%tCdJx5-h+8tiawu2;s& zD7l`HV!v@vcX*qM(}KvZ#%0VBIbd)NClLBu-m2Scx1H`jyLYce;2z;;eo;ckYlU53 z9JcQS+CvCwj*yxM+e*1Vk6}+qIik2VzvUuJyWyO}piM1rEk%IvS;dsXOIR!#9S;G@ zPcz^%QTf9D<2~VA5L@Z@FGQqwyx~Mc-QFzT4Em?7u`OU!PB=MD8jx%J{<`tH$Kcxz zjIvb$x|`s!-^^Zw{hGV>rg&zb;=m?XYAU0LFw+uyp8v@Y)zmjj&Ib7Y1@r4`cfrS%cVxJiw`;*BwIU*6QVsBBL;~nw4`ZFqs z1YSgLVy=rvA&GQB4MDG+j^)X1N=T;Ty2lE-`zrg(dNq?=Q`nCM*o8~A2V~UPArX<| zF;e$5B0hPSo56=ePVy{nah#?e-Yi3g*z6iYJ#BFJ-5f0KlQ-PRiuGwe29fyk1T6>& zeo2lvb%h9Vzi&^QcVNp}J!x&ubtw5fKa|n2XSMlg#=G*6F|;p)%SpN~l8BaMREDQN z-c9O}?%U1p-ej%hzIDB!W_{`9lS}_U==fdYpAil1E3MQOFW^u#B)Cs zTE3|YB0bKpXuDKR9z&{4gNO3VHDLB!xxPES+)yaJxo<|}&bl`F21};xsQnc!*FPZA zSct2IU3gEu@WQKmY-vA5>MV?7W|{$rAEj4<8`*i)<%fj*gDz2=ApqZ&MP&0UmO1?q!GN=di+n(#bB_mHa z(H-rIOJqamMfwB%?di!TrN=x~0jOJtvb0e9uu$ZCVj(gJyK}Fa5F2S?VE30P{#n3eMy!-v7e8viCooW9cfQx%xyPNL*eDKL zB=X@jxulpkLfnar7D2EeP*0L7c9urDz{XdV;@tO;u`7DlN7#~ zAKA~uM2u8_<5FLkd}OzD9K zO5&hbK8yakUXn8r*H9RE zO9Gsipa2()=&x=1mnQtNP#4m%GXThu8Ccqx*qb;S{5}>bU*V5{SY~(Hb={cyTeaTM zMEaKedtJf^NnJrwQ^Bd57vSlJ3l@$^0QpX@_1>h^+js8QVpwOiIMOiSC_>3@dt*&| zV?0jRdlgn|FIYam0s)a@5?0kf7A|GD|dRnP1=B!{ldr;N5s)}MJ=i4XEqlC}w)LEJ}7f9~c!?It(s zu>b=YBlFRi(H-%8A!@Vr{mndRJ z_jx*?BQpK>qh`2+3cBJhx;>yXPjv>dQ0m+nd4nl(L;GmF-?XzlMK zP(Xeyh7mFlP#=J%i~L{o)*sG7H5g~bnL2Hn3y!!r5YiYRzgNTvgL<(*g5IB*gcajK z86X3LoW*5heFmkIQ-I_@I_7b!Xq#O;IzOv(TK#(4gd)rmCbv5YfA4koRfLydaIXUU z8(q?)EWy!sjsn-oyUC&uwJqEXdlM}#tmD~*Ztav=mTQyrw0^F=1I5lj*}GSQTQOW{ z=O12;?fJfXxy`)ItiDB@0sk43AZo_sRn*jc#S|(2*%tH84d|UTYN!O4R(G6-CM}84 zpiyYJ^wl|w@!*t)dwn0XJv2kuHgbfNL$U6)O-k*~7pQ?y=sQJdKk5x`1>PEAxjIWn z{H$)fZH4S}%?xzAy1om0^`Q$^?QEL}*ZVQK)NLgmnJ`(we z21c23X1&=^>k;UF-}7}@nzUf5HSLUcOYW&gsqUrj7%d$)+d8ZWwTZq)tOgc%fz95+ zl%sdl)|l|jXfqIcjKTFrX74Rbq1}osA~fXPSPE?XO=__@`7k4Taa!sHE8v-zfx(AM zXT_(7u;&_?4ZIh%45x>p!(I&xV|IE**qbqCRGD5aqLpCRvrNy@uT?iYo-FPpu`t}J zSTZ}MDrud+`#^14r`A%UoMvN;raizytxMBV$~~y3i0#m}0F}Dj_fBIz+)1RWdnctP z>^O^vd0E+jS+$V~*`mZWER~L^q?i-6RPxxufWdrW=%prbCYT{5>Vgu%vPB)~NN*2L zB?xQg2K@+Xy=sPh$%10LH!39p&SJG+3^i*lFLn=uY8Io6AXRZf;p~v@1(hWsFzeKzx99_{w>r;cypkPVJCKtLGK>?-K0GE zGH>$g?u`)U_%0|f#!;+E>?v>qghuBwYZxZ*Q*EE|P|__G+OzC-Z+}CS(XK^t!TMoT zc+QU|1C_PGiVp&_^wMxfmMAuJDQ%1p4O|x5DljN6+MJiO%8s{^ts8$uh5`N~qK46c`3WY#hRH$QI@*i1OB7qBIN*S2gK#uVd{ zik+wwQ{D)g{XTGjKV1m#kYhmK#?uy)g@idi&^8mX)Ms`^=hQGY)j|LuFr8SJGZjr| zzZf{hxYg)-I^G|*#dT9Jj)+wMfz-l7ixjmwHK9L4aPdXyD-QCW!2|Jn(<3$pq-BM; zs(6}egHAL?8l?f}2FJSkP`N%hdAeBiD{3qVlghzJe5s9ZUMd`;KURm_eFaK?d&+TyC88v zCv2R(Qg~0VS?+p+l1e(aVq`($>|0b{{tPNbi} zaZDffTZ7N|t2D5DBv~aX#X+yGagWs1JRsqbr4L8a`B`m) z1p9?T`|*8ZXHS7YD8{P1Dk`EGM`2Yjsy0=7M&U6^VO30`Gx!ZkUoqmc3oUbd&)V*iD08>dk=#G!*cs~^tOw^s8YQqYJ z!5=-4ZB7rW4mQF&YZw>T_in-c9`0NqQ_5Q}fq|)%HECgBd5KIo`miEcJ>~a1e2B@) zL_rqoQ;1MowD34e6#_U+>D`WcnG5<2Q6cnt4Iv@NC$*M+i3!c?6hqPJLsB|SJ~xo! zm>!N;b0E{RX{d*in3&0w!cmB&TBNEjhxdg!fo+}iGE*BWV%x*46rT@+cXU;leofWy zxst{S8m!_#hIhbV7wfWN#th8OI5EUr3IR_GOIzBgGW1u4J*TQxtT7PXp#U#EagTV* zehVkBFF06`@5bh!t%L)-)`p|d7D|^kED7fsht#SN7*3`MKZX};Jh0~nCREL_BGqNR zxpJ4`V{%>CAqEE#Dt95u=;Un8wLhrac$fao`XlNsOH%&Ey2tK&vAcriS1kXnntDuttcN{%YJz@!$T zD&v6ZQ>zS1`o!qT=JK-Y+^i~bZkVJpN8%<4>HbuG($h9LP;{3DJF_Jcl8CA5M~<3s^!$Sg62zLEnJtZ z0`)jwK75Il6)9XLf(64~`778D6-#Ie1IR2Ffu+_Oty%$8u+bP$?803V5W6%(+iZzp zp5<&sBV&%CJcXUIATUakP1czt$&0x$lyoLH!ueNaIpvtO z*eCijxOv^-D?JaLzH<3yhOfDENi@q#4w(#tl-19(&Yc2K%S8Y&r{3~-)P17sC1{rQ zOy>IZ6%814_UoEi+w9a4XyGXF66{rgE~UT)oT4x zg9oIx@|{KL#VpTyE=6WK@Sbd9RKEEY)5W{-%0F^6(QMuT$RQRZ&yqfyF*Z$f8>{iT zq(;UzB-Ltv;VHvh4y%YvG^UEkvpe9ugiT97ErbY0ErCEOWs4J=kflA!*Q}gMbEP`N zY#L`x9a?E)*~B~t+7c8eR}VY`t}J;EWuJ-6&}SHnNZ8i0PZT^ahA@@HXk?c0{)6rC zP}I}_KK7MjXqn1E19gOwWvJ3i9>FNxN67o?lZy4H?n}%j|Dq$p%TFLUPJBD;R|*0O z3pLw^?*$9Ax!xy<&fO@;E2w$9nMez{5JdFO^q)B0OmGwkxxaDsEU+5C#g+?Ln-Vg@ z-=z4O*#*VJa*nujGnGfK#?`a|xfZsuiO+R}7y(d60@!WUIEUt>K+KTI&I z9YQ6#hVCo}0^*>yr-#Lisq6R?uI=Ms!J7}qm@B}Zu zp%f-~1Cf!-5S0xXl`oqq&fS=tt0`%dDWI&6pW(s zJXtYiY&~t>k5I0RK3sN;#8?#xO+*FeK#=C^%{Y>{k{~bXz%(H;)V5)DZRk~(_d0b6 zV!x54fwkl`1y;%U;n|E#^Vx(RGnuN|T$oJ^R%ZmI{8(9>U-K^QpDcT?Bb@|J0NAfvHtL#wP ziYupr2E5=_KS{U@;kyW7oy*+UTOiF*e+EhYqVcV^wx~5}49tBNSUHLH1=x}6L2Fl^4X4633$k!ZHZTL50Vq+a5+ z<}uglXQ<{x&6ey)-lq6;4KLHbR)_;Oo^FodsYSw3M-)FbLaBcPI=-ao+|))T2ksKb z{c%Fu`HR1dqNw8%>e0>HI2E_zNH1$+4RWfk}p-h(W@)7LC zwVnUO17y+~kw35CxVtokT44iF$l8XxYuetp)1Br${@lb(Q^e|q*5%7JNxp5B{r<09 z-~8o#rI1(Qb9FhW-igcsC6npf5j`-v!nCrAcVx5+S&_V2D>MOWp6cV$~Olhp2`F^Td{WV`2k4J`djb#M>5D#k&5XkMu*FiO(uP{SNX@(=)|Wm`@b> z_D<~{ip6@uyd7e3Rn+qM80@}Cl35~^)7XN?D{=B-4@gO4mY%`z!kMIZizhGtCH-*7 z{a%uB4usaUoJwbkVVj%8o!K^>W=(ZzRDA&kISY?`^0YHKe!()(*w@{w7o5lHd3(Us zUm-K=z&rEbOe$ackQ3XH=An;Qyug2g&vqf;zsRBldxA+=vNGoM$Zo9yT?Bn?`Hkiq z&h@Ss--~+=YOe@~JlC`CdSHy zcO`;bgMASYi6`WSw#Z|A;wQgH@>+I3OT6(*JgZZ_XQ!LrBJfVW2RK%#02|@V|H4&8DqslU6Zj(x!tM{h zRawG+Vy63_8gP#G!Eq>qKf(C&!^G$01~baLLk#)ov-Pqx~Du>%LHMv?=WBx2p2eV zbj5fjTBhwo&zeD=l1*o}Zs%SMxEi9yokhbHhY4N!XV?t8}?!?42E-B^Rh&ABFxovs*HeQ5{{*)SrnJ%e{){Z_#JH+jvwF7>Jo zE+qzWrugBwVOZou~oFa(wc7?`wNde>~HcC@>fA^o>ll?~aj-e|Ju z+iJzZg0y1@eQ4}rm`+@hH(|=gW^;>n>ydn!8%B4t7WL)R-D>mMw<7Wz6>ulFnM7QA ze2HEqaE4O6jpVq&ol3O$46r+DW@%glD8Kp*tFY#8oiSyMi#yEpVIw3#t?pXG?+H>v z$pUwT@0ri)_Bt+H(^uzp6qx!P(AdAI_Q?b`>0J?aAKTPt>73uL2(WXws9+T|%U)Jq zP?Oy;y6?{%J>}?ZmfcnyIQHh_jL;oD$`U#!v@Bf{5%^F`UiOX%)<0DqQ^nqA5Ac!< z1DPO5C>W0%m?MN*x(k>lDT4W3;tPi=&yM#Wjwc5IFNiLkQf`7GN+J*MbB4q~HVePM zeDj8YyA*btY&n!M9$tuOxG0)2um))hsVsY+(p~JnDaT7x(s2If0H_iRSju7!z7p|8 zzI`NV!1hHWX3m)?t68k6yNKvop{Z>kl)f5GV(~1InT4%9IxqhDX-rgj)Y|NYq_NTlZgz-)=Y$=x9L7|k0=m@6WQ<4&r=BX@pW25NtCI+N{e&`RGSpR zeb^`@FHm5?pWseZ6V08{R(ki}--13S2op~9Kzz;#cPgL}Tmrqd+gs(fJLTCM8#&|S z^L+7PbAhltJDyyxAVxqf(2h!RGC3$;hX@YNz@&JRw!m5?Q)|-tZ8u0D$4we+QytG^ zj0U_@+N|OJlBHdWPN!K={a$R1Zi{2%5QD}s&s-Xn1tY1cwh)8VW z$pjq>8sj4)?76EJs6bA0E&pfr^Vq`&Xc;Tl2T!fm+MV%!H|i0o;7A=zE?dl)-Iz#P zSY7QRV`qRc6b&rON`BValC01zSLQpVemH5y%FxK8m^PeNN(Hf1(%C}KPfC*L?Nm!nMW0@J3(J=mYq3DPk;TMs%h`-amWbc%7{1Lg3$ z^e=btuqch-lydbtLvazh+fx?87Q7!YRT(=-Vx;hO)?o@f1($e5B?JB9jcRd;zM;iE zu?3EqyK`@_5Smr#^a`C#M>sRwq2^|ym)X*r;0v6AM`Zz1aK94@9Ti)Lixun2N!e-A z>w#}xPxVd9AfaF$XTTff?+#D(xwOpjZj9-&SU%7Z-E2-VF-n#xnPeQH*67J=j>TL# z<v}>AiTXrQ(fYa%82%qlH=L z6Fg8@r4p+BeTZ!5cZlu$iR?EJpYuTx>cJ~{{B7KODY#o*2seq=p2U0Rh;3mX^9sza zk^R_l7jzL5BXWlrVkhh!+LQ-Nc0I`6l1mWkp~inn)HQWqMTWl4G-TBLglR~n&6J?4 z7J)IO{wkrtT!Csntw3H$Mnj>@;QbrxC&Shqn^VVu$Ls*_c~TTY~fri6fO-=eJsC*8(3(H zSyO>=B;G`qA398OvCHRvf3mabrPZaaLhn*+jeA`qI!gP&i8Zs!*bBqMXDJpSZG$N) zx0rDLvcO>EoqCTR)|n7eOp-jmd>`#w`6`;+9+hihW2WnKVPQ20LR94h+(p)R$Y!Q zj_3ZEY+e@NH0f6VjLND)sh+Cvfo3CpcXw?`$@a^@CyLrAKIpjL8G z`;cDLqvK=ER)$q)+6vMKlxn!!SzWl>Ib9Ys9L)L0IWr*Ox;Rk#(Dpqf;wapY_EYL8 zKFrV)Q8BBKO4$r2hON%g=r@lPE;kBUVYVG`uxx~QI>9>MCXw_5vnmDsm|^KRny929 zeKx>F(LDs#K4FGU*k3~GX`A!)l8&|tyan-rBHBm6XaB5hc5sGKWwibAD7&3M-gh1n z2?eI7E2u{(^z#W~wU~dHSfy|m)%PY454NBxED)y-T3AO`CLQxklcC1I@Y`v4~SEI#Cm> z-cjqK6I?mypZapi$ZK;y&G+|#D=woItrajg69VRD+Fu8*UxG6KdfFmFLE}HvBJ~Y) zC&c-hr~;H2Idnsz7_F~MKpBZldh)>itc1AL0>4knbVy#%pUB&9vqL1Kg*^aU`k#(p z=A%lur(|$GWSqILaWZ#2xj(&lheSiA|N6DOG?A|$!aYM)?oME6ngnfLw0CA79WA+y zhUeLbMw*VB?drVE_D~3DWVaD>8x?_q>f!6;)i3@W<=kBZBSE=uIU60SW)qct?AdM zXgti8&O=}QNd|u%Fpxr172Kc`sX^@fm>Fxl8fbFalJYci_GGoIzU*~U*I!QLz? z4NYk^=JXBS*Uph@51da-v;%?))cB^(ps}y8yChu7CzyC9SX{jAq13zdnqRHRvc{ha zcPmgCUqAJ^1RChMCCz;ZN*ap{JPoE<1#8nNObDbAt6Jr}Crq#xGkK@w2mLhIUecvy z#?s~?J()H*?w9K`_;S+8TNVkHSk}#yvn+|~jcB|he}OY(zH|7%EK%-Tq=)18730)v zM3f|=oFugXq3Lqn={L!wx|u(ycZf(Te11c3?^8~aF; zNMC)gi?nQ#S$s{46yImv_7@4_qu|XXEza~);h&cr*~dO@#$LtKZa@@r$8PD^jz{D6 zk~5;IJBuQjsKk+8i0wzLJ2=toMw4@rw7(|6`7*e|V(5-#ZzRirtkXBO1oshQ&0>z&HAtSF8+871e|ni4gLs#`3v7gnG#^F zDv!w100_HwtU}B2T!+v_YDR@-9VmoGW+a76oo4yy)o`MY(a^GcIvXW+4)t{lK}I-& zl-C=(w_1Z}tsSFjFd z3iZjkO6xnjLV3!EE?ex9rb1Zxm)O-CnWPat4vw08!GtcQ3lHD+ySRB*3zQu-at$rj zzBn`S?5h=JlLXX8)~Jp%1~YS6>M8c-Mv~E%s7_RcvIYjc-ia`3r>dvjxZ6=?6=#OM zfsv}?hGnMMdi9C`J9+g)5`M9+S79ug=!xE_XcHdWnIRr&hq$!X7aX5kJV8Q(6Lq?|AE8N2H z37j{DPDY^Jw!J>~>Mwaja$g%q1sYfH4bUJFOR`x=pZQ@O(-4b#5=_Vm(0xe!LW>YF zO4w`2C|Cu%^C9q9B>NjFD{+qt)cY3~(09ma%mp3%cjFsj0_93oVHC3)AsbBPuQNBO z`+zffU~AgGrE0K{NVR}@oxB4&XWt&pJ-mq!JLhFWbnXf~H%uU?6N zWJ7oa@``Vi$pMWM#7N9=sX1%Y+1qTGnr_G&h3YfnkHPKG}p>i{fAG+(klE z(g~u_rJXF48l1D?;;>e}Ra{P$>{o`jR_!s{hV1Wk`vURz`W2c$-#r9GM7jgs2>um~ zouGlCm92rOiLITzf`jgl`v2qYw^!Lh0YwFHO1|3Krp8ztE}?#2+>c)yQlNw%5e6w5 zIm9BKZN5Q9b!tX`Zo$0RD~B)VscWp(FR|!a!{|Q$={;ZWl%10vBzfgWn}WBe!%cug z^G%;J-L4<6&aCKx@@(Grsf}dh8fuGT+TmhhA)_16uB!t{HIAK!B-7fJLe9fsF)4G- zf>(~ⅅ8zCNKueM5c!$)^mKpZNR!eIlFST57ePGQcqCqedAQ3UaUEzpjM--5V4YO zY22VxQm%$2NDnwfK+jkz=i2>NjAM6&P1DdcO<*Xs1-lzdXWn#LGSxwhPH7N%D8-zCgpFWt@`LgNYI+Fh^~nSiQmwH0^>E>*O$47MqfQza@Ce z1wBw;igLc#V2@y-*~Hp?jA1)+MYYyAt|DV_8RQCrRY@sAviO}wv;3gFdO>TE(=9o? z=S(r=0oT`w24=ihA=~iFV5z$ZG74?rmYn#eanx(!Hkxcr$*^KRFJKYYB&l6$WVsJ^ z-Iz#HYmE)Da@&seqG1fXsTER#adA&OrD2-T(z}Cwby|mQf{0v*v3hq~pzF`U`jenT z=XHXeB|fa?Ws$+9ADO0rco{#~+`VM?IXg7N>M0w1fyW1iiKTA@p$y zSiAJ%-Mg{m>&S4r#Tw@?@7ck}#oFo-iZJCWc`hw_J$=rw?omE{^tc59ftd`xq?jzf zo0bFUI=$>O!45{!c4?0KsJmZ#$vuYpZLo_O^oHTmmLMm0J_a{Nn`q5tG1m=0ecv$T z5H7r0DZGl6be@aJ+;26EGw9JENj0oJ5K0=^f-yBW2I0jqVIU};NBp*gF7_KlQnhB6 z##d$H({^HXj@il`*4^kC42&3)(A|tuhs;LygA-EWFSqpe+%#?6HG6}mE215Z4mjO2 zY2^?5$<8&k`O~#~sSc5Fy`5hg5#e{kG>SAbTxCh{y32fHkNryU_c0_6h&$zbWc63T z7|r?X7_H!9XK!HfZ+r?FvBQ$x{HTGS=1VN<>Ss-7M3z|vQG|N}Frv{h-q623@Jz*@ ziXlZIpAuY^RPlu&=nO)pFhML5=ut~&zWDSsn%>mv)!P1|^M!d5AwmSPIckoY|0u9I zTDAzG*U&5SPf+@c_tE_I!~Npfi$?gX(kn=zZd|tUZ_ez(xP+)xS!8=k(<{9@<+EUx zYQgZhjn(0qA#?~Q+EA9oh_Jx5PMfE3#KIh#*cFIFQGi)-40NHbJO&%ZvL|LAqU=Rw zf?Vr4qkUcKtLr^g-6*N-tfk+v8@#Lpl~SgKyH!+m9?T8B>WDWK22;!i5&_N=%f{__ z-LHb`v-LvKqTJZCx~z|Yg;U_f)VZu~q7trb%C6fOKs#eJosw&b$nmwGwP;Bz`=zK4 z>U3;}T_ptP)w=vJaL8EhW;J#SHA;fr13f=r#{o)`dRMOs-T;lp&Toi@u^oB_^pw=P zp#8Geo2?@!h2EYHY?L;ayT}-Df0?TeUCe8Cto{W0_a>!7Gxmi5G-nIIS;X{flm2De z{SjFG%knZoVa;mtHR_`*6)KEf=dvOT3OgT7C7&-4P#4X^B%VI&_57cBbli()(%zZC?Y0b;?5!f22UleQ=9h4_LkcA!Xsqx@q{ko&tvP_V@7epFs}AIpM{g??PA>U(sk$Gum>2Eu zD{Oy{$OF%~?B6>ixQeK9I}!$O0!T3#Ir8MW)j2V*qyJ z8Bg17L`rg^B_#rkny-=<3fr}Y42+x0@q6POk$H^*p3~Dc@5uYTQ$pfaRnIT}Wxb;- zl!@kkZkS=l)&=y|21veY8yz$t-&7ecA)TR|=51BKh(@n|d$EN>18)9kSQ|GqP?aeM ztXd9C&Md$PPF*FVs*GhoHM2L@D$(Qf%%x zwQBUt!jM~GgwluBcwkgwQ!249uPkNz3u@LSYZgmpHgX|P#8!iKk^vSKZ;?)KE$92d z2U>y}VWJ0&zjrIqddM3dz-nU%>bL&KU%SA|LiiUU7Ka|c=jF|vQ1V)Jz`JZe*j<5U6~RVuBEVJoY~ z&GE+F$f>4lN=X4-|9v*5O*Os>>r87u z!_1NSV?_X&HeFR1fOFb8_P)4lybJ6?1BWK`Tv2;4t|x1<#@17UO|hLGnrB%nu)fDk zfstJ4{X4^Y<8Lj<}g2^kksSefQTMuTo?tJLCh zC~>CR#a0hADw!_Vg*5fJwV{~S(j8)~sn>Oyt(ud2$1YfGck77}xN@3U_#T`q)f9!2 zf>Ia;Gwp2_C>WokU%(z2ec8z94pZyhaK+e>3a9sj^-&*V494;p9-xk+u1Jn#N_&xs z59OI2w=PuTErv|aNcK*>3l^W*p3}fjXJjJAXtBA#%B(-0--s;1U#f8gFYW!JL+iVG zV0SSx5w8eVgE?3Sg@eQv)=x<+-JgpVixZQNaZr}3b8sVyVs$@ndkF5FYKka@b+YAh z#nq_gzlIDKEs_i}H4f)(VQ!FSB}j>5znkVD&W0bOA{UZ7h!(FXrBbtdGA|PE1db>s z$!X)WY)u#7P8>^7Pjjj-kXNBuJX3(pJVetTZRNOnR5|RT5D>xmwxhAn)9KF3J05J; z-Mfb~dc?LUGqozC2p!1VjRqUwwDBnJhOua3vCCB-%ykW_ohSe?$R#dz%@Gym-8-RA zjMa_SJSzIl8{9dV+&63e9$4;{=1}w2=l+_j_Dtt@<(SYMbV-18&%F@Zl7F_5! z@xwJ0wiDdO%{}j9PW1(t+8P7Ud79yjY>x>aZYWJL_NI?bI6Y02`;@?qPz_PRqz(7v``20`- z033Dy|4;y6di|>cz|P-z|6c&3f&g^OAt8aN0Zd&0yZ>dq2aFCsE<~Ucf$v{sL=*++ zBxFSa2lfA+Y%U@B&3D=&CBO&u`#*nNc|PCY7XO<}MnG0VR764XrHtrb5zwC*2F!Lp zE<~Vj0;z!S-|3M4DFxuQ=`ShTf28<9p!81(0hFbGNqF%0gg*orez9!qt8e%o@Yfl@ zhvY}{@3&f??}7<`p>FyU;7?VkKbh8_=csozU=|fH&szgZ{=NDCylQ>EH^x5!K3~-V z)_2Y>0uJ`Z0Pb58y`RL+&n@m9tJ)O<%q#&u#DAIt+-rRt0eSe1MTtMl@W)H$b3D)@ z*A-1bUgZI)>HdcI4&W>P4W5{-j=s5p5`cbQ+{(g0+RDnz!TR^mxSLu_y#SDVKrj8i zA^hi6>jMGM;`$9Vfb-Yf!47b)Ow`2OKtNB=z|Kxa$5O}WPo;(Dc^`q(7X8kkeFyO8 z{XOq^07=u|7*P2`m;>PIFf=i80MKUxsN{d2cX0M+REsE*20+WQ79T9&cqT>=I_U% z{=8~^Isg(Nzo~`4iQfIb_#CVCD>#5h>=-Z#5dH}WxYzn%0)GAm6L2WdUdP=0_h>7f z(jh&7%1i(ZOn+}D8$iGK4Vs{pmHl_w4Qm-46H9>4^{3dz^DZDh+dw)6Xd@CpQNK$j z{CU;-cmpK=egplZ3y3%y=sEnCJ^eYVKXzV8H2_r*fJ*%*B;a1_lOpt6)IT1IAK2eB z{rie|uDJUrbgfUE>~C>@RO|m5ex55F{=~Bb4Cucp{ok7Yf9V}QuZ`#Gc|WaqsQlK- zKaV)iMRR__&Ak2Z=IM9R9g5$WM4u{a^C-7uX*!myEym z#_#p^T!P~#Dx$%^K>Y_nj_3J*E_LwJ60-5Xu=LkJAwcP@|0;a&+|+ZX`Jbj9P5;T% z|KOc}4*#4o{U?09`9Hz`Xo-I!P=9XfIrr*MQ}y=$!qgv?_J38^bNb4kM&_OVg^_=Eu-qG5U(fw0KMgH){C8pazq~51rN97hf#20-7=aK0)N|UM H-+%o-(+5aQ diff --git a/src/op/op_util.cc b/src/op/op_util.cc index 7cf6711d2270..78e092ca844e 100644 --- a/src/op/op_util.cc +++ b/src/op/op_util.cc @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2017 by5A Contributors + * Copyright (c) 2017 by Contributors * \brief Utility to make loop nest. * \file op_util.cc */ From e9ef432f9ca3a926ef1e3461517d78412bb7d085 Mon Sep 17 00:00:00 2001 From: solin319 Date: Fri, 1 Dec 2017 01:37:11 +0800 Subject: [PATCH 032/948] fix parameter name in UnrollLoop (#679) In unroll_loop.cc the parameter name is "auto_max_depth", but in ir_pass.h the parameter name is "auto_min_depth" --- include/tvm/ir_pass.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index d0f32478eb4c..e763a75e7ee0 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -216,7 +216,7 @@ Stmt NarrowChannelAccess(Stmt stmt); * * \param stmt The statment to be unrolled. * \param auto_max_step The maximum step before stop attach automatic unroll - * \param auto_min_depth The minimum depth before we can start automatic unroll + * \param auto_max_depth The maximum depth before stop attach automatic unroll * \param auto_max_extent The maximum extent of the loop we can unroll, * this is an legacy option that donot take the loop total steps into account. * \param explicit_unroll Whether explicitly unroll the loop, or leave unroll annotation to codegen. @@ -224,7 +224,7 @@ Stmt NarrowChannelAccess(Stmt stmt); */ Stmt UnrollLoop(Stmt stmt, int auto_max_step, - int auto_min_depth, + int auto_max_depth, int auto_max_extent, bool explicit_unroll); From 94123a362e5673a29dd5567d437096bd229839f5 Mon Sep 17 00:00:00 2001 From: solin319 Date: Fri, 1 Dec 2017 01:37:26 +0800 Subject: [PATCH 033/948] fix name bug in test_pass_inject_double_buffer (#678) Change the parameter 'C' name --- tests/python/unittest/test_pass_inject_double_buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_pass_inject_double_buffer.py b/tests/python/unittest/test_pass_inject_double_buffer.py index 3136e33197ec..0e3500edf2e3 100644 --- a/tests/python/unittest/test_pass_inject_double_buffer.py +++ b/tests/python/unittest/test_pass_inject_double_buffer.py @@ -7,7 +7,7 @@ def test_double_buffer(): tx = tvm.thread_axis("threadIdx.x") ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") - C = ib.pointer("float32", name="A") + C = ib.pointer("float32", name="C") ib.scope_attr(tx, "thread_extent", 1) with ib.for_range(0, n) as i: B = ib.allocate("float32", m, name="B", scope="shared") From dfabb3616cec17ab9b284765c772f86a3c32a346 Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Thu, 30 Nov 2017 14:19:49 -0800 Subject: [PATCH 034/948] [APP] fix gradle build for Android build (#685) --- apps/android_rpc/README.md | 8 +- .../gradle/wrapper/gradle-wrapper.properties | 6 - apps/android_rpc/gradlew | 160 ------------------ apps/android_rpc/gradlew.bat | 90 ---------- 4 files changed, 5 insertions(+), 259 deletions(-) delete mode 100644 apps/android_rpc/gradle/wrapper/gradle-wrapper.properties delete mode 100755 apps/android_rpc/gradlew delete mode 100644 apps/android_rpc/gradlew.bat diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md index 062227b3e424..00cb597cec28 100644 --- a/apps/android_rpc/README.md +++ b/apps/android_rpc/README.md @@ -8,6 +8,8 @@ You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Andro ### Build APK +We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system. + Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary. ``` @@ -24,17 +26,17 @@ dependencies { } ``` -The Gradle build script is provided in the app root folder. It downloads the proper version of Gradle, compiles JNI, resolves Java dependencies and builds the Android application together with tvm4j. Run following script to build apk file. +Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file. ```bash export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk] cd apps/android_rpc -./gradlew clean build +gradle clean build ``` In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/tvmrpc-release.apk`. -Now upload `tvmrpc-release.apk` to your Android device and install it. +Upload `tvmrpc-release.apk` to your Android device and install it. ### Build with OpenCL diff --git a/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties b/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties deleted file mode 100644 index 80a1f0954c16..000000000000 --- a/apps/android_rpc/gradle/wrapper/gradle-wrapper.properties +++ /dev/null @@ -1,6 +0,0 @@ -#Mon Aug 14 21:31:55 CST 2017 -distributionBase=GRADLE_USER_HOME -distributionPath=wrapper/dists -zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip diff --git a/apps/android_rpc/gradlew b/apps/android_rpc/gradlew deleted file mode 100755 index 9d82f7891513..000000000000 --- a/apps/android_rpc/gradlew +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env bash - -############################################################################## -## -## Gradle start up script for UN*X -## -############################################################################## - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS="" - -APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` - -# Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" - -warn ( ) { - echo "$*" -} - -die ( ) { - echo - echo "$*" - echo - exit 1 -} - -# OS specific support (must be 'true' or 'false'). -cygwin=false -msys=false -darwin=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; -esac - -# Attempt to set APP_HOME -# Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi -done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null - -CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar - -# Determine the Java command to use to start the JVM. -if [ -n "$JAVA_HOME" ] ; then - if [ -x "$JAVA_HOME/jre/sh/java" ] ; then - # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" - else - JAVACMD="$JAVA_HOME/bin/java" - fi - if [ ! -x "$JAVACMD" ] ; then - die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." - fi -else - JAVACMD="java" - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." -fi - -# Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi -fi - -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi - -# For Cygwin, switch paths to Windows format before running java -if $cygwin ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi - # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" - fi - i=$((i+1)) - done - case $i in - (0) set -- ;; - (1) set -- "$args0" ;; - (2) set -- "$args0" "$args1" ;; - (3) set -- "$args0" "$args1" "$args2" ;; - (4) set -- "$args0" "$args1" "$args2" "$args3" ;; - (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac -fi - -# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules -function splitJvmOpts() { - JVM_OPTS=("$@") -} -eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS -JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" - -exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" diff --git a/apps/android_rpc/gradlew.bat b/apps/android_rpc/gradlew.bat deleted file mode 100644 index aec99730b4e8..000000000000 --- a/apps/android_rpc/gradlew.bat +++ /dev/null @@ -1,90 +0,0 @@ -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS= - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto init - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:init -@rem Get command-line arguments, handling Windowz variants - -if not "%OS%" == "Windows_NT" goto win9xME_args -if "%@eval[2+2]" == "4" goto 4NT_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* -goto execute - -:4NT_args -@rem Get arguments from the 4NT Shell from JP Software -set CMD_LINE_ARGS=%$ - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega From 16e1b54d6f24426882fe6a9a22abb255fc904b49 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 30 Nov 2017 14:24:44 -0800 Subject: [PATCH 035/948] [CUDA] Enable int64 (#683) * [CUDA] Enable int64 * [PYTHON] Fix rpc tutorial with opencl * OK * update --- python/tvm/contrib/rpc.py | 46 +++++++++- src/codegen/codegen_cuda.cc | 18 ++-- tests/python/integration/test_ewise.py | 87 ++++++++++--------- tests/scripts/task_python_docs.sh | 2 +- .../deployment/cross_compilation_and_rpc.py | 2 +- 5 files changed, 101 insertions(+), 54 deletions(-) diff --git a/python/tvm/contrib/rpc.py b/python/tvm/contrib/rpc.py index 7b29b1ddac01..3d14f1eb2b6e 100644 --- a/python/tvm/contrib/rpc.py +++ b/python/tvm/contrib/rpc.py @@ -15,6 +15,8 @@ import struct import logging import multiprocessing +import subprocess +import time from . import util, cc, tar from ..module import load as _load_module from .._ffi.function import _init_api, register_func @@ -117,6 +119,17 @@ def _connect_proxy_loop(addr, key): process.join() +def _popen(cmd): + proc = subprocess.Popen(cmd, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + env=os.environ) + (out, _) = proc.communicate() + if proc.returncode != 0: + msg = "Server invoke error:\n" + msg += out + raise RuntimeError(msg) + + class Server(object): """Start RPC server on a seperate process. @@ -140,15 +153,36 @@ class Server(object): If this is true, the host and port actually corresponds to the address of the proxy server. + use_popen : bool, optional + Whether to use Popen to start a fresh new process instead of fork. + This is recommended to switch on if we want to do local RPC demonstration + for GPU devices to avoid fork safety issues. + key : str, optional The key used to identify the server in Proxy connection. """ - def __init__(self, host, port=9091, port_end=9199, is_proxy=False, key=""): + def __init__(self, + host, + port=9091, + port_end=9199, + is_proxy=False, + use_popen=False, + key=""): self.host = host self.port = port self.libs = [] - if not is_proxy: + if use_popen: + cmd = ["python", + "-m", "tvm.exec.rpc_server", + "--host=%s" % host, + "--port=%s" % port] + self.proc = multiprocessing.Process( + target=subprocess.check_call, args=(cmd,)) + self.proc.deamon = True + self.proc.start() + time.sleep(1) + elif not is_proxy: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.port = None for my_port in range(port, port_end): @@ -168,11 +202,15 @@ def __init__(self, host, port=9091, port_end=9199, is_proxy=False, key=""): self.sock = sock self.proc = multiprocessing.Process( target=_listen_loop, args=(self.sock,)) + self.proc.deamon = True + self.proc.start() else: self.proc = multiprocessing.Process( target=_connect_proxy_loop, args=((host, port), key)) - self.proc.deamon = True - self.proc.start() + self.proc.deamon = True + self.proc.start() + + def terminate(self): """Terminate the server process""" diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index 136f6eb4c3d5..249128d12f62 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -66,7 +66,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*) } } else if (t.is_uint() || t.is_int()) { if (t.is_uint()) { - os << 'u'; + if (t.lanes() != 1) { + os << "u"; + } else { + os << "unsigned "; + } } if (t.bits() == 8 && t.lanes() == 4) { // directly 4 8 bit int in integer. @@ -77,16 +81,16 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*) case 16: os << "short"; break; case 32: os << "int"; break; case 64: { - if (lanes != 1 && sizeof(long) == 64) { // NOLINT(*) - os << "long"; break; - } else { - os << "int64_t"; break; - } + CHECK(sizeof(long) == 8) // NOLINT(*) + << "CUDA not support int64 int in 32 bit system"; + os << "long"; break; } case 1: os << "int"; break; default: fail = true; break; } - if (!fail && lanes == 1) return; + if (!fail && lanes == 1) { + return; + } if (!fail && (lanes >= 2 && lanes <= 4)) { os << lanes; return; } diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index 8cbfef3ebbf8..24adf6ff28af 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -80,53 +80,58 @@ def test_popcount_llvm(): b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5) + def test_add(): - # graph - n = tvm.var('n') - A = tvm.placeholder((n,), name='A') - B = tvm.placeholder((n,), name='B') - bias = tvm.var("bias", dtype="float32") - scale = tvm.var("scale", dtype="float32") - C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name='C') - # schedule - s = tvm.create_schedule(C.op) - # create iter var and assign them tags. - num_thread = 32 - bx, x = s[C].split(C.op.axis[0], factor=num_thread*4) - tx, x = s[C].split(x, nparts=num_thread) - _, x = s[C].split(x, factor=4) - s[C].bind(bx, tvm.thread_axis("blockIdx.x")) - s[C].bind(tx, tvm.thread_axis("threadIdx.x")) - s[C].vectorize(x) + def run(dtype): + # graph + n = tvm.var('n') + A = tvm.placeholder((n,), name='A', dtype=dtype) + B = tvm.placeholder((n,), name='B', dtype=dtype) + bias = tvm.var("bias", dtype=dtype) + scale = tvm.var("scale", dtype=dtype) + C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') + # schedule + s = tvm.create_schedule(C.op) + # create iter var and assign them tags. + num_thread = 16 + bx, x = s[C].split(C.op.axis[0], factor=num_thread*4) + tx, x = s[C].split(x, nparts=num_thread) + _, x = s[C].split(x, factor=4) + s[C].bind(bx, tvm.thread_axis("blockIdx.x")) + s[C].bind(tx, tvm.thread_axis("threadIdx.x")) + s[C].vectorize(x) - # one line to build the function. - def check_device(device): - if not tvm.module.enabled(device): - print("skip because %s is not enabled.." % device) - return - fadd = tvm.build(s, [A, B, C, bias, scale], - device, - name="myadd") - ctx = tvm.context(device, 0) - # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - vbias = np.random.uniform() - vscale = np.random.uniform() - ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=10) - tcost = ftimer(a, b, c, vbias, vscale).mean - np.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy() * vscale + vbias, rtol=1e-6) + # one line to build the function. + def check_device(device): + if not tvm.module.enabled(device): + print("skip because %s is not enabled.." % device) + return + fadd = tvm.build(s, [A, B, C], + device, + name="myadd") + print(fadd.imported_modules[0].get_source()) + ctx = tvm.context(device, 0) + # launch the kernel. + n = 1024 + a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx) + b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1) + tcost = ftimer(a, b, c).mean + np.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6) - check_device("opencl") - check_device("metal") - check_device("cuda") + check_device("opencl") + check_device("metal") + check_device("cuda") + run("float32") + run("int32") + run("int64") + run("uint64") if __name__ == "__main__": + test_add() test_log_pow_llvm() test_popcount_llvm() test_exp() - test_add() diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh index b55dae933c6b..77c6f22acca5 100755 --- a/tests/scripts/task_python_docs.sh +++ b/tests/scripts/task_python_docs.sh @@ -11,7 +11,7 @@ mv out docs/_build/html/jsdoc || exit -1 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc cd docs -PYTHONPATH=../python make html || exit -1 +PYTHONPATH=`pwd`/../python make html || exit -1 cd _build/html tar czf docs.tgz * mv docs.tgz ../../../ diff --git a/tutorials/deployment/cross_compilation_and_rpc.py b/tutorials/deployment/cross_compilation_and_rpc.py index 859b24472483..bfd77f287336 100644 --- a/tutorials/deployment/cross_compilation_and_rpc.py +++ b/tutorials/deployment/cross_compilation_and_rpc.py @@ -101,7 +101,7 @@ # same machine, for demonstration. This line can be omitted if we # started an remote server. # -server = rpc.Server(host='0.0.0.0', port=9090) +server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True) ###################################################################### # Declare and Cross Compile Kernel on Local Machine From b94d26f416e06104b5d0253736d5e1b4e3f8da24 Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Thu, 30 Nov 2017 18:44:08 -0500 Subject: [PATCH 036/948] Consider variable range information during simplification of tensorize expressions (#674) --- src/arithmetic/canonical.cc | 2 ++ src/op/tensorize.cc | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index e7f9da1b448a..24369db02390 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -5,6 +5,7 @@ */ #include #include +#include #include "./canonical.h" #include "./compute_expr.h" #include "arithmetic/Simplify.h" @@ -612,6 +613,7 @@ void Canonical::SetRange(Var v, Range r, int level) { } // namespace arith namespace ir { + Stmt CanonicalSimplify(Stmt stmt, Map vrange) { return arith::Canonical(vrange).Simplify(stmt); } diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc index 9715fcbab1d6..243b7931da67 100644 --- a/src/op/tensorize.cc +++ b/src/op/tensorize.cc @@ -187,7 +187,8 @@ class TensorIntrinMatcher final : public IRMutator { const Stage& stage, const std::unordered_map& out_dom, const std::unordered_map >& in_region, - const TensorIntrin& intrin) { + const TensorIntrin& intrin, + Map* compute_intrin_iter_space) { CHECK(self == stage->op.get()); // input remap. Array inputs = self->InputTensors(); @@ -232,6 +233,7 @@ class TensorIntrinMatcher final : public IRMutator { Range r = out_dom.at(iv); var_remap_[iv->var.get()] = target_iv->var + r->min; axis_remap_[iv] = target_iv; + compute_intrin_iter_space->Set(target_iv->var, target_iv->dom); } // Remap reduction axis CHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size()) @@ -251,6 +253,7 @@ class TensorIntrinMatcher final : public IRMutator { Range r = out_dom.at(iv); var_remap_[iv->var.get()] = target_iv->var + r->min; axis_remap_[iv] = target_iv; + compute_intrin_iter_space->Set(target_iv->var, target_iv->dom); } } @@ -275,9 +278,10 @@ Array MatchTensorizeBody( const Stage& stage, const std::unordered_map& out_dom, const std::unordered_map >& in_region, - const TensorIntrin& intrin) { + const TensorIntrin& intrin, + Map* compute_intrin_iter_space) { TensorIntrinMatcher matcher; - matcher.Init(self, stage, out_dom, in_region, intrin); + matcher.Init(self, stage, out_dom, in_region, intrin, compute_intrin_iter_space); Array ret; for (Expr expr : self->body) { ret.push_back(matcher.Mutate(expr)); @@ -291,14 +295,16 @@ void VerifyTensorizeBody( const std::unordered_map& out_dom, const std::unordered_map >& in_region, const TensorIntrin& intrin) { - Array body = MatchTensorizeBody(self, stage, out_dom, in_region, intrin); + Map compute_intrin_iter_space; + Array body = MatchTensorizeBody(self, stage, out_dom, in_region, intrin, + &compute_intrin_iter_space); const ComputeOpNode* intrin_compute = intrin->op.as(); CHECK(intrin_compute) << "Only support compute intrinsic for now"; CHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch"; for (size_t i = 0; i < body.size(); ++i) { - Expr lhs = CanonicalSimplify(body[i]); - Expr rhs = CanonicalSimplify(intrin_compute->body[i]); + Expr lhs = CanonicalSimplify(body[i], compute_intrin_iter_space); + Expr rhs = CanonicalSimplify(intrin_compute->body[i], compute_intrin_iter_space); if (lhs.type() != rhs.type()) { LOG(FATAL) << "Failed to match the data type with TensorIntrin " @@ -459,11 +465,13 @@ TVM_REGISTER_API("test.op.MatchTensorizeBody") Map out_dom = args[1]; Map > in_region = args[2]; TensorIntrin intrin = args[3]; + Map vrange; CHECK(stage->op.as()); *ret = MatchTensorizeBody(stage->op.as(), stage, as_unordered_map(out_dom), as_unordered_map(in_region), - intrin); + intrin, + &vrange); }); } // namespace tvm From 71a27331c9df8ae0373af485cdd7fbeca5254974 Mon Sep 17 00:00:00 2001 From: ziheng Date: Fri, 1 Dec 2017 12:38:33 -0800 Subject: [PATCH 037/948] [RANDOM] Init contrib.random Library (#684) * [RANDOM] Init contrib.random library * [RANDOM] Add uniform * [RANDOM] Fix lint * [RANDOM] Add comments and tests * [RANDOM] Fix lint --- Makefile | 1 + make/config.mk | 3 + make/contrib/random.mk | 6 ++ python/tvm/api.py | 2 + python/tvm/contrib/random.py | 58 +++++++++++ src/contrib/random/random.cc | 149 ++++++++++++++++++++++++++++ tests/python/contrib/test_random.py | 55 ++++++++++ 7 files changed, 274 insertions(+) create mode 100644 make/contrib/random.mk create mode 100644 python/tvm/contrib/random.py create mode 100644 src/contrib/random/random.cc create mode 100644 tests/python/contrib/test_random.py diff --git a/Makefile b/Makefile index 75c5a563408e..31c8dac658fc 100644 --- a/Makefile +++ b/Makefile @@ -131,6 +131,7 @@ ifeq ($(USE_GRAPH_RUNTIME), 1) endif include make/contrib/cblas.mk +include make/contrib/random.mk include make/contrib/nnpack.mk include make/contrib/cudnn.mk include make/contrib/mps.mk diff --git a/make/config.mk b/make/config.mk index 8d8082c68142..94153edc38f6 100644 --- a/make/config.mk +++ b/make/config.mk @@ -62,6 +62,9 @@ USE_GRAPH_RUNTIME = 1 # Whether use BLAS, choices: openblas, atlas, blas, apple USE_BLAS = none +# Whether use contrib.random in runtime +USE_RANDOM = 0 + # Whether use NNPack USE_NNPACK = 0 # NNPACK_PATH = none diff --git a/make/contrib/random.mk b/make/contrib/random.mk new file mode 100644 index 000000000000..aea6770101d4 --- /dev/null +++ b/make/contrib/random.mk @@ -0,0 +1,6 @@ +RANDOM_CONTRIB_SRC = $(wildcard src/contrib/random/*.cc) +RANDOM_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(RANDOM_CONTRIB_SRC)) + +ifeq ($(USE_RANDOM), 1) + RUNTIME_DEP += $(RANDOM_CONTRIB_OBJ) +endif diff --git a/python/tvm/api.py b/python/tvm/api.py index 903e40308ecb..dfe6e4cf7d28 100644 --- a/python/tvm/api.py +++ b/python/tvm/api.py @@ -371,6 +371,8 @@ def extern(shape, inputs, fcompute, name="extern", dtype=None, tag=""): raise ValueError("Cannot infer output type, please provide dtype argument") infered_type = types.pop() dtype = [infered_type for _ in shape] + if isinstance(dtype, str): + dtype = [dtype] for shp, dt in zip(shape, dtype): output_placeholders.append(decl_buffer(shp, dt, name)) diff --git a/python/tvm/contrib/random.py b/python/tvm/contrib/random.py new file mode 100644 index 000000000000..04b018b0646f --- /dev/null +++ b/python/tvm/contrib/random.py @@ -0,0 +1,58 @@ +"""External function interface to random library.""" +from __future__ import absolute_import as _abs + +from .. import api as _api +from .. import intrin as _intrin +from .._ffi.function import _init_api + + +def randint(low, high, size, dtype='int32'): + """Return random integers from low (inclusive) to high (exclusive). + Return random integers from the "discrete uniform" distribution of the + specified dtype in the "half-open" interval [low, high). + + Parameters + ---------- + low : int + Lowest (signed) integer to be drawn from the distribution + high : int + One above the largest (signed) integer to be drawn from the distribution + + Returns + ------- + out : Tensor + A tensor with specified size and dtype + """ + assert 'int' in dtype, "the type of randint output must be int or uint" + return _api.extern(size, [], lambda ins, outs: _intrin.call_packed( + "tvm.contrib.random.randint", int(low), int(high), outs[0]), dtype=dtype) + + +def uniform(low, high, size): + """Draw samples from a uniform distribution. + + Samples are uniformly distributed over the half-open interval [low, high) + (includes low, but excludes high). In other words, any value within the + given interval is equally likely to be drawn by uniform. + + Parameters + ---------- + low : float + Lower boundary of the output interval. All values generated will be + greater than or equal to low. + high : float + Upper boundary of the output interval. All values generated will be + less than high. + size : tuple of ints + Output shape. If the given shape is, e.g., (m, n, k), then m * n * k + samples are drawn. + + Returns + ------- + out : Tensor + A tensor with specified size and dtype. + """ + return _api.extern(size, [], lambda ins, outs: _intrin.call_packed( + "tvm.contrib.random.uniform", float(low), float(high), outs[0]), dtype='float32') + +_init_api("tvm.contrib.random") diff --git a/src/contrib/random/random.cc b/src/contrib/random/random.cc new file mode 100644 index 000000000000..d0bcb18cb76b --- /dev/null +++ b/src/contrib/random/random.cc @@ -0,0 +1,149 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file External random functions for tensor. + */ +#include +#include +#include +#include +#include +#include +#include + +#define DLPACK_INTEGER_TYPE_SWITCH(type, DType, ...) \ + if (type.code == kDLInt && type.bits == 32) { \ + typedef int32_t DType; \ + {__VA_ARGS__} \ + } else if (type.code == kDLInt && type.bits == 16) { \ + typedef int16_t DType; \ + {__VA_ARGS__} \ + } else if (type.code == kDLInt && type.bits == 8) { \ + typedef int8_t DType; \ + {__VA_ARGS__} \ + } else if (type.code == kDLUInt && type.bits == 32) { \ + typedef uint32_t DType; \ + {__VA_ARGS__} \ + } else if (type.code == kDLUInt && type.bits == 16) { \ + typedef uint16_t DType; \ + {__VA_ARGS__} \ + } else if (type.code == kDLUInt && type.bits == 8) { \ + typedef uint8_t DType; \ + {__VA_ARGS__} \ + } else { \ + LOG(FATAL) << "unknown data type"; \ + } + +namespace tvm { +namespace contrib { + +using namespace runtime; + +class RandomEngine { + public: + RandomEngine() { + this->Seed(time(0)); + } + explicit RandomEngine(int seed) { + this->Seed(seed); + } + + ~RandomEngine() {} + + inline void Seed(int seed) { + rnd_engine_.seed(seed); + this->rseed_ = static_cast(seed); + } + + inline unsigned GetSeed() const { + return rseed_; + } + + inline unsigned GetRandInt() { + return rnd_engine_(); + } + + void SampleUniform(DLTensor* data, float low, float high) { + CHECK_GT(high, low) << "high must be bigger than low"; + CHECK(data->strides == nullptr); + + DLDataType dtype = data->dtype; + int64_t size = 1; + for (int i = 0; i < data->ndim; ++i) { + size *= data->shape[i]; + } + + CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1); + + if (data->ctx.device_type == kDLCPU) { + std::uniform_real_distribution uniform_dist(low, high); + std::generate_n(static_cast(data->data), size, [&] () { + return uniform_dist(rnd_engine_); + }); + } else { + LOG(FATAL) << "Do not support random.randint on this device yet"; + } + } + + private: + std::mt19937 rnd_engine_; + unsigned rseed_; +}; + +struct RandomThreadLocalEntry { + RandomEngine random_engine; + static RandomThreadLocalEntry* ThreadLocal(); +}; + +typedef dmlc::ThreadLocalStore RandomThreadLocalStore; + +RandomThreadLocalEntry* RandomThreadLocalEntry::ThreadLocal() { + return RandomThreadLocalStore::Get(); +} + + +TVM_REGISTER_GLOBAL("tvm.contrib.random.randint") +.set_body([](TVMArgs args, TVMRetValue *ret) { + RandomThreadLocalEntry *entry = RandomThreadLocalEntry::ThreadLocal(); + int64_t low = args[0]; + int64_t high = args[1]; + DLTensor* out = args[2]; + CHECK_GT(high, low) << "high must be bigger than low"; + CHECK(out->strides == nullptr); + + DLDataType dtype = out->dtype; + int64_t size = 1; + for (int i = 0; i < out->ndim; ++i) { + size *= out->shape[i]; + } + + DLPACK_INTEGER_TYPE_SWITCH(dtype, DType, { + int64_t numeric_low = std::numeric_limits::min(); + int64_t numeric_high = std::numeric_limits::max(); + numeric_high += 1; // exclusive upper bound + low = std::max(low, numeric_low); + high = std::min(high, numeric_high); + + if (out->ctx.device_type == kDLCPU) { + // file the data with random byte + std::generate_n(static_cast(out->data), size, [&] () { + unsigned rint = entry->random_engine.GetRandInt(); + return low + rint % (high - low); + }); + } else { + LOG(FATAL) << "Do not support random.randint on this device yet"; + } + }) + }); + +TVM_REGISTER_GLOBAL("tvm.contrib.random.uniform") +.set_body([](TVMArgs args, TVMRetValue *ret) { + RandomThreadLocalEntry *entry = RandomThreadLocalEntry::ThreadLocal(); + double low = args[0]; + double high = args[1]; + DLTensor* out = args[2]; + entry->random_engine.SampleUniform(out, low, high); + }); + + +} // namespace contrib +} // namespace tvm diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py new file mode 100644 index 000000000000..46f2bbbfb805 --- /dev/null +++ b/tests/python/contrib/test_random.py @@ -0,0 +1,55 @@ +import tvm +import numpy as np +from tvm.contrib import random + +def test_randint(): + m = 1024 + n = 1024 + A = random.randint(-127, 128, size=(m, n), dtype='int32') + s = tvm.create_schedule(A.op) + + def verify(target="llvm"): + if not tvm.module.enabled(target): + print("skip because %s is not enabled..." % target) + return + if not tvm.get_global_func("tvm.contrib.random.randint", True): + print("skip because extern function is not avalable") + return + ctx = tvm.cpu(0) + f = tvm.build(s, [A], target) + a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx) + f(a) + na = a.asnumpy() + assert abs(np.mean(na)) < 0.2 + assert np.min(na) == -127 + assert np.max(na) == 127 + verify() + + +def test_uniform(): + m = 1024 + n = 1024 + A = random.uniform(0, 1, size=(m, n)) + s = tvm.create_schedule(A.op) + + def verify(target="llvm"): + if not tvm.module.enabled(target): + print("skip because %s is not enabled..." % target) + return + if not tvm.get_global_func("tvm.contrib.random.uniform", True): + print("skip because extern function is not avalable") + return + ctx = tvm.cpu(0) + f = tvm.build(s, [A], target) + a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx) + f(a) + na = a.asnumpy() + assert abs(np.mean(na) - 0.5) < 1e-2 + assert abs(np.min(na) - 0.0) < 1e-3 + assert abs(np.max(na) - 1.0) < 1e-3 + verify() + + +if __name__ == "__main__": + test_randint() + test_uniform() From 8a3dbd7971a3d7df7c3a728d18c7c6124bf68989 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 3 Dec 2017 22:38:28 -0800 Subject: [PATCH 038/948] Support rank-0 tensor (#687) * Support rank-0 tensor * fix lint --- include/tvm/buffer.h | 5 +++ include/tvm/packed_func_ext.h | 4 ++ include/tvm/tensor.h | 2 +- python/tvm/_ffi/ndarray.py | 7 ++-- python/tvm/api.py | 1 - python/tvm/tensor.py | 8 +++- src/arithmetic/compute_expr.h | 12 ++++-- src/lang/buffer.cc | 37 ++++++++++--------- src/pass/arg_binder.cc | 20 +++++----- src/pass/inject_double_buffer.cc | 3 +- src/pass/inject_virtual_thread.cc | 3 +- src/pass/storage_flatten.cc | 15 ++++++-- src/pass/storage_rewrite.cc | 2 +- src/runtime/c_runtime_api.cc | 10 +++-- src/runtime/graph/graph_runtime.cc | 6 ++- src/schedule/schedule_dataflow_rewrite.cc | 6 +-- tests/python/unittest/test_codegen_device.py | 9 +++-- tests/python/unittest/test_codegen_llvm.py | 25 +++++++++++++ tests/python/unittest/test_lang_tensor.py | 13 +++++++ .../unittest/test_runtime_packed_func.py | 8 ++++ topi/python/topi/nn/dense.py | 4 +- 21 files changed, 143 insertions(+), 57 deletions(-) diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h index ad4872b8e4e0..f2790f6df7d1 100644 --- a/include/tvm/buffer.h +++ b/include/tvm/buffer.h @@ -124,6 +124,11 @@ class BufferNode : public Node { v->Visit("offset_factor", &offset_factor); } + /*! \return preferred index type for this buffer node */ + Type DefaultIndexType() const { + return shape.size() != 0 ? shape[0].type() : Int(32); + } + // User can specify data_alignment and offset_factor to be 0 // A default value will be picked. TVM_DLL static Buffer make(Var ptr, diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h index 5242a057659b..542de6a368b5 100644 --- a/include/tvm/packed_func_ext.h +++ b/include/tvm/packed_func_ext.h @@ -14,6 +14,7 @@ #include "./base.h" #include "./expr.h" +#include "./tensor.h" #include "./runtime/packed_func.h" namespace tvm { @@ -116,6 +117,9 @@ inline TVMArgValue::operator Halide::Expr() const { if (sptr->is_type()) { return IterVar(sptr)->var; } + if (sptr->is_type()) { + return Tensor(sptr)(); + } CHECK(NodeTypeChecker::Check(sptr.get())) << "Expected type " << NodeTypeName() << " but get " << sptr->type_key(); diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h index a6613a4dc424..4f46d86e93a5 100644 --- a/include/tvm/tensor.h +++ b/include/tvm/tensor.h @@ -188,7 +188,7 @@ inline bool Tensor::operator==(const Tensor& other) const { #define DEFINE_OVERLOAD_SLICE_UNARY_OP(Op) \ inline Expr operator Op (const Tensor::Slice& a) { \ return Op a.operator Expr() ; \ - } + } \ #define DEFINE_OVERLOAD_SLICE_BINARY_OP(Op) \ template \ diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py index b0dfd0f73fd9..135701a803c0 100644 --- a/python/tvm/_ffi/ndarray.py +++ b/python/tvm/_ffi/ndarray.py @@ -177,13 +177,14 @@ def copyfrom(self, source_array): shape = shape + (t.lanes,) t.lanes = 1 dtype = str(t) - source_array = np.ascontiguousarray(source_array, dtype=dtype) + if source_array.shape != shape: raise ValueError("array shape do not match the shape of NDArray {0} vs {1}".format( source_array.shape, shape)) + source_array = np.ascontiguousarray(source_array, dtype=dtype) assert source_array.flags['C_CONTIGUOUS'] data = source_array.ctypes.data_as(ctypes.c_void_p) - nbytes = ctypes.c_size_t(np.prod(source_array.shape) * source_array.dtype.itemsize) + nbytes = ctypes.c_size_t(source_array.size * source_array.dtype.itemsize) check_call(_LIB.TVMArrayCopyFromBytes(self.handle, data, nbytes)) return self @@ -212,7 +213,7 @@ def asnumpy(self): np_arr = np.empty(shape, dtype=dtype) assert np_arr.flags['C_CONTIGUOUS'] data = np_arr.ctypes.data_as(ctypes.c_void_p) - nbytes = ctypes.c_size_t(np.prod(np_arr.shape) * np_arr.dtype.itemsize) + nbytes = ctypes.c_size_t(np_arr.size * np_arr.dtype.itemsize) check_call(_LIB.TVMArrayCopyToBytes(self.handle, data, nbytes)) return np_arr diff --git a/python/tvm/api.py b/python/tvm/api.py index dfe6e4cf7d28..08b3d95dca27 100644 --- a/python/tvm/api.py +++ b/python/tvm/api.py @@ -462,7 +462,6 @@ def decl_buffer(shape, elem_offset = var('%s_elem_offset' % name, shape[0].dtype) if data is None: data = var(name, "handle") - return _api_internal._Buffer( data, dtype, shape, strides, elem_offset, name, scope, data_alignment, offset_factor) diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py index 98a142e8c128..f169ff1b64ac 100644 --- a/python/tvm/tensor.py +++ b/python/tvm/tensor.py @@ -32,7 +32,7 @@ def dtype(self): itervar_cls = None @register_node -class Tensor(NodeBase): +class Tensor(NodeBase, _expr.ExprOp): """Tensor object, to construct, see function.Tensor""" def __call__(self, *indices): ndim = self.ndim @@ -60,7 +60,13 @@ def __hash__(self): def __eq__(self, other): if not isinstance(other, Tensor): + if isinstance(other, _expr.ExprOp): + return _expr.EqualOp(self, other) return False + if self.ndim == 0 and other.ndim == 0: + raise ValueError("Equal == comparison among rank-0 tensor is ambiguous, " + "use Tensor.equal for content expression equvalence, " + "use Tensor.same_as for exact reference comparison") return _api_internal._TensorEqual(self, other) @property diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h index 18ae8530fbe8..994bcb13eadc 100644 --- a/src/arithmetic/compute_expr.h +++ b/src/arithmetic/compute_expr.h @@ -33,11 +33,14 @@ inline Expr ComputeExpr(Expr lhs, Expr rhs) { /*! * \brief Compute an reduction with Op * \param values The input values. + * \param empty_value The value when return if it is empty, can be Expr() + * which will cause an error to be rasied. * \tparam Op The computation operator * \return The result. */ template -inline Expr ComputeReduce(const Array& values); +inline Expr ComputeReduce( + const Array& values, Expr empty_value); template inline bool GetConst(Expr e, T* out); @@ -139,8 +142,11 @@ inline Expr ComputeExpr(Expr a, Expr b) { } template -inline Expr ComputeReduce(const Array& values) { - CHECK_NE(values.size(), 0U); +inline Expr ComputeReduce(const Array& values, Expr empty_value) { + if (values.size() == 0U) { + CHECK(empty_value.defined()); + return empty_value; + } Expr res = values[0]; for (size_t i = 1; i < values.size(); ++i) { res = ComputeExpr(res, values[i]); diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc index 5cf7ddef3018..af76dcc94f71 100644 --- a/src/lang/buffer.cc +++ b/src/lang/buffer.cc @@ -11,15 +11,6 @@ namespace tvm { -Array GetStrides(Array shape) { - CHECK_NE(shape.size(), 0U); - std::vector vec{make_const(shape[0].type(), 1)}; - for (size_t i = shape.size() - 1; i != 0; --i) { - vec.push_back(shape[i - 1] * vec.back()); - } - return Array(vec.rbegin(), vec.rend()); -} - Array SimplifyArray(Array array) { for (size_t i = 0; i < array.size(); ++i) { array.Set(i, ir::Simplify(array[i])); @@ -235,10 +226,12 @@ inline Expr ElemOffset(const BufferNode* n, Array index) { Expr base = n->elem_offset; if (n->strides.size() == 0) { CHECK_EQ(n->shape.size(), index.size()); - if (is_zero(base)) { - base = index[0]; - } else { - base = base + index[0]; + if (n->shape.size() != 0) { + if (is_zero(base)) { + base = index[0]; + } else { + base = base + index[0]; + } } base = MergeMulMod(base); for (size_t i = 1; i < index.size(); ++i) { @@ -294,9 +287,10 @@ Stmt Buffer::vstore(Array begin, Expr value) const { Buffer Buffer::MakeStrideView() const { if ((*this)->strides.size() != 0) return *this; + if ((*this)->shape.size() == 0) return *this; std::vector temp; auto n = std::make_shared(*operator->()); - Expr acc = make_const(n->shape[0].type(), 1); + Expr acc = make_const(n->DefaultIndexType(), 1); for (size_t i = n->shape.size(); i != 0 ; --i) { temp.push_back(acc); acc = acc * n->shape[i - 1]; @@ -344,9 +338,16 @@ Buffer Buffer::MakeSlice(Array begins, Array extents) const { Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes) const { const BufferNode* self = operator->(); Expr e_dtype; - Expr extent = (self->strides.size() == self->shape.size() ? - arith::ComputeExpr(self->strides[0], self->shape[0]): - arith::ComputeReduce(self->shape)); + Expr extent; + if (self->shape.size() == 0) { + extent = make_const(self->DefaultIndexType(), 1); + } else if (self->strides.size() == self->shape.size()) { + int highest_dim = 0; + extent = arith::ComputeExpr( + self->strides[highest_dim], self->shape[highest_dim]); + } else { + extent = arith::ComputeReduce(self->shape, Expr()); + } Expr elem_offset = self->elem_offset; if (content_lanes > 1) { e_dtype = make_zero(self->dtype.with_lanes(content_lanes)); @@ -383,7 +384,7 @@ Buffer BufferNode::make(Var data, } n->scope = std::move(scope); if (!elem_offset.defined()) { - elem_offset = make_const(n->shape[0].type(), 0); + elem_offset = make_const(n->DefaultIndexType(), 0); } if (data_alignment <= 0) { data_alignment = runtime::kAllocAlignment; diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc index 20c8593a1494..cdd344670725 100644 --- a/src/pass/arg_binder.cc +++ b/src/pass/arg_binder.cc @@ -196,7 +196,7 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, nop)); if (buffer->strides.size() == 0) { // Assert the buffer is compact - Type stype = buffer->shape[0].type(); + Type stype = buffer->DefaultIndexType(); Expr expect_stride = make_const(stype, 1); Array conds; for (size_t i = buffer->shape.size(); i != 0; --i) { @@ -211,14 +211,16 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, std::ostringstream stride_err_msg; stride_err_msg << arg_name << ".strides:" << " expected to be compact array"; - Stmt check = - AssertStmt::make(arith::ComputeReduce(conds), - stride_err_msg.str(), Evaluate::make(0)); - Expr is_null = Call::make( - Bool(1), intrinsic::tvm_handle_is_null, - {v_strides}, Call::PureIntrinsic); - check = IfThenElse::make(Not::make(is_null), check, Stmt()); - init_nest_.emplace_back(Block::make(check, Evaluate::make(0))); + if (conds.size() != 0) { + Stmt check = + AssertStmt::make(arith::ComputeReduce(conds, Expr()), + stride_err_msg.str(), Evaluate::make(0)); + Expr is_null = Call::make( + Bool(1), intrinsic::tvm_handle_is_null, + {v_strides}, Call::PureIntrinsic); + check = IfThenElse::make(Not::make(is_null), check, Stmt()); + init_nest_.emplace_back(Block::make(check, Evaluate::make(0))); + } } else { for (size_t k = 0; k < buffer->strides.size(); ++k) { std::ostringstream field_name; diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc index e9bd8594ab4d..03ffdb01e107 100644 --- a/src/pass/inject_double_buffer.cc +++ b/src/pass/inject_double_buffer.cc @@ -81,7 +81,8 @@ class DoubleBufferInjector : public IRMutator { Stmt Mutate_(const Allocate* op, const Stmt& s) final { auto it = dbuffer_info_.find(op->buffer_var.get()); if (it != dbuffer_info_.end()) { - it->second.stride = arith::ComputeReduce(op->extents) * op->type.lanes(); + it->second.stride = arith::ComputeReduce + (op->extents, Expr()) * op->type.lanes(); Stmt stmt = IRMutator::Mutate_(op, s); op = stmt.as(); Array new_extents{make_const(op->extents[0].type(), 2)}; diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc index 28e90ec4805b..bcf0e3d9fbaf 100644 --- a/src/pass/inject_virtual_thread.cc +++ b/src/pass/inject_virtual_thread.cc @@ -376,7 +376,8 @@ class VTInjector : public IRMutator { // always rewrite if not allow sharing. if (touched_var_.count(op->buffer_var.get()) || !allow_share_) { // place v on highest dimension. - Expr stride = arith::ComputeReduce(op->extents) * op->type.lanes(); + Expr stride = arith::ComputeReduce( + op->extents, Expr()) * op->type.lanes(); Array other; other.push_back(make_const(op->extents[0].type(), num_threads_)); for (Expr e : extents) { diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc index f1aee504fb3e..46bed1fc9382 100644 --- a/src/pass/storage_flatten.cc +++ b/src/pass/storage_flatten.cc @@ -147,10 +147,11 @@ class StorageFlattener : public IRMutator { } } Array strides; - if (dim_align_.count(key) != 0) { + if (dim_align_.count(key) != 0 && shape.size() != 0) { std::vector rstrides; const std::vector& avec = dim_align_[key]; - Expr stride = make_const(shape[0].type(), 1); + int first_dim = 0; + Expr stride = make_const(shape[first_dim].type(), 1); for (size_t i = shape.size(); i != 0; --i) { size_t dim = i - 1; if (dim < avec.size() && avec[dim].align_factor != 0) { @@ -164,6 +165,7 @@ class StorageFlattener : public IRMutator { } strides = Array(rstrides.rbegin(), rstrides.rend()); } + e.buffer = BufferNode::make( Var(key.GetName(), Handle()), op->type, shape, strides, Expr(), @@ -176,13 +178,18 @@ class StorageFlattener : public IRMutator { Stmt ret; if (strides.size() != 0) { + int first_dim = 0; ret = Allocate::make( e.buffer->data, e.buffer->dtype, - {arith::ComputeExpr(e.buffer->strides[0], e.buffer->shape[0])}, + {arith::ComputeExpr(e.buffer->strides[first_dim], e.buffer->shape[first_dim])}, make_const(Bool(e.buffer->dtype.lanes()), true), body); } else { + shape = e.buffer->shape; + if (shape.size() == 0) { + shape.push_back(make_const(Int(32), 1)); + } ret = Allocate::make( - e.buffer->data, e.buffer->dtype, e.buffer->shape, + e.buffer->data, e.buffer->dtype, shape, make_const(Bool(e.buffer->dtype.lanes()), true), body); } ret = AttrStmt::make( diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 2f3616017215..9d47a64f8837 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -405,7 +405,7 @@ class StoragePlanRewriter : public IRMutator { // Build a merged allocation Expr combo_size; for (const Allocate* op : e->allocs) { - Expr sz = arith::ComputeReduce(op->extents); + Expr sz = arith::ComputeReduce(op->extents, make_const(Int(32), 1)); if (alloc_type.lanes() != op->type.lanes()) { sz = (sz * make_const(sz.type(), op->type.lanes()) + make_const(sz.type(), alloc_type.lanes() - 1)) / diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index f036dccc381a..dd8f80bcd72f 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -352,9 +352,13 @@ int TVMArrayAlloc(const tvm_index_t* shape, arr->dtype.code = static_cast(dtype_code); arr->dtype.bits = static_cast(dtype_bits); arr->dtype.lanes = static_cast(dtype_lanes); - tvm_index_t* shape_copy = new tvm_index_t[ndim]; - std::copy(shape, shape + ndim, shape_copy); - arr->shape = shape_copy; + if (ndim != 0) { + tvm_index_t* shape_copy = new tvm_index_t[ndim]; + std::copy(shape, shape + ndim, shape_copy); + arr->shape = shape_copy; + } else { + arr->shape = nullptr; + } // ctx arr->ctx.device_type = static_cast(device_type); arr->ctx.device_id = device_id; diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index d3f849d743dc..ed833d40848c 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -370,8 +370,10 @@ void GraphRuntime::LoadDLTensor(dmlc::Stream* strm, DLTensor* dst) { CHECK(strm->Read(&tensor.dtype, sizeof(tensor.dtype))) << "Invalid DLTensor file format"; std::vector shape(tensor.ndim); - CHECK(strm->Read(&shape[0], sizeof(int64_t) * tensor.ndim)) - << "Invalid DLTensor file format"; + if (tensor.ndim != 0) { + CHECK(strm->Read(&shape[0], sizeof(int64_t) * tensor.ndim)) + << "Invalid DLTensor file format"; + } CHECK_EQ(tensor.ndim, dst->ndim) << "param dimension mismatch"; CHECK(tensor.dtype.bits == dst->dtype.bits && tensor.dtype.code == dst->dtype.code && diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc index a8dc4edf57f1..d1a69ecf0203 100644 --- a/src/schedule/schedule_dataflow_rewrite.cc +++ b/src/schedule/schedule_dataflow_rewrite.cc @@ -47,10 +47,10 @@ Expr InjectPredicate(const Array& predicates, const Reduce* reduce = body.as(); if (reduce) { std::shared_ptr n = std::make_shared(*reduce); - n->condition = n->condition && arith::ComputeReduce(predicates); + n->condition = n->condition && arith::ComputeReduce(predicates, Expr()); return Expr(n); } - return Select::make(arith::ComputeReduce(predicates), + return Select::make(arith::ComputeReduce(predicates, Expr()), body, make_zero(body.type())); } @@ -467,7 +467,7 @@ Array Schedule::rfactor(const Tensor& tensor, const Reduce* reduce = compute_op->body[idx].as(); CHECK(reduce) << "Can only rfactor non-inline reductions"; predicates.push_back(reduce->condition); - Expr predicate = arith::ComputeReduce(predicates); + Expr predicate = arith::ComputeReduce(predicates, Expr()); std::unordered_map vsub; diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py index bbdd65e4be1c..56e3fc81910f 100644 --- a/tests/python/unittest/test_codegen_device.py +++ b/tests/python/unittest/test_codegen_device.py @@ -5,8 +5,8 @@ def test_add_pipeline(): n = tvm.var('n') A = tvm.placeholder((n,), name='A') - B = tvm.placeholder((n,), name='B') - C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') + B = tvm.placeholder((), name='B') + C = tvm.compute(A.shape, lambda *i: A(*i) + B(), name='C') D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='D') s = tvm.create_schedule(D.op) @@ -48,7 +48,7 @@ def check_target(device, host="stackvm"): # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx) f(a, b, d) np.testing.assert_allclose( @@ -72,7 +72,7 @@ def check_module_save(device, host="stackvm"): # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx) f(a, b, d) np.testing.assert_allclose( @@ -84,5 +84,6 @@ def check_module_save(device, host="stackvm"): check_target("nvptx", host="llvm") check_target("rocm", host="llvm") + if __name__ == "__main__": test_add_pipeline() diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py index 0db06b934f1b..24996c842249 100644 --- a/tests/python/unittest/test_codegen_llvm.py +++ b/tests/python/unittest/test_codegen_llvm.py @@ -273,7 +273,32 @@ def check_llvm(n): check_llvm(64) +def test_rank_zero(): + def check_llvm(n): + if not tvm.module.enabled("llvm"): + return + A = tvm.placeholder((n, ), name='A') + scale = tvm.placeholder((), name='scale') + k = tvm.reduce_axis((0, n), name="k") + C = tvm.compute((), lambda : tvm.sum(A[k] * scale, axis=k), name="C") + D = tvm.compute((), lambda : C + 1) + s = tvm.create_schedule(D.op) + # build and invoke the kernel. + f = tvm.build(s, [A, scale, D], "llvm") + ctx = tvm.cpu(0) + # launch the kernel. + a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx) + sc = tvm.nd.array( + np.random.randint(0, 2, size=()).astype(scale.dtype), ctx) + d = tvm.nd.empty((), D.dtype, ctx) + f(a, sc, d) + d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1 + np.testing.assert_allclose(d.asnumpy(), d_np) + check_llvm(64) + + if __name__ == "__main__": + test_rank_zero() test_llvm_bool() test_llvm_persist_parallel() test_llvm_select() diff --git a/tests/python/unittest/test_lang_tensor.py b/tests/python/unittest/test_lang_tensor.py index 6f151749c849..1d8603dfc98b 100644 --- a/tests/python/unittest/test_lang_tensor.py +++ b/tests/python/unittest/test_lang_tensor.py @@ -19,6 +19,17 @@ def test_tensor(): assert(T[0][0][0].astype('float16').dtype == 'float16') +def test_rank_zero(): + m = tvm.var('m') + A = tvm.placeholder((m,), name='A') + scale = tvm.placeholder((), name='s') + k = tvm.reduce_axis((0, m), name="k") + T = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k)) + print(T) + print(T.op.body) + assert(tuple(T.shape) == ()) + + def test_conv1d(): n = tvm.var('n') A = tvm.placeholder((n+2), name='A') @@ -173,7 +184,9 @@ def test_tensor_inputs(): y = tvm.compute(x.shape, lambda i: x[i] + x[i]) assert tuple(y.op.input_tensors) == (x,) + if __name__ == "__main__": + test_rank_zero() test_tensor_inputs() test_tensor_reduce_multi_axis() test_conv1d() diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py index 44b450b23fe2..279172555d2a 100644 --- a/tests/python/unittest/test_runtime_packed_func.py +++ b/tests/python/unittest/test_runtime_packed_func.py @@ -63,7 +63,15 @@ def myfunc(ss): f(a) +def test_empty_array(): + def myfunc(ss): + assert tuple(ss) == () + x = tvm.convert(()) + tvm.convert(myfunc)(x) + + if __name__ == "__main__": + test_empty_array() test_get_global() test_get_callback_with_node() test_convert() diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py index caa736a41416..333692614bd1 100644 --- a/topi/python/topi/nn/dense.py +++ b/topi/python/topi/nn/dense.py @@ -25,7 +25,7 @@ def dense(data, weight, bias=None): """ assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim dense" - if bias: + if bias is not None: assert len(bias.shape) == 1 batch, in_dim = data.shape out_dim, _ = weight.shape @@ -33,7 +33,7 @@ def dense(data, weight, bias=None): matmul = tvm.compute((batch, out_dim), \ lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \ tag='dense') - if bias: + if bias is not None: matmul = tvm.compute((batch, out_dim), \ lambda i, j: matmul[i, j] + bias[j], \ tag=tag.BROADCAST) From 49ac8d8b1f5bf969d411d176c4f7033666bb446f Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 4 Dec 2017 13:33:18 -0800 Subject: [PATCH 039/948] [CI] Enable llvm in CPU test (#688) * [CI] Enable llvm in CPU test * fix llvm --- Jenkinsfile | 1 + tests/ci_build/Dockerfile.cpu | 3 +++ 2 files changed, 4 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index ef9666351ba5..793bf11f0d4a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -120,6 +120,7 @@ stage('Build') { echo USE_CUDA=0 >> config.mk echo USE_OPENCL=0 >> config.mk echo USE_RPC=0 >> config.mk + echo LLVM_CONFIG=llvm-config-4.0 >> config.mk """ make('cpu', '-j2') pack_lib('cpu', tvm_lib) diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index b113fc548fcb..2c7510365f9d 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -17,3 +17,6 @@ RUN bash /install/ubuntu_install_python_package.sh COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh RUN bash /install/ubuntu_install_java.sh + +COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh +RUN bash /install/ubuntu_install_llvm.sh From 0ef29283e3f7f940dc2b1a5d9fe9411b4696c3cf Mon Sep 17 00:00:00 2001 From: alex-weaver Date: Tue, 5 Dec 2017 20:06:46 +0000 Subject: [PATCH 040/948] Port build_module.py to C++ (#667) * Port build_module.py to C++ * Fix lint errors * Fix more lint errors * Fix more lint errors * Fix more lint errors * Fix build error * Implemented style fixes * Fix lint errors * Added function to construct target from string lower now returns array * Fix lint error * Implemented review changes - style & Target options -> std::vector * Fixed lint, argument alignment and added unit test * Changed test to target LLVM, fixed sign compare warnings * Reverted unit test to CUDA, changed Jenkinsfile to enable GPU for C++ tests * Slight change to Jenkinsfile * Changed build_module test from CUDA to LLVM * Added function var() to construct a Var instance. Changed implementation of LLVMEnabled() * Reverted Jenkinsfile --- include/tvm/build_module.h | 153 ++++++++++++++++ include/tvm/expr.h | 7 + include/tvm/schedule.h | 10 +- src/codegen/build_module.cc | 314 +++++++++++++++++++++++++++++++++ src/lang/expr.cc | 4 + tests/cpp/build_module_test.cc | 42 +++++ 6 files changed, 525 insertions(+), 5 deletions(-) create mode 100644 include/tvm/build_module.h create mode 100644 src/codegen/build_module.cc create mode 100644 tests/cpp/build_module_test.cc diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h new file mode 100644 index 000000000000..a1563e8e7447 --- /dev/null +++ b/include/tvm/build_module.h @@ -0,0 +1,153 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file build_module.h +* \brief Functions for compiling ops. +*/ +#ifndef TVM_BUILD_MODULE_H_ +#define TVM_BUILD_MODULE_H_ + +#include +#include +#include "./tvm/runtime/packed_func.h" +#include "./tvm/schedule_pass.h" +#include "./tvm/lowered_func.h" + +namespace tvm { + +/*! +* \brief Container for target device information. +* Use target::llvm, target::cuda etc functions instead of constructing directly. +*/ +struct Target { + /*! \brief The name of the target device */ + std::string target_name; + /*! \brief The type of the target device */ + DLDeviceType device_type; + /*! \brief The maximum threads that a schedule should use for this device */ + int max_num_threads = 1; + /*! \brief The warp size that should be used by the LowerThreadAllreduce pass */ + int thread_warp_size = 1; + /*! \brief Keys for this target */ + std::unordered_set keys; + /*! \brief Options for this target */ + std::vector options; + + Target(const std::string& target_name, + DLDeviceType device_type, + int max_num_threads, + int thread_warp_size, + const std::unordered_set& keys, + const std::vector& options) : + target_name(target_name), + device_type(device_type), + max_num_threads(max_num_threads), + thread_warp_size(thread_warp_size), + keys(keys), + options(options) { + } + + /*! \return the full device string to pass to codegen::Build */ + EXPORT std::string str() const; + + /*! + * \brief Create a Target given a string + * \param target_str the string to parse + */ + EXPORT static Target create(const std::string& target_str); +}; + +/*! \brief This namespace provides functions to construct Target instances */ +namespace target { +/*! \return A target for LLVM */ +EXPORT Target llvm(); + +/*! \return A target for CUDA */ +EXPORT Target cuda(); + +/*! \return A target for ROCm */ +EXPORT Target rocm(); + +/*! \return A target for Metal */ +EXPORT Target metal(); + +/*! \return A target for rasp */ +EXPORT Target rasp(); + +/*! \return A target for stackvm */ +EXPORT Target stackvm(); + +} // namespace target + +/*! +* \brief Container for build configuration options +*/ +struct BuildConfig { + /*! + * \brief The data alignment to use when constructing buffers. If this is set to + * -1, then TVM's internal default will be used + */ + int data_alignment = -1; + /*! + * \brief The offset factor to use when constructing buffers. If this is set to + * 0, then the offset field is not used. + */ + int offset_factor = 0; + + /*! + * \brief Splitting factor for loop splitting. If this is set to zero, no splitting will be + * done. Otherwise, a split will be done with this factor and the inner loop will be unrolled. + */ + int double_buffer_split_loop = 1; + /*! \brief Threshold of number of steps in the loop to be automatically unrolled */ + int auto_unroll_max_step = 0; + /*! \brief The maximum nested level of loops that can be automatically unrolled */ + int auto_unroll_max_depth = 8; + /*! \brief The maximum extent of loop that will be unrolled */ + int auto_unroll_max_extent = 0; + /*! + * \brief Whether to explicitly unroll the loop. If set to false, the unroll hint will + * be passed to the CodeGen phase. Set to true if CodeGen supports unroll pragma. + */ + bool unroll_explicit = true; + + /*! \brief Set to true if buffer arguments do not overlap. This enables more optimization. */ + bool restricted_func = true; + + /*! \brief Whether to detect global barrier */ + bool detect_global_barrier = false; + + BuildConfig() { + } +}; + +/*! +* \brief Build a LoweredFunc given a schedule, args and binds +* \param sch The schedule to lower. +* \param args The arguments to the function. +* \param name The name of the lowered function. +* \param binds Buffer assignments. +* \param config The build configuration. +* \return The lowered function. +*/ +EXPORT Array lower(Schedule sch, + const Array& args, + const std::string& name, + const std::unordered_map& binds, + const BuildConfig& config); + +/*! +* \brief Build a device and host module for a specific target from an array of lowered functions. +* \param funcs The functions to be built. +* \param target The target device to build for. +* \param target_host The target for building host code. If null, a suitable default will be used. +* \param config The build configuration. +* \return The built module. +*/ +EXPORT runtime::Module build(const Array& funcs, + const Target& target, + Target* target_host, + const BuildConfig& config); + +} // namespace tvm + +#endif // TVM_BUILD_MODULE_H_ diff --git a/include/tvm/expr.h b/include/tvm/expr.h index 4e4e25c0ce7d..c0f4fea24bf8 100644 --- a/include/tvm/expr.h +++ b/include/tvm/expr.h @@ -291,6 +291,13 @@ inline const char* IterVarType2String(IterVarType t) { return "Unknown"; } +/*! + * \brief Construct a new Var expression + * \param name_hint The name hint for the expression + * \param t The type of the expression + */ +TVM_DLL Var var(const std::string& name_hint, Type t = Int(32)); + /* * \brief Template function to convert Map to unordered_map * Sometimes useful for API gluing when internal uses unordered_map diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h index a0e4a2c9e829..3efc31774d40 100644 --- a/include/tvm/schedule.h +++ b/include/tvm/schedule.h @@ -81,7 +81,7 @@ class Stage : public NodeRef { * \param thread_ivar The thread axis to be binded. * \return reference to self. */ - Stage& bind(IterVar ivar, IterVar thread_ivar); + EXPORT Stage& bind(IterVar ivar, IterVar thread_ivar); /*! * \brief Set predicate under which store to the array can be performed. * Use this when there are duplicated threads doing the same store and we only @@ -110,7 +110,7 @@ class Stage : public NodeRef { * \param p_inner The result inner domain. * \return reference to self. */ - Stage& split(IterVar parent, Expr factor, IterVar* p_outer, IterVar* p_inner); // NOLINT(*) + EXPORT Stage& split(IterVar parent, Expr factor, IterVar* p_outer, IterVar* p_inner); // NOLINT(*) /*! * \brief Split the iteration with given number of parts. * @@ -248,13 +248,13 @@ class Schedule : public NodeRef { * \brief Get the stage corresponds to the op * \param op The operation. */ - Stage operator[](const Operation& op); + EXPORT Stage operator[](const Operation& op); /*! * \brief Short hand for getting the stage of tensor's operation. * \param tensor The tensor * \return The stage corresponding to the tensor's op */ - Stage operator[](const Tensor& tensor) { + EXPORT Stage operator[](const Tensor& tensor) { return this->operator[](tensor->op); } /*! @@ -493,7 +493,7 @@ class ScheduleNode : public Node { * \param ops The ops to be scheduled. * \return sch The created Schedule. */ - static Schedule make(Array ops); + EXPORT static Schedule make(Array ops); static constexpr const char* _type_key = "Schedule"; TVM_DECLARE_NODE_TYPE_INFO(ScheduleNode, Node); diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc new file mode 100644 index 000000000000..d936b873bcd2 --- /dev/null +++ b/src/codegen/build_module.cc @@ -0,0 +1,314 @@ +/*! + * Copyright (c) 2017 by Contributors + * Compile executable modules. + * \file build_module.cc + */ +#include +#include +#include +#include + + +namespace tvm { + +std::string Target::str() const { + std::ostringstream result; + result << target_name; + for (const auto &x : options) { + result << " " << x; + } + return result.str(); +} + +Target TargetFromName(const std::string& name) { + if (name == "llvm") { + return target::llvm(); + } else if (name == "cuda" || name == "nvptx") { + return target::cuda(); + } else if (name == "rocm" || name == "opencl") { + /* For now, assume rocm schedule for opencl */ + return target::rocm(); + } else if (name == "metal") { + return target::metal(); + } else if (name == "stackvm" || name == "ext_dev") { + return target::stackvm(); + } else { + LOG(ERROR) << "Unknown target name " << name; + return target::stackvm(); + } +} + +bool StartsWith(const std::string& str, const std::string& pattern) { + return str.compare(0, pattern.length(), pattern) == 0; +} + +std::string GetDeviceName(const std::string& target_str) { + std::istringstream ss(target_str); + std::string target_name; + ss >> target_name; + + std::string item; + while (ss >> item) { + if (StartsWith(item, "-device=")) { + return item.substr(std::string("-device=").length()); + } + } + + return ""; +} + +Target Target::create(const std::string& target_str) { + if (target_str.length() == 0) { + LOG(ERROR) << "target_str must not be empty"; + } + + std::istringstream ss(target_str); + std::string target_name; + + ss >> target_name; + auto device_name = GetDeviceName(target_str); + + auto result = device_name == "rasp" ? + target::rasp() : + TargetFromName(target_name); + + std::string item; + while (ss >> item) { + result.options.push_back(item); + } + + return result; +} + +namespace target { +Target llvm() { + std::unordered_set keys({ "llvm", "cpu" }); + std::vector options; + return Target("llvm", kDLCPU, 512, 1, keys, options); +} + +Target cuda() { + std::unordered_set keys({ "cuda", "gpu" }); + std::vector options; + return Target("cuda", kDLGPU, 512, 32, keys, options); +} + +Target rocm() { + std::unordered_set keys({ "rocm", "gpu" }); + std::vector options; + return Target("rocm", kDLROCM, 256, 1, keys, options); +} + +Target metal() { + std::unordered_set keys({ "gpu" }); + std::vector options; + return Target("metal", kDLMetal, 256, 1, keys, options); +} + +Target rasp() { + std::unordered_set keys({ "llvm", "cpu" }); + std::vector options({ + "-device=rasp", + "-mtriple=armv7l-none-linux-gnueabihf", + "-mcpu=cortex-a53", + "-mattr=+neon" + }); + return Target("llvm", kDLCPU, 512, 1, keys, options); +} + +Target stackvm() { + std::unordered_set keys({ "stackvm", "cpu" }); + std::vector options; + return Target("stackvm", kDLCPU, 512, 1, keys, options); +} +} // namespace target + +bool LLVMEnabled() { + const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.build_llvm"); + return pf != nullptr; +} + +/*! \return The default host target for a given device target */ +Target DefaultTargetHost(Target target) { + if (target.device_type == kDLCPU) { + return target; + } else { + if (LLVMEnabled()) { + return target::llvm(); + } else { + return target::stackvm(); + } + } +} + +Buffer BufferWithOffsetAlignment(Array shape, + Type dtype, + std::string name, + int data_alignment, + int offset_factor) { + auto data = Var(name, Handle()); + + Expr elem_offset; + if (offset_factor != 0) { + elem_offset = Var(name + "_elem_offset", shape[0].type()); + } else { + elem_offset = Expr(); + } + + return BufferNode::make(data, dtype, shape, Array(), elem_offset, name, "", + data_alignment, offset_factor); +} + +void GetBinds(const Array& args, + const std::unordered_map& binds, + Map* out_binds, + Array* out_arg_list, + const BuildConfig& config) { + *out_binds = binds; + + for (const auto &x : args) { + if (out_binds->find(x) == out_binds->end()) { + auto buf = BufferWithOffsetAlignment(x->shape, x->dtype, x->op->name, + config.data_alignment, config.offset_factor); + out_binds->Set(x, buf); + out_arg_list->push_back(buf); + } else { + out_arg_list->push_back((*out_binds)[x]); + } + } +} + +/*! +* \brief Build a Stmt given a schedule, args and binds. This function runs the IR passes. +* \param sch The schedule to build. +* \param args The arguments for the schedule. +* \param binds Buffer assignments. +* \param loop_partition True if the LoopPartition pass should be included. +* \param out_arg_list Returns the arguments for the Stmt. +* \param config The build configuration. +* \return The built Stmt. +*/ +Stmt BuildStmt(Schedule sch, + const Array& args, + const std::unordered_map& binds, + bool loop_partition, + Array *out_arg_list, + const BuildConfig& config) { + Map out_binds; + GetBinds(args, binds, &out_binds, out_arg_list, config); + + sch = sch.normalize(); + + // Phase 0 + auto bounds = schedule::InferBound(sch); + auto stmt = schedule::ScheduleOps(sch, bounds); + stmt = ir::InjectPrefetch(stmt); + + // Phase 1 + stmt = ir::StorageFlatten(stmt, out_binds, 64); + stmt = ir::CanonicalSimplify(stmt); + if (loop_partition) { + stmt = ir::LoopPartition(stmt); + } + stmt = ir::VectorizeLoop(stmt); + stmt = ir::InjectVirtualThread(stmt); + stmt = ir::InjectDoubleBuffer(stmt, config.double_buffer_split_loop); + stmt = ir::StorageRewrite(stmt); + stmt = ir::UnrollLoop(stmt, config.auto_unroll_max_step, config.auto_unroll_max_depth, + config.auto_unroll_max_extent, config.unroll_explicit); + + // Phase 2 + stmt = ir::Simplify(stmt); + stmt = ir::LowerStorageAccessInfo(stmt); + stmt = ir::RemoveNoOp(stmt); + stmt = ir::RewriteUnsafeSelect(stmt); + + return stmt; +} + +Array lower(Schedule sch, + const Array& args, + const std::string& name, + const std::unordered_map& binds, + const BuildConfig& config) { + Array out_arg_list; + auto stmt = BuildStmt(sch, args, binds, true, &out_arg_list, config); + return Array({ ir::MakeAPI(stmt, name, out_arg_list, 0, config.restricted_func) }); +} + +runtime::Module build(const Array& funcs, + const Target& target, + Target* target_host, + const BuildConfig& config) { + std::unordered_set all_names; + for (const auto &x : funcs) { + CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name; + all_names.insert(x->name); + } + + Target target_host_val = target_host == nullptr ? + DefaultTargetHost(target) : + *target_host; + + Array fhost; + Array fdevice; + + for (const auto &x : funcs) { + if (x->func_type == kMixedFunc) { + auto func = x; + if (config.detect_global_barrier) { + func = ir::ThreadSync(func, "global"); + } + + func = ir::ThreadSync(func, "shared"); + func = ir::LowerThreadAllreduce(func, target.thread_warp_size); + auto fsplits = ir::SplitHostDevice(func); + fhost.push_back(fsplits[0]); + for (auto f = fsplits.begin() + 1; f != fsplits.end(); ++f) { + fdevice.push_back(*f); + } + } else if (x->func_type == kHostFunc) { + fhost.push_back(x); + } else if (x->func_type == kDeviceFunc) { + fdevice.push_back(x); + } else { + LOG(FATAL) << "unknown function type " << x->func_type; + } + } + + if (target.keys.count("gpu") > 0 && fdevice.size() == 0) { + LOG(WARNING) << "Specified target " + target.str() + + " but cannot find device code. Did you forget to bind?"; + } + + for (size_t i = 0; i < fhost.size(); ++i) { + auto func = fhost[i]; + func = ir::BindDeviceType(func, target.device_type); + func = ir::LowerTVMBuiltin(func); + fhost.Set(i, func); + } + + + for (size_t i = 0; i < fdevice.size(); ++i) { + auto func = fdevice[i]; + func = ir::LowerIntrin(func, target.target_name); + fdevice.Set(i, func); + } + + for (size_t i = 0; i < fhost.size(); ++i) { + auto func = fhost[i]; + func = ir::LowerIntrin(func, target_host_val.target_name); + func = ir::CombineContextCall(func); + fhost.Set(i, func); + } + + auto mhost = codegen::Build(fhost, target_host_val.str()); + + if (fdevice.size() > 0) { + auto mdev = codegen::Build(fdevice, target.str()); + mhost.Import(mdev); + } + + return mhost; +} +} // namespace tvm diff --git a/src/lang/expr.cc b/src/lang/expr.cc index 348733bad626..be83b521ed8e 100644 --- a/src/lang/expr.cc +++ b/src/lang/expr.cc @@ -47,6 +47,10 @@ std::ostream& operator<<(std::ostream& os, const NodeRef& n) { // NOLINT(*) return os; } +Var var(const std::string& name_hint, Type t) { + return Var(name_hint, t); +} + TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) .set_dispatch([](const IterVarNode *op, IRPrinter *p) { p->stream << "iter_var("; diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc new file mode 100644 index 000000000000..fc3f6ac9324d --- /dev/null +++ b/tests/cpp/build_module_test.cc @@ -0,0 +1,42 @@ +#include +#include +#include +#include +#include + +TEST(BuildModule, Basic) { + using namespace tvm; + auto n = var("n"); + Array shape; + shape.push_back(n); + + auto A = placeholder(shape, Float(32), "A"); + auto B = placeholder(shape, Float(32), "B"); + + auto C = compute(A->shape, [&A, &B](Expr i) { + return A[i] + B[i]; + }, "C"); + + auto s = create_schedule({ C->op }); + + auto cAxis = C->op.as()->axis; + + IterVar bx, tx; + s[C].split(cAxis[0], 64, &bx, &tx); + + auto args = Array({ A, B, C }); + std::unordered_map binds; + + BuildConfig config; + auto target = target::llvm(); + + auto lowered = lower(s, args, "func", binds, config); + auto module = build(lowered, target, nullptr, config); +} + + +int main(int argc, char ** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} From 555e084d23bc0bc032f1f8d902c637b1dcf77cde Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 7 Dec 2017 08:20:45 +0800 Subject: [PATCH 041/948] [CODEGEN] add callback post proc for opencl (#692) --- src/codegen/build_opencl.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/codegen/build_opencl.cc b/src/codegen/build_opencl.cc index 51779d3f7a3e..c4a53744d477 100644 --- a/src/codegen/build_opencl.cc +++ b/src/codegen/build_opencl.cc @@ -16,6 +16,7 @@ namespace tvm { namespace codegen { runtime::Module BuildOpenCL(Array funcs) { + using tvm::runtime::Registry; bool output_ssa = false; CodeGenOpenCL cg; cg.Init(output_ssa); @@ -23,6 +24,10 @@ runtime::Module BuildOpenCL(Array funcs) { cg.AddFunction(f); } std::string code = cg.Finish(); + + if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) { + code = (*f)(code).operator std::string(); + } #if TVM_OPENCL_RUNTIME return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs)); #else From 99831770a7ccbd808a67635fea5d1b1fa1e91fb1 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 11 Dec 2017 19:50:04 +0800 Subject: [PATCH 042/948] [CODEGEN] add fp16 and fp64 enable pragma for opencl (#697) * [CODEGEN] add fp16 and fp64 enable pragma for opencl * fix style --- src/codegen/codegen_c.cc | 4 +++- src/codegen/codegen_c.h | 2 +- src/codegen/codegen_cuda.cc | 2 +- src/codegen/codegen_cuda.h | 2 +- src/codegen/codegen_metal.cc | 2 +- src/codegen/codegen_metal.h | 2 +- src/codegen/codegen_opencl.cc | 41 ++++++++++++++++++++++++++++++++--- src/codegen/codegen_opencl.h | 9 +++++++- 8 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index 2bbf8324eb51..b315707b52c5 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -272,7 +272,7 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { / CHECK_EQ(scope, "global"); } -void CodeGenC::PrintType(Type t, std::ostream& os) const { // NOLINT(*) +void CodeGenC::PrintType(Type t, std::ostream& os) { // NOLINT(*) CHECK_EQ(t.lanes(), 1) << "do not yet support vector types"; if (t.is_handle()) { @@ -402,7 +402,9 @@ inline void PrintBinaryIntrinsitc(const Call* op, } } void CodeGenC::VisitExpr_(const Cast *op, std::ostream& os) { // NOLINT(*) + os << "("; this->PrintType(op->type, os); + os << ")"; os << '('; this->PrintExpr(op->value, os); os << ')'; diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 8141c57c84a5..1c68dd18bd68 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -118,7 +118,7 @@ class CodeGenC : * \param t The type representation. * \param os The stream to print the ctype into */ - virtual void PrintType(Type t, std::ostream& os) const; // NOLINT(*) + virtual void PrintType(Type t, std::ostream& os); // NOLINT(*) /*! * \brief Print expr representing the thread tag * \param IterVar iv The thread index to be binded; diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index 249128d12f62..90b01e91bd51 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -45,7 +45,7 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) { CastFromTo(iv->thread_tag, UInt(32), iv->var.type()); } -void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*) +void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*) int lanes = t.lanes(); if (t.is_handle()) { CHECK_EQ(lanes, 1) diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index c0703523dfeb..9aa72d14cb06 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -26,7 +26,7 @@ class CodeGenCUDA final : public CodeGenC { void PrintVecBinaryOp( const std::string&op, Type t, Expr lhs, Expr rhs, std::ostream& os) final; // NOLINT(*) - void PrintType(Type t, std::ostream& os) const final; // NOLINT(*) + void PrintType(Type t, std::ostream& os) final; // NOLINT(*) void PrintVecElemLoad( const std::string& vec, Type t, int i, std::ostream& os) final; // NOLINT(*) void PrintVecElemStore( diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc index 7eb1a03c2019..0df63aad49a2 100644 --- a/src/codegen/codegen_metal.cc +++ b/src/codegen/codegen_metal.cc @@ -132,7 +132,7 @@ void CodeGenMetal::BindThreadIndex(const IterVar& iv) { CastFromTo(iv->thread_tag, UInt(thread_index_bits_), iv->var.type()); } -void CodeGenMetal::PrintType(Type t, std::ostream& os) const { // NOLINT(*) +void CodeGenMetal::PrintType(Type t, std::ostream& os) { // NOLINT(*) int lanes = t.lanes(); if (t.is_handle()) { CHECK_EQ(lanes, 1) diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h index e1bc09f0a939..80efef67332e 100644 --- a/src/codegen/codegen_metal.h +++ b/src/codegen/codegen_metal.h @@ -23,7 +23,7 @@ class CodeGenMetal final : public CodeGenC { void InitFuncState(LoweredFunc f) final; void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*) void PrintStorageSync(const Call* op) final; // NOLINT(*) - void PrintType(Type t, std::ostream& os) const final; // NOLINT(*) + void PrintType(Type t, std::ostream& os) final; // NOLINT(*) void BindThreadIndex(const IterVar& iv) final; // NOLINT(*) // overload visitor void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc index 9d0e16f2862e..ccd164bcd9e0 100644 --- a/src/codegen/codegen_opencl.cc +++ b/src/codegen/codegen_opencl.cc @@ -30,6 +30,35 @@ void CodeGenOpenCL::AddFunction(LoweredFunc f) { CodeGenC::AddFunction(f); } +std::string CodeGenOpenCL::Finish() { + // inject extension enable pragma for fp16 and fp64 + if (enable_fp16_) { + decl_stream + << "#ifdef cl_khr_fp16\n" + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#elif defined(cl_amd_fp16)\n" + "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n" + "#else\n" + "#error \"Half precision floating point not supported" + "by OpenCL implementation on your device.\" \n" + "#endif\n\n"; + } + + if (enable_fp64_) { + decl_stream + << "#ifdef cl_khr_fp64\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#elif defined(cl_amd_fp64)\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#else\n" + "#error \"Double precision floating point not supported" + "by OpenCL implementation on your device.\" \n" + "#endif\n\n"; + } + + return CodeGenC::Finish(); +} + void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) { CHECK(!var_idmap_.count(iv->var.get())); runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag); @@ -43,7 +72,7 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) { CastFromTo(os.str(), UInt(64), iv->var.type()); } -void CodeGenOpenCL::PrintType(Type t, std::ostream& os) const { // NOLINT(*) +void CodeGenOpenCL::PrintType(Type t, std::ostream& os) { // NOLINT(*) int lanes = t.lanes(); if (t.is_handle()) { CHECK_EQ(lanes, 1) @@ -53,9 +82,15 @@ void CodeGenOpenCL::PrintType(Type t, std::ostream& os) const { // NOLINT(*) bool fail = false; if (t.is_float()) { switch (t.bits()) { - case 16: os << "half"; break; + case 16: + os << "half"; + enable_fp16_ = true; + break; case 32: os << "float"; break; - case 64: os << "double"; break; + case 64: + os << "double"; + enable_fp64_ = true; + break; default: fail = true; break; } if (!fail && lanes == 1) return; diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h index a39d4e104c47..a10c165ee3a1 100644 --- a/src/codegen/codegen_opencl.h +++ b/src/codegen/codegen_opencl.h @@ -18,12 +18,14 @@ class CodeGenOpenCL final : public CodeGenC { public: CodeGenOpenCL(); void AddFunction(LoweredFunc f); + std::string Finish(); + // override print thread tag. void InitFuncState(LoweredFunc f) final; void BindThreadIndex(const IterVar& iv) final; // NOLINT(*) void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*) void PrintStorageSync(const Call* op) final; // NOLINT(*) - void PrintType(Type t, std::ostream& os) const final; // NOLINT(*) + void PrintType(Type t, std::ostream& os) final; // NOLINT(*) std::string GetVecLoad(Type t, const Variable* buffer, Expr base) final; void PrintVecStore(const Variable* buffer, @@ -34,6 +36,11 @@ class CodeGenOpenCL final : public CodeGenC { Expr base, std::ostream& os); // NOLINT(*) // overload visitor void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) + + private: + // whether enable fp16 and fp64 extension + bool enable_fp16_{false}; + bool enable_fp64_{false}; }; } // namespace codegen From 367ca3bfcee76637a06aa9c17101ffb852540bff Mon Sep 17 00:00:00 2001 From: abergeron Date: Mon, 11 Dec 2017 18:43:11 -0500 Subject: [PATCH 043/948] Fix long for windows in cuda (#700) * Use long long for platforms where long is 32 bits (like windows). * Make sure scalar chars are signed. * Re-add NOLINT marker. --- src/codegen/codegen_cuda.cc | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index 90b01e91bd51..cace8d8573bb 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -77,13 +77,28 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*) os << "int"; return; } switch (t.bits()) { - case 8: os << "char"; break; + case 8: { + if (!t.is_uint() && t.lanes() == 1) { + os << "signed char"; break; + } else { + os << "char"; break; + } + } case 16: os << "short"; break; case 32: os << "int"; break; case 64: { - CHECK(sizeof(long) == 8) // NOLINT(*) - << "CUDA not support int64 int in 32 bit system"; - os << "long"; break; + if (sizeof(long) != 8) { // NOLINT(*) + if (t.lanes() == 1) { + os << "long long"; break; + } else if (t.lanes() == 2) { + os << "longlong"; break; + } else { + // No longlong3, longlong4 + LOG(FATAL) << "Cannot convert type " << t << " to CUDA type on a L32 platform"; + } + } else { + os << "long"; break; + } } case 1: os << "int"; break; default: fail = true; break; From ce1c9d62075bdeb3c3f605fb0d4ba926f6127f0e Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Tue, 12 Dec 2017 20:10:35 -0500 Subject: [PATCH 044/948] 1) Make unroll code reusable 2) reduce non-determinisim in CanonicalSimplify (#701) * 1) Refactored some parts of the unrolling code into their own methods so we can reuse unrolling functionality in other parts of the code. E.g., to explicitly unroll loops with count of 1 when they are programmatically created. 2) Reorder based on top operator before resorting to pointers, which causes non-determinism. * Fixed lint errors --- src/arithmetic/canonical.cc | 2 + src/pass/unroll_loop.cc | 79 +++++++++++++++++++++++-------------- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index 24369db02390..473e330de735 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -29,6 +29,8 @@ struct ComExprEntry { inline bool operator<(const ComExprEntry& other) const { if (level < other.level) return true; if (level > other.level) return false; + if (value.type_index() < other.value.type_index()) return true; + if (value.type_index() > other.value.type_index()) return false; return value.get() < other.value.get(); } }; diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc index 01c5e6ebff00..22fd389454af 100644 --- a/src/pass/unroll_loop.cc +++ b/src/pass/unroll_loop.cc @@ -30,17 +30,7 @@ class LoopUnroller : public IRMutator { Stmt Mutate_(const For* op, const Stmt& s) { Stmt stmt = IRMutator::Mutate_(op, s); op = stmt.as(); - // constant folding. - Expr extent = ir::Simplify(op->extent); - const IntImm* v1 = extent.as(); - const UIntImm* v2 = extent.as(); - int value = -1; - if (v1 != nullptr) { - value = static_cast(v1->value); - } - if (v2 != nullptr) { - value = static_cast(v2->value); - } + int value = GetExtent(op); // condition for auto unroll bool auto_unroll = ( op->for_type == ForType::Serial && @@ -66,24 +56,7 @@ class LoopUnroller : public IRMutator { } if (auto_unroll && explicit_unroll_) { - using arith::ComputeExpr; - if (value == 0) return Evaluate::make(0); - Stmt body = op->body; - Map vmap; - Stmt unrolled; - for (int i = 0; i < value; ++i) { - Var lv(op->loop_var.node_); - vmap.Set(lv, - ComputeExpr( - op->min, make_const(op->loop_var.type(), i))); - Stmt step = Substitute(body, vmap); - if (unrolled.defined()) { - unrolled = Block::make(unrolled, step); - } else { - unrolled = step; - } - } - return unrolled; + return Unroll(op); } else { if (auto_unroll) { if (op->for_type != ForType::Unrolled) { @@ -128,7 +101,47 @@ class LoopUnroller : public IRMutator { } } + Stmt Unroll(const For* op) { + using arith::ComputeExpr; + int value = GetExtent(op); + // For loop must have a constant integer extent + CHECK_NE(value, -1) << "loop doesn't have a constant integer extent"; + if (value == 0) return Evaluate::make(0); + Stmt body = op->body; + Map vmap; + Stmt unrolled; + for (int i = 0; i < value; ++i) { + Var lv(op->loop_var.node_); + vmap.Set(lv, + ComputeExpr( + op->min, make_const(op->loop_var.type(), i))); + Stmt step = Substitute(body, vmap); + if (unrolled.defined()) { + unrolled = Block::make(unrolled, step); + } else { + unrolled = step; + } + } + return unrolled; + } + private: + // returns the extent of the loop if it's a constant integer, otherwise return -1 + int GetExtent(const For* op) { + // constant folding. + Expr extent = ir::Simplify(op->extent); + const IntImm *v1 = extent.as(); + const UIntImm *v2 = extent.as(); + int value = -1; + if (v1 != nullptr) { + value = static_cast(v1->value); + } + if (v2 != nullptr) { + value = static_cast(v2->value); + } + return value; + } + // maximum number of step to perform auto unroll. int auto_max_step_; int auto_max_depth_; @@ -162,5 +175,13 @@ Stmt UnrollLoop(Stmt stmt, } } +Stmt UnrollLoopExplicitly(Stmt stmt) { + const For* op = stmt.as(); + if (!op) { + LOG(FATAL) << "attempted to unroll a non-loop statement"; + } + return LoopUnroller(0, 0, 0, false).Unroll(op); +} + } // namespace ir } // namespace tvm From 793f4912ddab5cb228c815ac9e69f384cf3d2fc9 Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Wed, 13 Dec 2017 18:28:52 -0500 Subject: [PATCH 045/948] Simplify expressions early on (#702) * Simplify expressions early on * fixed lint errors --- src/arithmetic/int_set.cc | 2 +- src/schedule/bound.cc | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc index 8a88ed23e262..b19aa4691e8b 100644 --- a/src/arithmetic/int_set.cc +++ b/src/arithmetic/int_set.cc @@ -27,7 +27,7 @@ inline IntSet IntSet::cover_interval() const { for (size_t i = 0; i < s->extents.size(); ++i) { max = max + s->extents[i] * s->strides[i] - s->strides[i]; } - return IntervalSet::make(s->base.min, max); + return IntervalSet::make(s->base.min, Simplify(max)); } LOG(FATAL) << "cannot convert set " << (*this)->type_key() << " to interval"; return IntSet::everything(); diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc index 203ce28708a3..1a06970e52e4 100644 --- a/src/schedule/bound.cc +++ b/src/schedule/bound.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include "./graph.h" @@ -209,6 +210,10 @@ Map InferBound(const Schedule& sch) { ret[iv] = iv->dom; } } + for (auto& p : ret) { + ret[p.first] = Range::make_by_min_extent(ir::Simplify(p.second->min), + ir::Simplify(p.second->extent)); + } return Map(ret.begin(), ret.end()); } From fb14d9d747e3af3075117d1e1103d04713248d18 Mon Sep 17 00:00:00 2001 From: Cody Hao Yu Date: Thu, 14 Dec 2017 16:35:50 -0800 Subject: [PATCH 046/948] Make duplicated function name checker working (#705) --- CONTRIBUTORS.md | 1 + python/tvm/build_module.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 944e76fd3b83..a62c11e5fa4d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -36,3 +36,4 @@ List of Contributors - [Jian Weng](https://github.com/were) - [Masahiro Masuda](https://github.com/masahi) - [Haolong Zhang](https://github.com/haolongzhangm) +- [Cody Hao Yu](https://github.com/comaniac) diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 5756fe6d14c6..083074a6676e 100644 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -310,6 +310,7 @@ def build(sch, raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc") if x.name in fname_set: raise ValueError("Duplicate function name %s" % x.name) + fname_set.add(x.name) target = _target.current_target() if target is None else target target = _target.create(target) if target else _target.create("llvm") From 9bb6d77ed295899d6c25f491fc3aef762410566c Mon Sep 17 00:00:00 2001 From: masahi Date: Sat, 16 Dec 2017 18:06:49 -0500 Subject: [PATCH 047/948] fix cudnn output shape (#708) --- src/contrib/cudnn/conv_forward.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/contrib/cudnn/conv_forward.cc b/src/contrib/cudnn/conv_forward.cc index fadcfa03b021..480a789303c6 100644 --- a/src/contrib/cudnn/conv_forward.cc +++ b/src/contrib/cudnn/conv_forward.cc @@ -114,7 +114,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape") int x_dim3 = args[10]; int w_dim0 = args[11]; int w_dim1 = args[12]; - int w_dim2 = args[12]; + int w_dim2 = args[13]; int w_dim3 = args[14]; void *out_shape = args[15]; // Set Format From 52871592db15b0354e706724ee110338fc1bb7bc Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sun, 17 Dec 2017 15:52:48 -0800 Subject: [PATCH 048/948] Halide -> HalideIR (#698) --- HalideIR | 2 +- include/tvm/expr.h | 56 +++++++++--------- include/tvm/ir.h | 96 +++++++++++++++---------------- include/tvm/ir_operator.h | 14 ++--- include/tvm/packed_func_ext.h | 10 ++-- include/tvm/runtime/packed_func.h | 12 ++-- include/tvm/tensor.h | 2 +- src/api/api_ir.cc | 2 +- src/arithmetic/bound_deducer.cc | 2 +- src/arithmetic/canonical.cc | 4 +- src/arithmetic/compute_expr.h | 10 ++-- src/arithmetic/int_set.cc | 2 +- src/arithmetic/int_set_internal.h | 2 +- src/lang/expr.cc | 6 +- src/lang/ir.cc | 4 +- src/lang/reflection.cc | 2 +- src/lang/tensor.cc | 2 +- src/op/compute_op.cc | 2 +- src/op/extern_op.cc | 2 +- src/op/scan_op.cc | 2 +- src/pass/inject_prefetch.cc | 2 +- src/pass/ir_deep_compare.cc | 4 +- src/pass/ir_mutator.cc | 4 +- src/pass/storage_flatten.cc | 2 +- src/schedule/message_passing.cc | 2 +- tests/cpp/ir_mutator_test.cc | 6 +- tests/cpp/ir_simplify_test.cc | 2 +- tests/cpp/ir_ssa_test.cc | 4 +- tests/cpp/ir_visitor_test.cc | 2 +- 29 files changed, 131 insertions(+), 131 deletions(-) diff --git a/HalideIR b/HalideIR index d91cf97d5d6c..aadbf02d6bd7 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit d91cf97d5d6cd2b47ec408bb08e978b88cbf6ab7 +Subproject commit aadbf02d6bd7a545edbf6652494a7b07a97a06c1 diff --git a/include/tvm/expr.h b/include/tvm/expr.h index c0f4fea24bf8..5d35728c24b4 100644 --- a/include/tvm/expr.h +++ b/include/tvm/expr.h @@ -16,31 +16,31 @@ namespace tvm { -using Halide::Type; -using Halide::Float; -using Halide::Bool; -using Halide::Int; -using Halide::UInt; -using Halide::Handle; -using Halide::ExprHash; -using Halide::ExprEqual; +using HalideIR::Type; +using HalideIR::Float; +using HalideIR::Bool; +using HalideIR::Int; +using HalideIR::UInt; +using HalideIR::Handle; +using HalideIR::ExprHash; +using HalideIR::ExprEqual; -using Halide::Expr; -using Halide::VarExpr; -using Halide::IR::RangeNode; -using Halide::IR::FunctionRef; -using Halide::IR::FunctionBaseNode; -using Halide::Internal::Stmt; -using Halide::Internal::IRPrinter; -using Halide::Internal::Variable; +using HalideIR::Expr; +using HalideIR::VarExpr; +using HalideIR::IR::RangeNode; +using HalideIR::IR::FunctionRef; +using HalideIR::IR::FunctionBaseNode; +using HalideIR::Internal::Stmt; +using HalideIR::Internal::IRPrinter; +using HalideIR::Internal::Variable; -using Halide::Internal::make_const; -using Halide::Internal::make_zero; -using Halide::Internal::as_const_int; -using Halide::Internal::as_const_uint; -using Halide::Internal::const_true; -using Halide::Internal::const_false; -using Halide::Internal::is_no_op; +using HalideIR::Internal::make_const; +using HalideIR::Internal::make_zero; +using HalideIR::Internal::as_const_int; +using HalideIR::Internal::as_const_uint; +using HalideIR::Internal::const_true; +using HalideIR::Internal::const_false; +using HalideIR::Internal::is_no_op; inline Type TVMShapeIndexType() { if (std::is_signed::value) { @@ -51,7 +51,7 @@ inline Type TVMShapeIndexType() { } inline Type TVMType2Type(TVMType t) { - return Type(static_cast(t.code), t.bits, t.lanes); + return Type(static_cast(t.code), t.bits, t.lanes); } inline TVMType Type2TVMType(Type t) { @@ -71,7 +71,7 @@ inline int GetVectorBytes(Type dtype) { } /*! \brief a named variable in TVM */ -class Var : public Halide::VarExpr { +class Var : public HalideIR::VarExpr { public: explicit Var(const std::string& name_hint = "v", Type t = Int(32)) : VarExpr(name_hint, t) {} @@ -94,7 +94,7 @@ class Var : public Halide::VarExpr { class IterVarNode; /*! - * \brief same as Halide::IR::Range + * \brief same as HalideIR::IR::Range * except it provide an constructor with (begin, end) * * \note Traditional Halide's Range have a constructor with @@ -102,11 +102,11 @@ class IterVarNode; * We decided to correct it by removing the constructor in HalideIR, * and add it back in TVM's range. */ -class Range : public Halide::IR::Range { +class Range : public HalideIR::IR::Range { public: /*! \brief constructor */ Range() {} - explicit Range(std::shared_ptr n) : Halide::IR::Range(n) {} + explicit Range(std::shared_ptr n) : HalideIR::IR::Range(n) {} /*! * \brief constructor by begin and end * \param begin The begin of the range. diff --git a/include/tvm/ir.h b/include/tvm/ir.h index ae53d38b82b2..5b8b56be7db6 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -16,11 +16,11 @@ namespace tvm { namespace ir { -using Halide::Internal::ExprNode; -using Halide::Internal::StmtNode; -using Halide::Internal::IRNodeType; -using Halide::Internal::ForType; -using Halide::DeviceAPI; +using HalideIR::Internal::ExprNode; +using HalideIR::Internal::StmtNode; +using HalideIR::Internal::IRNodeType; +using HalideIR::Internal::ForType; +using HalideIR::DeviceAPI; // Node container for CommReducer struct CommReducerNode; @@ -433,50 +433,50 @@ enum TVMStructFieldKind : int { } // namespace intrinsic // Reuse IR node defintiion from HalideIR -using Halide::Internal::IntImm; -using Halide::Internal::UIntImm; -using Halide::Internal::FloatImm; -using Halide::Internal::StringImm; -using Halide::Internal::Cast; -using Halide::Internal::Add; -using Halide::Internal::Sub; -using Halide::Internal::Mul; -using Halide::Internal::Div; -using Halide::Internal::Mod; -using Halide::Internal::Min; -using Halide::Internal::Max; -using Halide::Internal::EQ; -using Halide::Internal::NE; -using Halide::Internal::LT; -using Halide::Internal::LE; -using Halide::Internal::GT; -using Halide::Internal::GE; -using Halide::Internal::And; -using Halide::Internal::Or; -using Halide::Internal::Not; -using Halide::Internal::Select; -using Halide::Internal::Load; -using Halide::Internal::Ramp; -using Halide::Internal::Broadcast; -using Halide::Internal::Call; -using Halide::Internal::Let; -using Halide::Internal::LetStmt; -using Halide::Internal::AttrStmt; -using Halide::Internal::AssertStmt; -using Halide::Internal::ProducerConsumer; -using Halide::Internal::For; -using Halide::Internal::Store; -using Halide::Internal::Provide; -using Halide::Internal::Allocate; -using Halide::Internal::Free; -using Halide::Internal::Realize; -using Halide::Internal::Prefetch; -using Halide::Internal::Block; -using Halide::Internal::IfThenElse; -using Halide::Internal::Evaluate; -using Halide::Internal::Shuffle; +using HalideIR::Internal::IntImm; +using HalideIR::Internal::UIntImm; +using HalideIR::Internal::FloatImm; +using HalideIR::Internal::StringImm; +using HalideIR::Internal::Cast; +using HalideIR::Internal::Add; +using HalideIR::Internal::Sub; +using HalideIR::Internal::Mul; +using HalideIR::Internal::Div; +using HalideIR::Internal::Mod; +using HalideIR::Internal::Min; +using HalideIR::Internal::Max; +using HalideIR::Internal::EQ; +using HalideIR::Internal::NE; +using HalideIR::Internal::LT; +using HalideIR::Internal::LE; +using HalideIR::Internal::GT; +using HalideIR::Internal::GE; +using HalideIR::Internal::And; +using HalideIR::Internal::Or; +using HalideIR::Internal::Not; +using HalideIR::Internal::Select; +using HalideIR::Internal::Load; +using HalideIR::Internal::Ramp; +using HalideIR::Internal::Broadcast; +using HalideIR::Internal::Call; +using HalideIR::Internal::Let; +using HalideIR::Internal::LetStmt; +using HalideIR::Internal::AttrStmt; +using HalideIR::Internal::AssertStmt; +using HalideIR::Internal::ProducerConsumer; +using HalideIR::Internal::For; +using HalideIR::Internal::Store; +using HalideIR::Internal::Provide; +using HalideIR::Internal::Allocate; +using HalideIR::Internal::Free; +using HalideIR::Internal::Realize; +using HalideIR::Internal::Prefetch; +using HalideIR::Internal::Block; +using HalideIR::Internal::IfThenElse; +using HalideIR::Internal::Evaluate; +using HalideIR::Internal::Shuffle; // ir functions -using Halide::Internal::is_const_power_of_two_integer; +using HalideIR::Internal::is_const_power_of_two_integer; } // namespace ir } // namespace tvm diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h index a0726f0030ab..8b27389db0d8 100644 --- a/include/tvm/ir_operator.h +++ b/include/tvm/ir_operator.h @@ -12,14 +12,14 @@ namespace tvm { -using Halide::likely; -using Halide::likely_if_innermost; +using HalideIR::likely; +using HalideIR::likely_if_innermost; // functions -using Halide::cast; -using Halide::min; -using Halide::max; -using Halide::abs; -using Halide::select; +using HalideIR::cast; +using HalideIR::min; +using HalideIR::max; +using HalideIR::abs; +using HalideIR::select; /*! * \brief sum of of source expression over axis diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h index 542de6a368b5..a598286001a3 100644 --- a/include/tvm/packed_func_ext.h +++ b/include/tvm/packed_func_ext.h @@ -104,7 +104,7 @@ inline TNodeRef TVMArgValue::AsNodeRef() const { return TNodeRef(sptr); } -inline TVMArgValue::operator Halide::Expr() const { +inline TVMArgValue::operator HalideIR::Expr() const { if (type_code_ == kNull) return Expr(); if (type_code_ == kDLInt) { return Expr(static_cast(value_.v_int64)); @@ -184,20 +184,20 @@ inline void TVMArgsSetter::operator()(size_t i, const NodeRef& other) const { / } // type related stuffs -inline TVMRetValue& TVMRetValue::operator=(const Halide::Type& t) { +inline TVMRetValue& TVMRetValue::operator=(const HalideIR::Type& t) { return this->operator=(Type2TVMType(t)); } -inline TVMRetValue::operator Halide::Type() const { +inline TVMRetValue::operator HalideIR::Type() const { return TVMType2Type(operator TVMType()); } -inline TVMArgValue::operator Halide::Type() const { +inline TVMArgValue::operator HalideIR::Type() const { return TVMType2Type(operator TVMType()); } inline void TVMArgsSetter::operator()( - size_t i, const Halide::Type& t) const { + size_t i, const HalideIR::Type& t) const { this->operator()(i, Type2TVMType(t)); } } // namespace runtime diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 4c08ffa6077c..b01e662b97ab 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -17,7 +17,7 @@ #include "./c_runtime_api.h" #include "./module.h" -namespace Halide { +namespace HalideIR { // Forward declare type for extensions // The header works fine without depending on this. struct Type; @@ -351,8 +351,8 @@ class TVMArgValue : public TVMPODValue_ { typename = typename std::enable_if< std::is_class::value>::type> inline bool IsNodeType() const; - inline operator Halide::Type() const; - inline operator Halide::Expr() const; + inline operator HalideIR::Type() const; + inline operator HalideIR::Expr() const; // get internal node ptr, if it is node inline std::shared_ptr& node_sptr(); }; @@ -531,8 +531,8 @@ class TVMRetValue : public TVMPODValue_ { inline TVMRetValue& operator=(const NodeRef& other); inline TVMRetValue& operator=(const std::shared_ptr& other); // type related - inline operator Halide::Type() const; - inline TVMRetValue& operator=(const Halide::Type& other); + inline operator HalideIR::Type() const; + inline TVMRetValue& operator=(const HalideIR::Type& other); private: template @@ -800,7 +800,7 @@ class TVMArgsSetter { inline void operator()(size_t i, const T& value) const; // NodeRef related extenstions: in tvm/packed_func_ext.h inline void operator()(size_t i, const NodeRef& other) const; // NOLINT(*) - inline void operator()(size_t i, const Halide::Type& t) const; + inline void operator()(size_t i, const HalideIR::Type& t) const; private: /*! \brief The values fields */ diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h index 4f46d86e93a5..0a74f97c0640 100644 --- a/include/tvm/tensor.h +++ b/include/tvm/tensor.h @@ -23,7 +23,7 @@ class TensorNode; // internal node container for Operation class OperationNode; -using Halide::IR::FunctionRef; +using HalideIR::IR::FunctionRef; /*! * \brief Tensor structure representing a possible input, diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc index 0a26af0ca43c..d0847aceceb3 100644 --- a/src/api/api_ir.cc +++ b/src/api/api_ir.cc @@ -27,7 +27,7 @@ TVM_REGISTER_API("make.For") args[1], args[2], static_cast(args[3].operator int()), - static_cast(args[4].operator int()), + static_cast(args[4].operator int()), args[5]); }); diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc index e09834923c37..7a8c841025fa 100644 --- a/src/arithmetic/bound_deducer.cc +++ b/src/arithmetic/bound_deducer.cc @@ -16,7 +16,7 @@ namespace tvm { namespace arith { using namespace ir; -using Halide::Internal::Interval; +using HalideIR::Internal::Interval; // a visitor to find the path to the target variable // from a expression. diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index 473e330de735..27ccfb09cdeb 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -626,7 +626,7 @@ Expr CanonicalSimplify(Expr expr, Map vrange) { template T Simplify_(T a, Map vrange) { - using namespace Halide::Internal; + using namespace HalideIR::Internal; Scope rscope; for (auto kv : vrange) { Range r = kv.second; @@ -635,7 +635,7 @@ T Simplify_(T a, Map vrange) { Interval(r->min, simplify(r->min + r->extent - make_const(r->min.type(), 1)))); } - return Halide::Internal::simplify(a, true, rscope); + return HalideIR::Internal::simplify(a, true, rscope); } diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h index 994bcb13eadc..5f44347f3539 100644 --- a/src/arithmetic/compute_expr.h +++ b/src/arithmetic/compute_expr.h @@ -14,9 +14,9 @@ namespace tvm { namespace arith { -using Halide::Internal::add_would_overflow; -using Halide::Internal::sub_would_overflow; -using Halide::Internal::mul_would_overflow; +using HalideIR::Internal::add_would_overflow; +using HalideIR::Internal::sub_would_overflow; +using HalideIR::Internal::mul_would_overflow; /*! * \brief Compute the expression with the given binary op. @@ -133,12 +133,12 @@ inline Expr ComputeExpr(Expr a, Expr b) { template<> inline Expr ComputeExpr(Expr a, Expr b) { - return Halide::Internal::Interval::make_max(a, b); + return HalideIR::Internal::Interval::make_max(a, b); } template<> inline Expr ComputeExpr(Expr a, Expr b) { - return Halide::Internal::Interval::make_min(a, b); + return HalideIR::Internal::Interval::make_min(a, b); } template diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc index b19aa4691e8b..c004b9666a58 100644 --- a/src/arithmetic/int_set.cc +++ b/src/arithmetic/int_set.cc @@ -15,7 +15,7 @@ namespace tvm { namespace arith { -using Halide::Internal::Interval; +using HalideIR::Internal::Interval; using namespace ir; inline IntSet IntSet::cover_interval() const { diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h index fca4b819905f..9284e6e016e0 100644 --- a/src/arithmetic/int_set_internal.h +++ b/src/arithmetic/int_set_internal.h @@ -13,7 +13,7 @@ namespace tvm { namespace arith { -using Halide::Internal::Interval; +using HalideIR::Internal::Interval; /*! \brief Set of continuous interval */ struct IntervalSet : public IntSetNode { diff --git a/src/lang/expr.cc b/src/lang/expr.cc index be83b521ed8e..0fb783d70cb8 100644 --- a/src/lang/expr.cc +++ b/src/lang/expr.cc @@ -10,7 +10,7 @@ namespace tvm { -using Halide::IR::RangeNode; +using HalideIR::IR::RangeNode; Range::Range(Expr begin, Expr end) : Range(std::make_shared( @@ -19,7 +19,7 @@ Range::Range(Expr begin, Expr end) } Range Range::make_by_min_extent(Expr min, Expr extent) { - return Range(std::make_shared(min, extent)); + return Range(std::make_shared(min, extent)); } IterVar IterVarNode::make(Range dom, Var var, @@ -67,7 +67,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) }); TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) -.set_dispatch([](const Halide::IR::RangeNode *op, IRPrinter *p) { +.set_dispatch([](const HalideIR::IR::RangeNode *op, IRPrinter *p) { p->stream << "range(min=" << op->min << ", ext=" << op->extent << ')'; }); diff --git a/src/lang/ir.cc b/src/lang/ir.cc index 776f1f2368f5..1e0a6e5065f4 100644 --- a/src/lang/ir.cc +++ b/src/lang/ir.cc @@ -11,7 +11,7 @@ #include #include "../pass/ir_util.h" -namespace Halide { +namespace HalideIR { namespace Internal { using tvm::ir::CommReducerNode; @@ -43,7 +43,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) << ")"; }); } // namespace Internal -} // namespace Halide +} // namespace HalideIR namespace tvm { namespace ir { diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc index f40e399ef911..a8152d214d1f 100644 --- a/src/lang/reflection.cc +++ b/src/lang/reflection.cc @@ -25,7 +25,7 @@ inline std::string Type2String(const Type& t) { inline Type String2Type(std::string s) { std::istringstream is(s); - halide_type_code_t code = Type::Int; + halideir_type_code_t code = Type::Int; if (s.substr(0, 3) == "int") { code = Type::Int; s = s.substr(3); } else if (s.substr(0, 4) == "uint") { diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc index 5ed36287c54c..5db4f45e799f 100644 --- a/src/lang/tensor.cc +++ b/src/lang/tensor.cc @@ -16,7 +16,7 @@ Expr Tensor::operator()(Array indices) const { } Expr Tensor::operator()(Array indices) const { - using Halide::Internal::Call; + using HalideIR::Internal::Call; CHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read" << "ndim = " << ndim() << ", indices.size=" << indices.size(); diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc index 11731361148d..9e7db1deea45 100644 --- a/src/op/compute_op.cc +++ b/src/op/compute_op.cc @@ -228,7 +228,7 @@ Stmt ComputeOpNode::BuildRealize( const std::unordered_map& realize_map, const Stmt& realize_body) const { CHECK_EQ(stage->op.get(), this); - Halide::Internal::Region bounds; + HalideIR::Internal::Region bounds; for (IterVar iv : this->axis) { bounds.push_back(realize_map.at(iv)); } diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc index 68a51df32616..9b302f6e2504 100644 --- a/src/op/extern_op.cc +++ b/src/op/extern_op.cc @@ -113,7 +113,7 @@ Stmt ExternOpNode::BuildRealize( Stmt realize_body = body; for (int k = 0; k < num_outputs(); ++k) { Tensor t = stage->op.output(k); - Halide::Internal::Region bounds; + HalideIR::Internal::Region bounds; for (size_t i = 0; i < t->shape.size(); ++i) { bounds.push_back( Range::make_by_min_extent( diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc index 48565b6eb6df..94e3a4aa6586 100644 --- a/src/op/scan_op.cc +++ b/src/op/scan_op.cc @@ -238,7 +238,7 @@ Stmt ScanOpNode::BuildRealize( for (size_t i = 0; i < update.size(); ++i) { Tensor t = stage->op.output(i); CHECK_EQ(static_cast(t->value_index), i); - Halide::Internal::Region bounds; + HalideIR::Internal::Region bounds; bounds.push_back(tdom); for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) { IterVar sp_ax = this->spatial_axis_[sp_idx]; diff --git a/src/pass/inject_prefetch.cc b/src/pass/inject_prefetch.cc index 6d903292a004..2749cb6a2db3 100644 --- a/src/pass/inject_prefetch.cc +++ b/src/pass/inject_prefetch.cc @@ -15,7 +15,7 @@ namespace ir { using arith::IntSet; using arith::DomainTouched; -using Halide::Internal::Region; +using HalideIR::Internal::Region; class PrefetchInjector : public IRMutator { public: diff --git a/src/pass/ir_deep_compare.cc b/src/pass/ir_deep_compare.cc index 9bb764129c36..8a1b09e49339 100644 --- a/src/pass/ir_deep_compare.cc +++ b/src/pass/ir_deep_compare.cc @@ -328,8 +328,8 @@ class IRDeepCompare : return order_; } - int CompareRegion(const Halide::Internal::Region& lhs, - const Halide::Internal::Region& rhs) { + int CompareRegion(const HalideIR::Internal::Region& lhs, + const HalideIR::Internal::Region& rhs) { if (order_ != 0) return order_; if (CompareValue(lhs.size(), rhs.size()) != 0) return order_; for (size_t i = 0; i < lhs.size(); ++i) { diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc index 993b68f835d7..9ca9ccd190ff 100644 --- a/src/pass/ir_mutator.cc +++ b/src/pass/ir_mutator.cc @@ -206,7 +206,7 @@ Stmt IRMutator::Mutate_(const Provide* op, const Stmt& s) { Stmt IRMutator::Mutate_(const Realize* op, const Stmt& s) { IRMutator* m = this; - Halide::Internal::Region new_bounds; + HalideIR::Internal::Region new_bounds; bool bounds_changed = false; // Mutate the bounds @@ -236,7 +236,7 @@ Stmt IRMutator::Mutate_(const Realize* op, const Stmt& s) { Stmt IRMutator::Mutate_(const Prefetch* op, const Stmt& s) { IRMutator* m = this; - Halide::Internal::Region new_bounds; + HalideIR::Internal::Region new_bounds; bool bounds_changed = false; // Mutate the bounds diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc index 46bed1fc9382..45ad081f86cb 100644 --- a/src/pass/storage_flatten.cc +++ b/src/pass/storage_flatten.cc @@ -22,7 +22,7 @@ namespace tvm { namespace ir { -using Halide::Internal::Region; +using HalideIR::Internal::Region; using runtime::StorageScope; using runtime::ThreadScope; using intrinsic::tvm_address_of; diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc index 969a18ee9469..a144e7fc40d1 100644 --- a/src/schedule/message_passing.cc +++ b/src/schedule/message_passing.cc @@ -407,7 +407,7 @@ void PassUpBoundCheck(const Stage& s, const Map& dom_map, std::unordered_map* p_state) { auto& state = *p_state; - using Halide::Internal::can_prove; + using HalideIR::Internal::can_prove; for (size_t i = s->relations.size(); i != 0; --i) { IterVarRelation rel = s->relations[i - 1]; if (rel.as()) { diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc index 7a0739950bb5..fd5a60756f1c 100644 --- a/tests/cpp/ir_mutator_test.cc +++ b/tests/cpp/ir_mutator_test.cc @@ -4,8 +4,8 @@ namespace { using namespace tvm::ir; -using namespace Halide::Internal; -using namespace Halide; +using namespace HalideIR::Internal; +using namespace HalideIR; // replace variable to constant class IRVar2Const : public IRMutator { @@ -38,7 +38,7 @@ TVM_STATIC_IR_FUNCTOR(IRVar2Const, vtable_expr) } // namespace TEST(IRMutator, Basic) { - using namespace Halide::Internal; + using namespace HalideIR::Internal; using namespace tvm; Var x("x"), y; auto z = x + y; diff --git a/tests/cpp/ir_simplify_test.cc b/tests/cpp/ir_simplify_test.cc index e963bc9e991b..0667dc27367c 100644 --- a/tests/cpp/ir_simplify_test.cc +++ b/tests/cpp/ir_simplify_test.cc @@ -4,7 +4,7 @@ #include TEST(IRSIMPLIFY, Basic) { - using namespace Halide::Internal; + using namespace HalideIR::Internal; simplify_test(); } diff --git a/tests/cpp/ir_ssa_test.cc b/tests/cpp/ir_ssa_test.cc index 2de7dba080a3..97251eb5eeeb 100644 --- a/tests/cpp/ir_ssa_test.cc +++ b/tests/cpp/ir_ssa_test.cc @@ -5,7 +5,7 @@ TEST(IRSSA, Convert) { - using namespace Halide::Internal; + using namespace HalideIR::Internal; using namespace tvm; Var x("x"), y; Expr let = Let::make(x, 1, x + 1); @@ -17,7 +17,7 @@ TEST(IRSSA, Convert) { } TEST(IRSSA, Basic) { - using namespace Halide::Internal; + using namespace HalideIR::Internal; using namespace tvm; Var x("x"), y; auto z = Evaluate::make(x + y); diff --git a/tests/cpp/ir_visitor_test.cc b/tests/cpp/ir_visitor_test.cc index 0a649a09304c..930b0a273143 100644 --- a/tests/cpp/ir_visitor_test.cc +++ b/tests/cpp/ir_visitor_test.cc @@ -5,7 +5,7 @@ #include TEST(IRVisitor, CountVar) { - using namespace Halide::Internal; + using namespace HalideIR::Internal; using namespace tvm; int n_var = 0; Var x("x"), y; From 60bf936ef04652762af69e65b4eb8b1de15e99a9 Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Mon, 18 Dec 2017 19:21:38 -0500 Subject: [PATCH 049/948] removed non-determinism from CanonicalSimplify (#704) * 1) removed non-determinism from CanonicalSimplify 2) added couple of testcases for CanonicalSimplify * Use IRDeepCompare instead of comparison of string representation * Give a warning (instead of fatal error) when two "ComExprEntry"s are equal --- src/arithmetic/canonical.cc | 10 +++++++++- tests/python/unittest/test_pass_simplify.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index 27ccfb09cdeb..e219b5541bdc 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -29,9 +29,17 @@ struct ComExprEntry { inline bool operator<(const ComExprEntry& other) const { if (level < other.level) return true; if (level > other.level) return false; + // compare top operator of entries and sort on that if possible (fast check) if (value.type_index() < other.value.type_index()) return true; if (value.type_index() > other.value.type_index()) return false; - return value.get() < other.value.get(); + // if none of the above distinguishes the terms, compare the expression tree of the entries. + // This is a slower check. + int compare_result = Compare(value, other.value); + if (compare_result < 0) return true; + if (compare_result > 0) return false; + // it's a problem if we see identical entries at this point. They should've been merged earlier. + LOG(WARNING) << "we should not have identical entries at this point"; + return false; } }; diff --git a/tests/python/unittest/test_pass_simplify.py b/tests/python/unittest/test_pass_simplify.py index 9105693b3835..29b5b3a8450d 100644 --- a/tests/python/unittest/test_pass_simplify.py +++ b/tests/python/unittest/test_pass_simplify.py @@ -43,6 +43,16 @@ def test_canonical(): ret = tvm.ir_pass.CanonicalSimplify(x / (z+z) - x / (z+z)) assert(tvm.ir_pass.Equal(ret, 0)) + #make sure terms are ordered based on their top operators (e.g., / always precedes %) + ret1 = tvm.ir_pass.CanonicalSimplify(x % 3 + x / 3) + ret2 = tvm.ir_pass.CanonicalSimplify(x / 3 + x % 3) + assert(tvm.ir_pass.Equal(ret1, ret2)) + + #when top operators match, compare string representation of terms + ret1 = tvm.ir_pass.CanonicalSimplify(x % 4 + x % 3) + ret2 = tvm.ir_pass.CanonicalSimplify(x % 3 + x % 4) + assert (tvm.ir_pass.Equal(ret1, ret2)) + if __name__ == "__main__": test_bound() test_basic() From befb252980fe41c0909e7339a45eeec4f8b376ed Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Fri, 22 Dec 2017 04:31:53 -0500 Subject: [PATCH 050/948] During tensorize, call Simplify on algorithm and intrinsic definitions before CanonicalSimplify. This will prevent a number of false tensorize mismatches. (#718) thanks, this we can use this solution for now --- src/arithmetic/canonical.cc | 18 ++++++++++++++++++ src/op/tensorize.cc | 6 ++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc index e219b5541bdc..736b8dad78c7 100644 --- a/src/arithmetic/canonical.cc +++ b/src/arithmetic/canonical.cc @@ -648,6 +648,24 @@ T Simplify_(T a, Map vrange) { Expr Simplify(Expr a, Map vrange) { + // We should not pass an expression having a non-HalideIR op to + // Halide::Internal::simplify. Reduce op is the only such op at this time + // and it only appears as the top op in an expression. So we strip it + // first and send the sub-expressions to the simplifier. + if (const Reduce* r = a.as()) { + Array new_source; + for (auto& e : r->source) { + new_source.push_back(Simplify_(e, vrange)); + } + Expr new_condition = Simplify_(r->condition, vrange); + if (r->source.same_as(new_source) && + r->condition.same_as(new_condition)) { + return a; + } else { + return Reduce::make( + r->combiner, new_source, r->axis, new_condition, r->value_index); + } + } return Simplify_(a, vrange); } diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc index 243b7931da67..b4527f76e808 100644 --- a/src/op/tensorize.cc +++ b/src/op/tensorize.cc @@ -303,8 +303,10 @@ void VerifyTensorizeBody( CHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch"; for (size_t i = 0; i < body.size(); ++i) { - Expr lhs = CanonicalSimplify(body[i], compute_intrin_iter_space); - Expr rhs = CanonicalSimplify(intrin_compute->body[i], compute_intrin_iter_space); + Expr lhs = Simplify(body[i], compute_intrin_iter_space); + lhs = CanonicalSimplify(lhs, compute_intrin_iter_space); + Expr rhs = Simplify(intrin_compute->body[i], compute_intrin_iter_space); + rhs = CanonicalSimplify(rhs, compute_intrin_iter_space); if (lhs.type() != rhs.type()) { LOG(FATAL) << "Failed to match the data type with TensorIntrin " From b3de356f1bf0f268f870703cfc5ef02150d58180 Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Fri, 22 Dec 2017 21:40:51 -0500 Subject: [PATCH 051/948] Added a regression test for #696 (#720) --- .../unittest/test_schedule_tensorize.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py index d8553c025e2d..71ae493e51ae 100644 --- a/tests/python/unittest/test_schedule_tensorize.py +++ b/tests/python/unittest/test_schedule_tensorize.py @@ -126,7 +126,34 @@ def check_rfactor(factor, rfactor): check(16) check_rfactor(16, 16) +# This tests whether algorithm and intrinsics expressions are simplified +# as much as possible first and then checked for equality. See Issue #696 +def test_tensorize_op(): + def op_intrin(): + bh = 9 + bw = 9 + x = tvm.placeholder((5, 5), name='A') + y = tvm.compute((bh, bw), lambda i,j: x[j/3 + i%3, j%3+ i/3]) + + def intrin_func(ins, outs): + xx, = ins + zz = outs[0] + return tvm.call_packed("op", xx, zz) + + with tvm.build_config(offset_factor=2): + return tvm.decl_tensor_intrin(y.op, intrin_func) + + A = tvm.placeholder((5, 5), name='A') + B = tvm.compute((9,9), lambda i, j: A[j/3 + i%3, j%3 + i/3]) + bt = op_intrin() + s = tvm.create_schedule(B.op) + + x,y = B.op.axis + s[B].tensorize(x, bt) + s = s.normalize() + tvm.lower(s, [A, B]) if __name__ == "__main__": test_tensorize_vadd() test_tensorize_matmul() + test_tensorize_op() From 632d7b1ffced065cb5f15f6eb0582670cad624eb Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 23 Dec 2017 11:27:02 +0800 Subject: [PATCH 052/948] Update metal_module.mm --- src/runtime/metal/metal_module.mm | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm index 76926f75bd63..bf7f606c952d 100644 --- a/src/runtime/metal/metal_module.mm +++ b/src/runtime/metal/metal_module.mm @@ -22,7 +22,6 @@ namespace runtime { // Module to support thread-safe multi-GPU execution. -// cuModule is a per-GPU module // The runtime will contain a per-device module table // The modules will be lazily loaded class MetalModuleNode final :public runtime::ModuleNode { From e11b0e8a7268c76534305f5083b7b12902736124 Mon Sep 17 00:00:00 2001 From: Cody Hao Yu Date: Fri, 22 Dec 2017 23:20:16 -0800 Subject: [PATCH 053/948] Fix dependency problem of reducer condition (#712) (#721) * Make duplicated function name checker working * Fix dependency checking problem for reducer condition (#712); add test * Fix dependency checking problem for reducer condition (#712); add test * Specify R to be computed inlined --- .gitignore | 1 + src/pass/ir_visitor.cc | 1 + tests/python/integration/test_reduce.py | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f59a58552f8d..b645f6693b09 100644 --- a/.gitignore +++ b/.gitignore @@ -138,6 +138,7 @@ xcuserdata/ *.xcscmblueprint .DS_Store tags +cscope* # vim temporary files *.swp diff --git a/src/pass/ir_visitor.cc b/src/pass/ir_visitor.cc index 27add40f2bd9..12bad2bd5e86 100644 --- a/src/pass/ir_visitor.cc +++ b/src/pass/ir_visitor.cc @@ -134,6 +134,7 @@ DEFINE_BINOP_VISIT_(Or) void IRVisitor::Visit_(const Reduce* op) { VisitRDom(op->axis, this); VisitArray(op->source, this); + this->Visit(op->condition); } void IRVisitor::Visit_(const Cast* op) { diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index 8f2f053c9138..95c8bc6f24e8 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -7,8 +7,9 @@ def test_prim(reducer, np_reducer): n = tvm.var('n') m = tvm.var('m') A = tvm.placeholder((n, m), name='A') + R = tvm.compute((n, ), lambda i: tvm.select((i > 1), 1, 0), name='R') k = tvm.reduce_axis((0, m)) - B = tvm.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(i>1)), name='B') + B = tvm.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i]==1)), name='B') # schedule s = tvm.create_schedule(B.op) # create iter var and assign them tags. @@ -16,6 +17,7 @@ def test_prim(reducer, np_reducer): xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) + s[R].compute_inline() # one line to build the function. def check_device(device, host="stackvm"): From 3a000c6c3a3b67e54660bd986e22df27858dd047 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 24 Dec 2017 18:06:01 +0800 Subject: [PATCH 054/948] [CODEGEN] enable static handle cache (#723) --- include/tvm/ir.h | 5 ++++ python/tvm/intrin.py | 27 +++++++++++++++++++ src/codegen/llvm/codegen_cpu.cc | 19 ++++++++----- src/codegen/llvm/codegen_cpu.h | 1 + .../unittest/test_codegen_static_init.py | 26 +++++++++++++++++- 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/include/tvm/ir.h b/include/tvm/ir.h index 5b8b56be7db6..95e01382dd98 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -256,6 +256,11 @@ constexpr const char* tvm_if_then_else = "tvm_if_then_else"; * } */ constexpr const char* tvm_access_ptr = "tvm_access_ptr"; +/*! + * \brief Create a function local static handle that iniitalizes to nullptr. + * can be used to cache function local static resources. + */ +constexpr const char* tvm_static_handle = "tvm_static_handle"; /*! * \brief Return a unique context id, used for hint of workspace separation. * Different context id ganrantees not having overlapping workspace. diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py index e13a1162bb0b..f8f65e25aa68 100644 --- a/python/tvm/intrin.py +++ b/python/tvm/intrin.py @@ -80,6 +80,33 @@ def call_pure_intrin(dtype, func_name, *args): dtype, func_name, convert(args), _Call.PureIntrinsic, None, 0) +def call_intrin(dtype, func_name, *args): + """Build expression by calling an intrinsic function. + + Intrinsics can be overloaded with multiple data types via + the intrinsic translation rule. + + Parameters + ---------- + dtype : str + The data type of the result. + + func_name: str + The intrinsic function name. + + args : list + Positional arguments. + + Returns + ------- + call : Expr + The call expression. + """ + args = convert(args) + return _make.Call( + dtype, func_name, convert(args), _Call.Intrinsic, None, 0) + + def call_pure_extern(dtype, func_name, *args): """Build expression by calling a pure extern function. diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc index ea5d90bccc1f..876d1e12d8a2 100644 --- a/src/codegen/llvm/codegen_cpu.cc +++ b/src/codegen/llvm/codegen_cpu.cc @@ -419,6 +419,16 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) { builder_->SetInsertPoint(par_launch_end); } +llvm::Value* CodeGenCPU::CreateStaticHandle() { + llvm::GlobalVariable* gv = new llvm::GlobalVariable( + *module_, t_void_p_, false, + llvm::GlobalValue::PrivateLinkage, 0, + "__tvm_static_handle"); + gv->setAlignment(data_layout_->getTypeAllocSize(t_void_p_)); + gv->setInitializer(llvm::Constant::getNullValue(t_void_p_)); + return gv; +} + void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& body) { using llvm::BasicBlock; // closure data @@ -426,12 +436,7 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod ftype_tvm_static_init_callback_, llvm::Function::PrivateLinkage, "__tvm_static_init_lambda", module_.get()); - llvm::GlobalVariable* gv = new llvm::GlobalVariable( - *module_, t_void_p_, false, - llvm::GlobalValue::PrivateLinkage, 0, - "__tvm_static_handle"); - gv->setAlignment(data_layout_->getTypeAllocSize(t_void_p_)); - gv->setInitializer(llvm::Constant::getNullValue(t_void_p_)); + llvm::Value* gv = CreateStaticHandle(); llvm::Function* finit = module_->getFunction(init_fname); if (finit == nullptr) { finit = llvm::Function::Create( @@ -599,6 +604,8 @@ void CodeGenCPU::AddStartupFunction() { llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) { if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) { return CreateCallPacked(op); + } else if (op->is_intrinsic(intrinsic::tvm_static_handle)) { + return CreateStaticHandle(); } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { builder_->CreateRet(ConstInt32(-1)); return ConstInt32(-1); diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h index 702d8777d50b..5027dab911bd 100644 --- a/src/codegen/llvm/codegen_cpu.h +++ b/src/codegen/llvm/codegen_cpu.h @@ -72,6 +72,7 @@ class CodeGenCPU : public CodeGenLLVM { llvm::Value* RuntimeTVMAPISetLastError(); llvm::Value* RuntimeTVMParallelLaunch(); llvm::Value* RuntimeTVMParallelBarrier(); + llvm::Value* CreateStaticHandle(); llvm::Value* GetPackedFuncHandle(const std::string& str); llvm::Value* PackClosureData(const Array& fields, uint64_t *num_bytes); llvm::Value* CreateStructRefPtr(Type t, llvm::Value* buffer, llvm::Value* index, int kind); diff --git a/tests/python/unittest/test_codegen_static_init.py b/tests/python/unittest/test_codegen_static_init.py index ecb7d82df6cd..1a03de9ee000 100644 --- a/tests/python/unittest/test_codegen_static_init.py +++ b/tests/python/unittest/test_codegen_static_init.py @@ -1,7 +1,8 @@ import tvm +import ctypes import numpy as np -def test_static_init(): +def test_static_callback(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) @@ -22,6 +23,29 @@ def test_static_init(): f(a) np.testing.assert_equal(a.asnumpy(), np.ones(a.shape[0])) +def test_static_init(): + dtype = 'int64' + n = tvm.var('n') + Ab = tvm.decl_buffer((n, ), dtype) + i = tvm.var('i') + ib = tvm.ir_builder.create() + handle = tvm.call_intrin("handle", "tvm_static_handle") + ib.emit( + tvm.call_packed("test_static_callback", handle, Ab)) + + @tvm.register_func("test_static_callback") + def test_cb(sh, A): + assert isinstance(sh, ctypes.c_void_p) + return sh + + stmt = ib.get() + fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) + fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) + f = tvm.codegen.build_module(fapi, "llvm") + a = tvm.nd.array(np.zeros(10, dtype=dtype)) + f(a) + if __name__ == "__main__": + test_static_callback() test_static_init() From d218c5fc0072495b41017af5a83c91e79f97ce22 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 24 Dec 2017 19:09:57 +0800 Subject: [PATCH 055/948] [CODEGEN] update codegen for vector operation (#711) * [CODEGEN] update codegen for vector operation * update comment, fix for metal --- src/codegen/codegen_c.cc | 18 +++++++++++++++++- src/codegen/codegen_cuda.cc | 15 +++++++++++++++ src/codegen/codegen_cuda.h | 1 + 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index b315707b52c5..1f9890d9a271 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -567,6 +567,10 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) { // NOLINT(*) std::string ref = GetVecLoad(op->type, op->buffer_var.get(), base); os << ref; } else { + // The assignment below introduces side-effect, and the resulting value cannot + // be reused across multiple expression, thus a new scope is needed + int vec_scope = BeginScope(); + // load seperately. std::string svalue = GetUniqueName("_"); this->PrintIndent(); @@ -590,6 +594,7 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) { // NOLINT(*) PrintVecElemStore(svalue, op->type, i, value_temp.str()); } os << svalue; + EndScope(vec_scope); } } } @@ -609,6 +614,10 @@ void CodeGenC::VisitStmt_(const Store* op) { std::string value = this->PrintExpr(op->value); this->PrintVecStore(op->buffer_var.get(), t, base, value); } else { + // The assignment below introduces side-effect, and the resulting value cannot + // be reused across multiple expression, thus a new scope is needed + int vec_scope = BeginScope(); + // store elements seperately std::string index = SSAGetID(PrintExpr(op->index), op->index.type()); std::string value = SSAGetID(PrintExpr(op->value), op->value.type()); @@ -629,6 +638,7 @@ void CodeGenC::VisitStmt_(const Store* op) { PrintVecElemLoad(value, op->value.type(), i, stream); stream << ";\n"; } + EndScope(vec_scope); } } } @@ -642,7 +652,13 @@ void CodeGenC::VisitExpr_(const Let* op, std::ostream& os) { // NOLINT(*) } void CodeGenC::VisitExpr_(const Ramp* op, std::ostream& os) { // NOLINT(*) - LOG(FATAL) << "Ramp: not supported "; + os << "((int" << op->lanes << ")("; + for (int i = 0; i < op->lanes; i++) { + os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride) << "*" << i <<")"; + if (i != op->lanes - 1) + os << ", "; + } + os << "))"; } void CodeGenC::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index cace8d8573bb..854bdd993127 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -120,6 +120,10 @@ void CodeGenCUDA::PrintVecBinaryOp( int lanes = t.lanes(); { + // The assignment below introduces side-effect, and the resulting value cannot + // be reused across multiple expression, thus a new scope is needed + int vec_scope = BeginScope(); + // default: unpack into individual ops. std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.type()); std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.type()); @@ -148,6 +152,7 @@ void CodeGenCUDA::PrintVecBinaryOp( PrintVecElemStore(sret, t, i, value_temp.str()); } os << sret; + EndScope(vec_scope); } } @@ -232,6 +237,16 @@ void CodeGenCUDA::VisitStmt_(const Evaluate *op) { } } +void CodeGenCUDA::VisitExpr_(const Ramp* op, std::ostream& os) { + os << "((make_int" << op->lanes << ")("; + for (int i = 0; i < op->lanes; i++) { + os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride) << "*" << i <<")"; + if (i != op->lanes - 1) + os << ", "; + } + os << "))"; +} + void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) std::string v = PrintExpr(op->value); os << "make_"; diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index 9aa72d14cb06..b84b245860e0 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -33,6 +33,7 @@ class CodeGenCUDA final : public CodeGenC { const std::string& vec, Type t, int i, const std::string& value) final; void BindThreadIndex(const IterVar& iv) final; // NOLINT(*) // overload visitor + void VisitExpr_(const Ramp* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) void VisitStmt_(const Evaluate *op) final; From da39a1c1255008348471c678b2399f2bb9b5e26c Mon Sep 17 00:00:00 2001 From: masahi Date: Mon, 25 Dec 2017 00:23:55 +0900 Subject: [PATCH 056/948] [ROCM] MIOpen contrib for convolution kernels (#722) * fist working miopen support * do FindFwdAlgo during build time * fix lint * update doc string * import topi after checking if rocm is enabled * add miopen namespace * fixed descriptor overwrite bug * add use_miopen option * fix lint * better miopen option handling * fix typo * fix options handling --- Makefile | 1 + make/config.mk | 3 + make/contrib/miopen.mk | 8 + python/tvm/contrib/miopen.py | 102 ++++++++++++ python/tvm/target.py | 11 +- src/contrib/miopen/conv_forward.cc | 221 ++++++++++++++++++++++++++ src/contrib/miopen/miopen_utils.cc | 78 +++++++++ src/contrib/miopen/miopen_utils.h | 59 +++++++ tests/python/contrib/test_miopen.py | 64 ++++++++ tests/scripts/task_python_unittest.sh | 2 +- 10 files changed, 545 insertions(+), 4 deletions(-) create mode 100644 make/contrib/miopen.mk create mode 100644 python/tvm/contrib/miopen.py create mode 100644 src/contrib/miopen/conv_forward.cc create mode 100644 src/contrib/miopen/miopen_utils.cc create mode 100644 src/contrib/miopen/miopen_utils.h create mode 100644 tests/python/contrib/test_miopen.py diff --git a/Makefile b/Makefile index 31c8dac658fc..4a6ac7f3c8f3 100644 --- a/Makefile +++ b/Makefile @@ -134,6 +134,7 @@ include make/contrib/cblas.mk include make/contrib/random.mk include make/contrib/nnpack.mk include make/contrib/cudnn.mk +include make/contrib/miopen.mk include make/contrib/mps.mk ifdef ADD_CFLAGS diff --git a/make/config.mk b/make/config.mk index 94153edc38f6..837db40ebff3 100644 --- a/make/config.mk +++ b/make/config.mk @@ -72,5 +72,8 @@ USE_NNPACK = 0 # Whether use CuDNN USE_CUDNN = 0 +# Whether use MIOpen +USE_MIOPEN = 0 + # Whether use MPS USE_MPS = 0 diff --git a/make/contrib/miopen.mk b/make/contrib/miopen.mk new file mode 100644 index 000000000000..10ed6cba0801 --- /dev/null +++ b/make/contrib/miopen.mk @@ -0,0 +1,8 @@ +MIOPEN_CONTRIB_SRC = $(wildcard src/contrib/miopen/*.cc) +MIOPEN_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(MIOPEN_CONTRIB_SRC)) + +ifeq ($(USE_MIOPEN), 1) +CFLAGS += -DTVM_USE_MIOPEN=1 +ADD_LDFLAGS += -lMIOpen +RUNTIME_DEP += $(MIOPEN_CONTRIB_OBJ) +endif diff --git a/python/tvm/contrib/miopen.py b/python/tvm/contrib/miopen.py new file mode 100644 index 000000000000..b76e70688c4e --- /dev/null +++ b/python/tvm/contrib/miopen.py @@ -0,0 +1,102 @@ +"""External function interface to MIOpen library.""" +# pylint: disable-msg=C0103 +import ctypes +import numpy as np +from .. import api as _api +from .. import intrin as _intrin +from .. import get_global_func as _get_global_func + + +def _get_np_int32_array_handle(arr): + """Return a void_p handle for a numpy array + + Parameters + ---------- + arr: numpy.NDArray + source numpy array + + Returns + ------- + ptr: ctypes.c_void_p + pointer to the data + """ + assert arr.dtype == np.int32 + ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + return ctypes.cast(ptr, ctypes.c_void_p) + + +def conv2d_forward(x, + w, + stride_h=1, + stride_w=1, + pad_h=0, + pad_w=0, + dilation_h=1, + dilation_w=1, + conv_mode=0): + """Create an extern op that compute 2D convolution with MIOpen + + Parameters + ---------- + x: Tensor + input feature map + w: Tensor + convolution weight + stride_h: int + height stride + stride_w: int + width stride + pad_h: int + height pad + pad_w: int + weight pad + dilation_h: int + height dilation + dilation_w: int + width dilation + conv_mode: int + 0: miopenConvolution + 1: miopenTranspose + + Returns + ------- + y: Tensor + The result tensor + """ + assert conv_mode == 0, "Transpose convolutions not supported yet." + oshape = np.zeros((len(x.shape)), dtype=np.int32) + xshape = x.shape + wshape = w.shape + setup_func = _get_global_func("tvm.contrib.miopen.conv2d.setup") + algo = setup_func(conv_mode, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + xshape[0].value, + xshape[1].value, + xshape[2].value, + xshape[3].value, + wshape[0].value, + wshape[1].value, + wshape[2].value, + wshape[3].value, + _get_np_int32_array_handle(oshape)) + + return _api.extern( + list(oshape), [x, w], + lambda ins, outs: _intrin.call_packed( + "tvm.contrib.miopen.conv2d.forward", + conv_mode, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + algo, + ins[0], + ins[1], + outs[0]), name="y") diff --git a/python/tvm/target.py b/python/tvm/target.py index 1bcd1de7d3d9..096459c1ac83 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -88,12 +88,17 @@ def __init__(self, target_name, options=None): self.target_name = target_name - self.options = _merge_opts([], options) + self.options = [] self.device_name = "" + self.libs = [] # Parse device option - for item in self.options: - if item.startswith("-device="): + for item in _merge_opts([], options): + if item.startswith("-libs="): + self.libs.append(item.split("=")[1]) + continue + elif item.startswith("-device="): self.device_name = item.split("=")[1] + self.options.append(item) # Target query searchs device name first if self.device_name: self.keys = (self.device_name,) diff --git a/src/contrib/miopen/conv_forward.cc b/src/contrib/miopen/conv_forward.cc new file mode 100644 index 000000000000..d85c08dee36c --- /dev/null +++ b/src/contrib/miopen/conv_forward.cc @@ -0,0 +1,221 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external miopen utils function + */ +#include +#include +#include +#include "miopen_utils.h" + +namespace tvm { +namespace contrib { +namespace miopen { + +using namespace runtime; + +TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup") +.set_body([](TVMArgs args, TVMRetValue *ret) { + const int mode = args[0]; + const int pad_h = args[1]; + const int pad_w = args[2]; + const int stride_h = args[3]; + const int stride_w = args[4]; + const int dilation_h = args[5]; + const int dilation_w = args[6]; + const int x_dim0 = args[7]; + const int x_dim1 = args[8]; + const int x_dim2 = args[9]; + const int x_dim3 = args[10]; + const int w_dim0 = args[11]; + const int w_dim1 = args[12]; + const int w_dim2 = args[13]; + const int w_dim3 = args[14]; + void *out_shape = args[15]; + + MIOpenThreadEntry* entry_ptr = MIOpenThreadEntry::ThreadLocal(); + // Set Mode + entry_ptr->conv_entry.mode = static_cast(mode); + // Set Ctx + entry_ptr->conv_entry.ctx = TVMContext{kDLROCM, 0}; + // Set Data Type + entry_ptr->conv_entry.data_type = miopenFloat; // MIOpen only suppports fp32 + // Set Desc + MIOPEN_CALL(miopenInitConvolutionDescriptor(entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.mode, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w)); + // Set Filter + MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.filter_desc, + entry_ptr->conv_entry.data_type, + w_dim0, + w_dim1, + w_dim2, + w_dim3)); + // Set Input + MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.input_desc, + entry_ptr->conv_entry.data_type, + x_dim0, + x_dim1, + x_dim2, + x_dim3)); + + // Set Output shape + MIOPEN_CALL(miopenGetConvolutionForwardOutputDim(entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.input_desc, + entry_ptr->conv_entry.filter_desc, + static_cast(out_shape), + static_cast(out_shape) + 1, + static_cast(out_shape) + 2, + static_cast(out_shape) + 3)); + + const int *oshape = static_cast(out_shape); + // Set Output + MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.output_desc, + entry_ptr->conv_entry.data_type, + oshape[0], + oshape[1], + oshape[2], + oshape[3])); + + // Set workspace + size_t workspace_size = 0; + MIOPEN_CALL(miopenConvolutionForwardGetWorkSpaceSize(entry_ptr->handle, + entry_ptr->conv_entry.filter_desc, + entry_ptr->conv_entry.input_desc, + entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.output_desc, + &workspace_size)); + entry_ptr->conv_entry.UpdateWorkspace(workspace_size); + + const size_t input_size = x_dim0 * x_dim1 * x_dim2 * x_dim3; + const size_t filter_size = w_dim0 * w_dim1 * w_dim2 * w_dim3; + const size_t output_size = oshape[0] * oshape[1] * oshape[2] * oshape[3]; + + runtime::DeviceAPI* rocm_api = entry_ptr->conv_entry.rocm_api; + float* input_buf = static_cast(rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, + input_size * sizeof(float))); + float* filter_buf = static_cast(rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, + filter_size * sizeof(float))); + float* output_buf = static_cast(rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, + output_size * sizeof(float))); + + const int request_algo_count = 4; + const bool exhaustive_search = false; + int returned_algo_count = 0; + miopenConvAlgoPerf_t perfs[4]; + + MIOPEN_CALL(miopenFindConvolutionForwardAlgorithm(entry_ptr->handle, + entry_ptr->conv_entry.input_desc, + input_buf, + entry_ptr->conv_entry.filter_desc, + filter_buf, + entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.output_desc, + output_buf, + request_algo_count, + &returned_algo_count, + perfs, + entry_ptr->conv_entry.workspace, + workspace_size, + exhaustive_search)); + + rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, input_buf); + rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, filter_buf); + rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, output_buf); + + const std::vector fwd_algo_names{ + "miopenConvolutionFwdAlgoGEMM", + "miopenConvolutionFwdAlgoDirect", + "miopenConvolutionFwdAlgoFFT", + "miopenConvolutionFwdAlgoWinograd", + }; + const auto best_algo = perfs[0].fwd_algo; + LOG(INFO) << "\tMIOpen Found " << returned_algo_count + << " fwd algorithms, choosing " << fwd_algo_names[best_algo]; + for (int i = 0; i < returned_algo_count; ++i) { + LOG(INFO) << "\t\t" << i << ") " << fwd_algo_names[perfs[i].fwd_algo] + << " - time: " << perfs[i].time << " ms" + << ", Memory: " << perfs[i].memory; + } + // Set Algo + ret[0] = static_cast(best_algo); +}); + + +TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.forward") +.set_body([](TVMArgs args, TVMRetValue *ret) { + const int mode = args[0]; + const int pad_h = args[1]; + const int pad_w = args[2]; + const int stride_h = args[3]; + const int stride_w = args[4]; + const int dilation_h = args[5]; + const int dilation_w = args[6]; + const int algo = args[7]; + const DLTensor *x = args[8]; + const DLTensor *w = args[9]; + const DLTensor *y = args[10]; + + MIOpenThreadEntry* entry_ptr = MIOpenThreadEntry::ThreadLocal(); + entry_ptr->conv_entry.fwd_algo = static_cast(algo); + // Set Mode + entry_ptr->conv_entry.mode = static_cast(mode); + // Set Ctx + entry_ptr->conv_entry.ctx = x->ctx; + // Set Data Type + entry_ptr->conv_entry.data_type = miopenFloat; // MIOpen only suppports fp32 + // Set Desc + MIOPEN_CALL(miopenInitConvolutionDescriptor(entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.mode, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w)); + // Set Filter + MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.filter_desc, + entry_ptr->conv_entry.data_type, + w->shape[0], + w->shape[1], + w->shape[2], + w->shape[3])); + // Set Input + MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.input_desc, + entry_ptr->conv_entry.data_type, + x->shape[0], + x->shape[1], + x->shape[2], + x->shape[3])); + // Set Output + MIOPEN_CALL(miopenSet4dTensorDescriptor(entry_ptr->conv_entry.output_desc, + entry_ptr->conv_entry.data_type, + y->shape[0], + y->shape[1], + y->shape[2], + y->shape[3])); + + const float alpha = 1.f; + const float beta = 0.f; + MIOPEN_CALL(miopenConvolutionForward(entry_ptr->handle, + &alpha, + entry_ptr->conv_entry.input_desc, + x->data, + entry_ptr->conv_entry.filter_desc, + w->data, + entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.fwd_algo, + &beta, + entry_ptr->conv_entry.output_desc, + y->data, + entry_ptr->conv_entry.workspace, + entry_ptr->conv_entry.workspace_size)); +}); + +} // namespace miopen +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/miopen/miopen_utils.cc b/src/contrib/miopen/miopen_utils.cc new file mode 100644 index 000000000000..3019b1b83e81 --- /dev/null +++ b/src/contrib/miopen/miopen_utils.cc @@ -0,0 +1,78 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external miopen utils function + */ +#include "miopen_utils.h" +#include +#include +#include +#include + +namespace tvm { +namespace contrib { +namespace miopen { + +std::string miopenGetErrorString(int error_code) { + const std::vector mio_err{ + "StatusSuccess ", "StatusNotInitialized ", "StatusInvalidValue ", + "StatusBadParm ", "StatusAllocFailed ", "StatusInternalError ", + "StatusNotImplemented ", "StatusUnknownError "}; + return mio_err[error_code]; +} + +// MiopenThreadEntry +MIOpenThreadEntry::MIOpenThreadEntry() { + auto stream = runtime::ROCMThreadEntry::ThreadLocal()->stream; + auto func = runtime::Registry::Get("device_api.rocm"); + void *ret = (*func)(); + rocm_api = static_cast(ret); + MIOPEN_CALL(miopenCreate(&handle)); + MIOPEN_CALL(miopenSetStream(handle, stream)); + conv_entry.rocm_api = rocm_api; +} + +MIOpenThreadEntry::~MIOpenThreadEntry() { + MIOPEN_CALL(miopenDestroy(handle)); +} + +typedef dmlc::ThreadLocalStore MIOpenThreadStore; + +MIOpenThreadEntry* MIOpenThreadEntry::ThreadLocal() { + return MIOpenThreadStore::Get(); +} + +// ConvEntry + +ConvEntry::ConvEntry() { + MIOPEN_CALL(miopenCreateConvolutionDescriptor(&conv_desc)); + MIOPEN_CALL(miopenCreateTensorDescriptor(&filter_desc)); + MIOPEN_CALL(miopenCreateTensorDescriptor(&input_desc)); + MIOPEN_CALL(miopenCreateTensorDescriptor(&output_desc)); +} + +ConvEntry::~ConvEntry() { + MIOPEN_CALL(miopenDestroyConvolutionDescriptor(conv_desc)); + MIOPEN_CALL(miopenDestroyTensorDescriptor(filter_desc)); + MIOPEN_CALL(miopenDestroyTensorDescriptor(input_desc)); + MIOPEN_CALL(miopenDestroyTensorDescriptor(output_desc)); + CleanWorkspace(); +} + +void ConvEntry::UpdateWorkspace(const size_t wsize) { + if (workspace_size < wsize) { + if (workspace != nullptr) { + CleanWorkspace(); + } + workspace_size = wsize; + workspace = rocm_api->AllocWorkspace(ctx, workspace_size); + } +} + +void ConvEntry::CleanWorkspace() { + if (workspace) rocm_api->FreeWorkspace(ctx, workspace); + workspace_size = 0; +} + +} // namespace miopen +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/miopen/miopen_utils.h b/src/contrib/miopen/miopen_utils.h new file mode 100644 index 000000000000..b01fc017ed95 --- /dev/null +++ b/src/contrib/miopen/miopen_utils.h @@ -0,0 +1,59 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external miopen utils function + */ + +#ifndef TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_ +#define TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_ + +#include +#include +#include +#include +#include "../../runtime/rocm/rocm_common.h" + +namespace tvm { +namespace contrib { +namespace miopen { + +std::string miopenGetErrorString(int error_code); + +#define MIOPEN_CALL(func) \ + { \ + miopenStatus_t e = (func); \ + CHECK_EQ(e, miopenStatusSuccess) \ + << "miopen error: " << miopenGetErrorString(e); \ + } + +struct ConvEntry { + miopenConvolutionDescriptor_t conv_desc; + miopenConvolutionMode_t mode{miopenConvolution}; + miopenTensorDescriptor_t filter_desc; + miopenDataType_t data_type{miopenFloat}; + miopenTensorDescriptor_t input_desc; + miopenTensorDescriptor_t output_desc; + miopenConvFwdAlgorithm_t fwd_algo; + TVMContext ctx; + runtime::DeviceAPI *rocm_api; + void *workspace{nullptr}; + size_t workspace_size{0}; + ConvEntry(); + ~ConvEntry(); + void UpdateWorkspace(const size_t wsize); + void CleanWorkspace(); +}; // ConvThreadEntry + +struct MIOpenThreadEntry { + MIOpenThreadEntry(); + ~MIOpenThreadEntry(); + miopenHandle_t handle{nullptr}; + ConvEntry conv_entry; + runtime::DeviceAPI *rocm_api{nullptr}; + static MIOpenThreadEntry *ThreadLocal(); +}; // MIOpenThreadEntry + +} // namespace miopen +} // namespace contrib +} // namespace tvm + +#endif // TVM_CONTRIB_MIOPEN_MIOPEN_UTILS_H_ diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py new file mode 100644 index 000000000000..51e50f62db94 --- /dev/null +++ b/tests/python/contrib/test_miopen.py @@ -0,0 +1,64 @@ +import tvm +from tvm.contrib import miopen +import numpy as np + + +def test_conv2d(): + in_channel = 64 + out_channel = 128 + filter_h = 3 + filter_w = 3 + pad_h = 1 + pad_w = 1 + stride_h = 1 + stride_w = 1 + dilation_h = 1 + dilation_w = 1 + + xshape = [1, in_channel, 64, 64] + if not tvm.module.enabled("rocm"): + print("skip because rocm is not enabled...") + return + if not tvm.get_global_func("tvm.contrib.miopen.conv2d.setup", True): + print("skip because miopen is not enabled...") + return + wshape = (out_channel, in_channel, filter_h, filter_w) + + X = tvm.placeholder(xshape, name='X') + W = tvm.placeholder(wshape, name='W') + Y = miopen.conv2d_forward(X, + W, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + conv_mode=0) + + yshape = [x.value for x in Y.shape] + s = tvm.create_schedule(Y.op) + + def verify(): + ctx = tvm.rocm(0) + f = tvm.build(s, [X, W, Y], "rocm", target_host="llvm", name="conv2d") + x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx) + w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx) + y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) + f(x, w, y) + + import topi + Y_ref = topi.nn.conv2d_nchw(X, W, (stride_h, stride_w), (pad_h, pad_w)) + with tvm.target.rocm(): + s_ref = topi.generic.schedule_conv2d_nchw([Y_ref]) + f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm") + y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) + f_ref(x, w, y_ref) + print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy()))) + np.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3) + + verify() + + +if __name__ == "__main__": + test_conv2d() diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh index a77dc989ebe5..f0331609479c 100755 --- a/tests/scripts/task_python_unittest.sh +++ b/tests/scripts/task_python_unittest.sh @@ -1,6 +1,6 @@ #!/bin/bash -export PYTHONPATH=python +export PYTHONPATH=python:topi/python rm -rf python/tvm/*.pyc python/tvm/*/*.pyc From 377242a97fcf624d2b66112005018f61e36d6fcf Mon Sep 17 00:00:00 2001 From: Yuwei Hu Date: Mon, 25 Dec 2017 09:18:14 +0800 Subject: [PATCH 057/948] [TOPI] 1bit dense operator on x86_64 (#629) * add x86_64 target * add binary dense operator * rebase * improve schedule * remove x86 target * improve schedule --- topi/python/topi/generic/nn.py | 36 ++++++++++++ topi/python/topi/nn/__init__.py | 1 + topi/python/topi/nn/bnn.py | 79 +++++++++++++++++++++++++++ topi/python/topi/x86/__init__.py | 2 + topi/python/topi/x86/binarize_pack.py | 38 +++++++++++++ topi/python/topi/x86/binary_dense.py | 55 +++++++++++++++++++ topi/tests/python/test_topi_bnn.py | 55 +++++++++++++++++++ 7 files changed, 266 insertions(+) create mode 100644 topi/python/topi/nn/bnn.py create mode 100644 topi/python/topi/x86/binarize_pack.py create mode 100644 topi/python/topi/x86/binary_dense.py create mode 100644 topi/tests/python/test_topi_bnn.py diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 2cb64407c88e..d606213a5270 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -176,3 +176,39 @@ def schedule_global_pool(outs): The computation schedule for the op. """ return _default_schedule(outs, False) + + +@tvm.target.generic_func +def schedule_binarize_pack(outs): + """Schedule for binarize_pack + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of binarize_pack + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + +@tvm.target.generic_func +def schedule_binary_dense(outs): + """Schedule for binary_dense + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of binary_dense + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index b6606108268c..3cdf3122e78e 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -13,3 +13,4 @@ from .pooling import * from .softmax import * from .conv2d_transpose import * +from .bnn import * diff --git a/topi/python/topi/nn/bnn.py b/topi/python/topi/nn/bnn.py new file mode 100644 index 000000000000..39b9d2a15a1b --- /dev/null +++ b/topi/python/topi/nn/bnn.py @@ -0,0 +1,79 @@ +"""Binary Neural Network (BNN) Operators""" +from __future__ import absolute_import as _abs +import tvm +from .. import tag +from ..util import simplify, get_const_int + + +def binarize_pack(data, axis=None, name="PackedInput"): + """Binarization and bit-packing along a certain axis. + + Parameters + ---------- + data : tvm.Tensor + n-D input, can be any layout. + + axis : None or int + The axis along which to do binarization and bit-packing, + default is the last axis. + + name : str, optional + The name prefix operators generate. + + Returns + ------- + output : tvm.Tensor + n-D, the same layout as input, dtype is uint32. + """ + ishape = data.shape + if axis is None: + axis = len(ishape) - 1 + assert get_const_int(ishape[axis]) % 32 == 0 + n = len(ishape) + oshape = tuple(simplify(ishape[i] // 32) if i == axis \ + else ishape[i] for i in range(n)) + + def _binarize_pack(*indices): + start_idx = [indices[i] * 32 if i == axis else indices[i] for i in range(n)] + packed = tvm.const(0, 'uint32') + for j in range(32): + idx = [start_idx[i] + j if i == axis else start_idx[i] for i in range(n)] + sign = (data(*idx) >= 0).astype("uint32") + packed = (packed | sign) + if j == 31: + return packed + packed = packed << 1 + + return tvm.compute(oshape, _binarize_pack, name=name, tag='binarize_pack') + + +def binary_dense(data, weight): + """Binary matrix multiplication using xor and bit-count. + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim], dtype is uint32. + + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim], dtype is uint32. + + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim], dtype is float32. + """ + assert data.dtype == 'uint32' and weight.dtype == 'uint32', \ + "dtype of data and weight should be uint32" + assert len(data.shape) == 2 and len(weight.shape) == 2, \ + "only support 2-dim binary dense" + batch, in_dim = data.shape + out_dim, _ = weight.shape + k = tvm.reduce_axis((0, in_dim), name='k') + matmul = tvm.compute((batch, out_dim), lambda i, j: \ + tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \ + tag='binary_dense') + + return tvm.compute((batch, out_dim), lambda i, j: \ + 32 * in_dim - 2. * matmul(i, j), \ + tag=tag.ELEMWISE) diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py index d9912de2870d..6ab37b8c03ac 100644 --- a/topi/python/topi/x86/__init__.py +++ b/topi/python/topi/x86/__init__.py @@ -3,3 +3,5 @@ from __future__ import absolute_import as _abs from .conv2d import schedule_conv2d +from .binarize_pack import schedule_binarize_pack +from .binary_dense import schedule_binary_dense diff --git a/topi/python/topi/x86/binarize_pack.py b/topi/python/topi/x86/binarize_pack.py new file mode 100644 index 000000000000..adf0a714b3fb --- /dev/null +++ b/topi/python/topi/x86/binarize_pack.py @@ -0,0 +1,38 @@ +# pylint: disable=invalid-name +"""Schedule for binarization and bit-packing.""" +from __future__ import absolute_import as _abs +import tvm +from .. import generic + + +@generic.schedule_binarize_pack.register(["cpu"]) +def schedule_binarize_pack(outs): + """Schedule for binarize_pack. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of binarize_pack + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for binarize_pack. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _schedule(Out): + s[Out].parallel(Out.op.axis[0]) + + def traverse(OP): + # schedule binarize_pack + if OP.tag == 'binarize_pack': + Out = OP.output(0) + _schedule(Out) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/x86/binary_dense.py b/topi/python/topi/x86/binary_dense.py new file mode 100644 index 000000000000..11fccba7b644 --- /dev/null +++ b/topi/python/topi/x86/binary_dense.py @@ -0,0 +1,55 @@ +# pylint: disable=invalid-name, unused-variable, unused-argument +"""Schedule for binary dense operator.""" +from __future__ import absolute_import as _abs +import tvm +from .. import tag +from .. import generic + + +@generic.schedule_binary_dense.register(["cpu"]) +def schedule_binary_dense(outs): + """Schedule for binary_dense. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of binary_dense + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for binary_dense. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _schedule(A, B, C): + s[C].split(s[C].op.reduce_axis[0], factor=8) + s[C].parallel(s[C].op.axis[0]) + if C.op in s.outputs: + Out = C + else: + Out = outs[0].op.output(0) + xo, xi = s[Out].split(Out.op.axis[1], factor=8) + s[Out].vectorize(xi) + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + # schedule binary_dense + elif OP.tag == 'binary_dense': + output = OP.output(0) + data = OP.input_tensors[0] + weight = OP.input_tensors[1] + _schedule(data, weight, output) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py new file mode 100644 index 000000000000..5e6a11afc602 --- /dev/null +++ b/topi/tests/python/test_topi_bnn.py @@ -0,0 +1,55 @@ +"""Test code for binary neural network operators.""" +import numpy as np +import tvm +import topi +from topi.util import get_const_tuple +from tvm.contrib.pickle_memoize import memoize + + +def verify_binary_dense(batch, in_dim, out_dim): + A = tvm.placeholder((batch, in_dim), name='A') + B = tvm.placeholder((out_dim, in_dim), name='B') + bnn_A = topi.nn.binarize_pack(A) + bnn_B = topi.nn.binarize_pack(B) + # binary dense + bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype) + bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype) + bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1) + # schedule + with tvm.target.create('llvm'): + s1 = topi.generic.schedule_binarize_pack(bnn_A) + s2 = topi.generic.schedule_binarize_pack(bnn_B) + s3 = topi.generic.schedule_binary_dense(bnn_C) + + dtype = A.dtype + @memoize("topi.tests.test_topi_binary_dense") + def get_ref_data(): + # generate random matrix of +1 or -1 value + a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype) + b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype) + c_np = np.dot(a_np, b_np.T) + return (a_np, b_np, c_np) + + a_np, b_np, c_np = get_ref_data() + + ctx = tvm.cpu(0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx) + bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx) + bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx) + f1 = tvm.build(s1, [A, bnn_A], 'llvm -mcpu=core-avx2') + f2 = tvm.build(s2, [B, bnn_B], 'llvm -mcpu=core-avx2') + f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm -mcpu=core-avx2') + f1(a, bnn_a) + f2(b, bnn_b) + f3(bnn_a, bnn_b, bnn_c) + np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5) + +def test_binary_dense(): + verify_binary_dense(1, 4096, 1024) + verify_binary_dense(1, 1024, 1000) + + +if __name__ == "__main__": + test_binary_dense() From 02668ad892d7ed4e3ed6e4466607275211f723b9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 26 Dec 2017 11:47:44 +0800 Subject: [PATCH 058/948] update dmlc-core (#728) --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index 04f91953ace7..674a662c22b9 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 04f91953ace74aced3bb317990515304c5425849 +Subproject commit 674a662c22b900b76e8a3c9b77987a2c5563ba71 From 5f60df34925f1ac402d0f0c9d4df8e054a6426f4 Mon Sep 17 00:00:00 2001 From: masahi Date: Tue, 26 Dec 2017 12:50:32 +0900 Subject: [PATCH 059/948] [TOPI] add extern schedule for cudnn and miopen (#724) * add extern schedule for miopen * fix comment * optionally dispatch to miopen from topi * fix lint * check if current target is None * use generic dispatch for rocm conv2d * fix lint * fix workspace bug * remove blank line * remove blank line * remove blank line --- src/contrib/miopen/conv_forward.cc | 4 +- tests/python/contrib/test_miopen.py | 11 ++-- topi/python/topi/__init__.py | 1 + topi/python/topi/cuda/__init__.py | 1 + topi/python/topi/cuda/extern.py | 40 +++++++++++++++ topi/python/topi/generic/__init__.py | 1 + topi/python/topi/generic/extern.py | 26 ++++++++++ topi/python/topi/rocm/__init__.py | 5 ++ topi/python/topi/rocm/conv2d.py | 77 ++++++++++++++++++++++++++++ 9 files changed, 160 insertions(+), 6 deletions(-) create mode 100644 topi/python/topi/cuda/extern.py create mode 100644 topi/python/topi/generic/extern.py create mode 100644 topi/python/topi/rocm/__init__.py create mode 100644 topi/python/topi/rocm/conv2d.py diff --git a/src/contrib/miopen/conv_forward.cc b/src/contrib/miopen/conv_forward.cc index d85c08dee36c..7090560e3889 100644 --- a/src/contrib/miopen/conv_forward.cc +++ b/src/contrib/miopen/conv_forward.cc @@ -105,6 +105,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup") const int request_algo_count = 4; const bool exhaustive_search = false; + void* workspace = entry_ptr->conv_entry.workspace; + if (workspace_size == 0) workspace = nullptr; int returned_algo_count = 0; miopenConvAlgoPerf_t perfs[4]; @@ -119,7 +121,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup") request_algo_count, &returned_algo_count, perfs, - entry_ptr->conv_entry.workspace, + workspace, workspace_size, exhaustive_search)); diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py index 51e50f62db94..4e13b052e616 100644 --- a/tests/python/contrib/test_miopen.py +++ b/tests/python/contrib/test_miopen.py @@ -4,8 +4,8 @@ def test_conv2d(): - in_channel = 64 - out_channel = 128 + in_channel = 3 + out_channel = 64 filter_h = 3 filter_w = 3 pad_h = 1 @@ -15,7 +15,7 @@ def test_conv2d(): dilation_h = 1 dilation_w = 1 - xshape = [1, in_channel, 64, 64] + xshape = [1, in_channel, 128, 128] if not tvm.module.enabled("rocm"): print("skip because rocm is not enabled...") return @@ -37,7 +37,9 @@ def test_conv2d(): conv_mode=0) yshape = [x.value for x in Y.shape] - s = tvm.create_schedule(Y.op) + import topi + with tvm.target.create("rocm -libs=miopen"): + s = topi.generic.schedule_extern(Y) def verify(): ctx = tvm.rocm(0) @@ -47,7 +49,6 @@ def verify(): y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) f(x, w, y) - import topi Y_ref = topi.nn.conv2d_nchw(X, W, (stride_h, stride_w), (pad_h, pad_w)) with tvm.target.rocm(): s_ref = topi.generic.schedule_conv2d_nchw([Y_ref]) diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index 62a9ae153052..c28dfb34e8b6 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -19,3 +19,4 @@ from . import rasp from . import testing from . import util +from . import rocm diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index b898dde6a8cf..03cab1441b0c 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -13,3 +13,4 @@ from .dense import schedule_dense from .pooling import schedule_pool, schedule_global_pool from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw +from .extern import schedule_extern diff --git a/topi/python/topi/cuda/extern.py b/topi/python/topi/cuda/extern.py new file mode 100644 index 000000000000..34dae092cf69 --- /dev/null +++ b/topi/python/topi/cuda/extern.py @@ -0,0 +1,40 @@ +# pylint: disable=invalid-name, unused-variable, +"""Schedule for cudnn and miopen extern op""" +import tvm +from .. import generic + +def _schedule_output(op, sch): + x = op.output(0) + fused = sch[x].fuse(*sch[x].op.axis) + num_thread = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = sch[x].split(fused, factor=num_thread) + sch[x].bind(bx, tvm.thread_axis("blockIdx.x")) + sch[x].bind(tx, tvm.thread_axis("threadIdx.x")) + return sch + + +@generic.schedule_extern.register(["cuda", "gpu"]) +def schedule_extern(outs): + """Schedule for an extern op followed by injective operations. + For example, cudnn kernel + bias add + relu. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of extern plus injective ops in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + tvm.schedule.AutoInlineInjective(s) + for out in outs: + if isinstance(out.op, tvm.tensor.ExternOp): + continue + _schedule_output(out.op, s) + return s diff --git a/topi/python/topi/generic/__init__.py b/topi/python/topi/generic/__init__.py index d10d5c5ebecc..8fc9143c3f86 100644 --- a/topi/python/topi/generic/__init__.py +++ b/topi/python/topi/generic/__init__.py @@ -17,3 +17,4 @@ from .nn import * from .injective import * +from .extern import * diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py new file mode 100644 index 000000000000..082c1bca83ea --- /dev/null +++ b/topi/python/topi/generic/extern.py @@ -0,0 +1,26 @@ +# pylint: disable=invalid-name +"""generic declaration and schedules.""" +from __future__ import absolute_import as _abs + +import tvm + +@tvm.target.generic_func +def schedule_extern(outs): + """Schedule for an extern op followed by injective operations. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of extern plus injective ops in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + target = tvm.target.current_target(allow_none=False) + if target.target_name != "llvm": + raise RuntimeError("schedule_injective not registered for '%s'" % target) + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + return tvm.create_schedule([x.op for x in outs]) diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py new file mode 100644 index 000000000000..d2d7aaf0fd3d --- /dev/null +++ b/topi/python/topi/rocm/__init__.py @@ -0,0 +1,5 @@ +# pylint: disable=redefined-builtin, wildcard-import +"""rocm specific declaration and schedules.""" +from __future__ import absolute_import as _abs + +from .conv2d import * diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py new file mode 100644 index 000000000000..c6e4eb817b11 --- /dev/null +++ b/topi/python/topi/rocm/conv2d.py @@ -0,0 +1,77 @@ +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long +"""Schedule for rocm conv2d_nchw with auto fusion""" +import tvm +from tvm.contrib import miopen +import topi +from .. import generic +from ..nn.conv2d import conv2d + +@conv2d.register("rocm") +def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): + """Conv2D operator for rocm backend. + + Parameters + ---------- + input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + filter : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + assert layout == 'NCHW', "Only NCHW layout is supported." + assert isinstance(stride, int) or len(stride) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + if isinstance(padding, int): + pad_h = pad_w = padding + else: + pad_h, pad_w = padding + target = tvm.target.current_target() + if "miopen" in target.libs: + return miopen.conv2d_forward(data, + kernel, + stride_h, + stride_w, + pad_h, + pad_w, + 1, # dilation_h + 1, # dilation_w + conv_mode=0) + return topi.nn.conv2d_nchw(data, kernel, stride, padding, out_dtype) + + +@generic.schedule_conv2d_nchw.register(["rocm"]) +def schedule_conv2d_nchw(outs): + """Schedule for conv2d_nchw with rocm backend. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + target = tvm.target.current_target() + if target and "miopen" in target.libs: + return topi.generic.schedule_extern(outs) + return topi.cuda.schedule_conv2d_nchw(outs) From 0163f6b1a1c9c714f6cf4ca46dd6efaf5b8a877c Mon Sep 17 00:00:00 2001 From: masahi Date: Wed, 27 Dec 2017 13:12:21 +0900 Subject: [PATCH 060/948] [TOPI] CUDNN integration (#730) * add target.libs to target str representation * integrate cudnn into topi cuda * append target.libs to target.options --- python/tvm/target.py | 6 +-- src/codegen/llvm/llvm_common.cc | 2 +- tests/python/contrib/test_cudnn.py | 3 +- topi/python/topi/cuda/__init__.py | 1 + topi/python/topi/cuda/conv2d.py | 65 ++++++++++++++++++++++++++++ topi/python/topi/cuda/conv2d_nchw.py | 5 +++ topi/python/topi/rocm/conv2d.py | 2 +- 7 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 topi/python/topi/cuda/conv2d.py diff --git a/python/tvm/target.py b/python/tvm/target.py index 096459c1ac83..092dbc0d1165 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -88,17 +88,15 @@ def __init__(self, target_name, options=None): self.target_name = target_name - self.options = [] + self.options = _merge_opts([], options) self.device_name = "" self.libs = [] # Parse device option - for item in _merge_opts([], options): + for item in self.options: if item.startswith("-libs="): self.libs.append(item.split("=")[1]) - continue elif item.startswith("-device="): self.device_name = item.split("=")[1] - self.options.append(item) # Target query searchs device name first if self.device_name: self.keys = (self.device_name,) diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index dfe51fb373f7..b34bf0d5ec91 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -82,7 +82,7 @@ GetLLVMTargetMachine(const std::string& target_str, } else { LOG(FATAL) << "invalid -mfloat-abi option " << value; } - } else if (key == "-device") { + } else if (key == "-device" || key == "-libs") { // pass } else { LOG(FATAL) << "unknown option " << key; diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py index 93e17ea54ad8..47561c57bcbd 100644 --- a/tests/python/contrib/test_cudnn.py +++ b/tests/python/contrib/test_cudnn.py @@ -41,7 +41,8 @@ def test_conv2d(): tensor_format=0, algo=1) yshape = [x.value for x in Y.shape] - s = tvm.create_schedule(Y.op) + with tvm.target.create("cuda -libs=cudnn"): + s = tvm.create_schedule(Y.op) def verify(): ctx = tvm.gpu(0) diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index 03cab1441b0c..314931b4cb42 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -2,6 +2,7 @@ """CUDA specific declaration and schedules.""" from __future__ import absolute_import as _abs +from .conv2d import conv2d_cuda from .conv2d_nchw import schedule_conv2d_nchw from .conv2d_hwcn import schedule_conv2d_hwcn from .depthwise_conv2d import schedule_depthwise_conv2d_nchw, schedule_depthwise_conv2d_nhwc diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py new file mode 100644 index 000000000000..62b5642abbab --- /dev/null +++ b/topi/python/topi/cuda/conv2d.py @@ -0,0 +1,65 @@ +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long +"""Compute definition for conv2d with cuda backend""" +import tvm +from tvm.contrib import cudnn +import topi +from ..nn.conv2d import conv2d + +@conv2d.register("cuda") +def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): + """Conv2D operator for cuda backend. + + Parameters + ---------- + input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + filter : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + assert isinstance(stride, int) or len(stride) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + if isinstance(padding, int): + pad_h = pad_w = padding + else: + pad_h, pad_w = padding + target = tvm.target.current_target() + if "cudnn" in target.libs: + assert layout != 'HWCN', "HWCN layout not supported with CUDNN." + tensor_format = 0 # CUDNN_TENSOR_NCHW + if layout == 'NHWC': + tensor_format = 1 # CUDNN_TENSOR_NHWC + return cudnn.conv2d_forward(data, + kernel, + stride_h, + stride_w, + pad_h, + pad_w, + 1, # dilation_h + 1, # dilation_w + conv_mode=1, + tensor_format=tensor_format, + algo=0) + elif layout == 'NCHW': + return topi.nn.conv2d_nchw(data, kernel, stride, padding, out_dtype) + elif layout == 'HWCN': + return topi.nn.conv2d_hwcn(data, kernel, stride, padding, out_dtype) + else: + raise ValueError("not support this layout {} yet".format(layout)) diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index 0d3f5eedb0a2..e313029e74b0 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -1,6 +1,7 @@ #pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long """Schedule for conv2d_nchw with auto fusion""" import tvm +import topi from .. import util from .. import tag from .. import generic @@ -516,6 +517,10 @@ def schedule_conv2d_nchw(outs): s: Schedule The computation schedule for conv2d_nchw. """ + target = tvm.target.current_target() + if target.target_name == "cuda" and "cudnn" in target.libs: + return topi.generic.schedule_extern(outs) + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs batch_size = util.get_const_int(outs[0].op.output(0).shape[0]) if batch_size > 1: diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py index c6e4eb817b11..4dd5e5fd0735 100644 --- a/topi/python/topi/rocm/conv2d.py +++ b/topi/python/topi/rocm/conv2d.py @@ -1,5 +1,5 @@ # pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long -"""Schedule for rocm conv2d_nchw with auto fusion""" +"""Compute and schedule for rocm conv2d_nchw with auto fusion""" import tvm from tvm.contrib import miopen import topi From b4131574f2fbebb03c00294984664258368e7b3c Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 27 Dec 2017 01:52:37 -0800 Subject: [PATCH 061/948] [TOPI]Support dim-0 tensor in topi broadcast/reduce (#731) * support dim-0 tensor in topi ops revert transform * revert --- topi/python/topi/reduction.py | 4 +--- topi/tests/python/test_topi_broadcast.py | 2 ++ topi/tests/python/test_topi_reduce.py | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py index 3c6bf1ca0e2c..997ec8e9ba95 100644 --- a/topi/python/topi/reduction.py +++ b/topi/python/topi/reduction.py @@ -107,10 +107,8 @@ def comm_reduce(data, axis=None, keepdims=False, func=tvm.sum, is_idx_reduce=Fal ret : tvm.Tensor """ ndim = len(data.shape) + assert ndim != 0, "Reduce a dim-0 input is not supported!" real_axis = _get_real_axis(ndim, axis) - if real_axis == list(range(ndim)) and keepdims is False: - raise ValueError("Currently we do not support all reduce + keepdims = False!" - " axis={}, keepdims={}".format(axis, keepdims)) reduce_axes = [tvm.reduce_axis((0, data.shape[i]), "k%d" %i) for i in real_axis] if keepdims: target_shape = [1 if i in real_axis else data.shape[i] for i in range(ndim)] diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py index e5f88e9d4df6..28a9e721a4ea 100644 --- a/topi/tests/python/test_topi_broadcast.py +++ b/topi/tests/python/test_topi_broadcast.py @@ -89,12 +89,14 @@ def check_device(device): def test_broadcast_to(): verify_broadcast_to_ele((1,), (10,)) + verify_broadcast_to_ele((), (10,)) verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4)) verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32)) def test_broadcast_binary(): verify_broadcast_binary_ele((5, 2, 3), (2, 1), typ="add") + verify_broadcast_binary_ele((5, 2, 3), (), typ="add") verify_broadcast_binary_ele((5, 64, 128), (2, 5, 64, 1), typ="mul") verify_broadcast_binary_ele((2, 3, 1, 32), (64, 32), typ="div") verify_broadcast_binary_ele((1, 32), (64, 32), typ="sub") diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py index 13cd8fcdcd21..08e66e1404b6 100644 --- a/topi/tests/python/test_topi_reduce.py +++ b/topi/tests/python/test_topi_reduce.py @@ -108,7 +108,10 @@ def test_reduce_map(): axis=None, keepdims=True, type="argmax") - + verify_reduce_map_ele(in_shape=(31, 21, 15), + axis=None, + keepdims=False, + type="sum") if __name__ == "__main__": test_reduce_map() From bb6ddd2508bb6889cfe2f89d17d9238db853f2c2 Mon Sep 17 00:00:00 2001 From: kun-zh <32951065+kun-zh@users.noreply.github.com> Date: Wed, 27 Dec 2017 17:52:48 +0800 Subject: [PATCH 062/948] [SCHEDULE] New Reduction Mode for Tensorize (#727) * when there is no intrin func, using body for initialization. For issue 714. * Refine code per review comments, and add a test case. * Fix lint issues. --- src/op/tensorize.cc | 94 +++++++++++++++---- .../test_schedule_tensorize_init_none.py | 90 ++++++++++++++++++ 2 files changed, 167 insertions(+), 17 deletions(-) create mode 100644 tests/python/unittest/test_schedule_tensorize_init_none.py diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc index b4527f76e808..6fa5459829fc 100644 --- a/src/op/tensorize.cc +++ b/src/op/tensorize.cc @@ -10,6 +10,7 @@ #include "./op_util.h" #include "./compute_op.h" #include "../schedule/message_passing.h" +#include "../arithmetic/compute_expr.h" namespace tvm { @@ -322,6 +323,50 @@ void VerifyTensorizeBody( } } +/*! + * \brief Transform the update part when there is no init func in tensorizing + * \param stage The stage for tensorizing. + * \param dom_map The range of each iter var. + * \param n The loop nest structured used in compute. + * \param body The body func in tensorize intrin + * \param update The update func in tensorize intrin + * \return Transformed result. + */ +Stmt TransformUpdate(const Stage& stage, + const std::unordered_map& dom_map, + const ComputeLoopNest& n, + Stmt body, + Stmt update) { + Array conds; + std::unordered_set banned; + for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) { + IterVar iv = stage->leaf_iter_vars[i]; + auto iit = stage->iter_var_attrs.find(iv); + if (iit != stage->iter_var_attrs.end()) { + const IterVarAttr& attr = (*iit).second; + if (attr->iter_type == kTensorized) { + break; + } + } + if (iv->iter_type == kCommReduce) { + auto vit = dom_map.find(iv); + CHECK(vit != dom_map.end()); + const Range& vrange = vit->second; + conds.push_back(likely(iv->var > vrange->min)); + banned.insert(iv->var.get()); + } + } + for (const Expr& pred : n.main_predicates) { + if (ir::ExprUseVar(pred, banned)) { + LOG(FATAL) << "Tensorize update transform failed, the condition " + << pred << " has a conflict with the reset condition"; + } + } + + return IfThenElse::make(arith::ComputeReduce(conds, const_true(1)), + update, body); +} + Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map) { @@ -416,32 +461,47 @@ Stmt MakeTensorize(const ComputeOpNode* self, return MergeNest(nest, body); } else { // Need to split reduction - CHECK(intrin->reduce_init.defined()) - << "Reduction init op for intrin " << intrin << " is not defined"; CHECK(intrin->reduce_update.defined()) << "Reduction update op for intrin " << intrin << " is not defined"; // Need init and update steps CHECK_NE(self->reduce_axis.size(), 0U); std::vector > common( n.main_nest.begin(), n.main_nest.begin() + n.num_common_loop + 1); - // init nest - std::vector > init_nest( - n.init_nest.begin(), n.init_nest.begin() + tloc + 1); - init_nest.emplace_back(op::MakeIfNest(n.init_predicates)); - Stmt init = MergeNest(output_bind_nest, intrin->reduce_init); - init = Substitute(init, n.init_vmap); - init = MergeNest(init_nest, init); - // The update std::vector > update_nest( n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.begin() + tloc + 1); update_nest.emplace_back(op::MakeIfNest(n.main_predicates)); - Stmt update = MergeNest(output_bind_nest, intrin->reduce_update); - update = MergeNest(input_bind_nest, update); - update = Substitute(update, vmap); - update = MergeNest(binder.asserts(), update); - update = Substitute(update, n.main_vmap); - update = MergeNest(update_nest, update); - return MergeNest(common, Block::make(init, update)); + + if (intrin->reduce_init.defined()) { + // init nest + std::vector > init_nest( + n.init_nest.begin(), n.init_nest.begin() + tloc + 1); + init_nest.emplace_back(op::MakeIfNest(n.init_predicates)); + Stmt init = MergeNest(output_bind_nest, intrin->reduce_init); + init = Substitute(init, n.init_vmap); + init = MergeNest(init_nest, init); + // The update + Stmt update = MergeNest(output_bind_nest, intrin->reduce_update); + update = MergeNest(input_bind_nest, update); + update = Substitute(update, vmap); + update = MergeNest(binder.asserts(), update); + update = Substitute(update, n.main_vmap); + update = MergeNest(update_nest, update); + return MergeNest(common, Block::make(init, update)); + } else { + // When init op is not available, use body op for reset in the first iter. + CHECK(intrin->body.defined()) + << "Normal body op for intrin " << intrin << " is not defined"; + Stmt update = TransformUpdate(stage, dom_map, n, + intrin->body, + intrin->reduce_update); + update = MergeNest(output_bind_nest, update); + update = MergeNest(input_bind_nest, update); + update = Substitute(update, vmap); + update = MergeNest(binder.asserts(), update); + update = Substitute(update, n.main_vmap); + update = MergeNest(update_nest, update); + return MergeNest(common, update); + } } } diff --git a/tests/python/unittest/test_schedule_tensorize_init_none.py b/tests/python/unittest/test_schedule_tensorize_init_none.py new file mode 100644 index 000000000000..ce1d5633173a --- /dev/null +++ b/tests/python/unittest/test_schedule_tensorize_init_none.py @@ -0,0 +1,90 @@ +import tvm + +def intrin_gemv(m, n): + w = tvm.placeholder((m, n), name='w') + x = tvm.placeholder((n,), name='x') + k = tvm.reduce_axis((0, n), name='k') + z = tvm.compute((m,), lambda i: + tvm.sum(w[i, k] * x[k], axis=k), name='z') + Wb = tvm.decl_buffer(w.shape, w.dtype, + name="W", + offset_factor=16, + strides=[tvm.var('ldw'), 1]) + def intrin_func(ins, outs): + ww, xx = ins + zz = outs[0] + ww_ptr = ww.access_ptr("r") + xx_ptr = xx.access_ptr("r") + zz_ptr = zz.access_ptr("w") + body = tvm.call_packed( + "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) + update = tvm.call_packed( + "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) + return body, None, update + + with tvm.build_config(data_alignment=16, + offset_factor=16): + return tvm.decl_tensor_intrin(z.op, intrin_func, + binds={w: Wb}) + + +def test_tensorize_matmul(): + n = 1024 + m = n + l = n + A = tvm.placeholder((n, l), name='A') + B = tvm.placeholder((m, l), name='B') + k = tvm.reduce_axis((0, l), name='k') + C = tvm.compute((n, m), lambda i, j: + tvm.sum(B[j, k] * A[i, k], axis=k), name='C') + + def check(factor): + s = tvm.create_schedule(C.op) + x, y = C.op.axis + yo, yi = s[C].split(y, factor=factor) + gemv = intrin_gemv(factor, l) + s[C].tensorize(yi, gemv) + s = s.normalize() + dom_map = tvm.schedule.InferBound(s) + finfer = tvm.get_global_func("test.op.InferTensorizeRegion") + out_dom, in_dom = finfer(s[C], dom_map) + assert tvm.ir_pass.Equal(out_dom[x].extent, 1) + assert tvm.ir_pass.Equal(out_dom[y].extent, factor) + assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) + fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") + body = fmatch(s[C], out_dom, in_dom, gemv) + assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), + tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) + stmt = tvm.schedule.ScheduleOps(s, dom_map) + tvm.lower(s, [A, B, C]) + + + def check_rfactor(factor, rfactor): + s = tvm.create_schedule(C.op) + x, y = C.op.axis + rk = C.op.reduce_axis[0] + yo, yi = s[C].split(y, factor=factor) + ro, ri = s[C].split(rk, factor=rfactor) + s[C].reorder(yo, ro, yi, ri) + gemv = intrin_gemv(factor, rfactor) + s[C].tensorize(yi, gemv) + s = s.normalize() + dom_map = tvm.schedule.InferBound(s) + finfer = tvm.get_global_func("test.op.InferTensorizeRegion") + out_dom, in_dom = finfer(s[C], dom_map) + assert tvm.ir_pass.Equal(out_dom[x].extent, 1) + assert tvm.ir_pass.Equal(out_dom[y].extent, factor) + assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) + fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") + body = fmatch(s[C], out_dom, in_dom, gemv) + assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), + tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) + stmt = tvm.schedule.ScheduleOps(s, dom_map) + tvm.lower(s, [A, B, C]) + + check(16) + check_rfactor(16, 16) + + +if __name__ == "__main__": + test_tensorize_matmul() From 3fd55cc8b95da6d832a694da337a6c18eb1b47f0 Mon Sep 17 00:00:00 2001 From: kun-zh <32951065+kun-zh@users.noreply.github.com> Date: Fri, 29 Dec 2017 08:31:51 +0800 Subject: [PATCH 063/948] Re-organize the test cases for tensorize. (#736) * when there is no intrin func, using body for initialization. For issue 714. * Refine code per review comments, and add a test case. * Fix lint issues. * Re-organize the tensorize test cases, and add a new case for none-reset mode. * Fix a typo. * Delete the unit case because merged it into test_schedule_tensorize.py already. --- .../unittest/test_schedule_tensorize.py | 76 ++++++++++++++++ .../test_schedule_tensorize_init_none.py | 90 ------------------- 2 files changed, 76 insertions(+), 90 deletions(-) delete mode 100644 tests/python/unittest/test_schedule_tensorize_init_none.py diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py index 71ae493e51ae..ca5836143ef3 100644 --- a/tests/python/unittest/test_schedule_tensorize.py +++ b/tests/python/unittest/test_schedule_tensorize.py @@ -40,6 +40,33 @@ def intrin_func(ins, outs): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb}) +def intrin_gemv_no_reset(m, n): + w = tvm.placeholder((m, n), name='w') + x = tvm.placeholder((n,), name='x') + k = tvm.reduce_axis((0, n), name='k') + z = tvm.compute((m,), lambda i: + tvm.sum(w[i, k] * x[k], axis=k), name='z') + Wb = tvm.decl_buffer(w.shape, w.dtype, + name="W", + offset_factor=16, + strides=[tvm.var('ldw'), 1]) + def intrin_func(ins, outs): + ww, xx = ins + zz = outs[0] + ww_ptr = ww.access_ptr("r") + xx_ptr = xx.access_ptr("r") + zz_ptr = zz.access_ptr("w") + body = tvm.call_packed( + "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) + update = tvm.call_packed( + "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) + return body, None, update + + with tvm.build_config(data_alignment=16, + offset_factor=16): + return tvm.decl_tensor_intrin(z.op, intrin_func, + binds={w: Wb}) + def test_tensorize_vadd(): m = 128 @@ -123,8 +150,57 @@ def check_rfactor(factor, rfactor): stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) + def check_rfactor_no_reset(factor, rfactor): + s = tvm.create_schedule(C.op) + x, y = C.op.axis + rk = C.op.reduce_axis[0] + yo, yi = s[C].split(y, factor=factor) + ro, ri = s[C].split(rk, factor=rfactor) + s[C].reorder(yo, ro, yi, ri) + gemv = intrin_gemv_no_reset(factor, rfactor) + s[C].tensorize(yi, gemv) + s = s.normalize() + dom_map = tvm.schedule.InferBound(s) + finfer = tvm.get_global_func("test.op.InferTensorizeRegion") + out_dom, in_dom = finfer(s[C], dom_map) + assert tvm.ir_pass.Equal(out_dom[x].extent, 1) + assert tvm.ir_pass.Equal(out_dom[y].extent, factor) + assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) + fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") + body = fmatch(s[C], out_dom, in_dom, gemv) + assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), + tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) + stmt = tvm.schedule.ScheduleOps(s, dom_map) + tvm.lower(s, [A, B, C]) + + def check_rfactor_no_reset_multi_reduction(factor, rfactor): + s = tvm.create_schedule(C.op) + x, y = C.op.axis + rk = C.op.reduce_axis[0] + yo, yi = s[C].split(y, factor=factor) + ro, ri = s[C].split(rk, factor=rfactor) + roo, roi = s[C].split(ro, factor=2) + s[C].reorder(yo, roo, roi, yi, ri) + gemv = intrin_gemv_no_reset(factor, rfactor) + s[C].tensorize(yi, gemv) + s = s.normalize() + dom_map = tvm.schedule.InferBound(s) + finfer = tvm.get_global_func("test.op.InferTensorizeRegion") + out_dom, in_dom = finfer(s[C], dom_map) + assert tvm.ir_pass.Equal(out_dom[x].extent, 1) + assert tvm.ir_pass.Equal(out_dom[y].extent, factor) + assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) + fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") + body = fmatch(s[C], out_dom, in_dom, gemv) + assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), + tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) + stmt = tvm.schedule.ScheduleOps(s, dom_map) + tvm.lower(s, [A, B, C]) + check(16) check_rfactor(16, 16) + check_rfactor_no_reset(16, 16) + check_rfactor_no_reset_multi_reduction(16, 16) # This tests whether algorithm and intrinsics expressions are simplified # as much as possible first and then checked for equality. See Issue #696 diff --git a/tests/python/unittest/test_schedule_tensorize_init_none.py b/tests/python/unittest/test_schedule_tensorize_init_none.py deleted file mode 100644 index ce1d5633173a..000000000000 --- a/tests/python/unittest/test_schedule_tensorize_init_none.py +++ /dev/null @@ -1,90 +0,0 @@ -import tvm - -def intrin_gemv(m, n): - w = tvm.placeholder((m, n), name='w') - x = tvm.placeholder((n,), name='x') - k = tvm.reduce_axis((0, n), name='k') - z = tvm.compute((m,), lambda i: - tvm.sum(w[i, k] * x[k], axis=k), name='z') - Wb = tvm.decl_buffer(w.shape, w.dtype, - name="W", - offset_factor=16, - strides=[tvm.var('ldw'), 1]) - def intrin_func(ins, outs): - ww, xx = ins - zz = outs[0] - ww_ptr = ww.access_ptr("r") - xx_ptr = xx.access_ptr("r") - zz_ptr = zz.access_ptr("w") - body = tvm.call_packed( - "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - update = tvm.call_packed( - "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - return body, None, update - - with tvm.build_config(data_alignment=16, - offset_factor=16): - return tvm.decl_tensor_intrin(z.op, intrin_func, - binds={w: Wb}) - - -def test_tensorize_matmul(): - n = 1024 - m = n - l = n - A = tvm.placeholder((n, l), name='A') - B = tvm.placeholder((m, l), name='B') - k = tvm.reduce_axis((0, l), name='k') - C = tvm.compute((n, m), lambda i, j: - tvm.sum(B[j, k] * A[i, k], axis=k), name='C') - - def check(factor): - s = tvm.create_schedule(C.op) - x, y = C.op.axis - yo, yi = s[C].split(y, factor=factor) - gemv = intrin_gemv(factor, l) - s[C].tensorize(yi, gemv) - s = s.normalize() - dom_map = tvm.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[C], dom_map) - assert tvm.ir_pass.Equal(out_dom[x].extent, 1) - assert tvm.ir_pass.Equal(out_dom[y].extent, factor) - assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[C], out_dom, in_dom, gemv) - assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), - tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) - stmt = tvm.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [A, B, C]) - - - def check_rfactor(factor, rfactor): - s = tvm.create_schedule(C.op) - x, y = C.op.axis - rk = C.op.reduce_axis[0] - yo, yi = s[C].split(y, factor=factor) - ro, ri = s[C].split(rk, factor=rfactor) - s[C].reorder(yo, ro, yi, ri) - gemv = intrin_gemv(factor, rfactor) - s[C].tensorize(yi, gemv) - s = s.normalize() - dom_map = tvm.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[C], dom_map) - assert tvm.ir_pass.Equal(out_dom[x].extent, 1) - assert tvm.ir_pass.Equal(out_dom[y].extent, factor) - assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[C], out_dom, in_dom, gemv) - assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), - tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) - stmt = tvm.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [A, B, C]) - - check(16) - check_rfactor(16, 16) - - -if __name__ == "__main__": - test_tensorize_matmul() From dccc1e3fc4409349b5394960dee17ca2f376dc4d Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 29 Dec 2017 10:53:05 +0900 Subject: [PATCH 064/948] Let CUDNN choose the best algo (#734) * use cudnn findalgo to choose the best algo * fix lint --- python/tvm/contrib/cudnn.py | 79 ++++++++++++++++++++++++- src/contrib/cudnn/conv_forward.cc | 98 ++++++++++++++++++++++++++++++- topi/python/topi/cuda/conv2d.py | 2 +- 3 files changed, 176 insertions(+), 3 deletions(-) diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py index e728e42f614e..5200f3193079 100644 --- a/python/tvm/contrib/cudnn.py +++ b/python/tvm/contrib/cudnn.py @@ -220,6 +220,70 @@ def conv2d_output_shape(tensor_format, return list(oshape) +def conv2d_find_algo(tensor_format, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + x_shape, + w_shape, + y_shape): + """Choose the best algo for the given input. + + Paramters + --------- + tensor_format: int + 0: CUDNN_TENSOR_NCHW + 1: CUDNN_TENSOR_NHWC + 2: CUDNN_TENSOR_NCHW_VECT_C + pad_h: int + height pad + pad_w: int + weight pad + stride_h: int + height stride + stride_w: int + width stride + dilation_h: int + height dilation + dilation_w: int + width dilation + x_shape: list + input shape + w_shape: list + weight shape + y_shape: list + output shape + + Returns + ------- + algo: int + algo chosen by CUDNN + """ + func = _get_global_func("tvm.contrib.cudnn.conv2d.find_algo") + return func(tensor_format, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + x_shape[0].value, + x_shape[1].value, + x_shape[2].value, + x_shape[3].value, + w_shape[0].value, + w_shape[1].value, + w_shape[2].value, + w_shape[3].value, + y_shape[0], + y_shape[1], + y_shape[2], + y_shape[3]) + + def conv2d_forward(x, w, stride_h=1, @@ -230,7 +294,7 @@ def conv2d_forward(x, dilation_w=1, conv_mode=1, tensor_format=0, - algo=0): + algo=-1): """Create an extern op that compute 2D convolution with CuDNN Parameters @@ -260,6 +324,7 @@ def conv2d_forward(x, 2: CUDNN_TENSOR_NCHW_VECT_C algo: int Forward algorithm, get index from ```algo_to_index``` function + if algo == -1, the best algo will be chosen by CUDNN Returns ------- @@ -275,6 +340,18 @@ def conv2d_forward(x, dilation_w, list(x.shape), list(w.shape)) + if algo == -1: + algo = conv2d_find_algo(tensor_format, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + list(x.shape), + list(w.shape), + oshape) + return _api.extern( oshape, [x, w], lambda ins, outs: _intrin.call_packed( diff --git a/src/contrib/cudnn/conv_forward.cc b/src/contrib/cudnn/conv_forward.cc index 480a789303c6..4cd25f0c2fe4 100644 --- a/src/contrib/cudnn/conv_forward.cc +++ b/src/contrib/cudnn/conv_forward.cc @@ -153,7 +153,103 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape") static_cast(out_shape) + 1, static_cast(out_shape) + 2, static_cast(out_shape) + 3)); - }); +}); + + +TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo") +.set_body([](TVMArgs args, TVMRetValue *ret) { + CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(); + int format = args[0]; + int pad_h = args[1]; + int pad_w = args[2]; + int stride_h = args[3]; + int stride_w = args[4]; + int dilation_h = args[5]; + int dilation_w = args[6]; + int x_dim0 = args[7]; + int x_dim1 = args[8]; + int x_dim2 = args[9]; + int x_dim3 = args[10]; + int w_dim0 = args[11]; + int w_dim1 = args[12]; + int w_dim2 = args[13]; + int w_dim3 = args[14]; + int y_dim0 = args[15]; + int y_dim1 = args[16]; + int y_dim2 = args[17]; + int y_dim3 = args[18]; + + // Set Format + entry_ptr->conv_entry.tensor_format = static_cast(format); + // conv desc + CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + CUDNN_CROSS_CORRELATION, + entry_ptr->conv_entry.data_type)); + // input desc + CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc, + entry_ptr->conv_entry.tensor_format, + CUDNN_DATA_FLOAT, + x_dim0, + x_dim1, + x_dim2, + x_dim3)); + // filter desc + CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc, + CUDNN_DATA_FLOAT, + CUDNN_TENSOR_NCHW, + w_dim0, + w_dim1, + w_dim2, + w_dim3)); + + // output desc + CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc, + entry_ptr->conv_entry.tensor_format, + entry_ptr->conv_entry.data_type, + y_dim0, + y_dim1, + y_dim2, + y_dim3)); + + int returned_algo_count = 0; + cudnnConvolutionFwdAlgoPerf_t perf_results[CUDNN_CONVOLUTION_FWD_ALGO_COUNT]; + CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(entry_ptr->handle, + entry_ptr->conv_entry.input_desc, + entry_ptr->conv_entry.filter_desc, + entry_ptr->conv_entry.conv_desc, + entry_ptr->conv_entry.output_desc, + CUDNN_CONVOLUTION_FWD_ALGO_COUNT, + &returned_algo_count, + perf_results)); + + const std::vector fwd_algo_names{ + "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", + "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", + "CUDNN_CONVOLUTION_FWD_ALGO_GEMM", + "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT", + "CUDNN_CONVOLUTION_FWD_ALGO_FFT", + "CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", + "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", + "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED" + }; + + auto best_algo = perf_results[0].algo; + LOG(INFO) << "\tCUDNN Found " << returned_algo_count + << " fwd algorithms, choosing " << fwd_algo_names[best_algo]; + for (int i = 0; i < returned_algo_count; ++i) { + LOG(INFO) << "\t\t" << i << ") " << fwd_algo_names[perf_results[i].algo] + << " - time: " << perf_results[i].time << " ms" + << ", Memory: " << perf_results[i].memory; + } + + ret[0] = best_algo; +}); } // namespace contrib } // namespace tvm diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py index 62b5642abbab..2641bfe490c4 100644 --- a/topi/python/topi/cuda/conv2d.py +++ b/topi/python/topi/cuda/conv2d.py @@ -56,7 +56,7 @@ def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32 1, # dilation_w conv_mode=1, tensor_format=tensor_format, - algo=0) + algo=-1) # let CUDNN choose the best algo elif layout == 'NCHW': return topi.nn.conv2d_nchw(data, kernel, stride, padding, out_dtype) elif layout == 'HWCN': From 8ecef3d5ac180c9b78497db058e134d2b2ca654b Mon Sep 17 00:00:00 2001 From: xqdan Date: Fri, 29 Dec 2017 16:56:27 +0800 Subject: [PATCH 065/948] enable partition const loop with build flag (#732) * [SCHEDULE]enable partition const loop with build flag (#719) * enable partition loop with build flag * add a testcase, and modify LoopPartition related cases * * add document for split_const_loop --- include/tvm/build_module.h | 3 ++ include/tvm/ir_pass.h | 3 +- python/tvm/build_module.py | 6 +++- src/api/api_pass.cc | 2 +- src/codegen/build_module.cc | 2 +- src/pass/loop_partition.cc | 13 ++++---- tests/python/unittest/test_codegen_device.py | 2 +- .../unittest/test_pass_loop_partition.py | 30 ++++++++++++++----- 8 files changed, 44 insertions(+), 17 deletions(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index a1563e8e7447..75062e819748 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -116,6 +116,9 @@ struct BuildConfig { /*! \brief Whether to detect global barrier */ bool detect_global_barrier = false; + /*! \brief Whether to partition const loop */ + bool partition_const_loop = false; + BuildConfig() { } }; diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index e763a75e7ee0..525b36d1b6b1 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -289,9 +289,10 @@ Stmt StorageRewrite(Stmt stmt); /*! * \brief partition loops in the stmt * \param stmt The stmt to do loop partition + * \param split_const_loop flag to enable partition for const loop * \return Transformed stmt. */ -Stmt LoopPartition(Stmt stmt); +Stmt LoopPartition(Stmt stmt, bool split_const_loop); /*! * \brief Detect and insert sync points to co-processor. diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 083074a6676e..fe6b01bb4d8c 100644 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -32,6 +32,7 @@ class BuildConfig(object): "auto_unroll_max_extent": 0, "unroll_explicit": True, "detect_global_barrier": False, + "partition_const_loop": False, "offset_factor": 0, "data_alignment": -1, "restricted_func": True, @@ -88,6 +89,9 @@ def build_config(**kwargs): detect_global_barrier: bool, default=True Whether detect global barrier. + partition_const_loop: bool, default=False + Whether partition const loop + data_alignment: int, optional The alignment of data pointer in bytes. If -1 is passed, the alignment will be set to TVM's internal default. @@ -219,7 +223,7 @@ def lower(sch, stmt = f(stmt) # Phase 2 if not simple_mode: - stmt = ir_pass.LoopPartition(stmt) + stmt = ir_pass.LoopPartition(stmt, cfg.partition_const_loop) stmt = ir_pass.VectorizeLoop(stmt) stmt = ir_pass.InjectVirtualThread(stmt) stmt = ir_pass.InjectDoubleBuffer(stmt, cfg.double_buffer_split_loop) diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index 23deb03af482..06c6b621abde 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -119,7 +119,7 @@ REGISTER_PASS1(LowerStorageAccessInfo); REGISTER_PASS1(InjectVirtualThread); REGISTER_PASS1(InjectPrefetch); REGISTER_PASS2(InjectDoubleBuffer); -REGISTER_PASS1(LoopPartition); +REGISTER_PASS2(LoopPartition); REGISTER_PASS1(RemoveNoOp); REGISTER_PASS2(SplitPipeline); REGISTER_PASS2(LiftAttrScope); diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index d936b873bcd2..2c419d43da08 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -208,7 +208,7 @@ Stmt BuildStmt(Schedule sch, stmt = ir::StorageFlatten(stmt, out_binds, 64); stmt = ir::CanonicalSimplify(stmt); if (loop_partition) { - stmt = ir::LoopPartition(stmt); + stmt = ir::LoopPartition(stmt, config.partition_const_loop); } stmt = ir::VectorizeLoop(stmt); stmt = ir::InjectVirtualThread(stmt); diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc index 0834fe6ab3df..ff6103cd6e02 100644 --- a/src/pass/loop_partition.cc +++ b/src/pass/loop_partition.cc @@ -45,10 +45,12 @@ bool ExprUseVars(Expr expr, const std::unordered_set& vars) { class CandidateSelector final : public IRVisitor { public: using VarIsUsed = bool; - CandidateSelector() {} + explicit CandidateSelector(bool split_const_loop) + : split_const_loop_(split_const_loop) {} void Visit_(const For* op) { - if (!is_const(op->min) || !is_const(op->extent)) { + // partition const loop when sets split_const_loop_ + if (!is_const(op->min) || !is_const(op->extent) || split_const_loop_) { const Variable* var = op->loop_var.get(); record_.insert({var, false}); IRVisitor::Visit_(op); @@ -67,7 +69,7 @@ class CandidateSelector final : public IRVisitor { CHECK(iv); Var var = iv->var; runtime::ThreadScope scope = runtime::ThreadScope::make(iv->thread_tag); - if ((scope.rank == 0) && !is_const(op->value)) { + if ((scope.rank == 0) && (!is_const(op->value) || split_const_loop_)) { record_.insert({var.get(), false}); IRVisitor::Visit_(op); if (record_.at(var.get()) && !no_split_) { @@ -115,6 +117,7 @@ class CandidateSelector final : public IRVisitor { private: bool in_likely_{false}; bool no_split_{false}; + bool split_const_loop_{false}; std::unordered_map record_; }; @@ -392,8 +395,8 @@ class RemoveLikelyTags : public IRMutator { } }; -Stmt LoopPartition(Stmt stmt) { - CandidateSelector selector; +Stmt LoopPartition(Stmt stmt, bool split_const_loop) { + CandidateSelector selector(split_const_loop); selector.Visit(stmt); stmt = LoopPartitioner(selector.candidates).Mutate(stmt); stmt = RemoveLikelyTags().Mutate(stmt); diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py index 56e3fc81910f..773c2c890ef3 100644 --- a/tests/python/unittest/test_codegen_device.py +++ b/tests/python/unittest/test_codegen_device.py @@ -27,7 +27,7 @@ def test_add_pipeline(): Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Db = tvm.decl_buffer(D.shape, D.dtype, name='D') - stmt = tvm.ir_pass.LoopPartition(stmt) + stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, D:Db}, 64) stmt = tvm.ir_pass.Simplify(stmt) fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Db], 0, True) diff --git a/tests/python/unittest/test_pass_loop_partition.py b/tests/python/unittest/test_pass_loop_partition.py index a1f337b40214..a1025e1f662c 100644 --- a/tests/python/unittest/test_pass_loop_partition.py +++ b/tests/python/unittest/test_pass_loop_partition.py @@ -19,7 +19,7 @@ def lower(sch, args): sch = sch.normalize() bounds = tvm.schedule.InferBound(sch) stmt = tvm.schedule.ScheduleOps(sch, bounds) - stmt = tvm.ir_pass.LoopPartition(stmt) + stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.VectorizeLoop(stmt) @@ -37,7 +37,22 @@ def test_basic(): bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) - stmt = tvm.ir_pass.LoopPartition(stmt) + stmt = tvm.ir_pass.LoopPartition(stmt, False) + stmt = tvm.ir_pass.Simplify(stmt) + assert('if' not in str(stmt.body.body.body.first)) + +def test_const_loop(): + n = 21 + A = tvm.placeholder((n, ), name='A') + B = tvm.placeholder((n, ), name='B') + + T = tvm.compute((n, ), lambda i: A[i]+B[i]) + s = tvm.create_schedule(T.op) + xo, xi = s[T].split(T.op.axis[0], factor=4) + + bounds = tvm.schedule.InferBound(s) + stmt = tvm.schedule.ScheduleOps(s, bounds) + stmt = tvm.ir_pass.LoopPartition(stmt, True) stmt = tvm.ir_pass.Simplify(stmt) assert('if' not in str(stmt.body.body.body.first)) @@ -53,7 +68,7 @@ def test_multi_loop(): with ib.else_scope(): ib.emit(tvm.make.Evaluate(n)) stmt = ib.get() - stmt = tvm.ir_pass.LoopPartition(stmt) + stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.Simplify(stmt) assert(not any(collect_visit(stmt.body.first, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) @@ -73,7 +88,7 @@ def test_multi_if(): with ib.else_scope(): ib.emit(tvm.make.Evaluate(n)) stmt = ib.get() - stmt = tvm.ir_pass.LoopPartition(stmt) + stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.Simplify(stmt) assert('if' not in str(stmt.body.first)) @@ -92,7 +107,7 @@ def test_thread_axis(): bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) - stmt = tvm.ir_pass.LoopPartition(stmt) + stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.Simplify(stmt) assert('if' not in str(stmt.body.body.body.first)) @@ -127,7 +142,7 @@ def test_select(): ib.emit(tvm.make.Evaluate( tvm.make.Select(ib.likely(i*4+j Date: Sun, 31 Dec 2017 21:57:23 +0800 Subject: [PATCH 066/948] [WEB] update web runtime to latest emcc (#742) --- Makefile | 6 ++++-- python/tvm/contrib/emscripten.py | 16 +++++++++------- web/tvm_runtime.js | 12 +++++++++--- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 4a6ac7f3c8f3..7b9f99d8ebdb 100644 --- a/Makefile +++ b/Makefile @@ -29,8 +29,10 @@ INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include - CFLAGS = -std=c++11 -Wall -O2 $(INCLUDE_FLAGS) -fPIC FRAMEWORKS = OBJCFLAGS = -fno-objc-arc -EMCC_FLAGS= -s RESERVED_FUNCTION_POINTERS=2 -s NO_EXIT_RUNTIME=1 -s MAIN_MODULE=1 -DDMLC_LOG_STACK_TRACE=0\ - -std=c++11 -Oz $(INCLUDE_FLAGS) +EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\ + -Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\ + -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\ + $(INCLUDE_FLAGS) # llvm configuration ifdef LLVM_CONFIG diff --git a/python/tvm/contrib/emscripten.py b/python/tvm/contrib/emscripten.py index d747b5f039f8..d770ce1161f1 100644 --- a/python/tvm/contrib/emscripten.py +++ b/python/tvm/contrib/emscripten.py @@ -26,13 +26,16 @@ def create_js(output, The compile string. """ cmd = [cc] - cmd += ["-s", "RESERVED_FUNCTION_POINTERS=2"] - cmd += ["-s", "NO_EXIT_RUNTIME=1"] cmd += ["-Oz"] - cmd += ["-o", output] - if side_module: + if not side_module: + cmd += ["-s", "RESERVED_FUNCTION_POINTERS=2"] + cmd += ["-s", "NO_EXIT_RUNTIME=1"] + extra_methods = ['cwrap', 'getValue', 'setValue', 'addFunction'] + cfg = "[" + (','.join("\'%s\'" % x for x in extra_methods)) + "]" + cmd += ["-s", "EXTRA_EXPORTED_RUNTIME_METHODS=" + cfg] + else: cmd += ["-s", "SIDE_MODULE=1"] - + cmd += ["-o", output] objects = [objects] if isinstance(objects, str) else objects with_runtime = False for obj in objects: @@ -47,9 +50,8 @@ def create_js(output, if options: cmd += options - args = ' '.join(cmd) proc = subprocess.Popen( - args, shell=True, + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index 288f6e16014b..f69b1d57a11d 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -493,7 +493,8 @@ var tvm_runtime = tvm_runtime || {}; } var fptrInvokeCallback = null; var fptrFreeCallback = null; - if (typeof Runtime !== "undefined") { + if (typeof Runtime !== "undefined" && + typeof Runtime.addFunction !== "undefined") { fptrInvokeCallback = Runtime.addFunction(invokeCallback); fptrFreeCallback = Runtime.addFunction(freeCallback); } @@ -513,7 +514,8 @@ var tvm_runtime = tvm_runtime || {}; */ this.convertFunc = function(f) { if (isPackedFunc(f)) return f; - CHECK(fptrInvokeCallback !== null, "Emscripten Runtime is not available"); + CHECK(fptrInvokeCallback !== null, + "Emscripten Runtime addFunction is not available"); var fid; if (freeFuncId.length != 0) { fid = freeFuncId.pop(); @@ -1086,7 +1088,11 @@ var tvm_runtime = tvm_runtime || {}; this.create = function(Module) { var tvm = {}; tvm.Module = Module; - tvm.Runtime = Module.Runtime; + if (typeof Module.addFunction !== "undefined") { + tvm.Runtime = Module; + } else { + tvm.Runtime = Module.Runtime; + } TVMRuntime.apply(tvm); return tvm; }; From 86d5de6a166922d1422d24d252d9190f94a32176 Mon Sep 17 00:00:00 2001 From: xqdan Date: Sun, 31 Dec 2017 21:58:20 +0800 Subject: [PATCH 067/948] Support automatically Name Loop Variable in IRBuilder (#716) (#741) * [SCHEDULE]enable partition const loop with build flag (#719) * enable partition loop with build flag * add a testcase, and modify LoopPartition related cases * * add document for split_const_loop * [IRbuild]Support automatically Name Loop Variable in IRBuilder (#719) * add idx_num in class * using typical index [i, j, k] first, then i_suffix * keep inputs names * fix lint * improve comment of name * fix lint --- python/tvm/ir_builder.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/tvm/ir_builder.py b/python/tvm/ir_builder.py index 1888cd9f1d18..dcf5b9d0976b 100644 --- a/python/tvm/ir_builder.py +++ b/python/tvm/ir_builder.py @@ -97,6 +97,7 @@ class IRBuilder(object): """ def __init__(self): self._seq_stack = [[]] + self.nidx = 0 def _pop_seq(self): """Pop sequence from stack""" @@ -167,7 +168,8 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"): The end iteration scope name : str, optional - The name of iteration variable + The name of iteration variable, if no input names, + using typical index names i, j, k, then i_nidx dtype : str, optional The data type of iteration variable. @@ -189,6 +191,9 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"): with ib.for_range(1, 10, name="i") as i: x[i] = x[i - 1] + 1 """ + if name == 'i': + name = chr(ord(name) + self.nidx) if self.nidx < 3 else name + "_" + str(self.nidx - 3) + self.nidx += 1 self._seq_stack.append([]) loop_var = _api.var(name, dtype=dtype) extent = end if begin == 0 else _pass.Simplify(end - begin) From 94128c7c766a1462c5dff7e645017e18fce6ecf5 Mon Sep 17 00:00:00 2001 From: masahi Date: Tue, 2 Jan 2018 22:52:53 +0900 Subject: [PATCH 068/948] [CONTRIB] cuBLAS integration (#744) * add cublas support * integrate cublas to topi dense * add cublas error check * minor fix * fix lint * remove topi import from contrib unittest --- Makefile | 1 + make/config.mk | 3 ++ make/contrib/cublas.mk | 8 +++ python/tvm/contrib/cblas.py | 2 +- python/tvm/contrib/cublas.py | 32 ++++++++++++ python/tvm/target.py | 3 +- src/contrib/cblas/cblas.cc | 2 +- src/contrib/cublas/cublas.cc | 81 +++++++++++++++++++++++++++++ tests/python/contrib/test_cublas.py | 33 ++++++++++++ tests/python/contrib/test_cudnn.py | 3 +- topi/python/topi/cuda/__init__.py | 2 +- topi/python/topi/cuda/dense.py | 43 +++++++++++++++ topi/python/topi/nn/dense.py | 28 ++++++++-- 13 files changed, 232 insertions(+), 9 deletions(-) create mode 100644 make/contrib/cublas.mk create mode 100644 python/tvm/contrib/cublas.py create mode 100644 src/contrib/cublas/cublas.cc create mode 100644 tests/python/contrib/test_cublas.py diff --git a/Makefile b/Makefile index 7b9f99d8ebdb..875c99b8657d 100644 --- a/Makefile +++ b/Makefile @@ -138,6 +138,7 @@ include make/contrib/nnpack.mk include make/contrib/cudnn.mk include make/contrib/miopen.mk include make/contrib/mps.mk +include make/contrib/cublas.mk ifdef ADD_CFLAGS CFLAGS += $(ADD_CFLAGS) diff --git a/make/config.mk b/make/config.mk index 837db40ebff3..778d52025f92 100644 --- a/make/config.mk +++ b/make/config.mk @@ -77,3 +77,6 @@ USE_MIOPEN = 0 # Whether use MPS USE_MPS = 0 + +# Whether use cuBLAS +USE_CUBLAS = 0 diff --git a/make/contrib/cublas.mk b/make/contrib/cublas.mk new file mode 100644 index 000000000000..8274dc0bb378 --- /dev/null +++ b/make/contrib/cublas.mk @@ -0,0 +1,8 @@ +CUBLAS_CONTRIB_SRC = $(wildcard src/contrib/cublas/*.cc) +CUBLAS_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(CUBLAS_CONTRIB_SRC)) + +ifeq ($(USE_CUBLAS), 1) +CFLAGS += -DTVM_USE_CUBLAS=1 +ADD_LDFLAGS += -lcublas +RUNTIME_DEP += $(CUBLAS_CONTRIB_OBJ) +endif diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py index ae7b48d82f37..17af941449ea 100644 --- a/python/tvm/contrib/cblas.py +++ b/python/tvm/contrib/cblas.py @@ -1,4 +1,4 @@ -"""External function interface to BLAS libraroes.""" +"""External function interface to BLAS libraries.""" from __future__ import absolute_import as _abs from .. import api as _api diff --git a/python/tvm/contrib/cublas.py b/python/tvm/contrib/cublas.py new file mode 100644 index 000000000000..eda09fead359 --- /dev/null +++ b/python/tvm/contrib/cublas.py @@ -0,0 +1,32 @@ +"""External function interface to cuBLAS libraries.""" +from __future__ import absolute_import as _abs + +from .. import api as _api +from .. import intrin as _intrin + +def matmul(lhs, rhs, transa=False, transb=False): + """Create an extern op that compute matrix mult of A and rhs with cuBLAS + + Parameters + ---------- + lhs : Tensor + The left matrix operand + rhs : Tensor + The right matrix operand + transa : bool + Whether transpose lhs + transb : bool + Whether transpose rhs + + Returns + ------- + C : Tensor + The result tensor. + """ + n = lhs.shape[1] if transa else lhs.shape[0] + m = rhs.shape[0] if transb else rhs.shape[1] + return _api.extern( + (n, m), [lhs, rhs], + lambda ins, outs: _intrin.call_packed( + "tvm.contrib.cublas.matmul", + ins[0], ins[1], outs[0], transa, transb), name="C") diff --git a/python/tvm/target.py b/python/tvm/target.py index 092dbc0d1165..8f5f4bc87852 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -94,7 +94,8 @@ def __init__(self, # Parse device option for item in self.options: if item.startswith("-libs="): - self.libs.append(item.split("=")[1]) + libs = item.split("=")[1] + self.libs += libs.split(",") elif item.startswith("-device="): self.device_name = item.split("=")[1] # Target query searchs device name first diff --git a/src/contrib/cblas/cblas.cc b/src/contrib/cblas/cblas.cc index 9ce85ae4fd95..c2e981ba5d89 100644 --- a/src/contrib/cblas/cblas.cc +++ b/src/contrib/cblas/cblas.cc @@ -38,7 +38,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul") transa ? CblasTrans : CblasNoTrans, transb ? B->shape[0] : B->shape[1], transa ? A->shape[1] : A->shape[0], - transa ? B->shape[1] : B->shape[0], + transb ? B->shape[1] : B->shape[0], 1.0f, reinterpret_cast(static_cast(B->data) + B->byte_offset), B->shape[1], diff --git a/src/contrib/cublas/cublas.cc b/src/contrib/cublas/cublas.cc new file mode 100644 index 000000000000..4171aadf6381 --- /dev/null +++ b/src/contrib/cublas/cublas.cc @@ -0,0 +1,81 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external cblas library call. + */ +#include +#include +#include + +extern "C" { +#include +} + +namespace tvm { +namespace contrib { + +using namespace runtime; + +#ifndef CHECK_CUBLAS_ERROR +#define CHECK_CUBLAS_ERROR(error) \ +if (error != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuBLAS error: "); \ + if (error == CUBLAS_STATUS_NOT_INITIALIZED) fprintf(stderr, "CUBLAS_STATUS_NOT_INITIALIZED"); \ + if (error == CUBLAS_STATUS_ALLOC_FAILED) fprintf(stderr, "CUBLAS_STATUS_ALLOC_FAILED"); \ + if (error == CUBLAS_STATUS_INVALID_VALUE) fprintf(stderr, "CUBLAS_STATUS_INVALID_VALUE"); \ + if (error == CUBLAS_STATUS_ARCH_MISMATCH) fprintf(stderr, "CUBLAS_STATUS_ARCH_MISMATCH"); \ + if (error == CUBLAS_STATUS_MAPPING_ERROR) fprintf(stderr, "CUBLAS_STATUS_MAPPING_ERROR"); \ + if (error == CUBLAS_STATUS_EXECUTION_FAILED) fprintf(stderr, "CUBLAS_STATUS_EXECUTION_FAILED"); \ + if (error == CUBLAS_STATUS_INTERNAL_ERROR) fprintf(stderr, "CUBLAS_STATUS_INTERNAL_ERROR"); \ + if (error == CUBLAS_STATUS_NOT_SUPPORTED) fprintf(stderr, "CUBLAS_STATUS_NOT_SUPPORTED"); \ + if (error == CUBLAS_STATUS_LICENSE_ERROR) fprintf(stderr, "CUBLAS_STATUS_LICENSE_ERROR"); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ +} +#endif + +// matrix multiplication for row major +TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul") +.set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor* A = args[0]; + DLTensor* B = args[1]; + DLTensor* C = args[2]; + bool transa = args[3]; + bool transb = args[4]; + // call gemm for simple compact code. + CHECK_EQ(A->ndim, 2); + CHECK_EQ(B->ndim, 2); + CHECK_EQ(C->ndim, 2); + CHECK(C->strides == nullptr); + CHECK(B->strides == nullptr); + CHECK(A->strides == nullptr); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); + + cublasHandle_t handle; + CHECK_CUBLAS_ERROR(cublasCreate(&handle)); + float alpha = 1.0; + float beta = 0.0; + float *A_ptr = reinterpret_cast(static_cast(B->data) + B->byte_offset); + float *B_ptr = reinterpret_cast(static_cast(A->data) + A->byte_offset); + float *C_ptr = reinterpret_cast(static_cast(C->data) + C->byte_offset); + + CHECK_CUBLAS_ERROR(cublasSgemm(handle, + transb ? CUBLAS_OP_T : CUBLAS_OP_N, + transa ? CUBLAS_OP_T : CUBLAS_OP_N, + transb ? B->shape[0] : B->shape[1], + transa ? A->shape[1] : A->shape[0], + transb ? B->shape[1] : B->shape[0], + &alpha, + A_ptr, + B->shape[1], + B_ptr, + A->shape[1], + &beta, + C_ptr, + C->shape[1])); + + CHECK_CUBLAS_ERROR(cublasDestroy(handle)); +}); +} // namespace contrib +} // namespace tvm diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py new file mode 100644 index 000000000000..c488c8c680e1 --- /dev/null +++ b/tests/python/contrib/test_cublas.py @@ -0,0 +1,33 @@ +import tvm +import numpy as np +from tvm.contrib import cublas + +def test_matmul_add(): + n = 1024 + l = 128 + m = 235 + A = tvm.placeholder((n, l), name='A') + B = tvm.placeholder((l, m), name='B') + C = cublas.matmul(A, B) + s = tvm.create_schedule(C.op) + + def verify(target="cuda"): + if not tvm.module.enabled(target): + print("skip because %s is not enabled..." % target) + return + if not tvm.get_global_func("tvm.contrib.cublas.matmul", True): + print("skip because extern function is not avalable") + return + ctx = tvm.gpu(0) + f = tvm.build(s, [A, B, C], target) + a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + f(a, b, c) + np.testing.assert_allclose( + c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) + verify() + + +if __name__ == "__main__": + test_matmul_add() diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py index 47561c57bcbd..c197f5c5b0c6 100644 --- a/tests/python/contrib/test_cudnn.py +++ b/tests/python/contrib/test_cudnn.py @@ -41,8 +41,7 @@ def test_conv2d(): tensor_format=0, algo=1) yshape = [x.value for x in Y.shape] - with tvm.target.create("cuda -libs=cudnn"): - s = tvm.create_schedule(Y.op) + s = tvm.create_schedule(Y.op) def verify(): ctx = tvm.gpu(0) diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index 314931b4cb42..f829a9895fd2 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -11,7 +11,7 @@ from .reduction import schedule_reduce from .softmax import schedule_softmax from .injective import schedule_injective, schedule_elemwise, schedule_broadcast -from .dense import schedule_dense +from .dense import dense_cuda, schedule_dense from .pooling import schedule_pool, schedule_global_pool from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw from .extern import schedule_extern diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py index e7b142758295..6207c14220d0 100644 --- a/topi/python/topi/cuda/dense.py +++ b/topi/python/topi/cuda/dense.py @@ -2,9 +2,48 @@ """Schedule for dense operator""" from __future__ import absolute_import as _abs import tvm +from tvm.contrib import cublas +from ..nn.dense import dense, dense_default from .. import tag from .. import generic +@dense.register("cuda") +def dense_cuda(data, weight, bias=None): + """Dense operator for cuda backend. + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim] + + bias : tvm.Tensor, optional + 1-D with shape [out_dim] + + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim] + """ + assert len(data.shape) == 2 and len(weight.shape) == 2, \ + "only support 2-dim dense" + if bias is not None: + assert len(bias.shape) == 1 + batch, in_dim = data.shape + out_dim, _ = weight.shape + target = tvm.target.current_target() + if "cublas" in target.libs: + matmul = cublas.matmul(data, weight, False, True) + if bias is not None: + matmul = tvm.compute((batch, out_dim), \ + lambda i, j: matmul[i, j] + bias[j], \ + tag=tag.BROADCAST) + return matmul + return dense_default(data, weight, bias) + + @generic.schedule_dense.register(["cuda", "gpu"]) def schedule_dense(outs): """Schedule for dense operator. @@ -20,6 +59,10 @@ def schedule_dense(outs): s: Schedule The computation schedule for dense. """ + target = tvm.target.current_target() + if target.target_name == "cuda" and "cublas" in target.libs: + return generic.schedule_extern(outs) + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _schedule(Dense): diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py index 333692614bd1..11cc6097c250 100644 --- a/topi/python/topi/nn/dense.py +++ b/topi/python/topi/nn/dense.py @@ -3,9 +3,8 @@ import tvm from .. import tag - -def dense(data, weight, bias=None): - """Applies a linear transformation: :math:`Y = XW^T + b`. +def dense_default(data, weight, bias=None): + """The default implementation of dense in topi. Parameters ---------- @@ -38,3 +37,26 @@ def dense(data, weight, bias=None): lambda i, j: matmul[i, j] + bias[j], \ tag=tag.BROADCAST) return matmul + + +@tvm.target.generic_func +def dense(data, weight, bias=None): + """Applies a linear transformation: :math:`Y = XW^T + b`. + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim] + + bias : tvm.Tensor, optional + 1-D with shape [out_dim] + + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim] + """ + return dense_default(data, weight, bias) From 026f996de809715b874a65196f7c6eb34b8605c8 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 3 Jan 2018 11:56:08 +0800 Subject: [PATCH 069/948] [CODEGEN] fix & improments in codegen (#745) * [CODEGEN] update codegen for vector operation * update comment, fix for metal * fix some bugs in codegen * use 'restrict' in every argument * fix * fix --- src/codegen/codegen_c.cc | 18 ++++++++++++++++-- src/codegen/codegen_opencl.cc | 4 ++-- src/pass/ir_util.h | 4 ++-- src/pass/split_host_device.cc | 3 +++ 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index 1f9890d9a271..05ba1f2f357b 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -228,7 +228,7 @@ void CodeGenC::RegisterHandleType(const Variable* buf_var, Type t) { void CodeGenC::PrintVecElemLoad(const std::string& vec, Type t, int i, std::ostream& os) { // NOLINT(*) - os << vec << ".s" << std::hex << i; + os << vec << ".s" << std::hex << i << std::dec; } void CodeGenC::PrintVecElemStore(const std::string& vec, @@ -236,7 +236,7 @@ void CodeGenC::PrintVecElemStore(const std::string& vec, const std::string& value) { this->PrintIndent(); stream << vec << ".s" << std::hex << i - << " = " << value << ";\n"; + << " = " << value << ";\n" << std::dec; } std::string CodeGenC::GetVecLoad( @@ -583,6 +583,13 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) { // NOLINT(*) std::ostringstream value_temp; if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) { value_temp << "(("; + if (op->buffer_var.get()->type.is_handle()) { + auto it = alloc_storage_scope_.find(op->buffer_var.get()); + if (it != alloc_storage_scope_.end()) { + PrintStorageScope(it->second, value_temp); + value_temp << ' '; + } + } PrintType(elem_type, value_temp); value_temp << "*)" << vid << ')'; } else { @@ -627,6 +634,13 @@ void CodeGenC::VisitStmt_(const Store* op) { Type elem_type = t.element_of(); if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) { stream << "(("; + if (op->buffer_var.get()->type.is_handle()) { + auto it = alloc_storage_scope_.find(op->buffer_var.get()); + if (it != alloc_storage_scope_.end()) { + PrintStorageScope(it->second, stream); + stream << ' '; + } + } PrintType(elem_type, stream); stream << "*)" << vid << ')'; } else { diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc index ccd164bcd9e0..d2133f85a08c 100644 --- a/src/codegen/codegen_opencl.cc +++ b/src/codegen/codegen_opencl.cc @@ -177,14 +177,14 @@ void CodeGenOpenCL::PrintStorageScope( void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) std::string v = PrintExpr(op->value); - os << '('; + os << "(("; PrintType(op->type, os); os << ")("; for (int i = 0; i < op->lanes; ++i) { if (i != 0) os << ", "; os << v; } - os << ')'; + os << "))"; } } // namespace codegen } // namespace tvm diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h index 082d580a0e45..96a41b120e46 100644 --- a/src/pass/ir_util.h +++ b/src/pass/ir_util.h @@ -154,8 +154,8 @@ inline Type APIType(Type t) { inline int GetTempAllocaAlignment(Type type, int32_t const_size) { int align = runtime::kTempAllocaAlignment; if (const_size > 0) { - const_size = const_size * type.bits() * type.lanes() / 8; - while (align > const_size) { + int64_t const_s = static_cast(const_size) * type.bits() * type.lanes() / 8; + while (align > const_s) { align = align / 2; } } diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc index 942e70339488..44e9753081db 100644 --- a/src/pass/split_host_device.cc +++ b/src/pass/split_host_device.cc @@ -191,6 +191,9 @@ class HostDeviceSplitter : public IRMutator { auto it = handle_data_type_.find(v.get()); if (it != handle_data_type_.end()) { n->handle_data_type.Set(v, it->second); + } else { + // int32 as a placeholder + n->handle_data_type.Set(v, make_const(UInt(32), 0)); } } } From 2335fdab80931ac4b187463c4920e1351d0f9f1e Mon Sep 17 00:00:00 2001 From: libing4752 Date: Thu, 4 Jan 2018 05:37:56 +0800 Subject: [PATCH 070/948] modified schedule_dataflow_rewrite.cc to fix Stale Tensor during Dataflow Rewrite #738 (#747) * modified schedule_dataflow_rewrite.cc to fix losing tensor problem * modified schedule_dataflow_rewrite.cc for lint scan * modified schedule_dataflow_rewrite.cc for lint scan * using tensor's value_index to index output of stage op --- src/schedule/schedule_dataflow_rewrite.cc | 4 +++- .../unittest/test_schedule_schedule_ops.py | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc index d1a69ecf0203..b58df9d0481f 100644 --- a/src/schedule/schedule_dataflow_rewrite.cc +++ b/src/schedule/schedule_dataflow_rewrite.cc @@ -86,7 +86,9 @@ Tensor Schedule::cache_read(const Tensor& tensor, return tensor(Array(i.begin(), i.end())); }, os.str()); std::unordered_map vsub; - vsub[tensor] = cache; + Stage s = operator[](tensor->op); + Tensor sugar_tensor = s->op.output(tensor->value_index); + vsub[sugar_tensor] = cache; std::unordered_map vmap; for (Operation op : readers) { diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py index a85db2a23e86..03b8dbf48c8c 100644 --- a/tests/python/unittest/test_schedule_schedule_ops.py +++ b/tests/python/unittest/test_schedule_schedule_ops.py @@ -182,6 +182,25 @@ def test_schedule_cache(): bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) +def test_schedule_middle_cache(): + m = tvm.var('m') + n = tvm.var('n') + A = tvm.placeholder((m, n), name='A') + B = tvm.placeholder((m, n), name='B') + + C = tvm.compute((m, n), lambda i, j: A(i, j) * B(i, j), name='C') + D = tvm.compute((m, n), lambda i, j: C(i , j) , name='D') + + s = tvm.create_schedule(D.op) + AA = s.cache_read(A, "local", readers=[C]) + BB = s.cache_read(B, "local", readers=[C]) + CC = s.cache_read(C, "local", readers=[D]) + DD = s.cache_write(D, "local") + #s[AA].compute_at(s[CC], CC.op.axis[0]) + bounds = tvm.schedule.InferBound(s) + stmt = tvm.schedule.ScheduleOps(s, bounds) + + def test_schedule_cache_relayout1(): m = tvm.var('m') @@ -231,6 +250,7 @@ def test_schedule_cache_relayout3(): if __name__ == "__main__": + test_schedule_middle_cache() test_inline_multi_reduce() test_schedule_cache_relayout3() test_schedule_cache_relayout2() From 1cd67cd9928875e1a572b8a7147b8ee335676f31 Mon Sep 17 00:00:00 2001 From: masahi Date: Thu, 4 Jan 2018 06:38:24 +0900 Subject: [PATCH 071/948] [CONTRIB] rocBLAS integration (#751) * rocblas integration * fix include * fix lint --- Makefile | 1 + make/config.mk | 3 ++ make/contrib/rocblas.mk | 8 +++ python/tvm/contrib/cblas.py | 2 +- python/tvm/contrib/rocblas.py | 32 ++++++++++++ src/contrib/rocblas/rocblas.cc | 76 ++++++++++++++++++++++++++++ tests/python/contrib/test_rocblas.py | 33 ++++++++++++ topi/python/topi/rocm/__init__.py | 1 + topi/python/topi/rocm/dense.py | 66 ++++++++++++++++++++++++ 9 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 make/contrib/rocblas.mk create mode 100644 python/tvm/contrib/rocblas.py create mode 100644 src/contrib/rocblas/rocblas.cc create mode 100644 tests/python/contrib/test_rocblas.py create mode 100644 topi/python/topi/rocm/dense.py diff --git a/Makefile b/Makefile index 875c99b8657d..7f612f450cb0 100644 --- a/Makefile +++ b/Makefile @@ -139,6 +139,7 @@ include make/contrib/cudnn.mk include make/contrib/miopen.mk include make/contrib/mps.mk include make/contrib/cublas.mk +include make/contrib/rocblas.mk ifdef ADD_CFLAGS CFLAGS += $(ADD_CFLAGS) diff --git a/make/config.mk b/make/config.mk index 778d52025f92..256771ac3220 100644 --- a/make/config.mk +++ b/make/config.mk @@ -80,3 +80,6 @@ USE_MPS = 0 # Whether use cuBLAS USE_CUBLAS = 0 + +# Whether use rocBlas +USE_ROCBLAS = 0 diff --git a/make/contrib/rocblas.mk b/make/contrib/rocblas.mk new file mode 100644 index 000000000000..ae5663099d53 --- /dev/null +++ b/make/contrib/rocblas.mk @@ -0,0 +1,8 @@ +ROCBLAS_CONTRIB_SRC = $(wildcard src/contrib/rocblas/*.cc) +ROCBLAS_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCBLAS_CONTRIB_SRC)) + +ifeq ($(USE_ROCBLAS), 1) +CFLAGS += -DTVM_USE_ROCBLAS=1 +ADD_LDFLAGS += -lrocblas +RUNTIME_DEP += $(ROCBLAS_CONTRIB_OBJ) +endif diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py index 17af941449ea..eb32cc490347 100644 --- a/python/tvm/contrib/cblas.py +++ b/python/tvm/contrib/cblas.py @@ -7,7 +7,7 @@ def matmul(lhs, rhs, transa=False, transb=False): """Create an extern op that compute matrix mult of A and rhs with CrhsLAS - This function serves as an example on how to calle external libraries. + This function serves as an example on how to call external libraries. Parameters ---------- diff --git a/python/tvm/contrib/rocblas.py b/python/tvm/contrib/rocblas.py new file mode 100644 index 000000000000..470cff662c4c --- /dev/null +++ b/python/tvm/contrib/rocblas.py @@ -0,0 +1,32 @@ +"""External function interface to rocBLAS libraries.""" +from __future__ import absolute_import as _abs + +from .. import api as _api +from .. import intrin as _intrin + +def matmul(lhs, rhs, transa=False, transb=False): + """Create an extern op that compute matrix mult of A and rhs with rocBLAS + + Parameters + ---------- + lhs : Tensor + The left matrix operand + rhs : Tensor + The right matrix operand + transa : bool + Whether transpose lhs + transb : bool + Whether transpose rhs + + Returns + ------- + C : Tensor + The result tensor. + """ + n = lhs.shape[1] if transa else lhs.shape[0] + m = rhs.shape[0] if transb else rhs.shape[1] + return _api.extern( + (n, m), [lhs, rhs], + lambda ins, outs: _intrin.call_packed( + "tvm.contrib.rocblas.matmul", + ins[0], ins[1], outs[0], transa, transb), name="C") diff --git a/src/contrib/rocblas/rocblas.cc b/src/contrib/rocblas/rocblas.cc new file mode 100644 index 000000000000..1dbf429461eb --- /dev/null +++ b/src/contrib/rocblas/rocblas.cc @@ -0,0 +1,76 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external rocblas library call. + */ +#include +#include +#include +#include "rocblas.h" + +namespace tvm { +namespace contrib { + +using namespace runtime; + +#ifndef CHECK_ROCBLAS_ERROR +#define CHECK_ROCBLAS_ERROR(error) \ +if (error != rocblas_status_success) { \ + fprintf(stderr, "rocBLAS error: "); \ + if (error == rocblas_status_invalid_handle) fprintf(stderr, "rocblas_status_invalid_handle"); \ + if (error == rocblas_status_not_implemented) fprintf(stderr, " rocblas_status_not_implemented"); \ + if (error == rocblas_status_invalid_pointer) fprintf(stderr, "rocblas_status_invalid_pointer"); \ + if (error == rocblas_status_invalid_size) fprintf(stderr, "rocblas_status_invalid_size"); \ + if (error == rocblas_status_memory_error) fprintf(stderr, "rocblas_status_memory_error"); \ + if (error == rocblas_status_internal_error) fprintf(stderr, "rocblas_status_internal_error"); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ +} +#endif + + +// matrix multiplication for row major +TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul") +.set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor* A = args[0]; + DLTensor* B = args[1]; + DLTensor* C = args[2]; + bool transa = args[3]; + bool transb = args[4]; + // call gemm for simple compact code. + CHECK_EQ(A->ndim, 2); + CHECK_EQ(B->ndim, 2); + CHECK_EQ(C->ndim, 2); + CHECK(C->strides == nullptr); + CHECK(B->strides == nullptr); + CHECK(A->strides == nullptr); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); + + rocblas_handle handle; + CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle)); + float alpha = 1.0; + float beta = 0.0; + float *A_ptr = reinterpret_cast(static_cast(B->data) + B->byte_offset); + float *B_ptr = reinterpret_cast(static_cast(A->data) + A->byte_offset); + float *C_ptr = reinterpret_cast(static_cast(C->data) + C->byte_offset); + + CHECK_ROCBLAS_ERROR(rocblas_sgemm(handle, + transb ? rocblas_operation_transpose : rocblas_operation_none, + transa ? rocblas_operation_transpose : rocblas_operation_none, + transb ? B->shape[0] : B->shape[1], + transa ? A->shape[1] : A->shape[0], + transb ? B->shape[1] : B->shape[0], + &alpha, + A_ptr, + B->shape[1], + B_ptr, + A->shape[1], + &beta, + C_ptr, + C->shape[1])); + + CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle)); +}); +} // namespace contrib +} // namespace tvm diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py new file mode 100644 index 000000000000..46350f4d6625 --- /dev/null +++ b/tests/python/contrib/test_rocblas.py @@ -0,0 +1,33 @@ +import tvm +import numpy as np +from tvm.contrib import rocblas + +def test_matmul_add(): + n = 1024 + l = 128 + m = 235 + A = tvm.placeholder((n, l), name='A') + B = tvm.placeholder((l, m), name='B') + C = rocblas.matmul(A, B) + s = tvm.create_schedule(C.op) + + def verify(target="rocm"): + if not tvm.module.enabled(target): + print("skip because %s is not enabled..." % target) + return + if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): + print("skip because extern function is not avalable") + return + ctx = tvm.rocm(0) + f = tvm.build(s, [A, B, C], target) + a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + f(a, b, c) + np.testing.assert_allclose( + c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) + verify() + + +if __name__ == "__main__": + test_matmul_add() diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py index d2d7aaf0fd3d..3fddd53a3b36 100644 --- a/topi/python/topi/rocm/__init__.py +++ b/topi/python/topi/rocm/__init__.py @@ -3,3 +3,4 @@ from __future__ import absolute_import as _abs from .conv2d import * +from .dense import * diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py new file mode 100644 index 000000000000..cfeed247a4a1 --- /dev/null +++ b/topi/python/topi/rocm/dense.py @@ -0,0 +1,66 @@ +# pylint: disable=invalid-name, unused-variable +"""Schedule for dense operator""" +from __future__ import absolute_import as _abs +import tvm +from tvm.contrib import rocblas +import topi +from ..nn.dense import dense, dense_default +from .. import tag +from .. import generic + +@dense.register("rocm") +def dense_rocm(data, weight, bias=None): + """Dense operator for rocm backend. + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim] + + bias : tvm.Tensor, optional + 1-D with shape [out_dim] + + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim] + """ + assert len(data.shape) == 2 and len(weight.shape) == 2, \ + "only support 2-dim dense" + if bias is not None: + assert len(bias.shape) == 1 + batch, in_dim = data.shape + out_dim, _ = weight.shape + target = tvm.target.current_target() + if "rocblas" in target.libs: + matmul = rocblas.matmul(data, weight, False, True) + if bias is not None: + matmul = tvm.compute((batch, out_dim), \ + lambda i, j: matmul[i, j] + bias[j], \ + tag=tag.BROADCAST) + return matmul + return dense_default(data, weight, bias) + + +@generic.schedule_dense.register(["rocm"]) +def schedule_dense(outs): + """Schedule for dense operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of dense + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for dense. + """ + target = tvm.target.current_target() + if target.target_name == "rocm" and "rocblas" in target.libs: + return generic.schedule_extern(outs) + return topi.cuda.schedule_dense(outs) From 4cd0ce6ea7b500c62425ebf883ff15bfaacd2d2c Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Wed, 3 Jan 2018 19:12:15 -0800 Subject: [PATCH 072/948] correct conv2d workload for resnet18 (#750) --- topi/tests/python/test_topi_conv2d_nchw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py index 86cfc9493cdc..99f8ac93fb83 100644 --- a/topi/tests/python/test_topi_conv2d_nchw.py +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -56,7 +56,7 @@ def check_device(device): def test_conv2d_nchw(): # ResNet18 worklaods - verify_conv2d_nchw(1, 3, 224, 64, 7, 3, 2) + verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3) verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0) verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1) From f63eaac96459fd0a201f3d21c902b28881e32df9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 4 Jan 2018 10:50:09 -0800 Subject: [PATCH 073/948] [CODEGEN] use charp for voidp (#753) * [CODEGEN] use charp for voidp * fx --- src/codegen/llvm/codegen_llvm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index d274af73ed82..ea1e24c277cd 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -599,7 +599,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { addrspace = llvm::dyn_cast( ptr->getType())->getAddressSpace(); } - return builder_->CreatePointerCast(ptr, t_void_->getPointerTo(addrspace)); + return builder_->CreatePointerCast(ptr, t_char_->getPointerTo(addrspace)); } else if (op->is_intrinsic(Call::reinterpret) && is_zero(op->args[0])) { return llvm::Constant::getNullValue(t_void_p_); } else if (op->is_intrinsic(intrinsic::tvm_handle_is_null)) { From d899f864c47308f02a502ac0fa5bbdfc05d1ae35 Mon Sep 17 00:00:00 2001 From: xqdan Date: Mon, 8 Jan 2018 03:47:46 +0800 Subject: [PATCH 074/948] [SCHEDULE]Improve bound deduce for loop partition (#743) (#755) * [SCHEDULE]enable partition const loop with build flag (#719) * enable partition loop with build flag * add a testcase, and modify LoopPartition related cases * * add document for split_const_loop * [IRbuild]Support automatically Name Loop Variable in IRBuilder (#719) * add idx_num in class * using typical index [i, j, k] first, then i_suffix * keep inputs names * fix lint * improve comment of name * fix lint * [SCHEDULE]Improve bound deduce for loop partition (#743) * add divided checking when deducing * related testcase * fix * * transform LE and GE first * remove is_equal * modify testcase for edge cases checking * * fix comment * * fix lint * * apply transformation form LT -> LE, GT -> GE * * fix lint * simplify code and testcase * add negative co-efficient case * More complicated cases * add testcase * simplify testcase * comment case for now * fix testcase --- src/arithmetic/bound_deducer.cc | 30 ++++++--- tests/python/unittest/test_arith_intset.py | 76 ++++++++++++++++++++-- 2 files changed, 92 insertions(+), 14 deletions(-) diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc index 7a8c841025fa..c9779bbbe24d 100644 --- a/src/arithmetic/bound_deducer.cc +++ b/src/arithmetic/bound_deducer.cc @@ -128,13 +128,25 @@ class BoundDeducer: public IRVisitor { } // always use relax bound - result = result / operand + (is_greater ? 1 : -1); + bool divided = can_prove(result % operand == 0); + result = result / operand; + // since system will round down when not divided + // eg. 2/4 -> 0; -2/4 -> -1 + // no need fix for !is_greater: + // eg. a <= 2/4 -> a <= 0 + // eg. a <= 0/4 -> a <= 0 + // so just fix for not divided and is_greater + // eg. a >= 2/4 -> a >= 0 + 1 + // eg. a >= 0/4 -> a >= 0 + if (is_greater && !divided) { + result += 1; + } + Visit(left ? op->a : op->b); } Expr result; bool is_greater{true}; - bool is_equal{true}; bool success{true}; private: @@ -178,22 +190,20 @@ void BoundDeducer::Init() { void BoundDeducer::Transform() { if (const LT* op = expr_.as()) { is_greater = false; - is_equal = false; expr_ = op->a; - result = op->b; + // a < b -> a <= b - 1 + result = op->b - 1; } else if (const LE* op = expr_.as()) { is_greater = false; - is_equal = true; expr_ = op->a; result = op->b; } else if (const GT* op = expr_.as()) { is_greater = true; - is_equal = false; expr_ = op->a; - result = op->b; + // a > b -> a >= b + 1 + result = op->b + 1; } else if (const GE* op = expr_.as()) { is_greater = true; - is_equal = true; expr_ = op->a; result = op->b; } else { @@ -237,9 +247,9 @@ IntSet DeduceBound(Expr v, Expr e, if (!d.success) return IntSet::nothing(); Expr min = Interval::neg_inf, max = Interval::pos_inf; if (d.is_greater) { - min = d.is_equal ? d.result : d.result + 1; + min = d.result; } else { - max = d.is_equal ? d.result : d.result - 1; + max = d.result; } return IntSet::interval(min, max); } diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py index c1b4daaada5b..78589cf3af0e 100644 --- a/tests/python/unittest/test_arith_intset.py +++ b/tests/python/unittest/test_arith_intset.py @@ -25,12 +25,17 @@ def test_deduce(): e0 = (-b)*a+c-d res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {}) - ans0 = (d-c)/(-b)+(-1) + ans0 = ((d - c) /(b*-1)) + assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) + + e0 = d*a+c-d + res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {}) + ans0 = ((0-c)/d + 1) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e1 = (a*4+b < c) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) - ans1 = (c-b)/4+(-2) + ans1 = (((c - b) + -1)/4) assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1) e2 = (tvm.max(5, a * 4) < 0) @@ -59,14 +64,77 @@ def test_check(): # multiple compare operators res2 = tvm.arith.DeduceBound(a, (a+b>3)>c , {b: b_s, c: c_s}, {}) - assert res1.is_nothing() + assert res2.is_nothing() # multiple target variable res2 = tvm.arith.DeduceBound(a, a*2-a>b, {b: b_s}, {}) - assert res1.is_nothing() + assert res2.is_nothing() + +def test_deduce_basic(): + def test_basic(a1, a2, coff): + a = tvm.var('a') + b = tvm.var('b') + b_s = tvm.arith.intset_interval(a1, a2) + e0 = b + a*coff + 3 + + res1 = tvm.arith.DeduceBound(a, e0<17, {b: b_s}, {b: b_s}) + [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1 + + res1 = tvm.arith.DeduceBound(a, e0>17, {b: b_s}, {b: b_s}) + [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1 + + res1 = tvm.arith.DeduceBound(a, e0<=17, {b: b_s}, {b: b_s}) + [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1 + + res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s}) + [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1 + + test_basic(0, 4, 4) + test_basic(1, 5, 4) + test_basic(2, 6, 4) + test_basic(0, 4, -4) + test_basic(1, 5, -4) + test_basic(2, 6, -4) + +def test_deduce_complex(): + def test_complex(a1, a2, coff): + a = tvm.var('a') + b = tvm.var('b') + b_s = tvm.arith.intset_interval(a1, a2) + e0 = (b*3 + a* coff) * 4 + + res1 = tvm.arith.DeduceBound(a, e0<63, {b: b_s}, {b: b_s}) + [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1 + + res1 = tvm.arith.DeduceBound(a, e0<=63, {b: b_s}, {b: b_s}) + [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1 + + res1 = tvm.arith.DeduceBound(a, e0>63, {b: b_s}, {b: b_s}) + [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1 + + res1 = tvm.arith.DeduceBound(a, e0>=63, {b: b_s}, {b: b_s}) + [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()] + assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1 + + test_complex(0, 4, 4) + test_complex(0, 4, -4) + test_complex(2, 6, 4) + test_complex(0, 4, -4) + test_complex(1, 5, -4) + test_complex(2, 6, -4) if __name__ == "__main__": test_basic() test_vector() test_deduce() test_check() + test_deduce_basic() + test_deduce_complex() + From 67b14a22a0352688bd9da8f9c1a66d6d0ff8d19a Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 7 Jan 2018 16:46:01 -0800 Subject: [PATCH 075/948] [PASS] StorageRewrite Fold Inplace op storage when possible (#759) * [PASS] StorageRewrite Fold Inplace op storage when possible * update comment to fix typos --- include/tvm/ir.h | 6 + src/op/extern_op.cc | 2 +- src/pass/storage_rewrite.cc | 352 ++++++++++++++---- tests/python/unittest/test_codegen_extern.py | 1 + .../unittest/test_pass_storage_rewrite.py | 39 +- 5 files changed, 325 insertions(+), 75 deletions(-) diff --git a/include/tvm/ir.h b/include/tvm/ir.h index 95e01382dd98..989802326ae4 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -152,6 +152,12 @@ constexpr const char* coproc_scope = "coproc_scope"; constexpr const char* coproc_uop_scope = "coproc_uop_scope"; /*! \brief Mark the scope as volatile access for certain handle. */ constexpr const char* volatile_scope = "volatile_scope"; +/*! + * \brief Mark the scope as generated by extern primitive. + * such scope can contain arbitrary ir program and we need to be careful + * when make certain assumptions about the structure of the program. + */ +constexpr const char* extern_scope = "extern_scope"; /*! * \brief Mark the scope as when computation start to happen * This can hint some code generator to create a new function for compute. diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc index 9b302f6e2504..e83f97b14652 100644 --- a/src/op/extern_op.cc +++ b/src/op/extern_op.cc @@ -130,7 +130,7 @@ Stmt ExternOpNode::BuildProvide( const Stage& stage, const std::unordered_map& dom_map) const { CHECK_EQ(stage->op.operator->(), this); - Stmt ret = this->body; + Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body); auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) { Array bind_spec; Array tuple; diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 9d47a64f8837..5e7abdda2112 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -41,27 +41,32 @@ class LinearAccessPatternFinder final : public IRVisitor { struct StmtEntry { // The statment const Node* stmt; - // Scope used for allocation. - StorageScope alloc_scope; + // The index in the linear_seq_ to point to end of the nested scope. + // This is only set to non-zero if stmt is a nested scope. + // if offset > 0, means this is the begin, the end entry is current_index + offset + // if offset < 0, means this is the end, the begin entry is current_index + offset + int64_t scope_pair_offset{0}; // The buffer variables this statment touched. std::vector touched; }; + // The scope of each allocation + struct AllocEntry { + // Scope used for allocation. + StorageScope storage_scope; + // scope level + size_t level{0}; + // allocation stmt + const Allocate* alloc{nullptr}; + }; - // Get linear access pattern. - std::vector GetLinearSeq(const Stmt& s) { - this->Visit(s); - return std::move(linear_seq_); - } void Visit_(const Allocate* op) final { size_t level = scope_.size(); const Variable* buf = op->buffer_var.get(); - CHECK(!alloc_scope_level_.count(buf)); - alloc_scope_level_[buf] = level; - StmtEntry e; - e.stmt = op; - e.alloc_scope = GetScope(buf); - e.touched.push_back(buf); - linear_seq_.emplace_back(std::move(e)); + auto it = alloc_info_.find(buf); + CHECK(it != alloc_info_.end()); + CHECK(it->second.alloc == nullptr); + it->second.alloc = op; + it->second.level = level; IRVisitor::Visit_(op); } void Visit_(const Store* op) final { @@ -70,9 +75,10 @@ class LinearAccessPatternFinder final : public IRVisitor { IRVisitor::Visit_(op); // Add write access. const Variable* buf = op->buffer_var.get(); - auto it = alloc_scope_level_.find(buf); - if (it != alloc_scope_level_.end()) { - scope_[it->second].touched.push_back(buf); + auto it = alloc_info_.find(buf); + if (it != alloc_info_.end() && it->second.alloc) { + CHECK_LT(it->second.level, scope_.size()); + scope_[it->second.level].touched.push_back(buf); } StmtEntry e = scope_.back(); scope_.pop_back(); @@ -96,11 +102,11 @@ class LinearAccessPatternFinder final : public IRVisitor { // Add write access. IRVisitor::Visit_(op); const Variable* buf = op->buffer_var.get(); - auto it = alloc_scope_level_.find(buf); - if (it != alloc_scope_level_.end()) { - CHECK_LT(it->second, scope_.size()) + auto it = alloc_info_.find(buf); + if (it != alloc_info_.end() && it->second.alloc) { + CHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store."; - scope_[it->second].touched.push_back(buf); + scope_[it->second.level].touched.push_back(buf); } } void Visit_(const Call* op) final { @@ -113,10 +119,11 @@ class LinearAccessPatternFinder final : public IRVisitor { } void Visit_(const Variable* buf) final { // Directly reference to the variable count as a read. - auto it = alloc_scope_level_.find(buf); - if (it != alloc_scope_level_.end()) { - CHECK_LT(it->second, scope_.size()) << " buf=" << buf->name_hint; - scope_[it->second].touched.push_back(buf); + auto it = alloc_info_.find(buf); + if (it != alloc_info_.end() && it->second.alloc) { + CHECK_LT(it->second.level, scope_.size()) + << " buf=" << buf->name_hint; + scope_[it->second.level].touched.push_back(buf); } } template @@ -124,13 +131,20 @@ class LinearAccessPatternFinder final : public IRVisitor { scope_.push_back(StmtEntry()); StmtEntry e; e.stmt = op; + int64_t begin_index = static_cast(linear_seq_.size()); // before scope. linear_seq_.push_back(e); IRVisitor::Visit_(op); // after scope. e.touched = std::move(scope_.back().touched); scope_.pop_back(); + int64_t end_index = static_cast(linear_seq_.size()); + CHECK_GT(end_index, begin_index); + e.scope_pair_offset = begin_index - end_index; linear_seq_.push_back(e); + // record the pointer to end index. + CHECK_NE(end_index, 0U); + linear_seq_[begin_index].scope_pair_offset = end_index - begin_index; } void Visit_(const AttrStmt* op) final { // Only record the outer most thread extent. @@ -138,9 +152,11 @@ class LinearAccessPatternFinder final : public IRVisitor { in_thread_env_ = true; VisitNewScope(op); in_thread_env_ = false; + } else if (op->attr_key == attr::extern_scope) { + VisitNewScope(op); } else if (op->attr_key == attr::storage_scope) { const Variable* buf = op->node.as(); - storage_scope_[buf] = + alloc_info_[buf].storage_scope = StorageScope::make(op->value.as()->value); IRVisitor::Visit_(op); } else { @@ -155,36 +171,156 @@ class LinearAccessPatternFinder final : public IRVisitor { VisitNewScope(op); } + // linearized access sequence. + std::vector linear_seq_; + // The storage scope of each buffer + std::unordered_map alloc_info_; + private: - // Get storage scope of buffer. - StorageScope GetScope(const Variable* buf) const { - auto it = storage_scope_.find(buf); - CHECK(it != storage_scope_.end()); - return it->second; - } // Whether already in thread env. bool in_thread_env_{false}; - // linearized access sequence. - std::vector linear_seq_; // The scope stack. std::vector scope_; - // The storage scope of each buffer - std::unordered_map storage_scope_; - // buffer -> allocated scope level in the IR. - std::unordered_map alloc_scope_level_; +}; + +// Verify if the statement can be run safely via inplace fashion +// +// Detect pattern: dst[index] = f(src[index]) +// +// WARNING: the current detection algorithm cannot handle the case +// when a location in an array is written multiple times +// +// For example, the following program will pass the check, +// but we cannot make A and B to be the same array. +// +// A[0] = B[0] + 1 +// A[0] = B[0] + 1 +// +// The high level code generator needs to ensure that the generated +// code only write each location of the target array once. +// +// This is the case with IR generated by the current compute schedule. +// We explicitly return false if we find there is an extern block +// which can be arbitrary IR. +// +// Neve-the-less, inplace detector should be used with care in mind. +// We may also consider introduce a condition checker that checks +// if every index only visited once for an absolute sufficient condition. +// +// The code after inplace transformation is no longer idempotent. +// +class InplaceOpVerifier : public IRVisitor { + public: + bool Check(const Node* stmt, + const Variable* dst, + const Variable* src) { + dst_ = dst; + src_ = src; + result_ = true; + if (stmt->is_type()) { + Visit_(static_cast(stmt)); + } else if (stmt->is_type()) { + Visit_(static_cast(stmt)); + } else if (stmt->is_type()) { + Visit_(static_cast(stmt)); + } else if (stmt->is_type()) { + Visit_(static_cast(stmt)); + } else { + return false; + } + return result_; + } + + using IRVisitor::Visit_; + + void Visit(const NodeRef& e) final { + if (!result_) return; + IRVisitor::Visit(e); + } + + void Visit_(const Variable* op) final { + // assume all opaque access is unsafe + if (op == dst_ || op == src_) { + result_ = false; return; + } + } + + void Visit_(const Store* op) final { + ++mem_nest_; + this->Visit(op->index); + --mem_nest_; + if (op->buffer_var.get() == dst_) { + store_ = op; + this->Visit(op->value); + this->Visit(op->predicate); + store_ = nullptr; + } else { + this->Visit(op->value); + this->Visit(op->predicate); + } + } + + void Visit_(const AttrStmt* op) final { + // always reject extern code + if (op->attr_key == attr::extern_scope || + op->attr_key == attr::volatile_scope) { + result_ = false; return; + } + IRVisitor::Visit_(op); + } + + void Visit_(const Load* op) final { + const Variable* buf = op->buffer_var.get(); + // cannot read from dst_ (no reduction) + if (buf == dst_) { + result_ = false; return; + } + // do not allow indirect memory load + if (mem_nest_ != 0) { + result_ = false; return; + } + if (src_ == buf) { + if (store_ == nullptr || + store_->value.type() != op->type || + !ir::Equal(store_->index, op->index)) { + result_ = false; return; + } + } + ++mem_nest_; + IRVisitor::Visit_(op); + --mem_nest_; + } + + + private: + // result of the check + bool result_{true}; + // destination memory + const Variable* dst_; + // source variable + const Variable* src_; + // counter of load, + // it is not safe to inplace when there is nested load like A[B[i]] + int mem_nest_{0}; + // The current store to be inspected + const Store* store_{nullptr}; }; // Planner to plan and rewrite memory allocation. class StoragePlanRewriter : public IRMutator { public: using StmtEntry = LinearAccessPatternFinder::StmtEntry; + using AllocEntry = LinearAccessPatternFinder::AllocEntry; - Stmt Rewrite(Stmt stmt) { - std::vector seq = - LinearAccessPatternFinder().GetLinearSeq(stmt); - this->FindFreeLocation(seq); - this->PlanMemory(seq); + Stmt Rewrite(Stmt stmt, bool detect_inplace) { + detect_inplace_ = detect_inplace; + // plan the rewrite + LinearAccessPatternFinder finder; + finder.Visit(stmt); + this->LivenessAnalysis(finder.linear_seq_); + this->PlanMemory(finder.linear_seq_, finder.alloc_info_); this->PrepareNewAlloc(); + // start rewrite stmt = this->Mutate(stmt); if (attach_map_.count(nullptr)) { std::vector nest; @@ -308,7 +444,6 @@ class StoragePlanRewriter : public IRMutator { } private: - // Alllocate entry of node. struct StorageEntry { // The scope that this alloc attaches after // For shared/local memory it is beginning of the thread extent. @@ -332,6 +467,16 @@ class StoragePlanRewriter : public IRMutator { // the address becomes alloc_var + sizeof(elem_type) * elem_offset; uint64_t elem_offset{0}; }; + + // Alllocate entry of node. + // Event entry in liveness analysis + struct EventEntry { + // variables we generate + std::vector gen; + // variables we kill + std::vector kill; + }; + Stmt MakeAttach(const std::vector& svec, Stmt body) { std::vector nest; @@ -461,16 +606,29 @@ class StoragePlanRewriter : public IRMutator { << "Allocation exceed bound of memory tag " << e->scope.to_string(); } } - // Find the free location of each varaible. - // Just do a reverse linear scan. - void FindFreeLocation(const std::vector& seq) { + // Liveness analysis to find gen and kill point of each variable. + void LivenessAnalysis(const std::vector& seq) { + // find kill point, do a reverse linear scan. std::unordered_set touched; for (size_t i = seq.size(); i != 0; --i) { const StmtEntry& s = seq[i - 1]; for (const Variable* buffer : s.touched) { if (!touched.count(buffer)) { touched.insert(buffer); - free_loc_[i - 1].push_back(buffer); + event_map_[s.stmt].kill.push_back(buffer); + } + } + } + // find gen point, do forward scan + touched.clear(); + for (size_t i = 0; i < seq.size(); ++i) { + int64_t offset = seq[i].scope_pair_offset; + if (offset < 0) continue; + const StmtEntry& s = seq[i + offset]; + for (const Variable* buffer : s.touched) { + if (!touched.count(buffer)) { + touched.insert(buffer); + event_map_[s.stmt].gen.push_back(buffer); } } } @@ -500,14 +658,66 @@ class StoragePlanRewriter : public IRMutator { } // Memory plan algorithm - void PlanMemory(const std::vector& seq) { + void PlanMemory(const std::vector& seq, + const std::unordered_map& alloc_info) { + std::unordered_set inplace_flag; + for (size_t i = 0; i < seq.size(); ++i) { const StmtEntry& s = seq[i]; + auto it = event_map_.find(seq[i].stmt); + + // scope_pair_offset >= 0 means it is either + // - leaf stmt(offset = 0) + // - beginning of scope(offset < 0) + // In both cases, we need to handle the gen event correctly + if (it != event_map_.end() && seq[i].scope_pair_offset >= 0) { + // Inplace operation detection + // specially handle this + bool detect_inplace = detect_inplace_ && (it->second.gen.size() <= 2); + + for (const Variable* var : it->second.gen) { + CHECK(alloc_info.count(var)); + const AllocEntry& ae = alloc_info.at(var); + StorageEntry* dst_entry = nullptr; + // inplace detection + if (detect_inplace) { + for (const Variable* src : it->second.kill) { + if (!inplace_flag.count(src) && alloc_map_.count(src)) { + InplaceOpVerifier visitor; + StorageEntry* src_entry = alloc_map_.at(src); + if (src_entry->scope == ae.storage_scope && + src_entry->attach_scope_ == thread_scope_ && + src_entry->elem_type == ae.alloc->type.element_of() && + visitor.Check(s.stmt, var, src)) { + uint64_t const_nbits = static_cast( + ae.alloc->constant_allocation_size() * + ae.alloc->type.bits() * + ae.alloc->type.lanes()); + if (src_entry->const_nbits == const_nbits) { + // successfully inplace + dst_entry = src_entry; + inplace_flag.insert(src); + } + } + } + } + } + if (dst_entry == nullptr) { + dst_entry = FindAlloc(ae.alloc, thread_scope_, ae.storage_scope); + } + dst_entry->allocs.emplace_back(ae.alloc); + alloc_map_[var] = dst_entry; + } + } + // enter/exit new scope if (s.stmt->is_type()) { const auto* op = static_cast(s.stmt); - CHECK(op->attr_key == attr::thread_extent || - op->attr_key == attr::pragma_scope); - PlanNewScope(op); + if (op->attr_key == attr::thread_extent || + op->attr_key == attr::pragma_scope) { + PlanNewScope(op); + } else { + CHECK(op->attr_key == attr::extern_scope); + } } else if (s.stmt->is_type()) { const auto* op = static_cast(s.stmt); if (op->for_type == ForType::Parallel) { @@ -515,16 +725,17 @@ class StoragePlanRewriter : public IRMutator { PlanNewScope(op); } } - } else if (s.stmt->is_type()) { - const auto* op = static_cast(s.stmt); - StorageEntry* e = this->FindAlloc(op, thread_scope_, s.alloc_scope); - e->allocs.emplace_back(op); - alloc_map_[op->buffer_var.get()] = e; } - // free list - if (free_loc_.count(i)) { - for (const Variable* var : free_loc_.at(i)) { - this->Free(var); + // scope_pair_offset <= 0 means it is either + // - leaf stmt(offset = 0) + // - end of scope(offset < 0) + // In both cases, we need to handle the kill event correctly + if (it != event_map_.end() && seq[i].scope_pair_offset <= 0) { + for (const Variable* var : it->second.kill) { + // skip space which are already replaced by inplace + if (!inplace_flag.count(var)) { + this->Free(var); + } } } } @@ -534,6 +745,7 @@ class StoragePlanRewriter : public IRMutator { const Node* attach_scope, const StorageScope& scope, size_t const_nbits) { + CHECK(op != nullptr); // Re-use not successful, allocate a new buffer. std::unique_ptr entry(new StorageEntry()); entry->attach_scope_ = attach_scope; @@ -544,9 +756,11 @@ class StoragePlanRewriter : public IRMutator { alloc_vec_.emplace_back(std::move(entry)); return e; } + StorageEntry* FindAlloc(const Allocate* op, const Node* attach_scope, const StorageScope& scope) { + CHECK(op != nullptr); // skip plan for local variable, // compiler can do a better job with register allocation. const uint64_t match_range = 16; @@ -603,6 +817,7 @@ class StoragePlanRewriter : public IRMutator { auto it = alloc_map_.find(var); CHECK(it != alloc_map_.end()); StorageEntry* e = it->second; + CHECK_NE(e->allocs.size(), 0U); // Disable sharing of local memory. if (e->scope.rank > 1 || e->allocs[0]->type.is_handle()) return; // disable reuse of small arrays @@ -616,17 +831,18 @@ class StoragePlanRewriter : public IRMutator { } // thread scope. const Node* thread_scope_{nullptr}; + // whether enable inplace detection. + bool detect_inplace_{false}; // Locations of free ops. - std::unordered_map > free_loc_; - // The allocation attach map - std::unordered_map > attach_map_; - // The allocation assign map - std::unordered_map alloc_map_; + std::unordered_map event_map_; // constant size free map. std::multimap const_free_map_; // symbolic free list, for non constant items. std::list sym_free_list_; + // The allocation attach map + std::unordered_map > attach_map_; + // The allocation assign map + std::unordered_map alloc_map_; // The allocations std::vector > alloc_vec_; }; @@ -693,7 +909,7 @@ class VectorAllocRewriter : public IRMutator { Stmt StorageRewrite(Stmt stmt) { - stmt = StoragePlanRewriter().Rewrite(stmt); + stmt = StoragePlanRewriter().Rewrite(stmt, true); return VectorAllocRewriter().Mutate(stmt); } } // namespace ir diff --git a/tests/python/unittest/test_codegen_extern.py b/tests/python/unittest/test_codegen_extern.py index 43736bc46768..1295ed26cce7 100644 --- a/tests/python/unittest/test_codegen_extern.py +++ b/tests/python/unittest/test_codegen_extern.py @@ -15,6 +15,7 @@ def extern_generator(ins, outs): C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) + print(tvm.lower(s, [A, C], simple_mode=True)) def check_llvm(): if not tvm.module.enabled("llvm"): diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index 4d2110319d2c..d3f6307f821f 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -19,14 +19,39 @@ def test_storage_share(): stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) - # verify only have two allocations. - # verify that the data is folded. + # verify only have one allocations. + # verify inplace folding works + num_alloc = [0] + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + num_alloc[0] += 1 + tvm.ir_pass.PostOrderVisit(stmt, verify) + assert num_alloc[0] == 1 + + +def test_inplace_rule(): + m = 10 + A = tvm.placeholder((m,), name='A') + A0 = tvm.compute((m,), lambda i: A[i], name='A0') + A1 = tvm.compute((m,), lambda i: A[i] + 1, name='A1') + AA = tvm.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA') + B = tvm.compute((m,), lambda i: AA[i] + 1, name='B') + s = tvm.create_schedule(B.op) + bounds = tvm.schedule.InferBound(s) + assert isinstance(bounds, tvm.container.Map) + stmt = tvm.schedule.ScheduleOps(s, bounds) + Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') + Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') + stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) + stmt = tvm.ir_pass.CanonicalSimplify(stmt) + stmt = tvm.ir_pass.Simplify(stmt) + stmt = tvm.ir_pass.StorageRewrite(stmt) + # verify only have one allocations. + # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 - elif isinstance(n, tvm.stmt.Store): - assert n.buffer_var != n.value.a.buffer_var tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2 @@ -38,7 +63,7 @@ def test_storage_combine(): B = A stages = [] for t in range(num_stage): - B = tvm.compute((n, ), lambda i: B[i] + (t+1), name='A%d' % t) + B = tvm.compute((n, ), lambda i: B[i] + B[0] + (t+1), name='A%d' % t) stages.append(B) s = tvm.create_schedule(B.op) @@ -121,12 +146,14 @@ def test_parallel_alloc(): A[j] = A[j] + 2 body = ib.get() body = tvm.ir_pass.StorageRewrite(body) + assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate)) if __name__ == "__main__": + test_inplace_rule() + test_storage_share() test_parallel_alloc() test_storage_combine() test_storage_share_gpu() - test_storage_share() From afbb70ba44203061df171dd1126f5b8ed4704a29 Mon Sep 17 00:00:00 2001 From: yuruofeifei Date: Mon, 8 Jan 2018 15:48:26 -0800 Subject: [PATCH 076/948] [TUTORIAL] Improve opt_gemm tutorial (#757) * Improve opt_gemm tutorial * Addressed comments --- tutorials/optimize/opt_gemm.py | 194 ++++++++++++++++++++++++--------- 1 file changed, 145 insertions(+), 49 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index cc9c234a7f87..3f24767ab5ed 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -9,8 +9,8 @@ trying various seemingly promising schedules is time-consuming. With the help of TVM, we can try these schedules efficiently to enhance the performance. -In this tutorial, we will demonstrate how square matrix multiplication is optimized step by step by -writing TVM. +In this tutorial, we will demonstrate how to use TVM to optimize square matrix multiplication +and achieve 100 times faster than baseline by simply adding 6 extra lines of code. There are two important optmizations on intense computation applications executed on CPU: 1. Increase the cache hit rate of memory access. Both complex numerical computation and hot-spot @@ -25,8 +25,8 @@ `repo `_. Some of them have been applied by TVM abstraction automatically, but some of them cannot be simply applied due to TVM constraints. -All the experiment results mentioned below, are executed on 2013's 15' MacBook equiped with -Intel i7-2760QM CPU. The cache line size should be 64 bytes for all the x86 CPU. +All the experiment results mentioned below, are executed on 2015's 15' MacBook equiped with +Intel i7-4770QH CPU. The cache line size should be 64 bytes for all the x86 CPU. """ ############################################################################### @@ -40,7 +40,7 @@ import tvm import numpy -import time +import timeit # The size of the square matrix N = 1024 @@ -49,7 +49,17 @@ # Random generated tensor for testing a = tvm.nd.array(numpy.random.rand(N, N).astype(dtype), tvm.cpu(0)) b = tvm.nd.array(numpy.random.rand(N, N).astype(dtype), tvm.cpu(0)) -# The expected answer + +np_repeat = 100 +np_runing_time = timeit.timeit(setup='import numpy\n' + 'N = 1024\n' + 'dtype = "float32"\n' + 'a = numpy.random.rand(N, N).astype(dtype)\n' + 'b = numpy.random.rand(N, N).astype(dtype)\n', + stmt='answer = numpy.dot(a, b)', + number=np_repeat) +print("Numpy running time: %f" % (np_runing_time / np_repeat)) + answer = numpy.dot(a.asnumpy(), b.asnumpy()) # Algorithm @@ -65,49 +75,50 @@ s = tvm.create_schedule(C.op) func = tvm.build(s, [A, B, C], name = 'mmult') assert func -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 1) + c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=1) print('Baseline: %f' % evaluator(a, b, c).mean) +################################################################################################ +# In TVM, we can always inspect lower level IR to debug or optimize our schedule. +# Here is the generated IR using our baseline schedule. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + ################################################################################################ # Blocking # -------- # A important trick to enhance the cache hit rate is blocking --- data chunck will be computed # block by block. The memory access inside the block is a small neighbourhood which is with high -# meomry locality. In this tutorial, I pick up 8, a relatively small value (8 ints < 64 bytes), -# as the blocking size. -# +# memory locality. In this tutorial, I picked up 32 as the blocking factor. So the block will +# fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache) -bn = 8 +bn = 32 +s = tvm.create_schedule(C.op) # Blocking by loop tiling -yo, xo, yi, xi = s[C].tile(C.op.axis[1], C.op.axis[0], bn, bn) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) # Hoist reduction domain outside the blocking loop -s[C].reorder(yo, xo, k, yi, xi) +s[C].reorder(xo, yo, k, xi, yi) func = tvm.build(s, [A, B, C], name = 'mmult') assert func -# By simply tiling the loop 8x8, and hoisting k outside the blocking loops, we can get nearly 4x -# speedup compared with the baseline. -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 5) + c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +# By simply tiling the loop 32x32, and hoisting k outside the blocking loops, we can see big +# speedup compared with the baseline. +evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) print('Opt1: %f' % evaluator(a, b, c).mean) -################################################################################################### -# Vectorization -# ------------- -# Another important trick is vectorization. When the memory access pattern is uniform, the compiler -# can detect this pattern and pass the continuous memory to vector processor. In TVM, we can use -# `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly. -# +################################################################################################ +# Here is the generated IR after blocking. -# After trying different schedule, we finally found that we can benefit from vectorizing -# the row loop most, i.e. yi. -s[C].vectorize(yi) -func = tvm.build(s, [A, B, C], name = 'mmult') -assert func -# We can get almost another 4x speedup compared with the previous schedule. -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 5) -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) -print('Opt2: %f' % evaluator(a, b, c).mean) +print(tvm.lower(s, [A, B, C], simple_mode=True)) ################################################################################################### # Array Packing @@ -125,8 +136,8 @@ ################################################################################################### # Just as it is shown in the figure above, after blocking the computations, we can observe the array # access pattern of B (after flattening), which is regular but discontinuous. We expect that after -# some transformation we can get continuous access pattern. We can reorder a [16][16] array to -# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing +# some transformation we can get continuous access pattern. We can reorder a [16][16] array to +# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing # the corresponding value from the packed array. # @@ -136,30 +147,115 @@ lambda x, y: tvm.sum(A[x, k] * packedB[y / bn, k, y % bn], axis = k), name = 'C') -# Same schedule s = tvm.create_schedule(C.op) -yo, xo, yi, xi = s[C].tile(C.op.axis[1], C.op.axis[0], bn, bn) -s[C].reorder(yo, xo, k, yi, xi) -s[C].vectorize(yi) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +s[C].reorder(xo, yo, k, xi, yi) func = tvm.build(s, [A, B, C], name = 'mmult') assert func -# We can accelerate it almost 3x compared with the previous schedule. -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number = 5) + c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) +print('Opt2: %f' % evaluator(a, b, c).mean) + +################################################################################################ +# Here is the generated IR after array packing. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################################### +# Vectorization +# ------------- +# Another important trick is vectorization. When the memory access pattern is uniform, +# the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM, +# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly. +# +# In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. + +s = tvm.create_schedule(C.op) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +s[C].reorder(xo, yo, k, xi, yi) + +# Vectorization +s[C].vectorize(yi) +func = tvm.build(s, [A, B, C], name = 'mmult') +assert func + +c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) print('Opt3: %f' % evaluator(a, b, c).mean) +################################################################################################ +# Here is the generated IR after vectorization. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################################### +# Loop Permutation +# ------------- +# If we look at the above IR, we can see the inner loop row data is vectorized and +# B is transformed into PackedB. The traversal of PackedB is sequential now. +# So we will look at the access pattern of A. In current schedule, A is accessed column by column +# which is not cache friendly. If we change the nested loop order of k and inner row index xi, +# the access pattern for A matrix is more cache friendly. + +s = tvm.create_schedule(C.op) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +s[C].reorder(xo, yo, xi, k, yi) + +# Vectorization +s[C].vectorize(yi) + +func = tvm.build(s, [A, B, C], name = 'mmult') +assert func + +c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) +print('Opt4: %f' % evaluator(a, b, c).mean) + +################################################################################################ +# Here is the generated IR after loop permutation. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################################### +# Parallel +# ------------- +# Futhermore, we can also utilize multi-core processors to parallelize computation. + +s = tvm.create_schedule(C.op) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +s[C].reorder(xo, yo, xi, k, yi) +s[C].vectorize(yi) + +# parallel +s[C].parallel(xo) + +func = tvm.build(s, [A, B, C], name = 'mmult') +assert func + +c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=50) +opt5_time = evaluator(a, b, c).mean +print('Opt5: %f' % opt5_time) + ################################################################################################## # Summary # ------- -# After applying three main tricks, we can achieve almost 90% performance of numpy. -# Further observation is required to catch up with the performance of numpy. +# After applying the above simple optimizations with only 6 lines of code, +# our generated code can achieve 30% of numpy performance with Apple implemented BLAS. # - -# TODO(Jian Weng): Catch up with the performance of numpy. -_a = a.asnumpy() -_b = b.asnumpy() -now = time.clock() -answer = numpy.dot(_a, _b) -print("Numpy: %f" % (time.clock() - now)) +# We can see TVM is very powerful tool to optimize low level computation. From 43b68ddf95d5b1807e972ef990ab1e99a3c3fdbe Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 8 Jan 2018 19:26:08 -0800 Subject: [PATCH 077/948] [PASS] Improve loop partition to remove un-necessary warning. (#766) * [PASS] Improve loop partition to remove un-necessary warning. * fix comment --- src/pass/loop_partition.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc index ff6103cd6e02..0de8a88edb00 100644 --- a/src/pass/loop_partition.cc +++ b/src/pass/loop_partition.cc @@ -300,8 +300,13 @@ class LoopPartitioner : public IRMutator { std::unordered_map relax_map_; }; -Stmt LoopPartitioner::TryPartition(const Node* node, const Stmt& stmt, - VarExpr var, Expr min, Expr max, Stmt body, bool partition_thread_scope) { +Stmt LoopPartitioner::TryPartition(const Node* node, + const Stmt& stmt, + VarExpr var, + Expr min, + Expr max, + Stmt body, + bool partition_thread_scope) { PartitionFinder finder(var, hint_map_, relax_map_); finder.Visit(body); const auto& partitions = finder.partitions; @@ -340,7 +345,8 @@ Stmt LoopPartitioner::TryPartition(const Node* node, const Stmt& stmt, if (true_itrv.as()->i.has_upper_bound()) { post_doubt_begin = true_itrv.max() + 1; if (!can_prove(true_itrv.max() == max)) { - Expr cond = (max - post_doubt_begin >= 0); + // require the extent to be non-negative + Expr cond = (max - post_doubt_begin + 1 >= 0); if (!can_prove(cond)) { LOG(WARNING) << "Cannot prove: " << cond << ", when generating the post doubt loop"; From a9c48f60a55eaea6edbea406b3ddfa2ecd49602b Mon Sep 17 00:00:00 2001 From: Yida Wang Date: Tue, 9 Jan 2018 15:39:35 -0800 Subject: [PATCH 078/948] small fixes on docs (#769) * small fixs on docs * add IR output after parallelization --- docs/how_to/contribute.md | 1 - tutorials/optimize/opt_gemm.py | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/how_to/contribute.md b/docs/how_to/contribute.md index a0ba99bdf718..bd31a62e52a9 100644 --- a/docs/how_to/contribute.md +++ b/docs/how_to/contribute.md @@ -14,7 +14,6 @@ Everyone is more than welcome to contribute. It is a way to make the project bet - [What is the consequence of force push](#what-is-the-consequence-of-force-push) * [Document](#document) * [Testcases](#testcases) -* [Examples](#examples) * [Core Library](#core-library) * [Python Package](#python-package) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 3f24767ab5ed..9a4264c9d05a 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -26,7 +26,7 @@ abstraction automatically, but some of them cannot be simply applied due to TVM constraints. All the experiment results mentioned below, are executed on 2015's 15' MacBook equiped with -Intel i7-4770QH CPU. The cache line size should be 64 bytes for all the x86 CPU. +Intel i7-4770QH CPU. The cache line size should be 64 bytes for all the x86 CPUs. """ ############################################################################### @@ -230,7 +230,7 @@ ################################################################################################### # Parallel # ------------- -# Futhermore, we can also utilize multi-core processors to parallelize computation. +# Futhermore, we can also utilize multi-core processors to do the thread-level parallelization. s = tvm.create_schedule(C.op) xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) @@ -251,11 +251,18 @@ opt5_time = evaluator(a, b, c).mean print('Opt5: %f' % opt5_time) +################################################################################################ +# Here is the generated IR after parallelization. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################################### + ################################################################################################## # Summary # ------- # After applying the above simple optimizations with only 6 lines of code, -# our generated code can achieve 30% of numpy performance with Apple implemented BLAS. -# -# We can see TVM is very powerful tool to optimize low level computation. - +# our generated code can achieve 30% of the `numpy` performance with Apple implemented BLAS. +# Note that the outputs on the webpage reflect the running times on a non-exclusive +# Docker container, thereby they are *unreliable*. It is highly encouraged to run the +# tutorial by yourself to observe the performance gain acheived by TVM. From bfd277188b8e2cb6cda57135e095b6d1d5410101 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 9 Jan 2018 17:39:16 -0800 Subject: [PATCH 079/948] [PASS] Fix storage rewrite merge rule for special tag memory (#770) --- src/pass/storage_rewrite.cc | 28 +++++++++++-------- .../unittest/test_pass_storage_rewrite.py | 23 +++++++++++++++ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 5e7abdda2112..7215c3f97a43 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -766,14 +766,15 @@ class StoragePlanRewriter : public IRMutator { const uint64_t match_range = 16; uint64_t const_nbits = static_cast( op->constant_allocation_size() * op->type.bits() * op->type.lanes()); - if (scope.rank > 1 || op->type.is_handle()) { - return NewAlloc(op, attach_scope, scope, const_nbits); - } // disable reuse of small arrays, they will be lowered to registers in LLVM - if (const_nbits > 0 && - const_nbits <= 32 && - scope.tag.length() == 0) { - return NewAlloc(op, attach_scope, scope, const_nbits); + // This rules only apply if we are using non special memory + if (scope.tag.length() == 0) { + if (scope.rank > 1 || op->type.is_handle()) { + return NewAlloc(op, attach_scope, scope, const_nbits); + } + if (const_nbits > 0 && const_nbits <= 32) { + return NewAlloc(op, attach_scope, scope, const_nbits); + } } if (const_nbits != 0) { // constant allocation. @@ -818,10 +819,15 @@ class StoragePlanRewriter : public IRMutator { CHECK(it != alloc_map_.end()); StorageEntry* e = it->second; CHECK_NE(e->allocs.size(), 0U); - // Disable sharing of local memory. - if (e->scope.rank > 1 || e->allocs[0]->type.is_handle()) return; - // disable reuse of small arrays - if (e->const_nbits > 0 && e->const_nbits <= 32) return; + + // disable reuse of small arrays, they will be lowered to registers in LLVM + // This rules only apply if we are using non special memory + if (e->scope.tag.length() == 0) { + // Disable sharing of local memory. + if (e->scope.rank > 1 || e->allocs[0]->type.is_handle()) return; + // disable reuse of small arrays + if (e->const_nbits > 0 && e->const_nbits <= 32) return; + } // normal free. if (e->const_nbits != 0) { const_free_map_.insert({e->const_nbits, e}); diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index d3f6307f821f..1e4dda684eb3 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -28,6 +28,28 @@ def verify(n): tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 1 +def test_alloc_seq(): + ib = tvm.ir_builder.create() + n = tvm.var("n") + with ib.for_range(0, n, name="i") as i: + with ib.for_range(0, 10, name="j") as j: + A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A[j] = 1.2 + with ib.for_range(0, 10, name="j") as j: + A = ib.allocate("float32", 200, name="B", scope="local.L0A") + A[j] = 1.3 + + body = ib.get() + body = tvm.ir_pass.StorageRewrite(body) + num_alloc = [0] + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + num_alloc[0] += 1 + assert n.extents[0].value == 200 + tvm.ir_pass.PostOrderVisit(body, verify) + assert num_alloc[0] == 1 + + def test_inplace_rule(): m = 10 @@ -152,6 +174,7 @@ def test_parallel_alloc(): if __name__ == "__main__": + test_alloc_seq() test_inplace_rule() test_storage_share() test_parallel_alloc() From 0a1156f121dbe93e3e6aa9f5db06e59d73df542c Mon Sep 17 00:00:00 2001 From: Yuwei Hu Date: Fri, 12 Jan 2018 01:05:19 +0800 Subject: [PATCH 080/948] [INTRIN] enable popcount on cuda, opencl, metal (#774) --- src/codegen/intrin_rule.h | 12 ++---- src/codegen/intrin_rule_cuda.cc | 16 ++++++++ src/codegen/intrin_rule_metal.cc | 13 ++++--- src/codegen/intrin_rule_opencl.cc | 13 ++++--- tests/python/integration/test_ewise.py | 51 +++++++++++++++++--------- 5 files changed, 69 insertions(+), 36 deletions(-) diff --git a/src/codegen/intrin_rule.h b/src/codegen/intrin_rule.h index e66b55dcc861..c900c9088880 100644 --- a/src/codegen/intrin_rule.h +++ b/src/codegen/intrin_rule.h @@ -30,18 +30,14 @@ struct FloatSuffix { } }; -// Add float suffix to the intrinsics -struct FloatDirect { +// Return the intrinsic name +struct Direct { std::string operator()(Type t, std::string name) const { - if (t.is_float()) { - return name; - } else { - return ""; - } + return name; } }; -// Directly call pure extern function for floats. +// Call pure extern function. template inline void DispatchExtern(const TVMArgs& args, TVMRetValue* rv) { Expr e = args[0]; diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc index a2441d597d86..9abb99d7c7c5 100644 --- a/src/codegen/intrin_rule_cuda.cc +++ b/src/codegen/intrin_rule_cuda.cc @@ -36,6 +36,19 @@ struct CUDAFastMath : public CUDAMath { } }; +struct CUDAPopcount { + std::string operator()(Type t, std::string name) const { + if (t.lanes() == 1 && t.is_uint()) { + switch (t.bits()) { + case 32: return "__popc"; + case 64: return "__popcll"; + default: return ""; + } + } + return ""; + } +}; + TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp") .set_body(DispatchExtern); @@ -51,6 +64,9 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sqrt") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.pow") .set_body(DispatchExtern); +TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.popcount") +.set_body(DispatchExtern); + } // namespace intrin } // namespace codegen } // namespace tvm diff --git a/src/codegen/intrin_rule_metal.cc b/src/codegen/intrin_rule_metal.cc index fbadf3a19bdf..b0e41770ebff 100644 --- a/src/codegen/intrin_rule_metal.cc +++ b/src/codegen/intrin_rule_metal.cc @@ -10,19 +10,22 @@ namespace codegen { namespace intrin { TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.exp") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.log") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.tanh") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.sqrt") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.pow") -.set_body(DispatchExtern); +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.popcount") +.set_body(DispatchExtern); } // namespace intrin } // namespace codegen diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc index a947715acdac..924abcade63f 100644 --- a/src/codegen/intrin_rule_opencl.cc +++ b/src/codegen/intrin_rule_opencl.cc @@ -10,19 +10,22 @@ namespace codegen { namespace intrin { TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.exp") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.log") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.tanh") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.sqrt") -.set_body(DispatchExtern); +.set_body(DispatchExtern); TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.pow") -.set_body(DispatchExtern); +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.popcount") +.set_body(DispatchExtern); } // namespace intrin } // namespace codegen diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index 24adf6ff28af..f8dc43da8d31 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -60,25 +60,40 @@ def test_log_pow_llvm(): b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5) -def test_popcount_llvm(): - # graph - n = tvm.var('n') - A = tvm.placeholder((n,), name='A', dtype="uint32") - B = tvm.compute(A.shape, lambda *i: tvm.popcount(A(*i)), name='B') - s = tvm.create_schedule(B.op) +def test_popcount(): + def run(dtype): + # graph + n = tvm.convert(1024) + A = tvm.placeholder((n,), name='A', dtype=dtype) + B = tvm.compute(A.shape, lambda *i: tvm.popcount(A(*i)), name='B') + s = tvm.create_schedule(B.op) + # simple schedule + num_thread = 8 + bx, tx = s[B].split(B.op.axis[0], factor=num_thread) - if not tvm.module.enabled("llvm"): - return - f = tvm.build(s, [A, B], "llvm") - ctx = tvm.cpu(0) - # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx) - b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx) - f(a, b) - np.testing.assert_allclose( - b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5) + def check_device(device): + if not tvm.module.enabled(device): + print("skip because %s is not enabled.." % device) + return + ctx = tvm.context(device, 0) + if str(ctx).startswith('gpu'): + s[B].bind(bx, tvm.thread_axis("blockIdx.x")) + s[B].bind(tx, tvm.thread_axis("threadIdx.x")) + func = tvm.build(s, [A, B], device) + # launch the kernel. + n = 1024 + a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx) + b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx) + func(a, b) + np.testing.assert_allclose( + b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5) + check_device("llvm") + check_device("cuda") + check_device("opencl") + check_device("metal") + run('uint32') + run('uint64') def test_add(): @@ -133,5 +148,5 @@ def check_device(device): if __name__ == "__main__": test_add() test_log_pow_llvm() - test_popcount_llvm() + test_popcount() test_exp() From 9800fe205340435b8fd72c12f686df1139809c72 Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 12 Jan 2018 02:06:36 +0900 Subject: [PATCH 081/948] [TOPI] Upsampling op support (#772) * add upsampling cpu op * add upsampling gpu schedule * add doc for upsampling op add more doc * cleanup upsampling test * add doc * fix lint * fix lint * fix lint * remove unused import * remove skimage dependency * remove skimage import * remove schedule_upsampling --- topi/python/topi/generic/nn.py | 1 - topi/python/topi/nn/__init__.py | 1 + topi/python/topi/nn/upsampling.py | 28 +++++++++++++ topi/python/topi/testing/__init__.py | 1 + topi/python/topi/testing/upsampling_python.py | 15 +++++++ topi/tests/python/test_topi_upsampling.py | 39 +++++++++++++++++++ 6 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 topi/python/topi/nn/upsampling.py create mode 100644 topi/python/topi/testing/upsampling_python.py create mode 100644 topi/tests/python/test_topi_upsampling.py diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index d606213a5270..6f641e99f7dd 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -177,7 +177,6 @@ def schedule_global_pool(outs): """ return _default_schedule(outs, False) - @tvm.target.generic_func def schedule_binarize_pack(outs): """Schedule for binarize_pack diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index 3cdf3122e78e..918f399f503c 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -14,3 +14,4 @@ from .softmax import * from .conv2d_transpose import * from .bnn import * +from .upsampling import * diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py new file mode 100644 index 000000000000..e1234741e286 --- /dev/null +++ b/topi/python/topi/nn/upsampling.py @@ -0,0 +1,28 @@ +"""TVM operator upsampling compute.""" +from __future__ import absolute_import +import tvm + + +def upsampling(data, scale): + """Perform nearest neighbor upsampling on the data. + Bilinear upsampling is not supported. + + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, channel, in_height, in_width] + + scale: int + upsampling scaling factor + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, channel, in_height*scale, in_width*scale] + """ + batch, channel, height, width = data.shape + out_height = height * scale + out_width = width * scale + + return tvm.compute((batch, channel, out_height, out_width), \ + lambda n, c, h, w: data[n, c, h/scale, w/scale]) diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index 3a43a04437a1..6a1b361e3097 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -10,3 +10,4 @@ from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc from .dilate_python import dilate_python from .softmax_python import softmax_python, log_softmax_python +from .upsampling_python import upsampling_python diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py new file mode 100644 index 000000000000..328c7a5a0bc1 --- /dev/null +++ b/topi/python/topi/testing/upsampling_python.py @@ -0,0 +1,15 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Upsampling in python""" +import numpy as np + +def upsample_nearest(arr, scale): + return arr.repeat(scale, axis=0).repeat(scale, axis=1) + +def upsampling_python(data, scale): + ishape = data.shape + oshape = (ishape[0], ishape[1], ishape[2]*scale, ishape[3]*scale) + output_np = np.zeros(oshape, dtype=data.dtype) + for b in range(oshape[0]): + for c in range(oshape[1]): + output_np[b, c, :, :] = upsample_nearest(data[b, c, :, :], scale) + return output_np diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py new file mode 100644 index 000000000000..08b8f987694d --- /dev/null +++ b/topi/tests/python/test_topi_upsampling.py @@ -0,0 +1,39 @@ +"""Test code for upsampling""" +import numpy as np +import tvm +import topi +import math + +def verify_upsampling(batch, in_channel, in_height, in_width, scale): + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') + B = topi.nn.upsampling(A, scale) + out_shape = (batch, in_channel, in_height*scale, in_width*scale) + dtype = A.dtype + + a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype) + b_np = topi.testing.upsampling_python(a_np, scale) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_injective(B) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) + f = tvm.build(s, [A, B], device) + f(a, b) + + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['llvm', 'cuda']: + check_device(device) + +def test_upsampling(): + verify_upsampling(8, 16, 32, 32, 2) + verify_upsampling(12, 32, 64, 64, 3) + +if __name__ == "__main__": + test_upsampling() From 33085208adc290ef9511feaa0804b42f8d448a58 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 11 Jan 2018 20:14:17 -0800 Subject: [PATCH 082/948] [LLVM] Enable same target option in JITModule (#778) * [LLVM] Enable same target option in JITModule * not set mcpu explicitly --- src/codegen/llvm/llvm_common.cc | 69 +++++++++++++++++++++--------- src/codegen/llvm/llvm_common.h | 14 ++++++ src/codegen/llvm/llvm_module.cc | 31 ++++++++++++-- topi/tests/python/test_topi_bnn.py | 6 +-- 4 files changed, 92 insertions(+), 28 deletions(-) diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index b34bf0d5ec91..f8b80118cb5d 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -36,9 +36,11 @@ void InitializeLLVM() { } } -llvm::TargetMachine* -GetLLVMTargetMachine(const std::string& target_str, - bool allow_null) { +void ParseLLVMTargetOptions(const std::string& target_str, + std::string* triple, + std::string* mcpu, + std::string* mattr, + llvm::TargetOptions* options) { // setup target triple size_t start = 0; if (target_str.length() >= 4 && @@ -46,9 +48,10 @@ GetLLVMTargetMachine(const std::string& target_str, start = 4; } // simple parser - std::string target_triple = ""; - std::string cpu = "generic"; - std::string attr = ""; + triple->resize(0); + mcpu->resize(0); + mattr->resize(0); + bool soft_float_abi = false; std::string key, value; std::istringstream is(target_str.substr(start, target_str.length() - start)); @@ -69,11 +72,11 @@ GetLLVMTargetMachine(const std::string& target_str, } if (key == "-target" || key == "-mtriple") { - target_triple = value; + *triple = value; } else if (key == "-mcpu") { - cpu = value; + *mcpu = value; } else if (key == "-mattr") { - attr = value; + *mattr = value; } else if (key == "-mfloat-abi") { if (value == "hard") { soft_float_abi = false; @@ -89,19 +92,13 @@ GetLLVMTargetMachine(const std::string& target_str, } } - if (target_triple.length() == 0 || - target_triple == "default") { - target_triple = llvm::sys::getDefaultTargetTriple(); - } - std::string err; - const llvm::Target* target = - llvm::TargetRegistry::lookupTarget(target_triple, err); - if (target == nullptr) { - CHECK(allow_null) << err << " target_triple=" << target_triple; - return nullptr; + if (triple->length() == 0 || + *triple == "default") { + *triple = llvm::sys::getDefaultTargetTriple(); } // set target option - llvm::TargetOptions opt; + llvm::TargetOptions& opt = *options; + opt = llvm::TargetOptions(); #if TVM_LLVM_VERSION < 50 opt.LessPreciseFPMADOption = true; #endif @@ -114,8 +111,38 @@ GetLLVMTargetMachine(const std::string& target_str, } else { opt.FloatABIType = llvm::FloatABI::Hard; } +} + + +llvm::TargetMachine* +GetLLVMTargetMachine(const std::string& target_str, + bool allow_null) { + std::string target_triple, mcpu, mattr; + llvm::TargetOptions opt; + + ParseLLVMTargetOptions(target_str, + &target_triple, + &mcpu, + &mattr, + &opt); + + if (target_triple.length() == 0 || + target_triple == "default") { + target_triple = llvm::sys::getDefaultTargetTriple(); + } + if (mcpu.length() == 0) { + mcpu = "generic"; + } + + std::string err; + const llvm::Target* target = + llvm::TargetRegistry::lookupTarget(target_triple, err); + if (target == nullptr) { + CHECK(allow_null) << err << " target_triple=" << target_triple; + return nullptr; + } llvm::TargetMachine* tm = target->createTargetMachine( - target_triple, cpu, attr, opt, llvm::Reloc::PIC_); + target_triple, mcpu, mattr, opt, llvm::Reloc::PIC_); return tm; } diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h index 11ff66d8ca38..35403952f21f 100644 --- a/src/codegen/llvm/llvm_common.h +++ b/src/codegen/llvm/llvm_common.h @@ -57,6 +57,20 @@ namespace codegen { */ void InitializeLLVM(); +/*! + * \brief Parse target options + * \param target_str Target string, in format "llvm -target=xxx -mcpu=xxx" + * \param triple Target triple + * \param mcpu cpu info + * \param options the options + * \param mattr The attributes + */ +void ParseLLVMTargetOptions(const std::string& target_str, + std::string* triple, + std::string* mcpu, + std::string* mattr, + llvm::TargetOptions* options); + /*! * \brief Get target machine from target_str string. * \param target_str Target string, in format "llvm -target=xxx -mcpu=xxx" diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc index 52c4f8897888..439c132d09a8 100644 --- a/src/codegen/llvm/llvm_module.cc +++ b/src/codegen/llvm/llvm_module.cc @@ -120,6 +120,10 @@ class LLVMModuleNode final : public runtime::ModuleNode { } cg->AddMainFunction(funcs[0]->name); module_ = cg->Finish(); + module_->addModuleFlag( + llvm::Module::Warning, "tvm_target", + llvm::MDString::get(*ctx_, target)); + target_ = target; mptr_ = module_.get(); } @@ -133,11 +137,19 @@ class LLVMModuleNode final : public runtime::ModuleNode { LOG(FATAL) << "Fail to load ir file " << file_name << "\n" << "line " << err.getLineNo() << ":" << msg; } - std::string target = module_->getTargetTriple(); + std::string target_; + llvm::Metadata* mtarget = module_->getModuleFlag("tvm_target"); + if (mtarget != nullptr) { + llvm::MDString* pstr = llvm::dyn_cast(mtarget); + CHECK(pstr != nullptr); + target_ = pstr->getString(); + } else { + std::ostringstream os; + os << "llvm -target " << module_->getTargetTriple(); + target_ = os.str(); + } mptr_ = module_.get(); - std::ostringstream os; - os << "llvm -target " << target; - tm_ = GetLLVMTargetMachine(os.str()); + tm_ = GetLLVMTargetMachine(target_); } private: @@ -145,8 +157,19 @@ class LLVMModuleNode final : public runtime::ModuleNode { CHECK(ee_ == nullptr); std::lock_guard lock(mutex_); llvm::EngineBuilder builder(std::move(module_)); + std::string triple, mcpu, mattr; + llvm::TargetOptions opt; + ParseLLVMTargetOptions(target_, &triple, &mcpu, &mattr, &opt); builder.setEngineKind(llvm::EngineKind::JIT); builder.setOptLevel(llvm::CodeGenOpt::Aggressive); + if (mcpu.length() != 0) { + builder.setMCPU(mcpu); + } + if (mattr.length() != 0) { + std::vector mattrs{mattr}; + builder.setMAttrs(mattrs); + } + builder.setTargetOptions(opt); llvm::TargetMachine *tm = builder.selectTarget(); llvm::TargetMachine *tm_sys = GetLLVMTargetMachine("llvm"); if (tm_sys->getTargetTriple().getArch() != tm->getTargetTriple().getArch()) { diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py index 5e6a11afc602..90abc68e6b68 100644 --- a/topi/tests/python/test_topi_bnn.py +++ b/topi/tests/python/test_topi_bnn.py @@ -38,9 +38,9 @@ def get_ref_data(): bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx) bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx) bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx) - f1 = tvm.build(s1, [A, bnn_A], 'llvm -mcpu=core-avx2') - f2 = tvm.build(s2, [B, bnn_B], 'llvm -mcpu=core-avx2') - f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm -mcpu=core-avx2') + f1 = tvm.build(s1, [A, bnn_A], 'llvm') + f2 = tvm.build(s2, [B, bnn_B], 'llvm') + f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm') f1(a, bnn_a) f2(b, bnn_b) f3(bnn_a, bnn_b, bnn_c) From 9060417ab778b9c8bd3a4a62d01157464f9dad47 Mon Sep 17 00:00:00 2001 From: Aman Date: Mon, 15 Jan 2018 09:23:20 -0800 Subject: [PATCH 083/948] Fix lib64 not found error when building with cuda for OSX (#782) --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7f612f450cb0..fef8b2a08051 100644 --- a/Makefile +++ b/Makefile @@ -79,7 +79,11 @@ RUNTIME_DEP = $(RUNTIME_OBJ) ifdef CUDA_PATH NVCC=$(CUDA_PATH)/bin/nvcc CFLAGS += -I$(CUDA_PATH)/include - LDFLAGS += -L$(CUDA_PATH)/lib64 + ifeq ($(UNAME_S),Darwin) + LDFLAGS += -L$(CUDA_PATH)/lib + else + LDFLAGS += -L$(CUDA_PATH)/lib64 + endif endif ifeq ($(USE_CUDA), 1) From f01dbf08d04f5e6851ed2f8de37f762541f050c7 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 15 Jan 2018 12:39:30 -0800 Subject: [PATCH 084/948] try to fix test (#784) try to fix fix --- topi/tests/python/test_topi_reduce.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py index 08e66e1404b6..c8d95df255c6 100644 --- a/topi/tests/python/test_topi_reduce.py +++ b/topi/tests/python/test_topi_reduce.py @@ -54,7 +54,7 @@ def check_device(device): with tvm.target.create(device): s = topi.generic.schedule_reduce(B) ctx = tvm.context(device, 0) - foo = tvm.build(s, [A, B], device, name="sum") + foo = tvm.build(s, [A, B], device, name=type) # Test in_npy = np.random.uniform(size=in_shape).astype(np.float32) in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32) @@ -74,6 +74,21 @@ def check_device(device): out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) + if type == "argmax" or type == "argmin": + out_tvm_indices = out_tvm.asnumpy() + if keepdims: + out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis) + if axis is None: + out_tvm_val = in_npy_map.ravel()[out_tvm_indices] + else: + other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):])) + sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:] + out_tvm_val = in_npy_map[sel_indices] + if type == "argmax": + np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3) + elif type == "argmin": + np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3) + np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) for device in ["cuda", "opencl", "metal", "llvm", "rocm"]: check_device(device) From 7d5ed8af5d8cae97734b4b49dc76fda92d8b3ee8 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 17 Jan 2018 00:20:27 +0800 Subject: [PATCH 085/948] [CODEGEN] fix vector conversion for opencl (#783) * support more argument type in depthwise_conv2d * mark all pointer as 'restrict' & fix vector conversion for opencl --- src/codegen/codegen_c.cc | 22 +++++++++---------- src/codegen/codegen_c.h | 4 ++-- src/codegen/codegen_opencl.cc | 16 ++++++++++++++ src/codegen/codegen_opencl.h | 2 ++ src/pass/split_host_device.cc | 3 --- topi/python/topi/nn/depthwise_conv2d.py | 22 +++++++++++++++---- .../topi/testing/depthwise_conv2d_python.py | 12 ++++++++-- .../python/test_topi_depthwise_conv2d.py | 7 +++--- 8 files changed, 62 insertions(+), 26 deletions(-) diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index 05ba1f2f357b..403a5525b170 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -38,14 +38,17 @@ void CodeGenC::AddFunction(LoweredFunc f) { if (i != 0) stream << ", "; if (v.type().is_handle()) { auto it = alloc_storage_scope_.find(v.get()); - if (it != alloc_storage_scope_.end()) { + if (it != alloc_storage_scope_.end()) PrintStorageScope(it->second, stream); - stream << ' '; + stream << ' '; + + if (handle_data_type_.count(v.get())) { + PrintType(handle_data_type_.at(v.get()), stream); + } else { + stream << "void"; } - } - if (handle_data_type_.count(v.get())) { - PrintType(handle_data_type_.at(v.get()), stream); stream << "*"; + if (f->is_restricted && restrict_keyword_.length() != 0) { stream << ' ' << restrict_keyword_; } @@ -402,12 +405,9 @@ inline void PrintBinaryIntrinsitc(const Call* op, } } void CodeGenC::VisitExpr_(const Cast *op, std::ostream& os) { // NOLINT(*) - os << "("; - this->PrintType(op->type, os); - os << ")"; - os << '('; - this->PrintExpr(op->value, os); - os << ')'; + std::stringstream value; + this->PrintExpr(op->value, value); + os << CastFromTo(value.str(), op->value.type(), op->type); } void CodeGenC::VisitExpr_(const Variable *op, std::ostream& os) { // NOLINT(*) os << GetVarID(op); diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 1c68dd18bd68..895e94b8198e 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -142,6 +142,8 @@ class CodeGenC : // print store of single element. virtual void PrintVecElemStore( const std::string& vec, Type t, int i, const std::string& value); + // Get a cast type from to + virtual std::string CastFromTo(std::string value, Type from, Type target); protected: // Print reference to struct location @@ -150,8 +152,6 @@ class CodeGenC : // print reference to a buffer as type t in index. std::string GetBufferRef( Type t, const Variable* buffer, Expr index); - // Get a cast type from to - std::string CastFromTo(std::string value, Type from, Type target); /*! * \brief If buffer is allocated as type t. * \param buf_var The buffer variable. diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc index d2133f85a08c..03f4acdd057c 100644 --- a/src/codegen/codegen_opencl.cc +++ b/src/codegen/codegen_opencl.cc @@ -175,6 +175,22 @@ void CodeGenOpenCL::PrintStorageScope( } } +std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) { + if (from == target) return value; + std::ostringstream os; + if (target.lanes() == 1) { + os << "(("; + this->PrintType(target, os); + os << ")" << value << ")"; + } else { // convert vector type + os << "("; + os << "convert_"; + this->PrintType(target, os); + os << "(" << value << "))"; + } + return os.str(); +} + void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) std::string v = PrintExpr(op->value); os << "(("; diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h index a10c165ee3a1..424bfa5ae2b3 100644 --- a/src/codegen/codegen_opencl.h +++ b/src/codegen/codegen_opencl.h @@ -34,6 +34,8 @@ class CodeGenOpenCL final : public CodeGenC { // the address of load/store void PrintVecAddr(const Variable* buffer, Type t, Expr base, std::ostream& os); // NOLINT(*) + std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*) + // overload visitor void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc index 44e9753081db..942e70339488 100644 --- a/src/pass/split_host_device.cc +++ b/src/pass/split_host_device.cc @@ -191,9 +191,6 @@ class HostDeviceSplitter : public IRMutator { auto it = handle_data_type_.find(v.get()); if (it != handle_data_type_.end()) { n->handle_data_type.Set(v, it->second); - } else { - // int32 as a placeholder - n->handle_data_type.Set(v, make_const(UInt(32), 0)); } } } diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py index 785bdab27738..2f0aa7ea95ce 100644 --- a/topi/python/topi/nn/depthwise_conv2d.py +++ b/topi/python/topi/nn/depthwise_conv2d.py @@ -31,9 +31,14 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'): Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ + out_dtype = Input.dtype + batch, in_channel, in_height, in_width = Input.shape filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape - stride_h, stride_w = stride + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (filter_height, filter_width)) @@ -82,7 +87,10 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding): """ batch, in_height, in_width, in_channel = Input.shape filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape - stride_h, stride_w = stride + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (filter_height, filter_width)) @@ -131,7 +139,10 @@ def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, strid batch, in_h, in_w, in_c = ishape _, out_h, out_w, out_c = oshape filter_h, filter_w, _, channel_multiplier = Filter.shape - stride_h, stride_w = stride + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name='dilated_out_grad') @@ -186,7 +197,10 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid batch, out_h, out_w, out_c = oshape filter_h, filter_w, _, channel_multiplier = fshape in_c = Input.shape[3].value - stride_h, stride_w = stride + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w)) diff --git a/topi/python/topi/testing/depthwise_conv2d_python.py b/topi/python/topi/testing/depthwise_conv2d_python.py index 84784f97c2b8..d7baf4a1beaf 100644 --- a/topi/python/topi/testing/depthwise_conv2d_python.py +++ b/topi/python/topi/testing/depthwise_conv2d_python.py @@ -27,7 +27,11 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding): """ batch, in_channel, in_height, in_width = input_np.shape _, channel_multiplier, filter_height, filter_width = filter_np.shape - stride_h, stride_w = stride + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + # calculate output shape if padding == 'VALID': out_channel = in_channel * channel_multiplier @@ -84,7 +88,11 @@ def depthwise_conv2d_python_nhwc(input_np, filter_np, stride, padding): """ batch, in_height, in_width, in_channel = input_np.shape filter_height, filter_width, _, channel_multiplier = filter_np.shape - stride_h, stride_w = stride + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + # calculate output shape if padding == 'VALID': out_channel = in_channel * channel_multiplier diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py index df9a755c078e..62230857ba02 100644 --- a/topi/tests/python/test_topi_depthwise_conv2d.py +++ b/topi/tests/python/test_topi_depthwise_conv2d.py @@ -7,18 +7,17 @@ from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc -def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding): +def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding): in_width = in_height filter_channel = in_channel filter_width = filter_height - stride_w = stride_h # placeholder Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input') Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter') Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale') Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift') # declare - DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, stride=[stride_h, stride_w], padding=padding) + DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, stride=stride, padding=padding) ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) @@ -56,7 +55,7 @@ def get_ref_data(): shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( - input_np, filter_np, stride=[stride_h, stride_w], padding=padding) + input_np, filter_np, stride=stride, padding=padding) scale_shift_scipy = np.zeros(shape=scale_shift_shape) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c] From 717aa0cfea3cbf85a6246dc20740b05255f4bb0b Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 17 Jan 2018 01:14:01 +0800 Subject: [PATCH 086/948] [TOPI] add schedule for ARM Mali GPU (#786) * add schedule for ARM Mali GPU * fix lint * fix lint --- dmlc-core | 2 +- python/tvm/target.py | 13 + topi/python/topi/__init__.py | 1 + topi/python/topi/mali/__init__.py | 7 + topi/python/topi/mali/conv2d.py | 479 ++++++++++++++++++++++ topi/python/topi/mali/dense.py | 100 +++++ topi/python/topi/mali/depthwise_conv2d.py | 106 +++++ 7 files changed, 707 insertions(+), 1 deletion(-) create mode 100644 topi/python/topi/mali/__init__.py create mode 100644 topi/python/topi/mali/conv2d.py create mode 100644 topi/python/topi/mali/dense.py create mode 100644 topi/python/topi/mali/depthwise_conv2d.py diff --git a/dmlc-core b/dmlc-core index 674a662c22b9..c0871823b518 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 674a662c22b900b76e8a3c9b77987a2c5563ba71 +Subproject commit c0871823b518093a0d04d6cba0a3291bc7b31401 diff --git a/python/tvm/target.py b/python/tvm/target.py index 8f5f4bc87852..3437e70e0a6d 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -264,6 +264,19 @@ def rasp(options=None): return Target("llvm", opts) +def mali(options=None): + """Returns a ARM Mali GPU target. + + Parameters + ---------- + options : list of str + Additional options + """ + opts = ["-device=mali"] + opts = _merge_opts(opts, options) + return Target("opencl", opts) + + def create(target_str): """Get a target given target string. diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index c28dfb34e8b6..9760722798af 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -17,6 +17,7 @@ from . import x86 from . import cuda from . import rasp +from . import mali from . import testing from . import util from . import rocm diff --git a/topi/python/topi/mali/__init__.py b/topi/python/topi/mali/__init__.py new file mode 100644 index 000000000000..bdd718e043a0 --- /dev/null +++ b/topi/python/topi/mali/__init__.py @@ -0,0 +1,7 @@ +# pylint: disable=redefined-builtin, wildcard-import +"""ARM Mali GPU specific declaration and schedules.""" +from __future__ import absolute_import as _abs + +from .conv2d import * +from .depthwise_conv2d import * +from .dense import * diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py new file mode 100644 index 000000000000..940d64668ac4 --- /dev/null +++ b/topi/python/topi/mali/conv2d.py @@ -0,0 +1,479 @@ +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""conv2d schedule on ARM Mali GPU""" + +from __future__ import absolute_import as _abs +import tvm + +from .. import generic +from .. import util +from .. import tag +from ..nn import pad +from ..nn.conv2d import conv2d +from ..nn.util import get_pad_tuple + +##### SCHEDULE UTILITIES ##### +def fuse_and_bind(s, tensor, axis=None, num_thread=None): + """ fuse all the axis and bind to GPU threads """ + axis = axis or s[tensor].op.axis + fused = s[tensor].fuse(*axis) + max_threads = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = s[tensor].split(fused, num_thread or max_threads) + s[tensor].bind(bx, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(tx, tvm.thread_axis("threadIdx.x")) + return bx, tx + +def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None): + """ tile and bind to GPU threads """ + x_factor = x_factor or y_factor + yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor) + s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) + s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) + return yo, xo, yi, xi + +def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): + """ tile and bind 3d """ + y_factor = y_factor or z_factor + x_factor = x_factor or y_factor + zo, zi = s[tensor].split(z, z_factor) + yo, yi = s[tensor].split(y, y_factor) + xo, xi = s[tensor].split(x, x_factor) + s[tensor].bind(zo, tvm.thread_axis("blockIdx.z")) + s[tensor].bind(zi, tvm.thread_axis("threadIdx.z")) + s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) + s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) + s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + +def pack_tensor(s, tensor, factor, readers): + """ do transform X[n, m] -> X[n / factor, m, factor] """ + tmp = s.cache_read(tensor, 'global', readers) + y, x = s[tmp].op.axis + yo, yi = s[tmp].split(y, factor) + s[tmp].reorder(yo, x, yi) + s[tmp].compute_inline() + return s.cache_write(tmp, 'global') + +def transpose(s, tensor, readers): + """ do transform X[n, m] -> X[m, n] """ + tmp = s.cache_read(tensor, 'global', readers) + y, x = s[tmp].op.axis + s[tmp].reorder(x, y) + s[tmp].compute_inline() + return s.cache_write(tmp, "global"), tmp + +@conv2d.register("mali") +def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): + """Conv2D operator for ARM Mali GPU backend. + + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + kernel : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + assert layout == 'NCHW', "only support NCHW convolution on mali" + assert data.shape[0].value == 1, "only support batch size=1 convolution on mali" + assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." + + out_dtype = data.dtype + if util.get_const_int(kernel.shape[2]) == 1: + return _decl_im2col(data, kernel, stride, padding, layout, out_dtype) + else: + return _decl_direct(data, kernel, stride, padding, layout, out_dtype) + +@generic.schedule_conv2d_nchw.register(["mali"]) +def schedule_conv2d_nchw(outs): + """Schedule for conv2d_nchw for ARM Mali GPU + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + """inline all one-to-one-mapping operators except the last stage (output)""" + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + if 'im2col_conv_output' in op.tag: + _schedule_im2col_conv2d(s, op) + + if 'direct_conv_output' in op.tag: + _schedule_direct_conv2d(s, op) + + traverse(outs[0].op) + return s + +def _decl_direct(data, kernel, stride, padding, layout, out_dtype): + """declare the direct method (spatial packing) for conv2d""" + _, CI, IH, IW = [util.get_const_int(x) for x in data.shape] + CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape] + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + HCAT, WCAT = KH - 1, KW - 1 + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + N = 1 + TH = IH + 2*HPAD + TW = IW + 2*WPAD + OH = (IH + 2*HPAD - KH) // HSTR + 1 + OW = (IW + 2*WPAD - KW) // WSTR + 1 + + DO_PAD = (HPAD != 0 and WPAD != 0) + if DO_PAD: + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + else: + data_pad = data + + # set tunable parameters (tile factor, ...) + tune_config = getattr(tvm.target.current_target(), "tune_config", None) + if tune_config is None: + VH = 1 + VW, VC = 4, 4 + # correct tile factor + if OW % VW != 0: + if OW == 14: + VW = 2 + VC = 8 + elif OW == 7: + VW = 7 + else: + VH = tune_config['VH'] + VW = tune_config['VW'] + VC = tune_config['VC'] + + if data.dtype == 'float16': + VC *= 2 + + assert CO % VC == 0 + assert OH % VH == 0, "OH: %d VH : %d" % (OH, VH) + assert OW % VW == 0, "OW: %d VW : %d" % (OW, VW) + + dvshape = (N, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT) + kvshape = (CO // VC, CI, KH, KW, VC) + ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) + oshape = (N, CO, OH, OW) + + data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: + data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], + name='data_vec') + + kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc: + kernel[co*VC+vc][ci][kh][kw], + name='kernel_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + kh = tvm.reduce_axis((0, KH), name='kh') + kw = tvm.reduce_axis((0, KW), name='kw') + + conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc:\ + tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) * + kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), + axis=[ci, kh, kw]), name='conv') + + output = tvm.compute(oshape, lambda n, co, h, w: + conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC], + name='output_unpack', tag='direct_conv_output') + + return output + +def _schedule_direct_conv2d(s, op): + """schedule the direct method (spatial packing) for conv2d""" + # get ops and tensors + output = op.output(0) + output_height = util.get_const_int(output.shape[2]) + + conv = op.input_tensors[0] + data_vec = s[conv].op.input_tensors[0] + kernel_vec = s[conv].op.input_tensors[1] + data = s[data_vec].op.input_tensors[0] + kernel = s[kernel_vec].op.input_tensors[0] + + # set tunable parameters (tile factor, ...) + tune_config = getattr(tvm.target.current_target(), "tune_config", None) + if tune_config is None: + num_thread = 8 + + out_channel = util.get_const_int(kernel.shape[0]) + in_channel = util.get_const_int(kernel.shape[1]) + in_width = util.get_const_int(data.shape[2]) + + if in_width >= 224: + pass + elif in_width >= 112: + pass + elif in_width >= 56: + if out_channel != in_channel: + num_thread = 16 + elif in_width >= 28: + if out_channel >= 256: + num_thread = 16 + elif in_width >= 14: + if in_channel == out_channel: + num_thread = 8 + else: + num_thread = 4 + else: + num_thread = tune_config["num_thread"] + + last = 1 + if output_height == 28: + last = 7 + num_thread = 32 + + if data.dtype == 'float16' and (util.get_const_int(conv.shape[1]) == 4 or output_height == 28): + num_thread /= 2 + + # schedule padding + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + s[data_pad].compute_inline() + + # schedule data packing + _, h, w, ci, vh, vw = s[data_vec].op.axis + tile_and_bind3d(s, data_vec, h, w, ci, 1) + s[data_vec].unroll(vw) + + # schedule kernel packing + co, ci, kh, kw, vc = s[kernel_vec].op.axis + tile_and_bind(s, kernel_vec, co, ci, 1) + s[kernel_vec].unroll(kh) + s[kernel_vec].unroll(kw) + s[kernel_vec].vectorize(vc) + + # schedule convolution + _, c, h, w, vh, vw, vc = s[conv].op.axis + kc, kh, kw = s[conv].op.reduce_axis + s[conv].reorder(_, c, h, w, vh, kc, kh, kw, vw, vc) + tile_and_bind3d(s, conv, c, h, w, num_thread, 1, last) + s[conv].unroll(kh) + s[conv].unroll(kw) + s[conv].unroll(vw) + s[conv].vectorize(vc) + + # schedule output + if output.op not in s.outputs: # has bias + s[output].compute_inline() + output = s.outputs[0] + + _, co, oh, ow = s[output].op.axis + tile_and_bind3d(s, output, co, oh, ow, num_thread, 1, last) + + #print(tvm.lower(s, [data, kernel, output], simple_mode=True)) + +def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): + """declare the Im2Col method for conv2d""" + _, CI, IH, IW = [x.value for x in data.shape] + CO, _, KH, KW = [x.value for x in kernel.shape] + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + N = 1 + OH = (IH + 2*HPAD - KH) // HSTR + 1 + OW = (IW + 2*WPAD - KW) // WSTR + 1 + + DO_PAD = (HPAD != 0 and WPAD != 0) + if DO_PAD: + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + else: + data_pad = data + + ALIGN = 16 + def upround(x, align): + return (x + align - 1) / align * align + + # A [CO, CI * KH * KW] + reduce_len = upround(CI * KH * KW, ALIGN) + A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j: + kernel[i][j / KW / KH][j / KW % KH][j % KW], name='A') + + # B [CI * KH * KW, N * OH * OW] + B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\ + tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW), + data_pad[j / (OH*OW)][i / (KH*KW)][j / OW % OH*HSTR + i / KW % KH] + [j % OW*WSTR + i % KW], + tvm.const(0, data_pad.dtype)), name='B') + + gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1] + + # C [CO, N * OH * OW] + k = tvm.reduce_axis((0, gemm_l), name='k') + C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') + + # output + # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment, + # otherwise the alignment above will be eliminated by bound inference + output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\ + C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1], + name='output', tag='im2col_conv_output') + + return output + +def _schedule_im2col_conv2d(s, op): + """schedule the Im2Col method for conv2d""" + + # get ops and tensors + output = op.output(0) + C = op.input_tensors[0] + A, B = C.op.input_tensors + kernel = A.op.input_tensors[0] + data = B.op.input_tensors[0] + + # tuning parameter config + tune_config = getattr(tvm.target.current_target(), "tune_config", None) + if tune_config is None: # use rule + bn = 4 + unroll_step = 16 + + total_work = util.get_const_int(C.shape[0] * C.shape[1]) + reduce_work = util.get_const_int(A.shape[1]) + if total_work > 200000: + last_work = util.get_const_int(C.shape[1]) + if last_work > 10000: + num_thread = 16 + elif last_work > 3000: + num_thread = 8 + elif reduce_work > 100: + num_thread = 4 + else: + num_thread = 2 + + if reduce_work < 50 and last_work < 30000: + num_thread = 4 + elif total_work > 150000: + num_thread = 8 + elif total_work > 50000: + num_thread = 4 + else: + num_thread = 2 + + if num_thread == 4: + unroll_step = 2 + else: + bn = tune_config["bn"] + num_thread = tune_config["num_thread"] + unroll_step = tune_config["unroll_step"] + + bna = bnb = bn + num_thread1 = num_thread2 = num_thread + if data.dtype == 'float16': + bnb *= 2 + last_work = util.get_const_int(C.shape[1]) + if last_work % (bnb * num_thread2) != 0: + num_thread1 = num_thread * 2 + num_thread2 = num_thread / 2 + + # schedule padding + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + s[data_pad].compute_inline() + + ##### SCHEDULE A ##### + if util.get_const_int(kernel.shape[2]) == 1 and util.get_const_int(kernel.shape[3]) == 1: + s[A].compute_inline() + else: + y, x = s[A].op.axis + yo, xo, yi, xi = s[A].tile(y, x, bna, util.get_const_int(kernel.shape[3])) + s[A].vectorize(xi) + fuse_and_bind(s, A, [yo, xo]) + + # pack to vector form + packedA = pack_tensor(s, A, bna, [C]) + + # vectorize load + y, x = s[packedA].op.axis[:2] + tmp = s.cache_write(packedA, "local") + x, xt = s[packedA].split(x, bna) + _, _, _, xi = tile_and_bind(s, packedA, y, x, num_thread) + s[tmp].compute_at(s[packedA], xi) + s[tmp].vectorize(s[tmp].op.axis[1]) + s[tmp].unroll(s[tmp].op.axis[2]) + s[packedA].vectorize(s[packedA].op.axis[2]) + s[packedA].unroll(xt) + + ##### SCHEDULE B ##### + y, x = s[B].op.axis + yo, xo, yi, xi = s[B].tile(y, x, 1, 1 * bnb) + fuse_and_bind(s, B, [yo, xo]) + + # transpose and pack to vector form + B_transpose, B_tmp = transpose(s, B, [C]) + s[B_transpose].compute_inline() + packedB = pack_tensor(s, B_transpose, bnb, [B_tmp]) + + # vectorize load + s[packedB].vectorize(s[packedB].op.axis[2]) + y, x = s[packedB].op.axis[:2] + tile_and_bind(s, packedB, y, x, num_thread) + + ##### SCHEDULE C ##### + # vectorize and unroll dot + y, x = s[C].op.axis + y, x, yt, xt = s[C].tile(y, x, bna, bnb) + + k = s[C].op.reduce_axis[0] + s[C].reorder(k, yt, xt) + if unroll_step != 1: + k, k_unroll = s[C].split(k, unroll_step) + s[C].unroll(k_unroll) + s[C].unroll(yt) + s[C].vectorize(xt) + + tile_and_bind(s, C, y, x, num_thread1, num_thread2) + + ##### COPY TO OUTPUT ##### + if output.op in s.outputs: # no bias + output = output + else: # has bias + s[output].compute_inline() + output = s.outputs[0] + + n, co, h, w = s[output].op.axis + h, w, vh, vw = s[output].tile(h, w, 1, bnb) + s[output].unroll(vh) + if util.get_const_int(s[output].op.output(0).shape[3]) % bnb != 0: + pass + else: + s[output].vectorize(vw) + fuse_and_bind(s, output, [n, co, h, w]) + + #print(tvm.lower(s, [data, kernel], simple_mode=True)) diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py new file mode 100644 index 000000000000..d3edeafed3b3 --- /dev/null +++ b/topi/python/topi/mali/dense.py @@ -0,0 +1,100 @@ +# pylint: disable=invalid-name,unused-variable +"""dense schedule on ARM Mali GPU""" + +from __future__ import absolute_import as _abs + +import tvm + +from .. import generic +from .. import util +from .. import tag + +@generic.schedule_dense.register(["mali"]) +def schedule_dense(outs): + """Schedule for dense operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of dense + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for dense. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def _schedule(dense): + data = s[dense].op.input_tensors[0] + weight = s[dense].op.input_tensors[1] + + hidden = util.get_const_int(weight.shape[1]) + out = util.get_const_int(weight.shape[0]) + + # set tunable parameter + tune_config = getattr(tvm.target.current_target(), "tune_config", None) + if tune_config is None: + if hidden > 8192: + num_thread = 32 + unroll_step = 32 + else: + if out <= 1024: + num_thread = 32 + unroll_step = 16 + else: + num_thread = 256 + unroll_step = 32 + + if data.dtype == 'float16': + if hidden > 8192: + num_thread = 2 + unroll_step = 32 + else: + num_thread = 8 + unroll_step = 256 + else: + num_thread = tune_config['num_thread'] + unroll_step = tune_config['unroll_step'] + + def fuse_and_bind(s, tensor, axis=None, num_thread=None): + """ fuse all the axis and bind to GPU threads """ + axis = axis or s[tensor].op.axis + fused = s[tensor].fuse(*axis) + max_threads = tvm.target.current_target(allow_none=False).max_num_threads + bx, tx = s[tensor].split(fused, num_thread or max_threads) + s[tensor].bind(bx, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(tx, tvm.thread_axis("threadIdx.x")) + return bx, tx + + output = outs[0] + bx, tx = fuse_and_bind(s, output, num_thread=num_thread) + + k = s[dense].op.reduce_axis[0] + k, k_unroll = s[dense].split(k, unroll_step) + s[dense].unroll(k_unroll) + + if dense.op not in s.outputs: + s[dense].compute_at(s[output], tx) + +# bias = s[outs[0]].op.input_tensors[1] +# print(tvm.lower(s, [data, weight, bias, outs[0]], simple_mode=True)) + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + # schedule dense + elif OP.tag == 'dense': + dense = OP.output(0) + _schedule(dense) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py new file mode 100644 index 000000000000..46ce7f747def --- /dev/null +++ b/topi/python/topi/mali/depthwise_conv2d.py @@ -0,0 +1,106 @@ +# pylint: disable=invalid-name,unused-variable,unused-argument +"""depthwise_conv2d schedule on ARM Mali GPU""" + +from __future__ import absolute_import as _abs +import tvm + +from .. import generic +from .. import util +from .. import tag + +@generic.schedule_depthwise_conv2d_nchw.register(["mali"]) +def schedule_depthwise_conv2d_nchw(outs): + """Schedule for depthwise_conv2d nchw forward. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of depthwise_conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for depthwise_conv2d nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def _schedule(pad_data, kernel, conv): + raw_data = s[pad_data].op.input_tensors[0] + + if conv.op not in s.outputs: # has bias or relu + output = outs[0] + else: # no bias or relu + output = conv + + def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): + """ tile and bind 3d """ + y_factor = y_factor or z_factor + x_factor = x_factor or y_factor + zo, zi = s[tensor].split(z, z_factor) + yo, yi = s[tensor].split(y, y_factor) + xo, xi = s[tensor].split(x, x_factor) + s[tensor].bind(zo, tvm.thread_axis("blockIdx.z")) + s[tensor].bind(zi, tvm.thread_axis("threadIdx.z")) + s[tensor].bind(yo, tvm.thread_axis("blockIdx.y")) + s[tensor].bind(yi, tvm.thread_axis("threadIdx.y")) + s[tensor].bind(xo, tvm.thread_axis("blockIdx.x")) + s[tensor].bind(xi, tvm.thread_axis("threadIdx.x")) + return zo, zi, yo, yi, xo, xi + + # set tunable parameters + VH = 1 + VW = 1 + num_thread = 4 + while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4: + VW = VW * 2 + while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2: + VH = VH * 2 + if raw_data.dtype == 'float16': + if util.get_const_int(conv.shape[3]) % (VW * 2) == 0: + VW *= 2 + num_thread *= 2 + else: + num_thread *= 2 + + # schedule padding + _, c, y, x = s[pad_data].op.axis + tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1) + + # schedule conv + di, dj = s[conv].op.reduce_axis + s[conv].unroll(di) + s[conv].unroll(dj) + + _, c, y, x = s[output].op.axis + y, x, yi, xi = s[output].tile(y, x, VH, VW) + s[output].unroll(yi) + s[output].vectorize(xi) + + _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1) + + if conv.op not in s.outputs: + _, c, y, x = s[conv].op.axis + y, x, yi, xi = s[conv].tile(y, x, VH, VW) + s[conv].unroll(yi) + s[conv].vectorize(xi) + s[conv].compute_at(s[output], ji) + + def traverse(op): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + # schedule depthwise_conv2d + if op.tag == 'depthwise_conv2d_nchw': + pad_data = op.input_tensors[0] + kernel = op.input_tensors[1] + conv = op.output(0) + _schedule(pad_data, kernel, conv) + + traverse(outs[0].op) + return s From dd17d5d2e34a21f9601fedf53c85a013c018ae3c Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 16 Jan 2018 10:30:14 -0800 Subject: [PATCH 087/948] fix (#788) --- topi/tests/python/test_topi_reduce.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py index c8d95df255c6..1e40770f5b10 100644 --- a/topi/tests/python/test_topi_reduce.py +++ b/topi/tests/python/test_topi_reduce.py @@ -88,8 +88,8 @@ def check_device(device): np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3) elif type == "argmin": np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3) - - np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) + else: + np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) for device in ["cuda", "opencl", "metal", "llvm", "rocm"]: check_device(device) From c44526daa24f778c8110a64b7b6f1db15de237b8 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 17 Jan 2018 03:43:43 +0800 Subject: [PATCH 088/948] fix mali topi for python3 (#789) --- topi/python/topi/mali/conv2d.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 940d64668ac4..ff67e0503f4f 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -256,7 +256,7 @@ def _schedule_direct_conv2d(s, op): num_thread = 32 if data.dtype == 'float16' and (util.get_const_int(conv.shape[1]) == 4 or output_height == 28): - num_thread /= 2 + num_thread //= 2 # schedule padding if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: @@ -319,17 +319,17 @@ def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float3 ALIGN = 16 def upround(x, align): - return (x + align - 1) / align * align + return (x + align - 1) // align * align # A [CO, CI * KH * KW] reduce_len = upround(CI * KH * KW, ALIGN) A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j: - kernel[i][j / KW / KH][j / KW % KH][j % KW], name='A') + kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A') # B [CI * KH * KW, N * OH * OW] B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\ tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW), - data_pad[j / (OH*OW)][i / (KH*KW)][j / OW % OH*HSTR + i / KW % KH] + data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH] [j % OW*WSTR + i % KW], tvm.const(0, data_pad.dtype)), name='B') @@ -400,7 +400,7 @@ def _schedule_im2col_conv2d(s, op): last_work = util.get_const_int(C.shape[1]) if last_work % (bnb * num_thread2) != 0: num_thread1 = num_thread * 2 - num_thread2 = num_thread / 2 + num_thread2 = num_thread // 2 # schedule padding if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: From 519ea5a2dc8ca0747770bbd4e0a99376db56d031 Mon Sep 17 00:00:00 2001 From: masahi Date: Wed, 17 Jan 2018 04:48:00 +0900 Subject: [PATCH 089/948] [TOPI] Basic x86 schedules (#775) * add basic x86 schedules * parallelize & vectorize batchnorm + relu * fuse conv into bn + relu * move rc loop to outer * add nhwc conv * change weight layout to hwcf * conv + bn + relu fusion for nhwc conv * fix conv_nhwc schedule when no fusion * clean up default parallel schedules * simplify elemwise parallel * fix elemwise parallel for batch == 1 * update nhwc conv test * fix and add comment * fix lint * remove redundant import * remove default multithreading for some ops * remove default multithreading for global pool --- topi/python/topi/generic/nn.py | 18 +++++ topi/python/topi/nn/conv2d.py | 51 ++++++++++++++ topi/python/topi/testing/__init__.py | 1 + .../python/topi/testing/conv2d_nhwc_python.py | 67 ++++++++++++++++++ topi/python/topi/x86/__init__.py | 4 +- topi/python/topi/x86/conv2d.py | 68 ++++++++++++++++++- topi/python/topi/x86/injective.py | 35 ++++++++++ topi/python/topi/x86/nn.py | 56 +++++++++++++++ topi/tests/python/test_topi_conv2d_nhwc.py | 59 ++++++++++++++++ 9 files changed, 356 insertions(+), 3 deletions(-) create mode 100644 topi/python/topi/testing/conv2d_nhwc_python.py create mode 100644 topi/python/topi/x86/injective.py create mode 100644 topi/python/topi/x86/nn.py create mode 100644 topi/tests/python/test_topi_conv2d_nhwc.py diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 6f641e99f7dd..5c580aad24c4 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -35,6 +35,24 @@ def schedule_conv2d_nchw(outs): return _default_schedule(outs, False) +@tvm.target.generic_func +def schedule_conv2d_nhwc(outs): + """Schedule for conv2d_nhwc + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + @tvm.target.generic_func def schedule_conv2d_transpose_nchw(outs): """Schedule for conv2d_transpose_nchw diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 11866aedc101..3bd910e29974 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -337,6 +337,57 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype='float32'): name="Conv2dOutput", tag="conv2d_hwcn") return Output + +def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'): + """Convolution operator in NHWC layout. + + Parameters + ---------- + Input : tvm.Tensor + 4-D with shape [batch, in_height, in_width, in_channel] + + Filter : tvm.Tensor + 4-D with shape [filter_height, filter_width, in_channel, num_filter] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or str + Padding size, or ['VALID', 'SAME'] + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_height, out_width, out_channel] + """ + assert isinstance(stride, int) or len(stride) == 2 + batch, in_height, in_width, in_channel = Input.shape + kernel_h, kernel_w, channel, num_filter = Filter.shape + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + padding, (kernel_h, kernel_w)) + # compute the output shape + out_channel = num_filter + out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) + pad_before = [0, pad_top, pad_left, 0] + pad_after = [0, pad_down, pad_right, 0] + PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") + rc = tvm.reduce_axis((0, in_channel), name='rc') + ry = tvm.reduce_axis((0, kernel_h), name='ry') + rx = tvm.reduce_axis((0, kernel_w), name='rx') + Output = tvm.compute( + (batch, out_height, out_width, out_channel), + lambda nn, yy, xx, ff: tvm.sum( + PaddedInput[nn, yy * stride_h + ry, xx * stride_w + rx, rc].astype(out_dtype) * + Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]), + name="Conv2dOutput", tag="conv2d_nhwc") + return Output + # map from schedule type to declaration function _SCH_TO_DECL_FUNC = { SpatialPack: _spatial_pack, diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index 6a1b361e3097..2a20a1c4f622 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -6,6 +6,7 @@ from .conv2d_hwcn_python import conv2d_hwcn_python from .conv2d_nchw_python import conv2d_nchw_python +from .conv2d_nhwc_python import conv2d_nhwc_python from .conv2d_transpose_nchw_python import conv2d_transpose_nchw_python from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc from .dilate_python import dilate_python diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py new file mode 100644 index 000000000000..880088a6f89f --- /dev/null +++ b/topi/python/topi/testing/conv2d_nhwc_python.py @@ -0,0 +1,67 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Convolution in python""" +import numpy as np +import scipy.signal + + +def conv2d_nhwc_python(a_np, w_np, stride, padding): + """Convolution operator in NHWC layout. + + Parameters + ---------- + a_np : numpy.ndarray + 4-D with shape [batch, in_height, in_width, in_channel] + + w_np : numpy.ndarray + 4-D with shape [num_filter, filter_height, filter_width, in_channel] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or str + Padding size, or ['VALID', 'SAME'] + + Returns + ------- + b_np : np.ndarray + 4-D with shape [out_height, out_width, out_channel, batch] + """ + batch, in_height, in_width, in_channel = a_np.shape + kernel_h, kernel_w, _, num_filter = w_np.shape + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + if isinstance(padding, int): + pad_h = pad_w = padding * 2 + elif padding == 'VALID': + pad_h = 0 + pad_w = 0 + else: # 'SAME' + pad_h = kernel_h - 1 + pad_w = kernel_w - 1 + pad_top = int(np.ceil(float(pad_h) / 2)) + pad_bottom = pad_h - pad_top + pad_left = int(np.ceil(float(pad_w) / 2)) + pad_right = pad_w - pad_left + # compute the output shape + out_channel = num_filter + out_height = (in_height - kernel_h + pad_h) // stride_h + 1 + out_width = (in_width - kernel_w + pad_w) // stride_w + 1 + # change the layout from NHWC to NCHW + at = a_np.transpose((0, 3, 1, 2)) + wt = w_np.transpose((3, 2, 0, 1)) + bt = np.zeros((batch, out_channel, out_height, out_width)) + # computation + for n in range(batch): + for f in range(out_channel): + for c in range(in_channel): + if pad_h > 0: + apad = np.zeros((in_height + pad_h, in_width + pad_w)) + apad[pad_top:-pad_bottom, pad_left:-pad_right] = at[n, c] + else: + apad = at[n, c] + out = scipy.signal.convolve2d( + apad, np.rot90(np.rot90(wt[f, c])), mode='valid') + bt[n, f] += out[::stride, ::stride] + return bt.transpose((0, 2, 3, 1)) diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py index 6ab37b8c03ac..ef227d035fce 100644 --- a/topi/python/topi/x86/__init__.py +++ b/topi/python/topi/x86/__init__.py @@ -2,6 +2,8 @@ """x86 specific declaration and schedules.""" from __future__ import absolute_import as _abs -from .conv2d import schedule_conv2d +from .conv2d import schedule_conv2d, schedule_conv2d_nhwc from .binarize_pack import schedule_binarize_pack from .binary_dense import schedule_binary_dense +from .nn import * +from .injective import * diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 0c91f8c25c88..cb3571d6a91b 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -15,6 +15,12 @@ def traverse(op): if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() + else: # inject custom schedule + if len(op.axis) == 4: # schedule bias + bn + relu + n, c, h, w = op.axis + fused = s[op].fuse(n, c) + s[op].parallel(fused) + s[op].vectorize(w) for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) @@ -28,10 +34,68 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] + n_pad, c_pad, h_pad, w_pad = data_pad.op.axis + pad_fused = s[data_pad].fuse(n_pad, c_pad) + s[data_pad].parallel(pad_fused) C = conv n, c, h, w = C.op.axis - s[C].parallel(c) - s[C].pragma(n, "parallel_launch_point") + rc, ry, rx = C.op.reduce_axis + fused = s[C].fuse(n, c) + s[C].parallel(fused) + wo, wi = s[C].split(w, factor=16) + s[C].reorder(fused, rc, h, wo, ry, rx, wi) # move rc to outer loop + s[C].unroll(rx) + s[C].unroll(ry) + s[C].vectorize(wi) traverse(outs[0].op) return s + + +@generic.schedule_conv2d_nhwc.register(["cpu"]) +def schedule_conv2d_nhwc(outs): + """Create schedule for tensors""" + s = tvm.create_schedule([x.op for x in outs]) + output_op = outs[0].op + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + else: # inject custom schedule + if len(op.axis) == 4: # schedule bias + bn + relu + n, h, w, c = op.axis + fused = s[op].fuse(n, h, w) + s[op].parallel(fused) + s[op].vectorize(c) + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + if 'conv2d_nhwc' in op.tag: + conv = op.output(0) + kernel = op.input_tensors[1] + data = op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + n_pad, h_pad, w_pad, c_pad = data_pad.op.axis + pad_fused = s[data_pad].fuse(n_pad, h_pad) + s[data_pad].parallel(pad_fused) + C = conv + n, h, w, c = C.op.axis + ry, rx, rc = C.op.reduce_axis + n_out, h_out, w_out, c_out = output_op.axis + s[C].vectorize(c) + if op != output_op: # fuse bias + bn + relu into conv + s[C].compute_at(s[output_op], c_out) + else: + fused = s[C].fuse(n, h, w) + s[C].parallel(fused) + + traverse(output_op) + return s diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py new file mode 100644 index 000000000000..0970b76142ae --- /dev/null +++ b/topi/python/topi/x86/injective.py @@ -0,0 +1,35 @@ +# pylint: disable=invalid-name +"""x86 declaration and schedules.""" +from __future__ import absolute_import as _abs +import tvm +from .. import generic + +@generic.schedule_injective.register(["cpu"]) +def schedule_injective(outs): + """X86 schedule for injective op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of injective in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + x = outs[0] + s = tvm.create_schedule([x.op for x in outs]) + tvm.schedule.AutoInlineInjective(s) + if len(s[x].op.axis) == 4: + n, c, _, _ = s[x].op.axis + fused = s[x].fuse(n, c) # for nhwc layout, fuse n and h + s[x].parallel(fused) + else: + s[x].parallel(s[x].op.axis[0]) + return s + +schedule_elemwise = schedule_injective +schedule_broadcast = schedule_injective diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py new file mode 100644 index 000000000000..49aa382589d1 --- /dev/null +++ b/topi/python/topi/x86/nn.py @@ -0,0 +1,56 @@ +"""x86 nn operators""" +from __future__ import absolute_import as _abs +import tvm +from .. import generic + +def _default_schedule(outs, auto_inline): + """Default schedule for x86.""" + x = outs[0] + s = tvm.create_schedule([x.op for x in outs]) + if auto_inline: + tvm.schedule.AutoInlineInjective(s) + s[x].fuse(s[x].op.axis) + return s + if len(s[x].op.axis) == 4: + n, c, _, _ = s[x].op.axis + fused = s[x].fuse(n, c) # for nhwc layout, fuse n and h + s[x].parallel(fused) + else: + s[x].parallel(s[x].op.axis[0]) + return s + + +@generic.schedule_softmax.register(["cpu"]) +def schedule_softmax(outs): + """Schedule for softmax + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of softmax + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + +@generic.schedule_pool.register(["cpu"]) +def schedule_pool(outs): + """Schedule for pool + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of pool + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py new file mode 100644 index 000000000000..7fc5b841908f --- /dev/null +++ b/topi/tests/python/test_topi_conv2d_nhwc.py @@ -0,0 +1,59 @@ +"""Example code to do convolution.""" +import os +import numpy as np +import tvm +import topi +from tvm.contrib.pickle_memoize import memoize +from topi.util import get_const_tuple + + +def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding): + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A') + W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W') + B = topi.nn.conv2d_nhwc(A, W, stride, padding) + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + + @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc") + def get_ref_data(): + a_np = np.random.uniform(size=a_shape).astype(dtype) + w_np = np.random.uniform(size=w_shape).astype(dtype) + b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding) + return a_np, w_np, b_np + a_np, w_np, b_np = get_ref_data() + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_conv2d_nhwc([B]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, W, B], device) + func(a, w, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['llvm']: + check_device(device) + + +def test_conv2d_nhwc(): + verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "SAME") + verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "SAME") + verify_conv2d_nhwc(4, 128, 16, 256, 5, 2, "SAME") + verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "VALID") + verify_conv2d_nhwc(1, 256, 32, 256, 3, 1, "VALID") + verify_conv2d_nhwc(4, 128, 16, 128, 5, 2, "VALID") + verify_conv2d_nhwc(4, 128, 16, 256, 5, 2, "VALID") + + +if __name__ == "__main__": + test_conv2d_nhwc() From 553a99f5621b6e667b51ef5f83969d159b62be15 Mon Sep 17 00:00:00 2001 From: solin319 Date: Fri, 19 Jan 2018 10:47:18 +0800 Subject: [PATCH 090/948] fix the description of create_shared (#793) The type of parameter options should be a str list. --- python/tvm/contrib/cc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py index b941140bfbf0..5764121c59ed 100644 --- a/python/tvm/contrib/cc.py +++ b/python/tvm/contrib/cc.py @@ -22,8 +22,8 @@ def create_shared(output, objects : list List of object files. - options : str - The additional options. + options : list + The list of additional options string. cc : str, optional The compile string. From fe33059e41ec7b721ffd3d8c3d00e4da3e87ad45 Mon Sep 17 00:00:00 2001 From: Jammy Zhou Date: Fri, 19 Jan 2018 17:40:05 +0000 Subject: [PATCH 091/948] Additional mali target support (#794) * Add Mali target support to tvm.target.create * Add Mali target support in codegen --- include/tvm/build_module.h | 3 +++ python/tvm/target.py | 3 +++ src/codegen/build_module.cc | 12 +++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index 75062e819748..ae7ca3ca681d 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -73,6 +73,9 @@ EXPORT Target metal(); /*! \return A target for rasp */ EXPORT Target rasp(); +/*! \return A target for Mali */ +EXPORT Target mali(); + /*! \return A target for stackvm */ EXPORT Target stackvm(); diff --git a/python/tvm/target.py b/python/tvm/target.py index 3437e70e0a6d..4584c1aa4d23 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -81,6 +81,7 @@ class Target(object): - :any:`tvm.target.rasp` create raspberry pi target - :any:`tvm.target.cuda` create CUDA target - :any:`tvm.target.rocm` create ROCM target + - :any:`tvm.target.mali` create Mali target """ current = None @@ -306,6 +307,8 @@ def create(target_str): device_name = item.split("=")[1] if device_name == "rasp": return rasp(arr[1:]) + if device_name == "mali": + return mali(arr[1:]) return Target(arr[0], arr[1:]) diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 2c419d43da08..df71a4a41bec 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -70,7 +70,8 @@ Target Target::create(const std::string& target_str) { auto result = device_name == "rasp" ? target::rasp() : - TargetFromName(target_name); + (device_name == "mali" ? target::mali() : + TargetFromName(target_name)); std::string item; while (ss >> item) { @@ -116,6 +117,15 @@ Target rasp() { return Target("llvm", kDLCPU, 512, 1, keys, options); } +Target mali() { + std::unordered_set keys({ "rocm", "gpu" }); + std::vector options({ + "-device=mali" + }); + return Target("opencl", kDLOpenCL, 256, 1, keys, options); +} + + Target stackvm() { std::unordered_set keys({ "stackvm", "cpu" }); std::vector options; From 867495b4e11be4a88ba797f30584dab495603168 Mon Sep 17 00:00:00 2001 From: masahi Date: Sat, 20 Jan 2018 03:31:51 +0900 Subject: [PATCH 092/948] simplify expr in get_const_tuple (#795) * fix upsampling output shape * simplify expr in get_const_tuple --- topi/python/topi/nn/upsampling.py | 5 +++-- topi/python/topi/util.py | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py index e1234741e286..df77bbdb23c0 100644 --- a/topi/python/topi/nn/upsampling.py +++ b/topi/python/topi/nn/upsampling.py @@ -1,6 +1,7 @@ """TVM operator upsampling compute.""" from __future__ import absolute_import import tvm +from .. import util def upsampling(data, scale): @@ -21,8 +22,8 @@ def upsampling(data, scale): 4-D with shape [batch, channel, in_height*scale, in_width*scale] """ batch, channel, height, width = data.shape - out_height = height * scale - out_width = width * scale + out_height = util.simplify(height * scale) + out_width = util.simplify(width * scale) return tvm.compute((batch, channel, out_height, out_width), \ lambda n, c, h, w: data[n, c, h/scale, w/scale]) diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py index 246d283cd2ed..00c8b9d42e2a 100644 --- a/topi/python/topi/util.py +++ b/topi/python/topi/util.py @@ -59,9 +59,8 @@ def get_const_tuple(in_tuple): """ out_tuple = () for elem in in_tuple: - if not isinstance(elem, (tvm.expr.IntImm, tvm.expr.UIntImm)): - raise ValueError("Element of input tuple should be const int") - out_tuple = out_tuple + (elem.value, ) + value = get_const_int(elem) + out_tuple = out_tuple + (value, ) return out_tuple From bc5f1922ab96c215263dbd7ebd3fb00deadb6738 Mon Sep 17 00:00:00 2001 From: xqdan Date: Sat, 20 Jan 2018 11:57:39 +0800 Subject: [PATCH 093/948] Support dump ir for each pass (#693) (#791) * Support dump ir for each pass(#693) * expose DumpIR * fix comments * fix comments --- python/tvm/build_module.py | 89 ++++++++++++++++++++++- tests/python/unittest/test_pass_unroll.py | 19 ++++- 2 files changed, 105 insertions(+), 3 deletions(-) mode change 100644 => 100755 python/tvm/build_module.py diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py old mode 100644 new mode 100755 index fe6b01bb4d8c..8b52b11d86e7 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -5,18 +5,96 @@ """ from __future__ import absolute_import as _abs import warnings +import types from . import api from . import tensor from . import schedule from . import expr from . import ir_pass +from . import stmt as _stmt from . import container from . import module from . import codegen from . import ndarray from . import target as _target +class DumpIR(object): + """Dump IR for each pass. + With it, you can dump ir just like gcc/llvm. + + How to use: + ----------- + .. code-block:: python + + with tvm.build_config(dump_pass_ir=True) + run() + + """ + scope_level = 0 + def __init__(self): + self._pass_id = 0 + self._recover_list = [] + + def decorate(self, func): + ''' decorate the pass function''' + def dump(*args, **kwargs): + '''dump function''' + retv = func(*args, **kwargs) + if not isinstance(retv, (_stmt.Stmt, container.LoweredFunc, container.Array)): + return retv + pname = str(self._pass_id) + "_" + func.func_name + "_ir.cc" + with open(pname, "a") as f: + out = retv.body if isinstance(retv, container.LoweredFunc) else retv + f.write(str(out)) + if isinstance(retv, container.Array): + for x in retv: + out = x.body if isinstance(x, container.LoweredFunc) else x + f.write("---------%s\n%s\n-----------\n"%(x.name, str(out))) + self._pass_id += 1 + return retv + return dump + + def decorate_irpass(self): + '''decorate ir_pass and ScheduleOps''' + self._old_sgpass = schedule.ScheduleOps + schedule.ScheduleOps = self.decorate(schedule.ScheduleOps) + vset = vars(ir_pass) + k = v = 0 + def recover(): + vset[k] = v + for k, v in vset.items(): + self._recover_list.append(recover) + vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v + + def decorate_custompass(self): + ''' decorate add_lower_pass pass in BuildConfig''' + cfg = BuildConfig.current + self._old_custom_pass = cfg.add_lower_pass + custom_pass = cfg.add_lower_pass if cfg.add_lower_pass else [] + pass_list = [(x[0], self.decorate(x[1])) for x in custom_pass] + BuildConfig.current.add_lower_pass = pass_list + + def enter(self): + '''only decorate outermost nest''' + if DumpIR.scope_level > 0: + return + self.decorate_irpass() + self.decorate_custompass() + self._pass_id = 0 + DumpIR.scope_level += 1 + + def exit(self): + '''recover outermost nest''' + if DumpIR.scope_level > 1: + return + # recover decorated functions + for f in self._recover_list: + f() + schedule.ScheduleOps = self._old_sgpass + BuildConfig.current.add_lower_pass = self._old_custom_pass + DumpIR.scope_level -= 1 + class BuildConfig(object): """Configuration scope to set a build config option. @@ -37,10 +115,12 @@ class BuildConfig(object): "data_alignment": -1, "restricted_func": True, "double_buffer_split_loop": 1, - "add_lower_pass": None + "add_lower_pass": None, + "dump_pass_ir": False } def __init__(self, **kwargs): self._old_scope = None + self._dump_ir = DumpIR() for k, _ in kwargs.items(): if k not in BuildConfig.defaults: raise ValueError( @@ -59,10 +139,14 @@ def __enter__(self): attr.update(self._attr) self._attr = attr BuildConfig.current = self + if self.dump_pass_ir is True: + self._dump_ir.enter() return self def __exit__(self, ptype, value, trace): assert self._old_scope + if self.dump_pass_ir is True: + self._dump_ir.exit() BuildConfig.current = self._old_scope @@ -115,6 +199,8 @@ def build_config(**kwargs): phase contains an integer on which optimization pass we apply the pass. Additional lowering passes to be applied before make_api. + dump_pass_ir: dump ir of each pass into file idx_passname_ir.cc, default=False + Returns ------- config: BuildConfig @@ -247,7 +333,6 @@ def lower(sch, return stmt return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) - def build(sch, args=None, target=None, diff --git a/tests/python/unittest/test_pass_unroll.py b/tests/python/unittest/test_pass_unroll.py index 9e52a455e7da..b158113e6216 100644 --- a/tests/python/unittest/test_pass_unroll.py +++ b/tests/python/unittest/test_pass_unroll.py @@ -1,4 +1,5 @@ import tvm +import os def test_unroll_loop(): dtype = 'int64' @@ -24,4 +25,20 @@ def test_unroll_loop(): if __name__ == "__main__": - test_unroll_loop() + with tvm.build_config(dump_pass_ir=True): + test_unroll_loop() + + def end_with(*suffix): + ends = suffix + def run(s): + f = map(s.endswith, ends) + if True in f: return s + return run + + file_list = os.listdir('./') + cc_file = end_with('.cc') + cc_file = filter(cc_file, file_list) + assert len(cc_file) == 3 + for i in cc_file: + os.remove(i) + From 700949676ef3cdcc7b830da8b27274e90a4564d5 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Sat, 20 Jan 2018 16:51:34 -0500 Subject: [PATCH 094/948] [WIP] WebGL Backend (#672) Basic WebGL Backend --- CMakeLists.txt | 18 +- Jenkinsfile | 2 + Makefile | 17 +- include/tvm/runtime/c_runtime_api.h | 4 +- include/tvm/runtime/device_api.h | 11 +- include/tvm/schedule.h | 5 + python/tvm/__init__.py | 2 +- python/tvm/_ffi/runtime_ctypes.py | 2 + python/tvm/contrib/rpc.py | 4 + python/tvm/ndarray.py | 15 + python/tvm/schedule.py | 7 + python/tvm/target.py | 4 +- src/api/api_lang.cc | 5 + src/codegen/build_opengl.cc | 35 ++ src/codegen/codegen_c.h | 2 +- src/codegen/codegen_opengl.cc | 264 +++++++++ src/codegen/codegen_opengl.h | 48 ++ src/codegen/verilog/vpi_device_api.cc | 5 +- src/runtime/c_runtime_api.cc | 6 +- src/runtime/cpu_device_api.cc | 9 +- src/runtime/cuda/cuda_device_api.cc | 7 +- src/runtime/metal/metal_common.h | 5 +- src/runtime/metal/metal_device_api.mm | 4 +- src/runtime/module.cc | 2 + src/runtime/opencl/opencl_common.h | 5 +- src/runtime/opencl/opencl_device_api.cc | 2 +- src/runtime/opengl/opengl_common.h | 495 ++++++++++++++++ src/runtime/opengl/opengl_device_api.cc | 556 ++++++++++++++++++ src/runtime/opengl/opengl_module.cc | 284 +++++++++ src/runtime/opengl/opengl_module.h | 148 +++++ src/runtime/rocm/rocm_device_api.cc | 7 +- src/runtime/rpc/rpc_device_api.cc | 7 +- src/runtime/rpc/rpc_session.cc | 6 +- src/runtime/workspace_pool.cc | 25 +- src/schedule/schedule_lang.cc | 39 ++ tests/ci_build/Dockerfile.cpu | 3 + tests/ci_build/Dockerfile.gpu | 3 + .../ci_build/install/ubuntu_install_opengl.sh | 4 + tests/python/unittest/test_runtime_ndarray.py | 6 +- tests/scripts/task_python_integration.sh | 2 + tests/webgl/README.md | 7 + tests/webgl/test_local_gemm.py | 41 ++ tests/webgl/test_local_save_load.py | 35 ++ tests/webgl/test_remote_save_load.py | 78 +++ web/example_rpc.html | 4 + web/tvm_runtime.js | 6 +- web/web_runtime.cc | 2 + 47 files changed, 2203 insertions(+), 45 deletions(-) create mode 100644 src/codegen/build_opengl.cc create mode 100644 src/codegen/codegen_opengl.cc create mode 100644 src/codegen/codegen_opengl.h create mode 100644 src/runtime/opengl/opengl_common.h create mode 100644 src/runtime/opengl/opengl_device_api.cc create mode 100644 src/runtime/opengl/opengl_module.cc create mode 100644 src/runtime/opengl/opengl_module.h create mode 100644 tests/ci_build/install/ubuntu_install_opengl.sh create mode 100644 tests/webgl/README.md create mode 100644 tests/webgl/test_local_gemm.py create mode 100644 tests/webgl/test_local_save_load.py create mode 100644 tests/webgl/test_remote_save_load.py diff --git a/CMakeLists.txt b/CMakeLists.txt index f42705ae7fda..fd381b9a12e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ endif() tvm_option(USE_CUDA "Build with CUDA" OFF) tvm_option(USE_OPENCL "Build with OpenCL" OFF) +tvm_option(USE_OPENGL "Build with OpenGL" OFF) tvm_option(USE_METAL "Build with Metal" OFF) tvm_option(USE_RPC "Build with RPC" ON) tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) @@ -61,8 +62,8 @@ if(MSVC) else(MSVC) include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11) - set(CMAKE_C_FLAGS "-O3 -Wall -std=c++11 -fPIC") - set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS}) + set(CMAKE_C_FLAGS "-O3 -Wall -fPIC") + set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -std=c++11") endif(MSVC) # add source group @@ -87,6 +88,7 @@ file(GLOB RUNTIME_SRCS src/runtime/*.cc) file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc) file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc) file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc) +file(GLOB RUNTIME_OPENGL_SRCS src/runtime/opengl/*.cc) file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm) file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc) file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc) @@ -135,6 +137,18 @@ else(USE_OPENCL) add_definitions(-DTVM_OPENCL_RUNTIME=0) endif(USE_OPENCL) +if(USE_OPENGL) + find_package(OpenGL QUIET REQUIRED) + find_package(glfw3 QUIET REQUIRED) + message(STATUS "Build with OpenGL support") + include_directories(${OPENGL_INCLUDE_DIRS}) + list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenGL_LIBRARIES} glfw) + list(APPEND RUNTIME_SRCS ${RUNTIME_OPENGL_SRCS}) + add_definitions(-DTVM_OPENGL_RUNTIME=1) +else(USE_OPENGL) + add_definitions(-DTVM_OPENGL_RUNTIME=0) +endif(USE_OPENGL) + if(USE_METAL) find_package(OpenCL QUIET REQUIRED) message(STATUS "Build with Metal support") diff --git a/Jenkinsfile b/Jenkinsfile index 793bf11f0d4a..4fc2285f507c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -88,6 +88,7 @@ stage('Build') { echo USE_CUDNN=1 >> config.mk echo USE_CUDA=1 >> config.mk echo USE_OPENCL=1 >> config.mk + echo USE_OPENGL=1 >> config.mk echo LLVM_CONFIG=llvm-config-4.0 >> config.mk echo USE_RPC=1 >> config.mk echo USE_GRAPH_RUNTIME=1 >> config.mk @@ -120,6 +121,7 @@ stage('Build') { echo USE_CUDA=0 >> config.mk echo USE_OPENCL=0 >> config.mk echo USE_RPC=0 >> config.mk + echo USE_OPENGL=1 >> config.mk echo LLVM_CONFIG=llvm-config-4.0 >> config.mk """ make('cpu', '-j2') diff --git a/Makefile b/Makefile index fef8b2a08051..453415de6634 100644 --- a/Makefile +++ b/Makefile @@ -32,8 +32,8 @@ OBJCFLAGS = -fno-objc-arc EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\ -Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\ -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\ + -s USE_GLFW=3 -s USE_WEBGL2=1 -lglfw\ $(INCLUDE_FLAGS) - # llvm configuration ifdef LLVM_CONFIG LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3) @@ -54,6 +54,7 @@ METAL_SRC = $(wildcard src/runtime/metal/*.mm) CUDA_SRC = $(wildcard src/runtime/cuda/*.cc) ROCM_SRC = $(wildcard src/runtime/rocm/*.cc) OPENCL_SRC = $(wildcard src/runtime/opencl/*.cc) +OPENGL_SRC = $(wildcard src/runtime/opengl/*.cc) RPC_SRC = $(wildcard src/runtime/rpc/*.cc) GRAPH_SRC = $(wildcard src/runtime/graph/*.cc) RUNTIME_SRC = $(wildcard src/runtime/*.cc) @@ -65,6 +66,7 @@ METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC)) CUDA_OBJ = $(patsubst src/%.cc, build/%.o, $(CUDA_SRC)) ROCM_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCM_SRC)) OPENCL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENCL_SRC)) +OPENGL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENGL_SRC)) RPC_OBJ = $(patsubst src/%.cc, build/%.o, $(RPC_SRC)) GRAPH_OBJ = $(patsubst src/%.cc, build/%.o, $(GRAPH_SRC)) CC_OBJ = $(patsubst src/%.cc, build/%.o, $(CC_SRC)) $(LLVM_OBJ) @@ -119,6 +121,19 @@ else CFLAGS += -DTVM_OPENCL_RUNTIME=0 endif +ifeq ($(USE_OPENGL), 1) + CFLAGS += -DTVM_OPENGL_RUNTIME=1 + EMCC_FLAGS += -DTVM_OPENGL_RUNTIME=1 + ifeq ($(UNAME_S), Darwin) + FRAMEWORKS += -framework OpenGL + else + LDFLAGS += -lGL -lglfw + endif + RUNTIME_DEP += $(OPENGL_OBJ) +else + CFLAGS += -DTVM_OPENGL_RUNTIME=0 +endif + ifeq ($(USE_METAL), 1) CFLAGS += -DTVM_METAL_RUNTIME=1 LDFLAGS += -lobjc diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 91175f671a56..6d680330a659 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -55,9 +55,11 @@ typedef int64_t tvm_index_t; /*! \brief Extension device types in TVM */ typedef enum { + kOpenGL = 11, + // Extension DRAM type, used for quickly test extension device // The device api can differ depending on the xpu driver registered. - kExtDev = 12 + kExtDev = 12, // AddExtraTVMType which is not in DLPack here } TVMDeviceExtType; diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 318a5363726f..9ba08fb86825 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -55,11 +55,16 @@ class DeviceAPI { /*! * \brief Allocate a data space on device. * \param ctx The device context to perform operation. - * \param size The size of the memory + * \param nbytes The number of bytes in memory. * \param alignment The alignment of the memory. - * \return The allocated device pointer + * \param type_hint The type of elements. Only needed by certain backends such + * as OpenGL, as nbytes & alignment are sufficient for most backends. + * \return The allocated device pointer. */ - virtual void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) = 0; + virtual void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) = 0; /*! * \brief Free a data space on device. * \param ctx The device context to perform operation. diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h index 3efc31774d40..2f94aedccf3d 100644 --- a/include/tvm/schedule.h +++ b/include/tvm/schedule.h @@ -213,6 +213,11 @@ class Stage : public NodeRef { * \return reference to self. */ Stage& double_buffer(); // NOLINT(*) + /*! + * \brief Schedule for OpenGL fragment shader. + * \return reference to self. + */ + Stage& opengl(); // NOLINT(*) /*! * \brief whether the stage has been scheduled. * \return whether the stage has been scheduled. diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index e23eed7168dc..fe66271dce5e 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -17,7 +17,7 @@ from . import target from . import ndarray as nd -from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, ext_dev +from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, opengl, ext_dev from ._ffi.runtime_ctypes import TypeCode from ._ffi.function import Function diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index cfadd18188f5..e2641fbc7e46 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -97,6 +97,7 @@ class TVMContext(ctypes.Structure): 8 : 'metal', 9 : 'vpi', 10: 'rocm', + 11: 'opengl', 12: 'ext_dev', } STR2MASK = { @@ -111,6 +112,7 @@ class TVMContext(ctypes.Structure): 'metal': 8, 'vpi': 9, 'rocm': 10, + 'opengl': 11, 'ext_dev': 12, } def __init__(self, device_type, device_id): diff --git a/python/tvm/contrib/rpc.py b/python/tvm/contrib/rpc.py index 3d14f1eb2b6e..3448c4c554d1 100644 --- a/python/tvm/contrib/rpc.py +++ b/python/tvm/contrib/rpc.py @@ -285,6 +285,10 @@ def metal(self, dev_id=0): """Construct remote Metal device.""" return self.context(8, dev_id) + def opengl(self, dev_id=0): + """Construct remote OpenGL device.""" + return self.context(11, dev_id) + def ext_dev(self, dev_id=0): """Construct remote extension device.""" return self.context(12, dev_id) diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index 1556c4912a35..0521a69c5f80 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -120,6 +120,21 @@ def vpi(dev_id=0): """ return TVMContext(9, dev_id) +def opengl(dev_id=0): + """Construct a OpenGL device + + Parameters + ---------- + dev_id : int, optional + The integer device id + + Returns + ------- + ctx : TVMContext + The created context + """ + return TVMContext(11, dev_id) + def ext_dev(dev_id=0): """Construct a extension device diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py index 6abe4aae2f6f..0fc6692d950e 100644 --- a/python/tvm/schedule.py +++ b/python/tvm/schedule.py @@ -611,4 +611,11 @@ def double_buffer(self): """ _api_internal._StageDoubleBuffer(self) + def opengl(self): + """The special OpenGL schedule + + Maps each output element to a pixel. + """ + _api_internal._StageOpenGL(self) + _init_api("tvm.schedule") diff --git a/python/tvm/target.py b/python/tvm/target.py index 4584c1aa4d23..94cd5457e44f 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -67,7 +67,7 @@ class Target(object): Parameters ---------- - target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "ext_dev"} + target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"} The major target name. options : list of str, optional @@ -119,6 +119,8 @@ def __init__(self, elif target_name in ("metal",): self.keys += ("gpu",) self.max_num_threads = 256 + elif target_name in ("opengl",): + self.keys += ("opengl",) elif target_name in ("stackvm", "ext_dev"): # Do not now class for stacvm or ext_dev pass diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc index 94075b6ec059..37a21cedf3db 100644 --- a/src/api/api_lang.cc +++ b/src/api/api_lang.cc @@ -399,6 +399,11 @@ TVM_REGISTER_API("_StageDoubleBuffer") args[0].operator Stage().double_buffer(); }); +TVM_REGISTER_API("_StageOpenGL") + .set_body([](TVMArgs args, TVMRetValue *ret) { + args[0].operator Stage().opengl(); + }); + TVM_REGISTER_API("_ScheduleNormalize") .set_body([](TVMArgs args, TVMRetValue* ret) { *ret = args[0].operator Schedule() diff --git a/src/codegen/build_opengl.cc b/src/codegen/build_opengl.cc new file mode 100644 index 000000000000..5e13676e8111 --- /dev/null +++ b/src/codegen/build_opengl.cc @@ -0,0 +1,35 @@ +/*! + * Copyright (c) 2017 by Contributors + * Build opengl modules from source. + * \file build_opengl.cc + */ +#include +#include "./codegen_opengl.h" +#include "./build_common.h" + +namespace tvm { +namespace codegen { + +runtime::Module BuildOpenGL(Array funcs) { + bool output_ssa = false; + CodeGenOpenGL cg; + cg.Init(output_ssa); + for (LoweredFunc f : funcs) { + cg.AddFunction(f); + } + auto shaders = cg.Finish(); +#if TVM_OPENGL_RUNTIME + return OpenGLModuleCreate(shaders, "gl", ExtractFuncInfo(funcs)); +#else + LOG(WARNING) << "OpenGL runtime not enabled, return a source module..."; + auto data = ToJSON(shaders); + return DeviceSourceModuleCreate(data, "gl", ExtractFuncInfo(funcs), "opengl"); +#endif // TVM_OPENGL_RUNTIME +} + +TVM_REGISTER_API("codegen.build_opengl") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildOpenGL(args[0]); +}); +} // namespace codegen +} // namespace tvm diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 895e94b8198e..ce882eda4a18 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -150,7 +150,7 @@ class CodeGenC : std::string GetStructRef( Type t, const Expr& buffer, const Expr& index, int kind); // print reference to a buffer as type t in index. - std::string GetBufferRef( + virtual std::string GetBufferRef( Type t, const Variable* buffer, Expr index); /*! * \brief If buffer is allocated as type t. diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc new file mode 100644 index 000000000000..e645e7f6c701 --- /dev/null +++ b/src/codegen/codegen_opengl.cc @@ -0,0 +1,264 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file codegen_opengl.cc + * + * We are targeting OpenGL 3.3. The reason of not targeting a recent version + * of OpenGL is to have better compatibility of WebGL 2. + */ +#include +#include +#include +#include +#include "./codegen_opengl.h" +#include "../runtime/thread_storage_scope.h" + +namespace tvm { +namespace codegen { + +CodeGenOpenGL::CodeGenOpenGL() + : output_(nullptr), output_iter_var_(nullptr) {} + +void CodeGenOpenGL::InitFuncState(LoweredFunc f) { + CodeGenC::InitFuncState(f); + output_ = nullptr; + inputs_.clear(); + output_iter_var_ = nullptr; + thread_extent_var_ = ""; +} + +void CodeGenOpenGL::AddFunction(LoweredFunc f) { + // clear previous generated state. + this->InitFuncState(f); + + this->decl_stream << "#version 300 es\n"; + this->decl_stream << "precision highp float;\n"; + + // skip the first underscore, so SSA variable starts from _1 + GetUniqueName("_"); + // add to alloc buffer type. + for (const auto& kv : f->handle_data_type) { + RegisterHandleType(kv.first.get(), kv.second.type()); + } + + // Allocate argument names. Store in `var_idmap_`. + for (auto arg : f->args) { + auto arg_name = GetUniqueName(arg.get()->name_hint); + var_idmap_[arg.get()] = arg_name; + } + + thread_extent_var_ = GetUniqueName("thread_extent"); + this->decl_stream << "uniform int " << thread_extent_var_ << ";\n"; + + this->stream << "void main() {\n"; + + int func_scope = this->BeginScope(); + this->PrintStmt(f->body); + this->EndScope(func_scope); + + this->PrintIndent(); + this->stream << "}\n\n"; + + // Declare arguments. + for (auto arg : f->args) { + if (this->inputs_.find(arg.get()) != this->inputs_.cend()) { + // Declare input texture. + // Format: + // - Float: "uniform sampler2D {name};" + // - Int: "uniform isampler2D {name};" + // - UInt: "uniform usampler2D {name};" + + auto arg_name = GetVarID(arg.get()); + + auto type_it = this->handle_data_type_.find(arg.get()); + CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type."; + auto type = Type2TVMType(type_it->second); + CHECK_EQ(type.lanes, 1) << "Vector type not supported."; + + switch (type.code) { + case kDLInt: + this->decl_stream << "uniform isampler2D " << arg_name << ";\n"; + break; + case kDLUInt: + this->decl_stream << "uniform usampler2D " << arg_name << ";\n"; + break; + case kDLFloat: + this->decl_stream << "uniform sampler2D " << arg_name << ";\n"; + break; + default: + LOG(FATAL) << "Unsupported type code."; + } + + } else if (this->output_ == arg.get()) { + // Declare output texture. + // Format: "out {type} {name};" + + auto arg_name = GetVarID(arg.get()); + + auto type_it = this->handle_data_type_.find(arg.get()); + CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type."; + auto type = type_it->second; + + this->decl_stream << "out "; + PrintType(type, this->decl_stream); + this->decl_stream << " " << arg_name << ";\n"; + + } else { + // Declare uniform value. + // Format: "uniform {type} {name};" + + auto arg_name = GetVarID(arg.get()); + auto type = arg.get()->type; + + this->decl_stream << "uniform "; + PrintType(type, this->decl_stream); + this->decl_stream << " " << arg_name << ";\n"; + } + } + + std::vector arg_names; + std::vector arg_kinds; + for (auto arg : f->args) { + std::string name = GetVarID(arg.get()); + + runtime::OpenGLArgKind kind; + if (inputs_.find(arg.get()) != inputs_.cend()) { + kind = runtime::OpenGLArgKind::kInputTexture; + } else if (output_ == arg.get()) { + kind = runtime::OpenGLArgKind::kOutputTexture; + } else { + kind = runtime::OpenGLArgKind::kUniform; + } + + arg_names.push_back(name); + arg_kinds.push_back(kind); + } + + shaders_[f->name] = runtime::OpenGLShader( + this->decl_stream.str() + this->stream.str(), + std::move(arg_names), std::move(arg_kinds), + this->thread_extent_var_); +} + +std::unordered_map CodeGenOpenGL::Finish() { + return shaders_; +} + +void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) { + CHECK_EQ(iv->thread_tag, "threadIdx.x") << "Must be threadIdx.x"; + CHECK(var_idmap_.find(iv->var.get()) == var_idmap_.end()) + << "Only support one thread iter var"; + CHECK(output_iter_var_ == nullptr) << "Only support one thread iter var"; + + var_idmap_[iv->var.get()] = iv->thread_tag; + output_iter_var_ = iv->var.get(); + + // Declare threadIdx local variable. + this->PrintIndent(); + this->stream << "ivec2 threadIdx = ivec2(gl_FragCoord.xy);\n"; + + // Return directly if threadIdx.x >= thread_extent. + this->PrintIndent(); + this->stream << "if (threadIdx.x >= " << thread_extent_var_ << ") {\n"; + this->PrintIndent(); + this->stream << " return;\n"; + this->PrintIndent(); + this->stream << "}\n"; +} + +// GLSL texture store is special. We can only store to one output texture, and +// we must store to the index that matches the current "thread index". +void CodeGenOpenGL::VisitStmt_(const Store* op) { + auto t = op->value.type(); + auto buffer = op->buffer_var.get(); + auto index = op->index; + + if (t.lanes() == 1) { + // Store to a scalar. + CHECK(inputs_.find(buffer) == inputs_.cend()) + << "Texture has been read from before. Must not store to it."; + if (output_ == nullptr) { + output_ = buffer; // Record that this texture is the output. + } else { + CHECK(output_ == buffer) << "GLSL can only write to 1 texture."; + } + + this->PrintIndent(); + this->stream << GetBufferRef(t, buffer, index) << " = " + << PrintExpr(op->value) << ";\n"; + + } else { + // Store to a vector. + LOG(FATAL) << "Vectorized store not implemented."; + } +} + +// texelFetch(tex, ivec2(idx, 0), 0).r +std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) { + std::ostringstream os; + os << "texelFetch(" << GetVarID(buffer) << ", ivec2("; + PrintExpr(index, os); + os << ", 0), 0).r"; + return os.str(); +} + +// Print a reference expression to a buffer. +// Format: texelFetch(buffer, index, 0).r +std::string CodeGenOpenGL::GetBufferRef( + Type t, const Variable* buffer, Expr index) { + CHECK_EQ(t.lanes(), 1) << "Vector type not supported."; + CHECK(HandleTypeMatch(buffer, t)) << "Type mismatch not supported."; + + if (buffer == this->output_) { + // This is the output texture. + CHECK_EQ(index.get(), output_iter_var_) + << "GLSL must access corresponding elem of output texture."; + return GetVarID(buffer); + } else { + // This is an input texture. + this->inputs_.insert(buffer); + return TexelFetch(buffer, index); + } +} + +void CodeGenOpenGL::PrintType(Type t, std::ostream& os) { + switch (t.code()) { + case halideir_type_int: + CHECK_EQ(t.bits(), 32) << "Only support 32-bit int."; + os << "int"; + break; + case halideir_type_uint: + CHECK_EQ(t.bits(), 32) << "Only support 32-bit uint."; + os << "uint"; + break; + case halideir_type_float: + CHECK_EQ(t.bits(), 32) << "Only support 32-bit float."; + os << "float"; + break; + default: + LOG(FATAL) << "Unsupported type code."; + } +} + +// Codegen for immediate values + +void CodeGenOpenGL::VisitExpr_(const IntImm* op, std::ostream& os) { + CHECK_EQ(op->type, Int(32)) << "GLSL 3.0 only supports 32-bit ints."; + CodeGenC::VisitExpr_(op, os); +} + +void CodeGenOpenGL::VisitExpr_(const UIntImm* op, std::ostream& os) { + CHECK_EQ(op->type, UInt(32)) << "GLSL 3.0 only supports 32-bit uints."; + CodeGenC::VisitExpr_(op, os); +} + +void CodeGenOpenGL::VisitExpr_(const FloatImm* op, std::ostream& os) { + CHECK_EQ(op->type, Float(32)) << "GLSL 3.0 only supports 32-bit floats."; + CodeGenC::VisitExpr_(op, os); +} + +void CodeGenOpenGL::VisitExpr_(const StringImm*, std::ostream& os) { + LOG(FATAL) << "GLSL 3.0 doesn't support strings."; +} + +} // namespace codegen +} // namespace tvm diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h new file mode 100644 index 000000000000..6ff1f7e9ac95 --- /dev/null +++ b/src/codegen/codegen_opengl.h @@ -0,0 +1,48 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file codegen_opengl.h + * \brief Generate OpenGL device code. + */ +#ifndef TVM_CODEGEN_CODEGEN_OPENGL_H_ +#define TVM_CODEGEN_CODEGEN_OPENGL_H_ + +#include +#include +#include +#include "./codegen_c.h" +#include "../runtime/opengl/opengl_module.h" + +namespace tvm { +namespace codegen { + +class CodeGenOpenGL final : public CodeGenC { + public: + CodeGenOpenGL(); + void AddFunction(LoweredFunc f); + std::unordered_map Finish(); + + void InitFuncState(LoweredFunc f) final; + void BindThreadIndex(const IterVar& iv) final; + void VisitStmt_(const Store* op) final; + std::string TexelFetch(const Variable* buffer, Expr index); + std::string GetBufferRef(Type t, const Variable* buffer, Expr index) final; + void PrintType(Type t, std::ostream& os) final; // NOLINT(*) + + // Codegen for immediate values + void VisitExpr_(const IntImm* op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const UIntImm* op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const FloatImm* op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const StringImm* op, std::ostream& os) final; // NOLINT(*) + + private: + const Variable* output_{nullptr}; + std::unordered_set inputs_; + const Variable* output_iter_var_{nullptr}; + std::unordered_map shaders_; + std::string thread_extent_var_; +}; + +} // namespace codegen +} // namespace tvm + +#endif // TVM_CODEGEN_CODEGEN_OPENGL_H_ diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc index 4e0e73eb427b..8efd65785547 100644 --- a/src/codegen/verilog/vpi_device_api.cc +++ b/src/codegen/verilog/vpi_device_api.cc @@ -49,7 +49,10 @@ class VPIDeviceAPI final : public runtime::DeviceAPI { *rv = 1; } } - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final { + void* AllocDataSpace(TVMContext ctx, + size_t size, + size_t alignment, + TVMType type_hint) final { // always align to 32 bytes at least. CHECK_LE(alignment, runtime::kAllocAlignment); alignment = runtime::kAllocAlignment; diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index dd8f80bcd72f..0d0e36f239f2 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -31,6 +31,7 @@ inline std::string DeviceName(int type) { case kDLMetal: return "metal"; case kDLVPI: return "vpi"; case kDLROCM: return "rocm"; + case kOpenGL: return "opengl"; case kExtDev: return "ext_dev"; default: LOG(FATAL) << "unknown type =" << type; return "Unknown"; } @@ -95,7 +96,8 @@ DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) { } void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) { - return AllocDataSpace(ctx, size, kTempAllocaAlignment); + TVMType type_hint{kDLUInt, 8, 1}; + return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint); } void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { @@ -365,7 +367,7 @@ int TVMArrayAlloc(const tvm_index_t* shape, size_t size = GetDataSize(arr); size_t alignment = GetDataAlignment(arr); arr->data = DeviceAPIManager::Get(arr->ctx)->AllocDataSpace( - arr->ctx, size, alignment); + arr->ctx, size, alignment, arr->dtype); *out = arr; API_END_HANDLE_ERROR(TVMArrayFree_(arr)); } diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 1b2009e98e7f..30c3bb7d52df 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -20,13 +20,16 @@ class CPUDeviceAPI final : public DeviceAPI { *rv = 1; } } - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final { + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final { void* ptr; #if _MSC_VER - ptr = _aligned_malloc(size, alignment); + ptr = _aligned_malloc(nbytes, alignment); if (ptr == nullptr) throw std::bad_alloc(); #else - int ret = posix_memalign(&ptr, alignment, size); + int ret = posix_memalign(&ptr, alignment, nbytes); if (ret != 0) throw std::bad_alloc(); #endif return ptr; diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index fd2c54ffd58d..69b485a423c0 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -54,12 +54,15 @@ class CUDADeviceAPI final : public DeviceAPI { } *rv = value; } - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final { + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final { CUDA_CALL(cudaSetDevice(ctx.device_id)); CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes"; void *ret; - CUDA_CALL(cudaMalloc(&ret, size)); + CUDA_CALL(cudaMalloc(&ret, nbytes)); return ret; } diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index d7980e64c9a9..7c2975fe7ccc 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -63,7 +63,10 @@ class MetalWorkspace final : public DeviceAPI { // override device API void SetDevice(TVMContext ctx) final; void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final; + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; void CopyDataFromTo(const void* from, size_t from_size, diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index f66d5b51e64a..82c52a23e036 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -123,12 +123,12 @@ int GetWarpSize(id dev) { } void* MetalWorkspace::AllocDataSpace( - TVMContext ctx, size_t size, size_t alignment) { + TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) { this->Init(); id dev = GetDevice(ctx); // allocate buffer in GPU only mode. id buf = [ - dev newBufferWithLength:size + dev newBufferWithLength:nbytes options:MTLResourceStorageModePrivate]; CHECK(buf != nil); return (__bridge void*)([buf retain]); diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 43ad6e523494..3b95137f4fa4 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -115,6 +115,8 @@ bool RuntimeEnabled(const std::string& target) { f_name = "device_api.gpu"; } else if (target == "cl" || target == "opencl") { f_name = "device_api.opencl"; + } else if (target == "gl" || target == "opengl") { + f_name = "device_api.opengl"; } else if (target == "mtl" || target == "metal") { f_name = "device_api.metal"; } else if (target == "stackvm") { diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index e990aeba6a3e..29e205ced4d7 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -142,7 +142,10 @@ class OpenCLWorkspace final : public DeviceAPI { // override device API void SetDevice(TVMContext ctx) final; void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final; + void* AllocDataSpace(TVMContext ctx, + size_t size, + size_t alignment, + TVMType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; void CopyDataFromTo(const void* from, size_t from_offset, diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 23c897e04825..7518e72f9d9b 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -51,7 +51,7 @@ void OpenCLWorkspace::GetAttr( } void* OpenCLWorkspace::AllocDataSpace( - TVMContext ctx, size_t size, size_t alignment) { + TVMContext ctx, size_t size, size_t alignment, TVMType type_hint) { this->Init(); CHECK(context != nullptr) << "No OpenCL device"; cl_int err_code; diff --git a/src/runtime/opengl/opengl_common.h b/src/runtime/opengl/opengl_common.h new file mode 100644 index 000000000000..80b1d9f95c8e --- /dev/null +++ b/src/runtime/opengl/opengl_common.h @@ -0,0 +1,495 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file opengl_common.h + * \brief OpenGL common header + */ +#ifndef TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_ +#define TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace gl { + +// This file contains the following classes. +class GLFunctionPointers; +class OpenGLWorkspace; +class Texture; +class Program; + +inline GLFWglproc GetProcAddress(const char* procname) { + GLFWglproc proc = glfwGetProcAddress(procname); + CHECK(proc != nullptr) << "Cannot get function \"" << procname << "\""; + return proc; +} + +#define SetGLFunctionPointer(NAME) \ + NAME(decltype(NAME)(GetProcAddress("gl" #NAME))) + +/*! + * \brief The function pointers of all OpenGL APIs that are used. + * Must be constructed after creating an OpenGL context. + */ +class GLFunctionPointers { + public: + GLFunctionPointers() + : SetGLFunctionPointer(ActiveTexture), + SetGLFunctionPointer(AttachShader), + SetGLFunctionPointer(BindBuffer), + SetGLFunctionPointer(BindFramebuffer), + SetGLFunctionPointer(BindTexture), + SetGLFunctionPointer(BindVertexArray), + SetGLFunctionPointer(BufferData), + SetGLFunctionPointer(CheckFramebufferStatus), + SetGLFunctionPointer(Clear), + SetGLFunctionPointer(CompileShader), + SetGLFunctionPointer(CreateProgram), + SetGLFunctionPointer(CreateShader), + SetGLFunctionPointer(DeleteFramebuffers), + SetGLFunctionPointer(DeleteProgram), + SetGLFunctionPointer(DeleteShader), + SetGLFunctionPointer(DeleteTextures), + SetGLFunctionPointer(DetachShader), + SetGLFunctionPointer(DrawArrays), + SetGLFunctionPointer(DrawBuffers), + SetGLFunctionPointer(EnableVertexAttribArray), + SetGLFunctionPointer(Finish), + SetGLFunctionPointer(FramebufferTexture2D), + SetGLFunctionPointer(GenBuffers), + SetGLFunctionPointer(GenFramebuffers), + SetGLFunctionPointer(GenTextures), + SetGLFunctionPointer(GenVertexArrays), + SetGLFunctionPointer(GetAttribLocation), + SetGLFunctionPointer(GetError), + SetGLFunctionPointer(GetIntegerv), + SetGLFunctionPointer(GetProgramInfoLog), + SetGLFunctionPointer(GetProgramiv), + SetGLFunctionPointer(GetShaderInfoLog), + SetGLFunctionPointer(GetShaderiv), + SetGLFunctionPointer(GetString), + SetGLFunctionPointer(GetUniformLocation), + SetGLFunctionPointer(LinkProgram), + SetGLFunctionPointer(ReadPixels), + SetGLFunctionPointer(ShaderSource), + SetGLFunctionPointer(TexImage2D), + SetGLFunctionPointer(TexParameteri), + SetGLFunctionPointer(TexSubImage2D), + SetGLFunctionPointer(Uniform1f), + SetGLFunctionPointer(Uniform1i), + SetGLFunctionPointer(UseProgram), + SetGLFunctionPointer(VertexAttribPointer), + SetGLFunctionPointer(Viewport) {} + + void (*ActiveTexture)(GLenum texture); + void (*AttachShader)(GLuint program, GLuint shader); + void (*BindBuffer)(GLenum target, GLuint buffer); + void (*BindFramebuffer)(GLenum target, GLuint framebuffer); + void (*BindTexture)(GLenum target, GLuint texture); + void (*BindVertexArray)(GLuint array); + void (*BufferData)(GLenum target, GLsizeiptr size, const GLvoid* data, + GLenum usage); + GLenum (*CheckFramebufferStatus)(GLenum target); + void (*Clear)(GLbitfield mask); + void (*CompileShader)(GLuint shader); + GLuint (*CreateProgram)(); + GLuint (*CreateShader)(GLenum shader_type); + void (*DeleteFramebuffers)(GLsizei n, const GLuint* framebuffers); + void (*DeleteProgram)(GLuint program); + void (*DeleteShader)(GLuint shader); + void (*DeleteTextures)(GLsizei n, const GLuint* textures); + void (*DetachShader)(GLuint program, GLuint shader); + void (*DrawArrays)(GLenum mode, GLint first, GLsizei count); + void (*DrawBuffers)(GLsizei n, const GLenum* bufs); + void (*EnableVertexAttribArray)(GLuint index); + void (*Finish)(); + void (*FramebufferTexture2D)(GLenum target, GLenum attachment, + GLenum textarget, GLuint texture, GLint level); + void (*GenBuffers)(GLsizei n, GLuint* buffers); + void (*GenFramebuffers)(GLsizei n, GLuint* ids); + void (*GenTextures)(GLsizei n, GLuint* textures); + void (*GenVertexArrays)(GLsizei n, GLuint* arrays); + GLint (*GetAttribLocation)(GLuint program, const GLchar* name); + GLenum (*GetError)(); + void (*GetIntegerv)(GLenum pname, GLint* data); + void (*GetProgramInfoLog)(GLuint program, GLsizei maxLength, GLsizei* length, + GLchar* info_log); + void (*GetProgramiv)(GLuint program, GLenum pname, GLint* params); + void (*GetShaderInfoLog)(GLuint shader, GLsizei max_length, GLsizei* length, + GLchar* info_log); + void (*GetShaderiv)(GLuint shader, GLenum pname, GLint* params); + const GLubyte *(*GetString)(GLenum name); + GLint (*GetUniformLocation)(GLuint program, const GLchar* name); + void (*LinkProgram)(GLuint program); + void (*ReadPixels)(GLint x, GLint y, GLsizei width, GLsizei height, + GLenum format, GLenum type, GLvoid* data); + void (*ShaderSource)(GLuint shader, GLsizei count, const GLchar** string, + const GLint* length); + void (*TexImage2D)(GLenum target, GLint level, GLint internal_format, + GLsizei width, GLsizei height, GLint border, GLenum format, + GLenum type, const GLvoid* data); + void (*TexParameteri)(GLenum target, GLenum pname, GLint param); + void (*TexSubImage2D)(GLenum target, GLint level, GLint xoffset, + GLint yoffset, GLsizei width, GLsizei height, + GLenum format, GLenum type, const GLvoid* data); + void (*Uniform1f)(GLint location, GLfloat v0); + void (*Uniform1i)(GLint location, GLint v0); + void (*UseProgram)(GLuint program); + void (*VertexAttribPointer)(GLuint index, GLint size, GLenum type, + GLboolean normalized, GLsizei stride, + const GLvoid* pointer); + void (*Viewport)(GLint x, GLint y, GLsizei width, GLsizei height); +}; + +/*! + * \brief Process global OpenGL workspace. + */ +class OpenGLWorkspace final : public DeviceAPI { + public: + ~OpenGLWorkspace() final; + + // override device API + void SetDevice(TVMContext ctx) final; + void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final; + void FreeDataSpace(TVMContext ctx, void* ptr) final; + void CopyDataFromTo(const void* from, + size_t from_offset, + void* to, + size_t to_offset, + size_t size, + TVMContext ctx_from, + TVMContext ctx_to, + TVMStreamHandle stream) final; + void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; + void* AllocWorkspace(TVMContext ctx, size_t size) final; + void FreeWorkspace(TVMContext ctx, void* data) final; + + /*! + * \brief Get the global OpenGL workspace. + * \return The global OpenGL workspace. + */ + static const std::shared_ptr& Global(); + + /*! + * \brief Create an OpenGL program that uses the given fragment shader. + * \param fragment_shader The fragment shader **source**. + * \return The OpenGL program. + */ + Program CreateProgram(const char* fragment_shader_src); + + /*! + * \brief Create an OpenGL texture that stores an array. + * \param type Element type. + * \param nbytes Number of bytes in the array. + * \return The OpenGL texture. + */ + Texture CreateTexture(TVMType type, size_t nbytes); + + /*! + * \brief Upload user data into a sub-region of an OpenGL texture. + * \param texture The texture to be written to. + * \param begin The index of the first element to be written to. + * \param nelems The number of elements to be written to. + * \param data The user data. + */ + void PutTextureData(Texture* texture, + GLint begin, + GLsizei nelems, + const GLvoid* data); + /*! + * \brief Download a sub-region of an OpenGL texture. + * \param texture The texture to download from. + * \param begin The index of first element to download from. + * \param nelems The number of elements to download from. + * \param data The user buffer. + */ + void GetTextureData(const Texture* texture, + GLint begin, + GLsizei nelems, + GLvoid* data); + + /*! + * \brief Set currently used OpenGL program. + */ + void SetCurrentProgram(const Program& program); + + /*! + * \brief Set uniform values for an OpenGL program. + * Must call SetCurrentProgram before calling this. + * \param program The OpenGL program. + * \param name The uniform argument name. + * \param type The type of the uniform. + * \param value The value to pass in. + */ + void SetUniform(const Program& program, + const std::string& name, + TVMType type, + void* value); + + /*! + * \brief Set input texture for an OpenGL program. + * Must call SetCurrentProgram before calling this. + * \param program The OpenGL program. + * \param name The texture uniform argument name. + * \param unit The texture unit to use. Each input texture must occupy a + * different unit. + * \param texture The OpenGL texture to pass in. + */ + void SetInputTexture(const Program& program, + const std::string& name, + GLuint unit, + Texture* texture); + + /*! + * \brief Render to a texture. + * \param output The output texture. + */ + void Render(Texture* output); + + private: + friend class Texture; + friend class Program; + + // Global singleton. Hide constructor. + OpenGLWorkspace(); + + GLFWwindow* window_; + std::unique_ptr gl; + GLuint vertex_shader_; + static const int kWindowWidth = 640; + static const int kWindowHeight = 480; + struct Vertex { + float x, y; + }; + static constexpr size_t kNumVertices = 6; + static const Vertex vertices[kNumVertices]; + static const char* vertex_shader_text_; + + /*! + * \brief Bind a texture to a "texture unit". + * After calling this function, the "texture unit" becomes "active", and the + * texture is bound to GL_TEXTURE_2D in that "texture unit". + * \param unit The texture unit to activate. + * \param texture The texture to bind. + */ + void BindTextureUnit(GLuint unit, GLuint texture); + + /*! + * \brief Callback in Texture's destructor. + */ + void OnDeleteTexture(GLuint texture); + + /*! + * \brief Callback in Program's destructor. + */ + void OnDeleteProgram(GLuint program); + + /*! + * \brief Check if there is any outstanding OpenGL error. If there is, crash. + */ + void CheckOpenGLError(); + + /*! + * \brief Get the maximum number of texture units. + */ + GLuint NumTextureUnits(); + + /*! + * \brief Create and compile a shader from a source string. + * \param shader_kind The kind of shader. + * Could be GL_VERTEX_SHADER or GL_FRAGMENT_SHADER. + * \param shader_src The source string of the shader. + * \return The compiled shader ID. + */ + GLuint CreateShader(GLenum shader_kind, const char* shader_src); + + /*! + * \brief Create an OpenGL program that uses the given fragment shader. + * \param fragment_shader The **compiled** fragment shader. + * \return The OpenGL program. + */ + Program CreateProgram(GLuint fragment_shader); +}; + +/*! + * \brief An OpenGL program, composed of a vertex shader and a fragment shader. + * In TVM, every program has the same vertex shader. + * So a program just corresponds to a fragment shader. + * A program can only be created by the workspace. + * This class is just a wrapper over an OpenGL program ID. + */ +class Program { + public: + // Move constructor. + Program(Program&& other) noexcept + : workspace_(other.workspace_), program_(other.program_) { + other.program_ = kInvalidProgram; + } + + // Move assignment. + Program& operator=(Program&& other) noexcept { + workspace_ = other.workspace_; + program_ = other.program_; + other.program_ = kInvalidProgram; + return *this; + } + + // Disallow copy. + Program(const Program& other) = delete; + Program& operator=(const Program& other) = delete; + + // Destructor. + ~Program() { + if (program_ != kInvalidProgram) { + workspace_->OnDeleteProgram(program_); + program_ = kInvalidProgram; + } + } + + private: + friend class OpenGLWorkspace; + + // Only OpenGLWorkspace can create a Program. + // We enforce this to make sure OpenGL is initialized. + explicit Program(OpenGLWorkspace* workspace, GLuint program) + : workspace_(workspace), program_(program) {} + + // The internal OpenGL program ID. + GLuint program() const { return program_; } + + static constexpr GLuint kInvalidProgram = static_cast(-1); + + OpenGLWorkspace* workspace_; + GLuint program_; +}; + +/*! + * \brief The storage format of a texture. + * The members match the API of glTexImage2D. + */ +struct TextureFormat { + TextureFormat(GLint internal_format, GLenum format, GLenum type) + : internal_format(internal_format), format(format), type(type) {} + + GLsizei elemsz() const { + switch (type) { + case GL_BYTE: case GL_UNSIGNED_BYTE: + return 1; + case GL_SHORT: case GL_UNSIGNED_SHORT: + return 2; + case GL_INT: case GL_UNSIGNED_INT: + return 4; + case GL_FLOAT: + return 4; + default: + LOG(FATAL) << "Unsupported type"; + return -1; + } + } + + bool operator==(const TextureFormat& other) const { + return std::make_tuple(internal_format, format, type) == + std::make_tuple(other.internal_format, other.format, other.type); + } + + GLint internal_format; // OpenGL says this is GLint, not GLenum. + GLenum format; + GLenum type; +}; + +/*! + * \brief An OpenGL texture represents a chunk of GPU memory. + * This is the way we represent tensors. + * We always use 2D textures. + */ +class Texture { + public: + // Move constructor. + Texture(Texture&& other) noexcept + : workspace_(other.workspace_), texture_(other.texture_), + format_(other.format_), width_(other.width_), height_(other.height_) { + other.texture_ = kInvalidTexture; + } + + // Move assignment. + Texture& operator=(Texture&& other) noexcept { + workspace_ = other.workspace_; + texture_ = other.texture_; + format_ = other.format_; + width_ = other.width_; + height_ = other.height_; + other.texture_ = kInvalidTexture; + return *this; + } + + // Disallow copy. + Texture(const Texture& other) = delete; + Texture& operator=(const Texture& other) = delete; + + // Destructor. + ~Texture() { + if (texture_ != kInvalidTexture) { + workspace_->OnDeleteTexture(texture_); + texture_ = kInvalidTexture; + } + } + + /*! + * \brief The width of the texture in number of pixels. + */ + GLsizei width() const { return width_; } + + /*! + * \brief The height of the texture in number of pixels. + */ + GLsizei height() const { return height_; } + + /*! + * \brief The number of bytes of each element in the array. + */ + GLsizei elemsz() const { return format_.elemsz(); } + + private: + friend class OpenGLWorkspace; + + // Only OpenGLWorkspace can create a Texture. + // We enforce this to make sure OpenGL is initialized. + // Always only use the first dimension of a 2D texture. + // The reason is that texelFetch only supports 2D textures. + explicit Texture(OpenGLWorkspace* workspace, GLuint texture, + TextureFormat format, + GLsizei width, GLsizei height) + : workspace_(workspace), texture_(texture), format_(format), + width_(width), height_(height) {} + + // The internal texture ID. + GLuint texture() const { return texture_; } + + static constexpr GLuint kInvalidTexture = static_cast(-1); + + OpenGLWorkspace* workspace_; + GLuint texture_; + TextureFormat format_; + GLsizei width_; + GLsizei height_; +}; + +} // namespace gl +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_ diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc new file mode 100644 index 000000000000..798003af902f --- /dev/null +++ b/src/runtime/opengl/opengl_device_api.cc @@ -0,0 +1,556 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file opengl_device_api.cc + */ +#include "./opengl_common.h" + +#if TVM_OPENGL_RUNTIME + +#include +#include + +namespace tvm { +namespace runtime { +namespace gl { + +/*! + * \brief Turn OpenGL error enum to string. + */ +static const char* GLGetErrorString(GLenum error) { + switch (error) { + case GL_NO_ERROR: + return "GL_NO_ERROR"; + case GL_INVALID_ENUM: + return "GL_INVALID_ENUM"; + case GL_INVALID_VALUE: + return "GL_INVALID_VALUE"; + case GL_INVALID_OPERATION: + return "GL_INVALID_OPERATION"; + case GL_STACK_OVERFLOW: + return "GL_STACK_OVERFLOW"; + case GL_STACK_UNDERFLOW: + return "GL_STACK_UNDERFLOW"; + case GL_OUT_OF_MEMORY: + return "GL_OUT_OF_MEMORY"; + default: + return "Unknown OpenGL error code"; + } +} + +/*! + * \brief Get the latest error. + */ +void OpenGLWorkspace::CheckOpenGLError() { + GLenum err = gl->GetError(); + CHECK_EQ(err, GL_NO_ERROR) << "OpenGL error, code=" << err << ": " + << gl::GLGetErrorString(err); +} + +/*! + * \brief Protected OpenGL call. + * \param func Expression to call. + */ +#define OPENGL_CALL(func) \ + { \ + (func); \ + CheckOpenGLError(); \ + } + +/*! + * \brief The error handling callback passed to GLFW. + */ +void GlfwErrorCallback(int err, const char* str) { + LOG(FATAL) << "Error: [" << err << "] " << str; +} + +const std::shared_ptr& OpenGLWorkspace::Global() { + static std::shared_ptr inst(new OpenGLWorkspace); + return inst; +} + +void OpenGLWorkspace::SetDevice(TVMContext ctx) { + CHECK_EQ(ctx.device_type, static_cast(kOpenGL)) + << "Device type must be OpenGL."; + CHECK_EQ(ctx.device_id, 0) << "Only support 1 OpenGL \"device\"."; +} + +void OpenGLWorkspace::GetAttr( + TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { + switch (kind) { + case kExist: { + *rv = static_cast(ctx.device_id == 0); + break; + } + case kMaxThreadsPerBlock: { + GLint max_texture_size; + OPENGL_CALL(gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size)); + break; + } + case kWarpSize: { + *rv = 1; + break; + } + case kComputeVersion: { + break; + } + } +} + +void* OpenGLWorkspace::AllocDataSpace( + TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) { + return reinterpret_cast(new Texture(CreateTexture(type_hint, nbytes))); +} + +void OpenGLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { + delete reinterpret_cast(ptr); +} + +void OpenGLWorkspace::CopyDataFromTo(const void* from, + size_t from_offset, + void* to, + size_t to_offset, + size_t size, + TVMContext ctx_from, + TVMContext ctx_to, + TVMStreamHandle stream) { + CHECK(stream == nullptr); + + // TODO(zhixunt): This is a nasty hack to avoid comparison between + // incompatible enums. We should add kOpenGL to dlpack. + constexpr int gl_devtype = kOpenGL; + std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); + + if (type_from_to == std::make_tuple(gl_devtype, gl_devtype)) { + auto from_texture = static_cast(from); + auto to_texture = static_cast(to); + auto temp_buffer = std::unique_ptr(new char[size]); + CHECK(from_texture->format_ == to_texture->format_); + auto elemsz = from_texture->elemsz(); + auto from_begin = static_cast(from_offset / elemsz); + auto to_begin = static_cast(to_offset / elemsz); + auto nelems = static_cast(size / elemsz); + GetTextureData(from_texture, from_begin, nelems, temp_buffer.get()); + PutTextureData(to_texture, to_begin, nelems, temp_buffer.get()); + + } else if (type_from_to == std::make_tuple(gl_devtype, kDLCPU)) { + auto texture = static_cast(from); + void *data = static_cast(to) + to_offset; + auto elemsz = texture->elemsz(); + auto begin = static_cast(from_offset / elemsz); + auto nelems = static_cast(size / elemsz); + GetTextureData(texture, begin, nelems, data); + + } else if (type_from_to == std::make_tuple(kDLCPU, gl_devtype)) { + auto texture = reinterpret_cast(to); + const void* data = static_cast(from) + from_offset; + auto elemsz = texture->elemsz(); + auto begin = static_cast(to_offset / elemsz); + auto nelems = static_cast(size / elemsz); + PutTextureData(texture, begin, nelems, data); + + } else { + LOG(FATAL) << "Expect copy from/to OpenGL or between OpenGL"; + } +} + +void OpenGLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {} + +void* OpenGLWorkspace::AllocWorkspace(TVMContext ctx, size_t size) { + LOG(FATAL) << "Cannot allocate OpenGL workspace."; + return nullptr; +} + +void OpenGLWorkspace::FreeWorkspace(TVMContext ctx, void* data) { + LOG(FATAL) << "Cannot free OpenGL workspace."; +} + +OpenGLWorkspace::OpenGLWorkspace() { + // Set an error handler. + // This can be called before glfwInit(). + glfwSetErrorCallback(&GlfwErrorCallback); + + // Initialize GLFW. + if (glfwInit() != GL_TRUE) { + LOG(FATAL) << "glfwInit() failed!"; + } + + // Create a window. + glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3); + glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); + glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); + glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); + glfwWindowHint(GLFW_VISIBLE, GL_FALSE); + window_ = glfwCreateWindow(kWindowWidth, kWindowHeight, "", nullptr, nullptr); + if (window_ == nullptr) { + LOG(FATAL) << "glfwCreateWindow() failed!"; + } + + // Before using any OpenGL API, we must specify a context. + glfwMakeContextCurrent(window_); + + // Load all OpenGL API function pointers. + gl = std::unique_ptr(new GLFunctionPointers); + + CheckOpenGLError(); + + // We always render the same vertices and triangles. + GLuint vertex_buffer; + OPENGL_CALL(gl->GenBuffers(1, &vertex_buffer)); + OPENGL_CALL(gl->BindBuffer(GL_ARRAY_BUFFER, vertex_buffer)); + OPENGL_CALL(gl->BufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, + GL_STATIC_DRAW)); + + GLuint vertex_array; + OPENGL_CALL(gl->GenVertexArrays(1, &vertex_array)); + OPENGL_CALL(gl->BindVertexArray(vertex_array)); + OPENGL_CALL(gl->BindBuffer(GL_ARRAY_BUFFER, vertex_buffer)); + + // We always use the same vertex shader. + vertex_shader_ = CreateShader(GL_VERTEX_SHADER, vertex_shader_text_); + + LOG(INFO) << "OpenGL initialized, version = " << gl->GetString(GL_VERSION); +} + +OpenGLWorkspace::~OpenGLWorkspace() { + // Paired with glfwCreateWindow(). + glfwDestroyWindow(window_); + + // Paired with glfwInit(). + glfwTerminate(); +} + +void OpenGLWorkspace::BindTextureUnit(GLuint unit, GLuint texture) { + OPENGL_CALL(gl->ActiveTexture(GL_TEXTURE0 + unit)); + OPENGL_CALL(gl->BindTexture(GL_TEXTURE_2D, texture)); +} + +void OpenGLWorkspace::OnDeleteTexture(GLuint texture) { + OPENGL_CALL(gl->DeleteTextures(1, &texture)); +} + +void OpenGLWorkspace::OnDeleteProgram(GLuint program) { + OPENGL_CALL(gl->DeleteProgram(program)); +} + +GLuint OpenGLWorkspace::NumTextureUnits() { + GLint num_units; + OPENGL_CALL(gl->GetIntegerv(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS, &num_units)); + return static_cast(num_units); +} + +const OpenGLWorkspace::Vertex OpenGLWorkspace::vertices[OpenGLWorkspace::kNumVertices] = { + {-1.f, -1.f}, + {1.0f, -1.f}, + {1.0f, 1.0f}, + {-1.f, -1.f}, + {-1.f, 1.0f}, + {1.0f, 1.0f}, +}; + +// Don't need to change this. +// The vertex shader only needs to take in the triangle points. +// No need for point transformations. +const char* OpenGLWorkspace::vertex_shader_text_ = "#version 300 es\n" + "in vec2 point; // input to vertex shader\n" + "void main() {\n" + " gl_Position = vec4(point, 0.0, 1.0);\n" + "}\n"; + +Program OpenGLWorkspace::CreateProgram( + const char* fragment_shader_src) { + // Create and compile the shaders. + GLuint fragment_shader = CreateShader(GL_FRAGMENT_SHADER, + fragment_shader_src); + + // Link the shaders and create the program. + Program program = CreateProgram(fragment_shader); + + OPENGL_CALL(gl->DeleteShader(fragment_shader)); + + return program; +} + +GLuint OpenGLWorkspace::CreateShader(GLenum shader_kind, + const char* shader_src) { + // Create the shader. + GLuint shader = gl->CreateShader(shader_kind); + gl->ShaderSource(shader, 1, &shader_src, nullptr); + gl->CompileShader(shader); + + // Check compile errors. + GLint err; + gl->GetShaderiv(shader, GL_COMPILE_STATUS, &err); + + GLint info_log_len; + gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &info_log_len); + + if (err != GL_TRUE) { + std::unique_ptr err_msg(new char[info_log_len + 1]); + gl->GetShaderInfoLog(shader, info_log_len, nullptr, err_msg.get()); + LOG(FATAL) << err_msg.get(); + assert(false); + } + + CheckOpenGLError(); + + return shader; +} + +static TextureFormat GetTextureFormat(TVMType type) { + CHECK_EQ(type.lanes, 1) << "Not supporting multi-lane types."; + + switch (type.code) { + case kDLInt: { + switch (type.bits) { + case 8: + return {GL_R8I, GL_RED_INTEGER, GL_BYTE}; + case 16: + return {GL_R16I, GL_RED_INTEGER, GL_SHORT}; + case 32: + return {GL_R32I, GL_RED_INTEGER, GL_INT}; + default: + LOG(FATAL) << "Unsupported type bits " << type.bits; + } + } + case kDLUInt: { + switch (type.bits) { + case 8: + return {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}; + case 16: + return {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}; + case 32: + return {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}; + default: + LOG(FATAL) << "Unsupported type bits " << type.bits; + } + } + case kDLFloat: { + switch (type.bits) { + case 32: + return {GL_R32F, GL_RED, GL_FLOAT}; + default: + LOG(FATAL) << "Unsupported type bits " << type.bits; + } + } + default: + LOG(FATAL) << "Unsupported type code" << type.code; + } + assert(false); +} + +Texture OpenGLWorkspace::CreateTexture(TVMType type, size_t nbytes) { + // Create a texture. + GLuint texture; + OPENGL_CALL(gl->GenTextures(1, &texture)); + + BindTextureUnit(NumTextureUnits() - 1, texture); + + // Use glTexImage2D with nullptr data to specify GPU data storage. + auto texture_format = GetTextureFormat(type); + auto width = static_cast(nbytes / (type.bits / 8)); + auto height = GLsizei(1); + OPENGL_CALL(gl->TexImage2D(GL_TEXTURE_2D, /*level=*/0, + texture_format.internal_format, + width, height, /*border=*/0, + texture_format.format, texture_format.type, + /*data=*/nullptr)); + + OPENGL_CALL( + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE)); + OPENGL_CALL( + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE)); + OPENGL_CALL( + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); + OPENGL_CALL( + gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); + + return Texture(this, texture, texture_format, width, height); +} + +Program OpenGLWorkspace::CreateProgram(GLuint fragment_shader) { + // Create the program and link the shaders. + GLuint program = gl->CreateProgram(); + gl->AttachShader(program, vertex_shader_); + gl->AttachShader(program, fragment_shader); + gl->LinkProgram(program); + + // Check link errors. + GLint err; + gl->GetProgramiv(program, GL_LINK_STATUS, &err); + + GLint info_log_len; + gl->GetProgramiv(program, GL_INFO_LOG_LENGTH, &info_log_len); + + if (err != GL_TRUE) { + std::unique_ptr err_msg(new char[info_log_len + 1]); + gl->GetProgramInfoLog(program, info_log_len, nullptr, err_msg.get()); + LOG(FATAL) << err_msg.get(); + assert(false); + } + + CheckOpenGLError(); + + OPENGL_CALL(gl->DetachShader(program, vertex_shader_)); + OPENGL_CALL(gl->DetachShader(program, fragment_shader)); + + auto point_attrib = GLuint(gl->GetAttribLocation(program, "point")); + OPENGL_CALL(gl->EnableVertexAttribArray(point_attrib)); + + OPENGL_CALL(gl->VertexAttribPointer(point_attrib, 2, GL_FLOAT, GL_FALSE, + sizeof(Vertex), nullptr)); + + return Program(this, program); +} + +void OpenGLWorkspace::PutTextureData(Texture *texture, + GLint begin, + GLsizei nelems, + const GLvoid* data) { + // Bind to temporary unit. + BindTextureUnit(NumTextureUnits() - 1, texture->texture()); + + // Similar to cudaMemcpy. + OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0, + /*xoffset=*/begin, /*yoffset=*/0, + /*width=*/nelems, /*height=*/1, + texture->format_.format, texture->format_.type, + data)); +} + +void OpenGLWorkspace::GetTextureData(const Texture *texture, + GLint begin, + GLsizei nelems, + GLvoid* data) { + BindTextureUnit(NumTextureUnits() - 1, texture->texture()); + + // Create frame buffer. + GLuint frame_buffer; + OPENGL_CALL(gl->GenFramebuffers(1, &frame_buffer)); + OPENGL_CALL(gl->BindFramebuffer(GL_FRAMEBUFFER, frame_buffer)); + + // Bind texture to framebuffer's attachment 0. + OPENGL_CALL(gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, texture->texture(), 0)); + + // Always check that our framebuffer is okay. + if (gl->CheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG(FATAL) << "Framebuffer not complete."; + } + +#ifdef __EMSCRIPTEN__ + // WebGL2's glReadPixels API doesn't allow GL_RED user buffer format. + // Instead, We must use GL_RGBA. This means the data we retrieve has useless + // GBA channels. Here we are applying a dirty hack. + // TODO(zhixunt): We really want to utilize all RGBA channels in textures. + // + // WebGL2's glReadPixels API also doesn't allow GL_RED_INTEGER or + // GL_RGB_INTEGER user buffer format, which means we cannot retrieve integer + // texture data? (need to confirm) + + CHECK_EQ(texture->format_.internal_format, GL_R32F) + << "Retrieving integer texture not supported yet."; + auto elemsz = texture->format_.elemsz(); + auto nchannels = 4; + auto padded_data_size = nchannels * nelems * elemsz; + auto padded_data = std::unique_ptr(new char[padded_data_size]); + OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems, + /*height=*/1, GL_RGBA, GL_FLOAT, + padded_data.get())); + for (GLsizei i = 0; i != nelems; ++i) { + auto dst = reinterpret_cast(data) + i * elemsz; + auto src = padded_data.get() + nchannels * i * elemsz; + std::memcpy(dst, src, elemsz); + } +#else + OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems, + /*height=*/1, texture->format_.format, + texture->format_.type, data)); +#endif + + OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer)); +} + +void OpenGLWorkspace::SetCurrentProgram(const Program& program) { + OPENGL_CALL(gl->UseProgram(program.program())); +} + +void OpenGLWorkspace::SetUniform(const Program& program, + const std::string& name, + TVMType type, + void* value) { + GLint location = gl->GetUniformLocation(program.program(), name.c_str()); + switch (type.code) { + case kDLInt: { + CHECK_EQ(type.bits, 32) << "Only support 32-bit int for uniform."; + GLint uniform_value = *reinterpret_cast(value); + OPENGL_CALL(gl->Uniform1i(location, uniform_value)); + break; + } + case kDLUInt: { + LOG(FATAL) << "Strangely, emcc WebGL does not support glUniform1ui."; + break; + } + case kDLFloat: { + CHECK_EQ(type.bits, 32) << "Only support 32-bit float for uniform."; + GLfloat uniform_value = *reinterpret_cast(value); + OPENGL_CALL(gl->Uniform1f(location, uniform_value)); + break; + } + default: { + LOG(FATAL) << "Unsupported type code for uniform."; + break; + } + } +} + +void OpenGLWorkspace::SetInputTexture(const Program& program, + const std::string& name, + GLuint unit, + Texture* texture) { + // We always use the last texture unit as temporary. + // Therefore, we can have "NumTextureUnits() - 1" input textures. + CHECK_LT(unit, NumTextureUnits() - 1) << "Too many textures."; + + BindTextureUnit(unit, texture->texture()); + GLint location = gl->GetUniformLocation(program.program_, name.c_str()); + OPENGL_CALL(gl->Uniform1i(location, unit)); +} + +void OpenGLWorkspace::Render(Texture* output) { + // Create frame buffer. + GLuint frame_buffer; + OPENGL_CALL(gl->GenFramebuffers(1, &frame_buffer)); + OPENGL_CALL(gl->BindFramebuffer(GL_FRAMEBUFFER, frame_buffer)); + + // Set "renderedTexture" as our colour attachement 0. + OPENGL_CALL(gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, output->texture(), 0)); + + // Specify that we will render to color attachment 0. + GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0}; + OPENGL_CALL(gl->DrawBuffers(1, DrawBuffers)); + + // Always check that our framebuffer is okay. + if (gl->CheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG(FATAL) << "Framebuffer not complete."; + } + + // Perform rendering. + OPENGL_CALL(gl->Viewport(0, 0, output->width(), output->height())); + OPENGL_CALL(gl->Clear(GL_COLOR_BUFFER_BIT)); + OPENGL_CALL(gl->DrawArrays(GL_TRIANGLES, 0, 6)); + + OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer)); +} + +TVM_REGISTER_GLOBAL("device_api.opengl") +.set_body([](TVMArgs args, TVMRetValue* rv) { + DeviceAPI* ptr = OpenGLWorkspace::Global().get(); + *rv = static_cast(ptr); +}); + +} // namespace gl +} // namespace runtime +} // namespace tvm + +#endif // TVM_OPENGL_RUNTIME diff --git a/src/runtime/opengl/opengl_module.cc b/src/runtime/opengl/opengl_module.cc new file mode 100644 index 000000000000..6793bd6d29b6 --- /dev/null +++ b/src/runtime/opengl/opengl_module.cc @@ -0,0 +1,284 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file opengl_module.cc + */ +#include + +#include "./opengl_common.h" +#include "./opengl_module.h" + +#if TVM_OPENGL_RUNTIME + +#include +#include "../pack_args.h" +#include "../thread_storage_scope.h" +#include "../file_util.h" + +namespace tvm { +namespace runtime { + +class OpenGLModuleNode final : public ModuleNode { + public: + OpenGLModuleNode(std::unordered_map shaders, + std::string fmt, + std::unordered_map fmap); + + ~OpenGLModuleNode() override = default; + + const char* type_key() const final { return "opengl"; } + + PackedFunc GetFunction(const std::string& name, + const std::shared_ptr& sptr_to_self) final; + + std::string GetSource(const std::string& format) final; + + void SaveToFile(const std::string& file_name, + const std::string& format) final; + + void SaveToBinary(dmlc::Stream* stream) final; + + const gl::Program& GetProgram(const std::string& func_name) const; + + const OpenGLShader& GetShader(const std::string& func_name) const; + + const FunctionInfo& GetFunctionInfo(const std::string& func_name) const; + + gl::OpenGLWorkspace& workspace() const { return *workspace_; } + + private: + std::shared_ptr workspace_; + std::unordered_map shaders_; + std::string fmt_; + std::unordered_map fmap_; + std::unordered_map programs_; + + DISALLOW_COPY_AND_ASSIGN(OpenGLModuleNode); +}; + +class OpenGLWrappedFunc { + public: + OpenGLWrappedFunc(OpenGLModuleNode* m, + std::shared_ptr sptr, + std::string func_name, + std::vector arg_size, + const std::vector& thread_axis_tags); + + void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const; + + private: + // The module + OpenGLModuleNode* m_; + // resource handle + std::shared_ptr sptr_; + // The name of the function. + std::string func_name_; + // convert code for void argument + std::vector arg_size_; + // thread axis config + ThreadAxisConfig thread_axis_cfg_; +}; + +OpenGLModuleNode::OpenGLModuleNode( + std::unordered_map shaders, + std::string fmt, + std::unordered_map fmap) + : workspace_(gl::OpenGLWorkspace::Global()), shaders_(std::move(shaders)), + fmt_(std::move(fmt)), fmap_(std::move(fmap)), programs_() { + CHECK_EQ(fmt_, "gl") << "Unknown OpenGL format " << fmt_; + for (auto &pair : shaders_) { + auto &func_name = pair.first; + auto &shader = pair.second; + programs_.emplace(func_name, + workspace_->CreateProgram(shader.source.c_str())); + } +} + +PackedFunc OpenGLModuleNode::GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) { + CHECK_EQ(sptr_to_self.get(), this); + CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; + + auto func_info_it = fmap_.find(name); + if (func_info_it == fmap_.end()) { return PackedFunc(); } + auto &func_info = func_info_it->second; + + std::vector arg_size(func_info.arg_types.size()); + for (size_t i = 0; i < func_info.arg_types.size(); ++i) { + TVMType t = func_info.arg_types[i]; + CHECK_EQ(t.lanes, 1U); + uint32_t bits = t.bits; + CHECK_EQ(bits % 8, 0U); + arg_size[i] = bits / 8; + } + + // Initialize the wrapped func. + OpenGLWrappedFunc f(this, sptr_to_self, name, arg_size, + func_info.thread_axis_tags); + return PackFuncVoidAddr(f, func_info.arg_types); +} + +std::string OpenGLModuleNode::GetSource(const std::string& format) { + if (format != fmt_ && fmt_ != "gl") { return ""; } + + std::ostringstream os; + for (auto &pair : shaders_) { + auto &name = pair.first; + auto &shader = pair.second; + os << "[" << name << "]" << "\n"; + os << shader.source <<"\n"; + } + return os.str(); +} + +void OpenGLModuleNode::SaveToFile(const std::string& file_name, + const std::string& format) { + std::string fmt = GetFileFormat(file_name, format); + CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_; + std::string meta_file = GetMetaFilePath(file_name); + SaveMetaDataToFile(meta_file, fmap_); + SaveBinaryToFile(file_name, ToJSON(shaders_)); +} + +void OpenGLModuleNode::SaveToBinary(dmlc::Stream* stream) { + stream->Write(fmt_); + stream->Write(fmap_); + stream->Write(ToJSON(shaders_)); +} + +const gl::Program& OpenGLModuleNode::GetProgram( + const std::string& func_name) const { + auto it = programs_.find(func_name); + if (it == programs_.end()) { + LOG(FATAL) << "Cannot find program"; + } + return it->second; +} + +const OpenGLShader& OpenGLModuleNode::GetShader( + const std::string& func_name) const { + auto it = shaders_.find(func_name); + if (it == shaders_.end()) { + LOG(FATAL) << "Cannot find shader"; + } + return it->second; +} + +const FunctionInfo& OpenGLModuleNode::GetFunctionInfo( + const std::string& func_name) const { + auto it = fmap_.find(func_name); + if (it == fmap_.end()) { + LOG(FATAL) << "Cannot find shader"; + } + return it->second; +} + +OpenGLWrappedFunc::OpenGLWrappedFunc( + OpenGLModuleNode* m, + std::shared_ptr sptr, + std::string func_name, + std::vector arg_size, + const std::vector& thread_axis_tags) + : m_(m), sptr_(std::move(sptr)), func_name_(std::move(func_name)), + arg_size_(std::move(arg_size)) { + thread_axis_cfg_.Init(arg_size_.size(), thread_axis_tags); +} + +void OpenGLWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, + void** void_args) const { + auto &shader = m_->GetShader(func_name_); + auto &program = m_->GetProgram(func_name_); + auto &func_info = m_->GetFunctionInfo(func_name_); + size_t nargs = shader.arg_kinds.size(); + + // Must call this function before setting uniforms & input textures. + m_->workspace().SetCurrentProgram(program); + + // Set all arguments. + GLuint texture_unit = 0; + gl::Texture* output = nullptr; + for (size_t i = 0; i != nargs; ++i) { + auto &name = shader.arg_names.at(i); + auto kind = shader.arg_kinds.at(i); + auto type = func_info.arg_types.at(i); + switch (kind) { + case OpenGLArgKind::kUniform: { + m_->workspace().SetUniform(program, name, type, void_args[i]); + break; + } + case OpenGLArgKind::kInputTexture: { + CHECK_EQ(type.code, kHandle) << "Type is not handle?"; + auto texture = *static_cast(void_args[i]); + m_->workspace().SetInputTexture(program, name, texture_unit, texture); + ++texture_unit; + break; + } + case OpenGLArgKind::kOutputTexture: { + CHECK_EQ(type.code, kHandle) << "Type is not handle?"; + CHECK(output == nullptr) << "Can only have one output texture."; + output = *static_cast(void_args[i]); + break; + } + } + } + + // Set "thread_extent" uniform. + ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); + std::unique_ptr thread_extent(new GLint(wl.block_dim(0))); + m_->workspace().SetUniform(program, shader.thread_extent_var, + TVMType{kDLInt, 32, 1}, + static_cast(thread_extent.get())); + + m_->workspace().Render(output); +} + +Module OpenGLModuleCreate(std::unordered_map shaders, + std::string fmt, + std::unordered_map fmap) { + auto n = std::make_shared(std::move(shaders), + std::move(fmt), + std::move(fmap)); + return Module(n); +} + +Module OpenGLModuleLoadFile(const std::string& file_name, + const std::string& format) { + std::string data; + std::unordered_map fmap; + std::string fmt = GetFileFormat(file_name, format); + std::string meta_file = GetMetaFilePath(file_name); + LoadBinaryFromFile(file_name, &data); + LoadMetaDataFromFile(meta_file, &fmap); + return OpenGLModuleCreate(FromJSON(data), fmt, fmap); +} + +Module OpenGLModuleLoadBinary(void* strm) { + auto stream = static_cast(strm); + std::string data; + std::unordered_map fmap; + std::string fmt; + stream->Read(&fmt); + stream->Read(&fmap); + stream->Read(&data); + return OpenGLModuleCreate(FromJSON(data), fmt, fmap); +} + +TVM_REGISTER_GLOBAL("module.loadfile_gl") + .set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = OpenGLModuleLoadFile(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("module.loadfile_glbin") + .set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = OpenGLModuleLoadFile(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("module.loadbinary_opengl") + .set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = OpenGLModuleLoadBinary(args[0]); + }); + +} // namespace runtime +} // namespace tvm + +#endif // TVM_OPENGL_RUNTIME diff --git a/src/runtime/opengl/opengl_module.h b/src/runtime/opengl/opengl_module.h new file mode 100644 index 000000000000..1913878c3d68 --- /dev/null +++ b/src/runtime/opengl/opengl_module.h @@ -0,0 +1,148 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file opengl_module.h + * \brief Execution handling of OpenGL kernels + */ +#ifndef TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_ +#define TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_ + +#include +#include +#include +#include +#include +#include +#include "../meta_data.h" + +namespace tvm { +namespace runtime { + +/*! + * \brief Determines how we supply arguments. + */ +enum class OpenGLArgKind { + kInputTexture = 0, // Bind to "gsampler2D" in GLSL. + kOutputTexture = 1, // Bind to "out" in GLSL. + kUniform = 2, // Bind to "uniform" in GLSL. +}; + +std::string OpenGLArgKind2String(OpenGLArgKind kind); +OpenGLArgKind String2OpenGLArgKind(const std::string& str); + +/*! + * \brief The output of OpenGL codegen. + * Contains necessary information to build a fragment shader and bind arguments. + */ +struct OpenGLShader { + OpenGLShader() = default; + OpenGLShader(std::string source, + std::vector arg_names, + std::vector arg_kinds, + std::string thread_extent_var) + : source(std::move(source)), arg_names(std::move(arg_names)), + arg_kinds(std::move(arg_kinds)), + thread_extent_var(std::move(thread_extent_var)) { + CHECK_EQ(this->arg_names.size(), this->arg_kinds.size()) << "Invalid input"; + } + + std::string source; + std::vector arg_names; // Matches FunctionInfo. + std::vector arg_kinds; // Matches FunctionInfo. + std::string thread_extent_var; // Stores the output length. + + void Save(dmlc::JSONWriter* writer) const; + void Load(dmlc::JSONReader* reader); +}; + +std::string ToJSON(const std::unordered_map& shaders); +std::unordered_map FromJSON(const std::string& str); + +/*! + * \brief Create an OpenGL module from data. + * + * \param data The module data. + * \param fmt The format of the data, + * \param fmap The map function information map of each function. + */ +Module OpenGLModuleCreate(std::unordered_map shaders, + std::string fmt, + std::unordered_map fmap); + +inline std::string OpenGLArgKind2String(OpenGLArgKind kind) { + switch (kind) { + case OpenGLArgKind::kOutputTexture: + return "output_texture"; + case OpenGLArgKind::kInputTexture: + return "input_texture"; + case OpenGLArgKind::kUniform: + return "uniform"; + } + assert(false); +} + +inline OpenGLArgKind String2OpenGLArgKind(const std::string& str) { + if (str == "output_texture") { + return OpenGLArgKind::kOutputTexture; + } else if (str == "input_texture") { + return OpenGLArgKind::kInputTexture; + } else if (str == "uniform") { + return OpenGLArgKind::kUniform; + } else { + LOG(FATAL) << "Invalid OpenGL arg kind."; + assert(false); + } +} + +inline void OpenGLShader::Save(dmlc::JSONWriter* writer) const { + std::vector arg_kind_strs; + for (auto kind : arg_kinds) { + arg_kind_strs.push_back(OpenGLArgKind2String(kind)); + } + + writer->BeginObject(); + writer->WriteObjectKeyValue("arg_names", arg_names); + writer->WriteObjectKeyValue("arg_kinds", arg_kind_strs); + writer->WriteObjectKeyValue("source", source); + writer->WriteObjectKeyValue("thread_extent_var", thread_extent_var); + writer->EndObject(); +} + +inline void OpenGLShader::Load(dmlc::JSONReader* reader) { + std::vector arg_kind_strs; + dmlc::JSONObjectReadHelper helper; + helper.DeclareField("arg_names", &arg_names); + helper.DeclareField("arg_kinds", &arg_kind_strs); + helper.DeclareField("source", &source); + helper.DeclareField("thread_extent_var", &thread_extent_var); + helper.ReadAllFields(reader); + + arg_kinds.clear(); + for (auto& str : arg_kind_strs) { + arg_kinds.push_back(String2OpenGLArgKind(str)); + } +} + +inline std::string ToJSON( + const std::unordered_map& shaders) { + std::ostringstream os; + dmlc::JSONWriter writer(&os); + writer.BeginObject(); + writer.WriteObjectKeyValue("shaders", shaders); + writer.EndObject(); + return os.str(); +} + +inline std::unordered_map FromJSON( + const std::string& str) { + std::unordered_map shaders; + std::istringstream is(str); + dmlc::JSONReader reader(&is); + dmlc::JSONObjectReadHelper helper; + helper.DeclareField("shaders", &shaders); + helper.ReadAllFields(&reader); + return shaders; +} + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_ diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 1e6154163b35..443d76b76eb6 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -52,12 +52,15 @@ class ROCMDeviceAPI final : public DeviceAPI { } *rv = value; } - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final { + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final { ROCM_CALL(hipSetDevice(ctx.device_id)); CHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes"; void *ret; - ROCM_CALL(hipMalloc(&ret, size)); + ROCM_CALL(hipMalloc(&ret, nbytes)); return ret; } diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc index 7674fa3e2334..bd222b5c272e 100644 --- a/src/runtime/rpc/rpc_device_api.cc +++ b/src/runtime/rpc/rpc_device_api.cc @@ -20,10 +20,13 @@ class RPCDeviceAPI final : public DeviceAPI { *rv = GetSess(ctx)->CallRemote( RPCCode::kDevGetAttr, ctx, static_cast(kind)); } - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final { + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final { auto sess = GetSess(ctx); void *data = sess->CallRemote( - RPCCode::kDevAllocData, ctx, size, alignment); + RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint); RemoteSpace* space = new RemoteSpace(); space->data = data; space->sess = std::move(sess); diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 0fa021918ed2..3bff73afe9bd 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -887,9 +887,11 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) { void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) { TVMContext ctx = args[0]; - uint64_t size = args[1]; + uint64_t nbytes = args[1]; uint64_t alignment = args[2]; - void* data = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, size, alignment); + TVMType type_hint = args[3]; + void* data = DeviceAPI::Get(ctx)->AllocDataSpace( + ctx, nbytes, alignment, type_hint); *rv = data; } diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc index 494927979a0f..24035faedaa7 100644 --- a/src/runtime/workspace_pool.cc +++ b/src/runtime/workspace_pool.cc @@ -23,28 +23,29 @@ class WorkspacePool::Pool { allocated_.push_back(e); } // allocate from pool - void* Alloc(TVMContext ctx, DeviceAPI* device, size_t size) { + void* Alloc(TVMContext ctx, DeviceAPI* device, size_t nbytes) { // Allocate align to page. - size = (size + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize; - if (size == 0) size = kWorkspacePageSize; + nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize; + if (nbytes == 0) nbytes = kWorkspacePageSize; Entry e; + TVMType type = {.code = kDLUInt, .bits = 8, .lanes = 1}; if (free_list_.size() == 2) { e = free_list_.back(); free_list_.pop_back(); - if (e.size < size) { + if (e.size < nbytes) { // resize the page device->FreeDataSpace(ctx, e.data); - e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment); - e.size = size; + e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type); + e.size = nbytes; } } else if (free_list_.size() == 1) { - e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment); - e.size = size; + e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type); + e.size = nbytes; } else { - if (free_list_.back().size >= size) { + if (free_list_.back().size >= nbytes) { // find smallest fit auto it = free_list_.end() - 2; - for (; it->size >= size; --it) {} + for (; it->size >= nbytes; --it) {} e = *(it + 1); free_list_.erase(it + 1); } else { @@ -52,8 +53,8 @@ class WorkspacePool::Pool { e = free_list_.back(); free_list_.pop_back(); device->FreeDataSpace(ctx, e.data); - e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment); - e.size = size; + e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type); + e.size = nbytes; } } allocated_.push_back(e); diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc index f8fcb8b0c744..59bc3f242b03 100644 --- a/src/schedule/schedule_lang.cc +++ b/src/schedule/schedule_lang.cc @@ -397,6 +397,45 @@ Stage& Stage::double_buffer() { return *this; } +Stage& Stage::opengl() { + CHECK(!is_scheduled()) << "Must be a fresh schedule"; + StageNode *self = operator->(); + + auto all_iter_vars = self->all_iter_vars; // curr version of all_iter_vars + CHECK(!all_iter_vars.empty()) << "At least one iter var"; + + // Fuse all data parallel dimensions to 1. + IterVar fused = all_iter_vars[0]; + for (size_t i = 1; i != all_iter_vars.size(); ++i) { + auto iter_var = all_iter_vars[i]; + switch (iter_var->iter_type) { + case IterVarType::kDataPar: { + fuse(fused, all_iter_vars[i], &fused); + break; + } + case IterVarType::kThreadIndex: { + LOG(ERROR) << "A fresh schedule shouldn't have thread index iter var"; + break; + } + case IterVarType::kCommReduce: + case IterVarType::kOrdered: + case IterVarType::kOpaque: { + break; + } + default: { + LOG(ERROR) << "Invalid iter var type " + << IterVarType2String(iter_var->iter_type); + break; + } + } + } + + // Bind the only dimension to threadIdx.x. + bind(fused, thread_axis(Range(nullptr), "threadIdx.x")); + + return *this; +} + Stage CopyStage(const Stage& s) { std::shared_ptr n = std::make_shared(*s.operator->()); diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 2c7510365f9d..57fb7e04552b 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -20,3 +20,6 @@ RUN bash /install/ubuntu_install_java.sh COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh RUN bash /install/ubuntu_install_llvm.sh + +COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh +RUN bash /install/ubuntu_install_opengl.sh diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index 9dff84e84635..b71b4cb118ec 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -37,6 +37,9 @@ RUN bash /install/ubuntu_install_nodejs.sh COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh RUN bash /install/ubuntu_install_rocm.sh +COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh +RUN bash /install/ubuntu_install_opengl.sh + # Enable doxygen for c++ doc build RUN apt-get install -y doxygen graphviz diff --git a/tests/ci_build/install/ubuntu_install_opengl.sh b/tests/ci_build/install/ubuntu_install_opengl.sh new file mode 100644 index 000000000000..f8be6e351581 --- /dev/null +++ b/tests/ci_build/install/ubuntu_install_opengl.sh @@ -0,0 +1,4 @@ +apt-get update --fix-missing + +apt-get install -y --no-install-recommends --force-yes \ + libgl1-mesa-dev libglfw3-dev \ No newline at end of file diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py index 698f877d2504..5edf43337de5 100644 --- a/tests/python/unittest/test_runtime_ndarray.py +++ b/tests/python/unittest/test_runtime_ndarray.py @@ -7,7 +7,8 @@ def enabled_ctx_list(): ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), - ('vpi', tvm.vpi(0))] + ('vpi', tvm.vpi(0)), + ('opengl', tvm.opengl(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] @@ -19,7 +20,8 @@ def enabled_ctx_list(): def test_nd_create(): for ctx in ENABLED_CTX_LIST: - for dtype in ["float32", "int8", "uint16"]: + for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", + "float32"]: x = np.random.randint(0, 10, size=(3, 4)) x = np.array(x, dtype=dtype) y = tvm.nd.array(x, ctx=ctx) diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index 757f2429ad32..70c2919f6fd0 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -17,3 +17,5 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1 +TVM_FFI=cython python -m nose -v tests/webgl || exit -1 +TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1 diff --git a/tests/webgl/README.md b/tests/webgl/README.md new file mode 100644 index 000000000000..c9f2ae3d2272 --- /dev/null +++ b/tests/webgl/README.md @@ -0,0 +1,7 @@ +## Test cases for the WebGL backend + +Any test case with name `test_local_...` tests the C++ OpenGL backend on the +local OS, which can be executed automatically. + +Any test case with name `test_remote_...` tests the WebGL backend within the +browser, which must be run manually. See instruction within the test. diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py new file mode 100644 index 000000000000..18d2d1d8bf34 --- /dev/null +++ b/tests/webgl/test_local_gemm.py @@ -0,0 +1,41 @@ +import tvm +import numpy as np + +def test_local_gemm(): + if not tvm.module.enabled("opengl"): + return + if not tvm.module.enabled("llvm"): + return + + nn = 2 + n = tvm.var('n') + n = tvm.convert(nn) + m = n + l = n + A = tvm.placeholder((n, l), name='A', dtype='int32') + B = tvm.placeholder((m, l), name='B', dtype='int32') + k = tvm.reduce_axis((0, l), name='k') + C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), + name='CC') + + s = tvm.create_schedule(C.op) + s[C].opengl() + print(tvm.lower(s, [A, B, C], simple_mode=True)) + + f = tvm.build(s, [A, B, C], "opengl", name="gemm") + print("------opengl code------") + print(f.imported_modules[0].get_source(fmt="gl")) + + ctx = tvm.opengl() + n, m, l = nn, nn, nn + a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype) + b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + f(a, b, c) + + np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T)) + +if __name__ == "__main__": + test_local_gemm() diff --git a/tests/webgl/test_local_save_load.py b/tests/webgl/test_local_save_load.py new file mode 100644 index 000000000000..9dca0d3acfca --- /dev/null +++ b/tests/webgl/test_local_save_load.py @@ -0,0 +1,35 @@ +import numpy as np +import tvm +from tvm.contrib import rpc, util, emscripten + +def test_local_save_load(): + if not tvm.module.enabled("opengl"): + return + if not tvm.module.enabled("llvm"): + return + + n = tvm.var("n") + A = tvm.placeholder((n,), name='A', dtype='int32') + B = tvm.placeholder((n,), name='B', dtype='int32') + C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") + s = tvm.create_schedule(C.op) + s[C].opengl() + + f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd") + + ctx = tvm.opengl(0) + n = 10 + a = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros((n), dtype=C.dtype), ctx) + f(a, b, c) + + temp = util.tempdir() + path_so = temp.relpath("myadd.so") + f.export_library(path_so) + f1 = tvm.module.load(path_so) + f1(a, b, c) + np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) + +if __name__ == "__main__": + test_local_save_load() diff --git a/tests/webgl/test_remote_save_load.py b/tests/webgl/test_remote_save_load.py new file mode 100644 index 000000000000..507616ee9f2e --- /dev/null +++ b/tests/webgl/test_remote_save_load.py @@ -0,0 +1,78 @@ +""" +The following instruction is based on web/README.md. + +Setup an RPC server: +$ python -m tvm.exec.rpc_proxy --example-rpc=1 + +Go to http://localhost:9190 in browser. + +Click "Connect To Proxy". + +Run this test script: +$ python tests/webgl/test_remote_save_load.py +""" + +import numpy as np +import tvm +from tvm.contrib import rpc, util, emscripten + +proxy_host = "localhost" +proxy_port = 9090 + +def try_remote_save_load(): + if not tvm.module.enabled("rpc"): + return + if not tvm.module.enabled("opengl"): + return + if not tvm.module.enabled("llvm"): + return + + # Build the module. + n = tvm.var("n") + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") + s = tvm.create_schedule(C.op) + s[C].opengl() + target_host = "llvm -target=asmjs-unknown-emscripten -system-lib" + f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd") + + remote = rpc.connect(proxy_host, proxy_port, key="js") + + temp = util.tempdir() + ctx = remote.opengl(0) + path_obj = temp.relpath("myadd.bc") + path_dso = temp.relpath("myadd.js") + path_gl = temp.relpath("myadd.gl") + path_json = temp.relpath("myadd.tvm_meta.json") + + f.save(path_obj) + emscripten.create_js(path_dso, path_obj, side_module=True) + f.imported_modules[0].save(path_gl) + + remote.upload(path_dso, "myadd.dso") + remote.upload(path_gl) + remote.upload(path_json) + + remote.download("myadd.dso") + remote.download("myadd.gl") + remote.download("myadd.tvm_meta.json") + + print('Loading myadd.dso') + fhost = remote.load_module("myadd.dso") + + print('Loading myadd.gl') + fdev = remote.load_module("myadd.gl") + + print('import_module') + fhost.import_module(fdev) + + print('running...') + a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx) + c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx) + fhost(a, b, c) + np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) + +if __name__ == "__main__": + try_remote_save_load() diff --git a/web/example_rpc.html b/web/example_rpc.html index bcccbef7358f..b23ecda8e017 100644 --- a/web/example_rpc.html +++ b/web/example_rpc.html @@ -36,5 +36,9 @@

Options

+ + diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index f69b1d57a11d..c23c1f9da796 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -686,7 +686,8 @@ var tvm_runtime = tvm_runtime || {}; 2 : "gpu", 4 : "opencl", 8 : "metal", - 9 : "vpi" + 9 : "vpi", + 11 : "opengl", }; var CTX_STR2MASK = { "cpu": 1, @@ -695,7 +696,8 @@ var tvm_runtime = tvm_runtime || {}; "cl": 4, "opencl": 4, "metal": 8, - "vpi": 9 + "vpi": 9, + "opengl": 11, }; TVMContext.prototype = { toString : function() { diff --git a/web/web_runtime.cc b/web/web_runtime.cc index 56538733025b..b8db8cce7006 100644 --- a/web/web_runtime.cc +++ b/web/web_runtime.cc @@ -18,6 +18,8 @@ #include "../src/runtime/rpc/rpc_event_impl.cc" #include "../src/runtime/rpc/rpc_server_env.cc" #include "../src/runtime/graph/graph_runtime.cc" +#include "../src/runtime/opengl/opengl_device_api.cc" +#include "../src/runtime/opengl/opengl_module.cc" namespace tvm { namespace contrib { From 8356388c86df2b324ca234e18746a37fa7c394dd Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 21 Jan 2018 15:17:26 -0800 Subject: [PATCH 095/948] Disable OpenGL test temporary (#801) --- tests/scripts/task_python_integration.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index 70c2919f6fd0..d10c9a6b127f 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -17,5 +17,7 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1 -TVM_FFI=cython python -m nose -v tests/webgl || exit -1 -TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1 + +# Do not enabke OpenGL +# TVM_FFI=cython python -m nose -v tests/webgl || exit -1 +# TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1 From b8128aabc7f960b37096a26f1d3fdb9dd78a1b83 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Mon, 22 Jan 2018 12:58:48 -0500 Subject: [PATCH 096/948] temporarily disable opengl in test_runtime_ndarray.py (#804) --- tests/python/unittest/test_runtime_ndarray.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py index 5edf43337de5..a03a33b7af08 100644 --- a/tests/python/unittest/test_runtime_ndarray.py +++ b/tests/python/unittest/test_runtime_ndarray.py @@ -7,8 +7,7 @@ def enabled_ctx_list(): ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), - ('vpi', tvm.vpi(0)), - ('opengl', tvm.opengl(0))] + ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] From 9651bc9378d3eff2694d60bb927b7e583f6fe202 Mon Sep 17 00:00:00 2001 From: Siju Samuel Date: Mon, 22 Jan 2018 23:29:15 +0530 Subject: [PATCH 097/948] Update inject_virtual_thread.cc (#806) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compilation warning is fixed. src/pass/inject_virtual_thread.cc:43:19: warning: ‘rw_mask’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (rw_mask & 2) { ~~~~~~~~^~~ --- src/pass/inject_virtual_thread.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc index bcf0e3d9fbaf..c0e7b4b3ff9e 100644 --- a/src/pass/inject_virtual_thread.cc +++ b/src/pass/inject_virtual_thread.cc @@ -32,7 +32,7 @@ class ExprTouched final : public IRVisitor { } void Visit_(const Call *op) final { if (op->is_intrinsic(intrinsic::tvm_access_ptr)) { - int rw_mask; + int rw_mask = 0; CHECK(arith::GetConstInt(op->args[4], &rw_mask)); const Variable* buffer_var = op->args[1].as(); CHECK(buffer_var); From 883df333a326f2ad0b2265411268c34b6f12745e Mon Sep 17 00:00:00 2001 From: Clouds Date: Tue, 23 Jan 2018 02:00:08 +0800 Subject: [PATCH 098/948] Update setup.py (#803) fix errors when running `python3 setup.py sdist bdist_wheel` --- python/setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 5a87325e9a1a..188938d76c3d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -18,11 +18,12 @@ from setuptools import setup from setuptools.extension import Extension +CURRENT_DIR = os.path.dirname(__file__) + def get_lib_path(): """Get library path, name and version""" # We can not import `libinfo.py` in setup.py directly since __init__.py # Will be invoked which introduces dependences - CURRENT_DIR = os.path.dirname(__file__) libinfo_py = os.path.join(CURRENT_DIR, './tvm/_ffi/libinfo.py') libinfo = {'__file__': libinfo_py} exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo) @@ -126,4 +127,4 @@ def is_pure(self): os.remove("MANIFEST.in") for path in LIB_LIST: _, libname = os.path.split(path) - os.remove("tvm/%s" % LIB_NAME) + os.remove("tvm/%s" % libname) From 78e00410e5bdf5f4428e39285cac75592233a813 Mon Sep 17 00:00:00 2001 From: Siva Date: Mon, 22 Jan 2018 23:30:47 +0530 Subject: [PATCH 099/948] [DOC] Generalize the get_started script for beginners with different environments. (#798) --- tutorials/get_started.py | 66 ++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/tutorials/get_started.py b/tutorials/get_started.py index 3dce21a64950..2c6165940d05 100644 --- a/tutorials/get_started.py +++ b/tutorials/get_started.py @@ -13,6 +13,12 @@ import tvm import numpy as np +# Global declarations of environment. + +tgt_host="llvm" +# Change it to respective GPU if gpu is enabled Ex: cuda, opencl +tgt="cuda" + ###################################################################### # Vector Add Example # ------------------ @@ -88,8 +94,9 @@ # compute grid. These are GPU specific constructs that allows us # to generate code that runs on GPU. # -s[C].bind(bx, tvm.thread_axis("blockIdx.x")) -s[C].bind(tx, tvm.thread_axis("threadIdx.x")) +if tgt == "cuda": + s[C].bind(bx, tvm.thread_axis("blockIdx.x")) + s[C].bind(tx, tvm.thread_axis("threadIdx.x")) ###################################################################### # Compilation @@ -103,12 +110,12 @@ # function(including the inputs and outputs) as well as target language # we want to compile to. # -# The result of compilation fadd is a CUDA device function that can -# as well as a host wrapper that calls into the CUDA function. +# The result of compilation fadd is a GPU device function(if GPU is involved) +# that can as well as a host wrapper that calls into the GPU function. # fadd is the generated host wrapper function, it contains reference # to the generated device function internally. # -fadd_cuda = tvm.build(s, [A, B, C], "cuda", target_host="llvm", name="myadd") +fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") ###################################################################### # Run the Function @@ -124,12 +131,13 @@ # - fadd runs the actual computation. # - asnumpy() copies the gpu array back to cpu and we can use this to verify correctness # -ctx = tvm.gpu(0) +ctx = tvm.context(tgt, 0) + n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) -fadd_cuda(a, b, c) +fadd(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) ###################################################################### @@ -137,13 +145,16 @@ # -------------------------- # You can inspect the generated code in TVM. The result of tvm.build # is a tvm Module. fadd is the host module that contains the host wrapper, -# it also contains a device module for the CUDA function. +# it also contains a device module for the CUDA (GPU) function. # # The following code fetches the device module and prints the content code. # -dev_module = fadd_cuda.imported_modules[0] -print("-----CUDA code-----") -print(dev_module.get_source()) +if tgt == "cuda": + dev_module = fadd.imported_modules[0] + print("-----GPU code-----") + print(dev_module.get_source()) +else: + print(fadd.get_source()) ###################################################################### # .. note:: Code Specialization @@ -179,8 +190,9 @@ from tvm.contrib import util temp = util.tempdir() -fadd_cuda.save(temp.relpath("myadd.o")) -fadd_cuda.imported_modules[0].save(temp.relpath("myadd.ptx")) +fadd.save(temp.relpath("myadd.o")) +if tgt == "cuda": + fadd.imported_modules[0].save(temp.relpath("myadd.ptx")) cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")]) print(temp.listdir()) @@ -201,8 +213,9 @@ # re-link them together. We can verify that the newly loaded function works. # fadd1 = tvm.module.load(temp.relpath("myadd.so")) -fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx")) -fadd1.import_module(fadd1_dev) +if tgt == "cuda": + fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx")) + fadd1.import_module(fadd1_dev) fadd1(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -215,7 +228,7 @@ # them together with the host code. # Currently we support packing of Metal, OpenCL and CUDA modules. # -fadd_cuda.export_library(temp.relpath("myadd_pack.so")) +fadd.export_library(temp.relpath("myadd_pack.so")) fadd2 = tvm.module.load(temp.relpath("myadd_pack.so")) fadd2(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -241,16 +254,17 @@ # The following codeblocks generate opencl code, creates array on opencl # device, and verifies the correctness of the code. # -fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd") -print("------opencl code------") -print(fadd_cl.imported_modules[0].get_source()) -ctx = tvm.cl(0) -n = 1024 -a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) -b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) -c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) -fadd_cl(a, b, c) -np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) +if tgt == "opencl": + fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd") + print("------opencl code------") + print(fadd_cl.imported_modules[0].get_source()) + ctx = tvm.cl(0) + n = 1024 + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + fadd_cl(a, b, c) + np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) ###################################################################### # Summary From 4077c9ff21353d19ed563dbf42512417984df00a Mon Sep 17 00:00:00 2001 From: Siju Samuel Date: Tue, 23 Jan 2018 09:35:30 +0530 Subject: [PATCH 100/948] [Compilation Warning Fix] comparison between signed and unsigned integer expressions (#807) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compilation warning is fixed. src/runtime/graph/graph_runtime.cc:392:24: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] CHECK(data_byte_size == size) ~~~~~~~~~~~~~~~^~~~ /mnt/D_DRIVE/work/nnvm_22_Jan/nnvm_latest/tvm/dmlc-core/include/dmlc/logging.h:109:9: note: in definition of macro ‘CHECK’ if (!(x)) \ ^ --- src/runtime/graph/graph_runtime.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index ed833d40848c..d244fe5f028e 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -386,7 +386,7 @@ void GraphRuntime::LoadDLTensor(dmlc::Stream* strm, DLTensor* dst) { for (int i = 0; i < dst->ndim; ++i) { size *= dst->shape[i]; } - int64_t data_byte_size; + uint64_t data_byte_size; CHECK(strm->Read(&data_byte_size, sizeof(data_byte_size))) << "Invalid DLTensor file format"; CHECK(data_byte_size == size) From 16cedd504386e41871036e2157aad8f56b0c2d01 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 23 Jan 2018 12:08:29 -0800 Subject: [PATCH 101/948] [OPENCL] Fix 32bit pointer size in OpenCL runtime (#809) --- src/runtime/opencl/opencl_module.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 5a585a19cccf..bde7e6b27418 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -254,9 +254,14 @@ PackedFunc OpenCLModuleNode::GetFunction( for (size_t i = 0; i < info.arg_types.size(); ++i) { TVMType t = info.arg_types[i]; CHECK_EQ(t.lanes, 1U); - uint32_t bits = t.bits; - CHECK_EQ(bits % 8, 0U); - arg_size[i] = bits / 8; + if (t.code == kHandle) { + // specially store pointer type size in OpenCL driver + arg_size[i] = sizeof(void*); + } else { + uint32_t bits = t.bits; + CHECK_EQ(bits % 8, 0U); + arg_size[i] = bits / 8; + } } // initialize the wrapped func. f.Init(this, sptr_to_self, kid_map_.at(name), From 430b7ede756fc25d274ec72c81eeb830255b1b17 Mon Sep 17 00:00:00 2001 From: xqdan Date: Wed, 24 Jan 2018 06:07:14 +0800 Subject: [PATCH 102/948] fix #802, create cache based on sugar tensor (#808) --- src/schedule/schedule_dataflow_rewrite.cc | 6 +-- .../unittest/test_pass_storage_rewrite.py | 43 ++++++++++++++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc index b58df9d0481f..59d425287be0 100644 --- a/src/schedule/schedule_dataflow_rewrite.cc +++ b/src/schedule/schedule_dataflow_rewrite.cc @@ -82,12 +82,12 @@ Tensor Schedule::cache_read(const Tensor& tensor, } os << "." << scope; - Tensor cache = compute(tensor->shape, [&tensor](const Array& i) { - return tensor(Array(i.begin(), i.end())); - }, os.str()); std::unordered_map vsub; Stage s = operator[](tensor->op); Tensor sugar_tensor = s->op.output(tensor->value_index); + Tensor cache = compute(sugar_tensor->shape, [&sugar_tensor](const Array& i) { + return sugar_tensor(Array(i.begin(), i.end())); + }, os.str()); vsub[sugar_tensor] = cache; std::unordered_map vmap; diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index 1e4dda684eb3..d044db12686f 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -171,7 +171,47 @@ def test_parallel_alloc(): assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate)) - +def test_inplace_rule2(): + #Test Buffer + scope_tb = "local_TB" + @tvm.register_func("tvm.info.mem.%s" % scope_tb) + def mem_info_inp_buffer(): + return tvm.make.node("MemoryInfo", + unit_bits= 16, + max_simd_bits=32, + max_num_bits=1024*1024*1024, + head_address=None) + m = 10 + A = tvm.placeholder((m,), name='A') + C = tvm.placeholder((m,), name='C') + D = tvm.placeholder((m,), name='D') + A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0') + A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1') + A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2') + B = tvm.compute((m,), lambda i: A2[i], name='B') + s = tvm.create_schedule(B.op) + A0L = s.cache_read(A0, scope_tb, [A2]) + A1L = s.cache_read(A1, scope_tb, [A2]) + A2L = s.cache_read(A2, scope_tb, [B]) + bounds = tvm.schedule.InferBound(s) + assert isinstance(bounds, tvm.container.Map) + stmt = tvm.schedule.ScheduleOps(s, bounds) + Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') + Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') + Cc = tvm.decl_buffer(C.shape, B.dtype, name='C') + Dd = tvm.decl_buffer(D.shape, B.dtype, name='D') + stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64) + stmt = tvm.ir_pass.CanonicalSimplify(stmt) + stmt = tvm.ir_pass.Simplify(stmt) + stmt = tvm.ir_pass.StorageRewrite(stmt) + # verify only have one allocations. + # verify inplace folding works + num_alloc = [0] + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + num_alloc[0] += 1 + tvm.ir_pass.PostOrderVisit(stmt, verify) + assert num_alloc[0] == 2 if __name__ == "__main__": test_alloc_seq() @@ -180,3 +220,4 @@ def test_parallel_alloc(): test_parallel_alloc() test_storage_combine() test_storage_share_gpu() + test_inplace_rule2() From 9111082625456c07ad317b953c04ea989d8ad72d Mon Sep 17 00:00:00 2001 From: yuruofeifei Date: Tue, 23 Jan 2018 14:32:37 -0800 Subject: [PATCH 103/948] Improve gemm tutorial (#800) --- tutorials/optimize/opt_gemm.py | 272 ++++++++++++++++++++++----------- 1 file changed, 182 insertions(+), 90 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 9a4264c9d05a..715408380b15 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -1,7 +1,8 @@ """ How to optimize GEMM on CPU =========================== -**Author**: `Jian Weng `_ +**Author**: `Jian Weng `_, \ + `Ruofei Yu `_ (TL;DR) TVM provides abstract interfaces which allows users to depict an algorithm and the algorithm's implementing organization (the so-called schedule) separately. Typically, writing @@ -10,7 +11,7 @@ try these schedules efficiently to enhance the performance. In this tutorial, we will demonstrate how to use TVM to optimize square matrix multiplication -and achieve 100 times faster than baseline by simply adding 6 extra lines of code. +and achieve 200 times faster than baseline by simply adding 18 extra lines of code. There are two important optmizations on intense computation applications executed on CPU: 1. Increase the cache hit rate of memory access. Both complex numerical computation and hot-spot @@ -26,36 +27,46 @@ abstraction automatically, but some of them cannot be simply applied due to TVM constraints. All the experiment results mentioned below, are executed on 2015's 15' MacBook equiped with -Intel i7-4770QH CPU. The cache line size should be 64 bytes for all the x86 CPUs. +Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs. """ -############################################################################### +################################################################################################ # Preparation and Baseline # ------------------------ -# In this tutorial we assume all the matrix tensors are square and fix-bounded. -# We use 1024x1024 float32 matrix in demonstration. Before actually demonstrating, -# we first define these variables. Then we write a baseline implementation, -# the simplest way to write a matrix mulplication in TVM. -# +# In this tutorial, we will demo how to use TVM to optimize matrix multiplication. +# Before actually demonstrating, we first define these variables. +# Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM. import tvm import numpy import timeit -# The size of the square matrix +# The size of the matrix +# (M, K) x (K, N) +# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL. +M = 1024 +K = 1024 N = 1024 + # The default tensor type in tvm dtype = "float32" + +# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD +target = 'llvm -mcpu=core-avx2' +ctx = tvm.context(target, 0) + # Random generated tensor for testing -a = tvm.nd.array(numpy.random.rand(N, N).astype(dtype), tvm.cpu(0)) -b = tvm.nd.array(numpy.random.rand(N, N).astype(dtype), tvm.cpu(0)) +a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx) +b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx) np_repeat = 100 np_runing_time = timeit.timeit(setup='import numpy\n' - 'N = 1024\n' + 'M = ' + str(M) + '\n' + 'K = ' + str(K) + '\n' + 'N = ' + str(N) + '\n' 'dtype = "float32"\n' - 'a = numpy.random.rand(N, N).astype(dtype)\n' - 'b = numpy.random.rand(N, N).astype(dtype)\n', + 'a = numpy.random.rand(M, K).astype(dtype)\n' + 'b = numpy.random.rand(K, N).astype(dtype)\n', stmt='answer = numpy.dot(a, b)', number=np_repeat) print("Numpy running time: %f" % (np_runing_time / np_repeat)) @@ -63,24 +74,24 @@ answer = numpy.dot(a.asnumpy(), b.asnumpy()) # Algorithm -k = tvm.reduce_axis((0, N), 'k') -A = tvm.placeholder((N, N), name = 'A') -B = tvm.placeholder((N, N), name = 'B') +k = tvm.reduce_axis((0, K), 'k') +A = tvm.placeholder((M, K), name='A') +B = tvm.placeholder((K, N), name='B') C = tvm.compute( - A.shape, - lambda x, y: tvm.sum(A[x, k] * B[k, y], axis = k), - name = 'C') + (M, N), + lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k), + name='C') # Default schedule s = tvm.create_schedule(C.op) -func = tvm.build(s, [A, B, C], name = 'mmult') +func = tvm.build(s, [A, B, C], target=target, name='mmult') assert func -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) func(a, b, c) numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=1) +evaluator = func.time_evaluator(func.entry_name, ctx, number=1) print('Baseline: %f' % evaluator(a, b, c).mean) ################################################################################################ @@ -92,27 +103,32 @@ ################################################################################################ # Blocking # -------- -# A important trick to enhance the cache hit rate is blocking --- data chunck will be computed +# A important trick to enhance the cache hit rate is blocking --- data chunk will be computed # block by block. The memory access inside the block is a small neighbourhood which is with high # memory locality. In this tutorial, I picked up 32 as the blocking factor. So the block will # fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache) bn = 32 s = tvm.create_schedule(C.op) + # Blocking by loop tiling xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +k, = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) + # Hoist reduction domain outside the blocking loop -s[C].reorder(xo, yo, k, xi, yi) -func = tvm.build(s, [A, B, C], name = 'mmult') +s[C].reorder(xo, yo, ko, ki, xi, yi) + +func = tvm.build(s, [A, B, C], target=target, name='mmult') assert func -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) func(a, b, c) numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -# By simply tiling the loop 32x32, and hoisting k outside the blocking loops, we can see big -# speedup compared with the baseline. -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) +# By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops, +# we can see big speedup compared with the baseline. +evaluator = func.time_evaluator(func.entry_name, ctx, number=10) print('Opt1: %f' % evaluator(a, b, c).mean) ################################################################################################ @@ -120,6 +136,73 @@ print(tvm.lower(s, [A, B, C], simple_mode=True)) +################################################################################################### +# Vectorization +# ------------- +# Another important trick is vectorization. When the memory access pattern is uniform, +# the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM, +# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly. +# +# In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. + +s = tvm.create_schedule(C.op) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +k, = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) + +s[C].reorder(xo, yo, ko, ki, xi, yi) + +# Vectorization +s[C].vectorize(yi) + +func = tvm.build(s, [A, B, C], target=target, name='mmult') +assert func + +c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +print('Opt2: %f' % evaluator(a, b, c).mean) + +################################################################################################ +# Here is the generated IR after vectorization. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################################### +# Loop Permutation +# ------------- +# If we look at the above IR, we can see the inner loop row data is vectorized and +# B is transformed into PackedB. The traversal of PackedB is sequential now. +# So we will look at the access pattern of A. In current schedule, A is accessed column by column +# which is not cache friendly. If we change the nested loop order of ki and inner axes xi, +# the access pattern for A matrix is more cache friendly. + +s = tvm.create_schedule(C.op) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +k, = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) + +# re-ordering +s[C].reorder(xo, yo, ko, xi, ki, yi) +s[C].vectorize(yi) + +func = tvm.build(s, [A, B, C], target=target, name='mmult') +assert func + +c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) +func(a, b, c) +numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) + +evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +print('Opt3: %f' % evaluator(a, b, c).mean) + +################################################################################################ +# Here is the generated IR after loop permutation. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + ################################################################################################### # Array Packing # ------------- @@ -142,88 +225,82 @@ # # We have to re-write the algorithm slightly. -packedB = tvm.compute((N / bn, N, bn), lambda x, y, z: B[y, x * bn + z], name = 'packedB') -C = tvm.compute(A.shape, - lambda x, y: tvm.sum(A[x, k] * packedB[y / bn, k, y % bn], axis = k), +packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB') +C = tvm.compute((M, N), + lambda x, y: tvm.sum(A[x, k] * packedB[y / bn, k, y % bn], axis=k), name = 'C') s = tvm.create_schedule(C.op) + xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -s[C].reorder(xo, yo, k, xi, yi) +k, = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) -func = tvm.build(s, [A, B, C], name = 'mmult') +s[C].reorder(xo, yo, ko, xi, ki, yi) +s[C].vectorize(yi) + +x, y, z = s[packedB].op.axis +s[packedB].vectorize(z) +s[packedB].parallel(x) + +func = tvm.build(s, [A, B, C], target=target, name='mmult') assert func -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) func(a, b, c) numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) -print('Opt2: %f' % evaluator(a, b, c).mean) +evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +print('Opt4: %f' % evaluator(a, b, c).mean) ################################################################################################ # Here is the generated IR after array packing. print(tvm.lower(s, [A, B, C], simple_mode=True)) -################################################################################################### -# Vectorization -# ------------- -# Another important trick is vectorization. When the memory access pattern is uniform, -# the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM, -# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly. +################################################################################################ +# Write cache for blocks +# -------- +# After blocking, the program will write result to C block by block, the access pattern +# is not sequential. So we can use a sequential cache array to hold the block results and +# write to C when all the block results are ready. # -# In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. s = tvm.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -s[C].reorder(xo, yo, k, xi, yi) -# Vectorization -s[C].vectorize(yi) -func = tvm.build(s, [A, B, C], name = 'mmult') -assert func - -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) -func(a, b, c) -numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) +# Allocate write cache +CC = s.cache_write(C, 'global') -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) -print('Opt3: %f' % evaluator(a, b, c).mean) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -################################################################################################ -# Here is the generated IR after vectorization. +# Write cache is computed at yo +s[CC].compute_at(s[C], yo) -print(tvm.lower(s, [A, B, C], simple_mode=True)) +# New inner axes +xc, yc = s[CC].op.axis -################################################################################################### -# Loop Permutation -# ------------- -# If we look at the above IR, we can see the inner loop row data is vectorized and -# B is transformed into PackedB. The traversal of PackedB is sequential now. -# So we will look at the access pattern of A. In current schedule, A is accessed column by column -# which is not cache friendly. If we change the nested loop order of k and inner row index xi, -# the access pattern for A matrix is more cache friendly. +k, = s[CC].op.reduce_axis +ko, ki = s[CC].split(k, factor=4) +s[CC].reorder(ko, xc, ki, yc) +s[CC].unroll(ki) +s[CC].vectorize(yc) -s = tvm.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -s[C].reorder(xo, yo, xi, k, yi) +x, y, z = s[packedB].op.axis +s[packedB].vectorize(z) +s[packedB].parallel(x) -# Vectorization -s[C].vectorize(yi) - -func = tvm.build(s, [A, B, C], name = 'mmult') +func = tvm.build(s, [A, B, C], target=target, name='mmult') assert func -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) func(a, b, c) numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) -print('Opt4: %f' % evaluator(a, b, c).mean) +evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +print('Opt5: %f' % evaluator(a, b, c).mean) ################################################################################################ -# Here is the generated IR after loop permutation. +# Here is the generated IR after blocking. print(tvm.lower(s, [A, B, C], simple_mode=True)) @@ -233,23 +310,38 @@ # Futhermore, we can also utilize multi-core processors to do the thread-level parallelization. s = tvm.create_schedule(C.op) + +CC = s.cache_write(C, 'global') + xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -s[C].reorder(xo, yo, xi, k, yi) -s[C].vectorize(yi) + +s[CC].compute_at(s[C], yo) + +xc, yc = s[CC].op.axis + +k, = s[CC].op.reduce_axis +ko, ki = s[CC].split(k, factor=4) +s[CC].reorder(ko, xc, ki, yc) +s[CC].unroll(ki) +s[CC].vectorize(yc) # parallel s[C].parallel(xo) -func = tvm.build(s, [A, B, C], name = 'mmult') +x, y, z = s[packedB].op.axis +s[packedB].vectorize(z) +s[packedB].parallel(x) + +func = tvm.build(s, [A, B, C], target=target, name = 'mmult') assert func -c = tvm.nd.array(numpy.zeros((N, N), dtype = dtype), tvm.cpu(0)) +c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) func(a, b, c) numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=50) -opt5_time = evaluator(a, b, c).mean -print('Opt5: %f' % opt5_time) +evaluator = func.time_evaluator(func.entry_name, ctx, number=50) +opt6_time = evaluator(a, b, c).mean +print('Opt6: %f' % opt6_time) ################################################################################################ # Here is the generated IR after parallelization. @@ -261,8 +353,8 @@ ################################################################################################## # Summary # ------- -# After applying the above simple optimizations with only 6 lines of code, -# our generated code can achieve 30% of the `numpy` performance with Apple implemented BLAS. -# Note that the outputs on the webpage reflect the running times on a non-exclusive +# After applying the above simple optimizations with only 18 lines of code, +# our generated code can achieve 60% of the `numpy` performance with MKL. +# Note that the outputs on the web page reflect the running times on a non-exclusive # Docker container, thereby they are *unreliable*. It is highly encouraged to run the # tutorial by yourself to observe the performance gain acheived by TVM. From ebb67c42701b9808d1ad7ada51d3eb48dd3afcc5 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 23 Jan 2018 15:41:30 -0800 Subject: [PATCH 104/948] fix gemm tutorial for env that may not have right instruction (#810) --- tutorials/optimize/opt_gemm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 715408380b15..f1060bf46f45 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -52,7 +52,9 @@ dtype = "float32" # using Intel AVX2(Advanced Vector Extensions) ISA for SIMD -target = 'llvm -mcpu=core-avx2' +# To get the best performance, please change the following line +# to llvm -mcpu=core-avx2, or specific type of CPU you use +target = 'llvm' ctx = tvm.context(target, 0) # Random generated tensor for testing From 814b46dccb3c340be7611d113a303a546f7b2b2a Mon Sep 17 00:00:00 2001 From: libing4752 Date: Wed, 24 Jan 2018 10:35:59 +0800 Subject: [PATCH 105/948] [PASS] enhance storage_rewrite to support different dtypes for unified buffer (#805) * modified schedule_dataflow_rewrite.cc to fix losing tensor problem * modified schedule_dataflow_rewrite.cc for lint scan * modified schedule_dataflow_rewrite.cc for lint scan * using tensor's value_index to index output of stage op * repare address offset for different kinds of dtype * bc * aaa * aaaaa * repare address for different dtypes * remove nonsense files * add whitespace of line 581 * use base alloc elem_type * enhance the testcast of basic buffer is 64bits,32bits,16bits,8bits * use extends[0]->type() as dtype of offset * clear program writes --- src/pass/storage_rewrite.cc | 26 +++++----- .../unittest/test_pass_storage_rewrite.py | 48 ++++++++++++++++++- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 7215c3f97a43..f052a9b05b90 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -576,33 +576,33 @@ class StoragePlanRewriter : public IRMutator { // allocate with element type. CHECK_NE(e->const_nbits, 0U); MemoryInfo info = GetMemoryInfo(e->scope.to_string()); + uint64_t total_bits = e->const_nbits; size_t align = 1; if (info.defined()) { - align = (info->max_simd_bits + e->elem_type.bits() - 1) / e->elem_type.bits(); + align = info->max_simd_bits; } - uint64_t total_elem = e->const_nbits / e->elem_type.bits(); - if (total_elem % align != 0) { - total_elem += align - (total_elem % align); + if (total_bits % align != 0) { + total_bits += align - (total_bits % align); } e->alloc_var = e->allocs[0]->buffer_var; for (StorageEntry* child : e->merged_children) { - CHECK_NE(e->const_nbits, 0U); - CHECK_NE(total_elem, 0U); - size_t num_elem = child->const_nbits / child->elem_type.bits(); - child->elem_offset = total_elem; + CHECK_NE(child->const_nbits, 0U); + CHECK_NE(total_bits, 0U); + child->elem_offset = total_bits / child->elem_type.bits(); child->alloc_var = e->alloc_var; - total_elem += num_elem; - if (total_elem % align != 0) { - total_elem += align - (total_elem % align); + total_bits += child->const_nbits; + if (total_bits % align != 0) { + total_bits += align - (total_bits % align); } } + uint64_t type_bits = e->elem_type.bits() * e->elem_type.lanes(); Expr alloc_size = make_const(e->allocs[0]->extents[0].type(), - total_elem); + (total_bits + type_bits - 1) / type_bits); e->new_alloc = Allocate::make( e->alloc_var, e->elem_type, {alloc_size}, const_true(), Evaluate::make(0)); if (info.defined()) { - CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits) + CHECK_LE(total_bits, info->max_num_bits) << "Allocation exceed bound of memory tag " << e->scope.to_string(); } } diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index d044db12686f..6b6ff71810bb 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -49,6 +49,52 @@ def verify(n): tvm.ir_pass.PostOrderVisit(body, verify) assert num_alloc[0] == 1 +def test_alloc_different_dtypes(): + def stmt_generater(dtype_list, length): + ib = tvm.ir_builder.create() + base_dtype = dtype_list[0] + global_a = tvm.placeholder((length,), name = "global_a", dtype = base_dtype) + for index, dtype in enumerate(dtype_list): + with ib.for_range(0, length, name="j") as j: + A = ib.allocate(dtype, length, name="A_" + str(index), scope="local.L0A") + A[j] = tvm.const(1, dtype = dtype) + return ib.get() + + def dtype_bit_len(dtype): + index = 0 + for i in dtype: + if i.isdigit(): + break + index += 1 + return int(dtype[index:]) + + def offset_generater(dtype_list, length): + dtype_len_list = [dtype_bit_len(i) for i in dtype_list] + base_len = dtype_len_list[0] + return sum([i * length / base_len for i in dtype_len_list]) + + def dtype_test(dtype_list, length): + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + assert n.extents[0].value == offset + + body = stmt_generater(dtype_list, length) + offset = offset_generater(dtype_list, length) + body = tvm.ir_pass.StorageRewrite(body) + tvm.ir_pass.PostOrderVisit(body, verify) + + length = 1024 + dtype_list = ["float16", "int32", "uint16", "int8"] + dtype_test(dtype_list, length) + + dtype_list = ["float32", "int32", "uint16", "int8"] + dtype_test(dtype_list, length) + + dtype_list = ["float64", "int32", "uint16", "int8"] + dtype_test(dtype_list, length) + + dtype_list = ["int8", "int32", "uint16", "uint8"] + dtype_test(dtype_list, length) def test_inplace_rule(): @@ -91,7 +137,6 @@ def test_storage_combine(): s = tvm.create_schedule(B.op) for S in stages[:-1]: s[S].set_scope("global:tag") - bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) @@ -215,6 +260,7 @@ def verify(n): if __name__ == "__main__": test_alloc_seq() + test_alloc_different_dtypes() test_inplace_rule() test_storage_share() test_parallel_alloc() From 4627c17e3030fe24f70ae961c3c4d5b7fa77fd2b Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 24 Jan 2018 13:59:30 -0800 Subject: [PATCH 106/948] [CODE COMMENT] Comment BindBufferScope (#815) --- src/pass/storage_flatten.cc | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc index 45ad081f86cb..bc380d473791 100644 --- a/src/pass/storage_flatten.cc +++ b/src/pass/storage_flatten.cc @@ -296,7 +296,40 @@ class StorageFlattener : public IRMutator { } private: - // Start bind + // The specific tensor data layout is not determined before + // StorageFlatten pass. We use buffer_bind_scope + // to specify before hand we want to bind a subregion + // of tensor to a symbolic buffer, which get used in extern. + // + // Example: + // + // realize A in range [i*4, extent=10) { + // bind Ab to A in [i*4+1, extent=4) { + // call_func(Ab.ptr, Ab.shape[0]) + // } + // } + // + // After StorageFlatten + // + // alloc A[10] + // call(A + 1, 4) + // + // Buffer is a protocol to declare specific + // data layout and shape we expect. + // So this function need to check: + // - If the bind range is within the realize range + // - If we can match the requirement of buffer + // - Remap variables such as Ab.ptr to the actual value. + // + // Here are a few possible failure cases: + // - Buffer is declared to have constant shape, + // but we try to bind it to a different one. + // - Buffer is declared to be compact(no strides) + // but this binded region is a subregion of + // a matrix(tensor), which means it requires strides. + // + // We do support a few relaxed case, such as bindingx + // region with shape [1, 1, n, m] to buffer with shape [n, m] Stmt HandleBufferBindScope(const AttrStmt* op) { Array arr(op->node.node_); CHECK_EQ(arr.size(), 2U); From 225c8064e58e21d6ac4aa516b2e57efb0adcb324 Mon Sep 17 00:00:00 2001 From: Mu Li Date: Wed, 24 Jan 2018 14:00:08 -0800 Subject: [PATCH 107/948] Update cross_compilation_and_rpc.py (#816) --- .../deployment/cross_compilation_and_rpc.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/tutorials/deployment/cross_compilation_and_rpc.py b/tutorials/deployment/cross_compilation_and_rpc.py index bfd77f287336..b31263f9ed44 100644 --- a/tutorials/deployment/cross_compilation_and_rpc.py +++ b/tutorials/deployment/cross_compilation_and_rpc.py @@ -14,11 +14,6 @@ In this tutorial, I will take Raspberry Pi as our target platform for example. """ -from __future__ import absolute_import, print_function - -import tvm -import numpy as np -from tvm.contrib import rpc, util ###################################################################### # Build TVM Runtime on Device @@ -27,6 +22,12 @@ # There're some prerequisites: similar as compiling TVM on your # local machine, we need build runtime on remote device. # +# .. note:: +# +# All instructions in both this section and next section should be +# executed on the target device, e.g. Raspberry Pi. And we assume it +# has Linux running. +# # To get started, clone tvm repo from github. It is important to clone # the submodules along, with --recursive option (Assuming you are in # your home directory): @@ -106,8 +107,19 @@ ###################################################################### # Declare and Cross Compile Kernel on Local Machine # ------------------------------------------------- -# Here we will declare a simple kernel with TVM on the local machine: # +# .. note:: +# +# Now we back to the local machine, which has a full TVM installed. +# +# Here we will declare a simple kernel with TVM on the local machine: + +from __future__ import absolute_import, print_function + +import tvm +import numpy as np +from tvm.contrib import rpc, util + n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') @@ -136,7 +148,8 @@ # Pi. Here we use :code:`'llvm'` directly to make the tutorial runable. # # Usually, you can query the target by execute :code:`gcc -v` on your -# device, although it may be still a loose configuration. +# device, and look for the line starting with :code:`Target:` +# (Though it may be still a loose configuration.) # # Besides :code:`-target`, you can also set other compilation options # like: @@ -171,9 +184,15 @@ # Run CPU Kernel Remotely by RPC # ------------------------------ # Here we will show you how to run the kernel on the remote device: +# +# .. note:: +# In order to have this tutorial runs locally to build the nice HTML, we +# start a RPC server on the local machine. You can ignore it if you already +# started the server on the target device. And then change host IP properly. -# replace host with the ip address of your device -host = '0.0.0.0' +# Can be ignored if you already started the RPC server +server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True) +host = '0.0.0.0' # Change to your target device IP port = 9090 # connect the remote device remote = rpc.connect(host, port) @@ -213,9 +232,12 @@ # on remote compiler to re-link them. # # .. note:: +# # Raspberry Pi does not support OpenCL, the following code is tested on # Firefly-RK3399. The target_host should be 'llvm -target=aarch64-linux-gnu'. # But here we set 'llvm' to enable this tutorial to run locally. +# +# Also we need to build the runtime with the flag `USE_OPENCL=1`. # build kernel (different from cpu, we need bind axis for OpenCL) s = tvm.create_schedule(B.op) From c2dd9273e9dc47d92238876385e164af4b81c50d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 24 Jan 2018 16:37:47 -0800 Subject: [PATCH 108/948] fix rpc tutorial (#818) --- .../deployment/cross_compilation_and_rpc.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tutorials/deployment/cross_compilation_and_rpc.py b/tutorials/deployment/cross_compilation_and_rpc.py index b31263f9ed44..eec2f9aeb34b 100644 --- a/tutorials/deployment/cross_compilation_and_rpc.py +++ b/tutorials/deployment/cross_compilation_and_rpc.py @@ -23,11 +23,11 @@ # local machine, we need build runtime on remote device. # # .. note:: -# -# All instructions in both this section and next section should be -# executed on the target device, e.g. Raspberry Pi. And we assume it -# has Linux running. -# +# +# All instructions in both this section and next section should be +# executed on the target device, e.g. Raspberry Pi. And we assume it +# has Linux running. +# # To get started, clone tvm repo from github. It is important to clone # the submodules along, with --recursive option (Assuming you are in # your home directory): @@ -102,6 +102,12 @@ # same machine, for demonstration. This line can be omitted if we # started an remote server. # +from __future__ import absolute_import, print_function + +import tvm +import numpy as np +from tvm.contrib import rpc, util + server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True) ###################################################################### @@ -110,16 +116,10 @@ # # .. note:: # -# Now we back to the local machine, which has a full TVM installed. +# Now we back to the local machine, which has a full TVM installed. # # Here we will declare a simple kernel with TVM on the local machine: -from __future__ import absolute_import, print_function - -import tvm -import numpy as np -from tvm.contrib import rpc, util - n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') @@ -148,7 +148,7 @@ # Pi. Here we use :code:`'llvm'` directly to make the tutorial runable. # # Usually, you can query the target by execute :code:`gcc -v` on your -# device, and look for the line starting with :code:`Target:` +# device, and look for the line starting with :code:`Target:` # (Though it may be still a loose configuration.) # # Besides :code:`-target`, you can also set other compilation options @@ -185,14 +185,14 @@ # ------------------------------ # Here we will show you how to run the kernel on the remote device: # -# .. note:: -# In order to have this tutorial runs locally to build the nice HTML, we -# start a RPC server on the local machine. You can ignore it if you already +# .. note:: +# In order to have this tutorial runs locally to build the nice HTML, we +# start a RPC server on the local machine. You can ignore it if you already # started the server on the target device. And then change host IP properly. -# Can be ignored if you already started the RPC server -server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True) -host = '0.0.0.0' # Change to your target device IP +# Can be ignored if you already started the RPC server +server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True) +host = '0.0.0.0' # Change to your target device IP port = 9090 # connect the remote device remote = rpc.connect(host, port) @@ -237,7 +237,7 @@ # Firefly-RK3399. The target_host should be 'llvm -target=aarch64-linux-gnu'. # But here we set 'llvm' to enable this tutorial to run locally. # -# Also we need to build the runtime with the flag `USE_OPENCL=1`. +# Also we need to build the runtime with the flag `USE_OPENCL=1`. # build kernel (different from cpu, we need bind axis for OpenCL) s = tvm.create_schedule(B.op) From 9fb25327dd29af681af1f63fa557feb3b8d5f852 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Thu, 25 Jan 2018 13:35:24 -0500 Subject: [PATCH 109/948] [OpenGL] Let OpenGL texture always be 1024 x nrows. (#817) * OpenGL texture is always 1024 x nrows. * Address review comments. --- src/codegen/codegen_opengl.cc | 11 +-- src/runtime/opengl/opengl_device_api.cc | 91 +++++++++++++++++++++---- src/runtime/opengl/opengl_module.h | 13 ++++ tests/webgl/test_local_gemm.py | 2 +- 4 files changed, 98 insertions(+), 19 deletions(-) diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc index e645e7f6c701..496b15b34bfa 100644 --- a/src/codegen/codegen_opengl.cc +++ b/src/codegen/codegen_opengl.cc @@ -154,7 +154,8 @@ void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) { // Declare threadIdx local variable. this->PrintIndent(); - this->stream << "ivec2 threadIdx = ivec2(gl_FragCoord.xy);\n"; + this->stream << "ivec2 threadIdx = ivec2(" << runtime::kTextureRowSize + << " * int(gl_FragCoord.y) + int(gl_FragCoord.x), 0);\n"; // Return directly if threadIdx.x >= thread_extent. this->PrintIndent(); @@ -192,12 +193,14 @@ void CodeGenOpenGL::VisitStmt_(const Store* op) { } } -// texelFetch(tex, ivec2(idx, 0), 0).r +// texelFetch(tex, ivec2(idx & kTextureRowMask, idx >> kTextureRowBits), 0).r std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) { std::ostringstream os; - os << "texelFetch(" << GetVarID(buffer) << ", ivec2("; + os << "texelFetch(" << GetVarID(buffer) << ", ivec2(int("; PrintExpr(index, os); - os << ", 0), 0).r"; + os << ") & " << runtime::kTextureRowMask << ", int("; + PrintExpr(index, os); + os << ") >> " << runtime::kTextureRowBits << "), 0).r"; return os.str(); } diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc index 798003af902f..d90d12034ae6 100644 --- a/src/runtime/opengl/opengl_device_api.cc +++ b/src/runtime/opengl/opengl_device_api.cc @@ -3,6 +3,7 @@ * \file opengl_device_api.cc */ #include "./opengl_common.h" +#include "./opengl_module.h" #if TVM_OPENGL_RUNTIME @@ -347,8 +348,9 @@ Texture OpenGLWorkspace::CreateTexture(TVMType type, size_t nbytes) { // Use glTexImage2D with nullptr data to specify GPU data storage. auto texture_format = GetTextureFormat(type); - auto width = static_cast(nbytes / (type.bits / 8)); - auto height = GLsizei(1); + auto nelems = static_cast(nbytes / (type.bits / 8)); + auto height = (nelems + kTextureRowSize - 1) / kTextureRowSize; + auto width = (height == 1) ? nelems : kTextureRowSize; OPENGL_CALL(gl->TexImage2D(GL_TEXTURE_2D, /*level=*/0, texture_format.internal_format, width, height, /*border=*/0, @@ -402,6 +404,51 @@ Program OpenGLWorkspace::CreateProgram(GLuint fragment_shader) { return Program(this, program); } +/*! + * \brief Visit a 1D range of an OpenGL texture-backed TVM array. + * When getting/setting a sub image of a texture, we can only specify a 2D + * block (xbeg, ybeg, width, height). + * Since we are storing all TVM arrays using (kTextureRowSize x nrows) 2D + * textures (row-major), a range in an array does not necessarily map to a 2D + * block. + * This function split a 1D range into 3 2D blocks. + * \param beg The index of the first element in the 1D range. + * \param end The index of the last + 1 element in the 1D range. + * \param on_2d_block Callback for each 2D block. Must have interface + * void(GLint xbeg, GLint ybeg, GLsizei width, GLsizei height). + */ +template +static void Visit1DRange(GLint beg, GLint end, F&& on_2d_block) { + CHECK_LE(beg, end) << "Invalid range."; + + // xbeg kTextureRowSize + // ybeg ....************ + // **************** + // **************** + // ylast *********....... + // xlast + GLint xbeg = beg % kTextureRowSize; + GLint ybeg = beg / kTextureRowSize; + GLint xlast = (end - 1) % kTextureRowSize; + GLint ylast = (end - 1) / kTextureRowSize; + + if (ybeg == ylast) { // Only one row. + on_2d_block(xbeg, ybeg, end - beg, 1); + return; + } + + // First row. + on_2d_block(xbeg, ybeg, kTextureRowSize - xbeg, 1); + + // Middle block. + if (ylast - ybeg > 1) { + on_2d_block(0, ybeg + 1, kTextureRowSize, ylast - ybeg - 1); + } + + // Last row. + on_2d_block(0, ylast, xlast + 1, 1); +} + void OpenGLWorkspace::PutTextureData(Texture *texture, GLint begin, GLsizei nelems, @@ -409,12 +456,17 @@ void OpenGLWorkspace::PutTextureData(Texture *texture, // Bind to temporary unit. BindTextureUnit(NumTextureUnits() - 1, texture->texture()); - // Similar to cudaMemcpy. - OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0, - /*xoffset=*/begin, /*yoffset=*/0, - /*width=*/nelems, /*height=*/1, - texture->format_.format, texture->format_.type, - data)); + Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg, + GLsizei width, GLsizei height) { + auto offset = (ybeg * kTextureRowSize + xbeg - begin) * texture->elemsz(); + const GLvoid* ptr = static_cast(data) + offset; + + // Similar to cudaMemcpy. + OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0, + xbeg, ybeg, width, height, + texture->format_.format, + texture->format_.type, ptr)); + }); } void OpenGLWorkspace::GetTextureData(const Texture *texture, @@ -453,18 +505,29 @@ void OpenGLWorkspace::GetTextureData(const Texture *texture, auto nchannels = 4; auto padded_data_size = nchannels * nelems * elemsz; auto padded_data = std::unique_ptr(new char[padded_data_size]); - OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems, - /*height=*/1, GL_RGBA, GL_FLOAT, - padded_data.get())); + Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg, + GLsizei width, GLsizei height) { + auto data_offset = (ybeg * kTextureRowSize + xbeg - begin) * elemsz; + auto padded_data_offset = data_offset * nchannels; + OPENGL_CALL(gl->ReadPixels(xbeg, ybeg, width, height, + GL_RGBA, GL_FLOAT, + padded_data.get() + padded_data_offset)); + }); for (GLsizei i = 0; i != nelems; ++i) { auto dst = reinterpret_cast(data) + i * elemsz; auto src = padded_data.get() + nchannels * i * elemsz; std::memcpy(dst, src, elemsz); } #else - OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems, - /*height=*/1, texture->format_.format, - texture->format_.type, data)); + Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg, + GLsizei width, GLsizei height) { + auto offset = (ybeg * kTextureRowSize + xbeg - begin) * texture->elemsz(); + GLvoid* ptr = static_cast(data) + offset; + + OPENGL_CALL(gl->ReadPixels(xbeg, ybeg, width, height, + texture->format_.format, texture->format_.type, + ptr)); + }); #endif OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer)); diff --git a/src/runtime/opengl/opengl_module.h b/src/runtime/opengl/opengl_module.h index 1913878c3d68..a4cfa20bd734 100644 --- a/src/runtime/opengl/opengl_module.h +++ b/src/runtime/opengl/opengl_module.h @@ -17,6 +17,19 @@ namespace tvm { namespace runtime { +/*! + * \brief The fixed row size of all OpenGL textures in TVM. + * + * OpenGL has texture size limit on each dimension. Suppose we have a limit of + * 1024, then we can have a 2D texture of size (2^10 x 2^10) but not (2^20 x 1). + * This means we don't want to just use (n x 1) 2D textures for all arrays, + * because that would limit our array size to be 1024. Here we use (1024 x m) + * 2D textures. Then we can have arrays of size up to 2^20. + */ +static constexpr int kTextureRowBits = 10; +static constexpr int kTextureRowSize = 1 << kTextureRowBits; +static constexpr int kTextureRowMask = kTextureRowSize - 1; + /*! * \brief Determines how we supply arguments. */ diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py index 18d2d1d8bf34..0dd1c0fc7376 100644 --- a/tests/webgl/test_local_gemm.py +++ b/tests/webgl/test_local_gemm.py @@ -7,7 +7,7 @@ def test_local_gemm(): if not tvm.module.enabled("llvm"): return - nn = 2 + nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n From 2725b898211a8e59e1bff551862cacddc8c09041 Mon Sep 17 00:00:00 2001 From: Yida Wang Date: Fri, 26 Jan 2018 13:02:57 -0800 Subject: [PATCH 110/948] minor tweak of the runtime doc to fix some grammatical and expression issues (#828) --- docs/dev/runtime.md | 54 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/dev/runtime.md b/docs/dev/runtime.md index b9cc81186200..d601801a435e 100644 --- a/docs/dev/runtime.md +++ b/docs/dev/runtime.md @@ -1,7 +1,7 @@ # TVM Runtime System -TVM support multiple programming languages for compiler stack development and deployment. -In this note, we explain the key element of TVM runtime. +TVM supports multiple programming languages for the compiler stack development and deployment. +In this note, we explain the key elements of the TVM runtime. ![](http://www.tvmlang.org/images/release/tvm_flexible.png) @@ -9,10 +9,10 @@ We need to satisfy quite a few interesting requirements - Deployment: invoke the compiled function from python/javascript/c++ language. - Debug: define a function in python and call that from a compiled function. -- Link: write driver code to call device specific code(CUDA) and call it from compiled host function. +- Link: write driver code to call device specific code (CUDA) and call it from compiled host function. - Prototype: define an IR pass from python and call that from C++ backend. -- Expose: compiler stack developed in c++ to front-end (i.e, python) -- Experiment: ship a compiled function to an embedded device directly run there. +- Expose: compiler stack developed in c++ to front-end (i.e, python) +- Experiment: ship a compiled function to an embedded device to directly run there. We want to be able to define a function from any language and call from another. We also want the runtime core to be minimal to deploy to embedded devices. @@ -41,11 +41,11 @@ void CallPacked() { ``` In the above codeblock, we defined a PackedFunc MyAdd. It takes two arguments : ```args``` represents input arguments and ```rv``` represents return value. -The function is type-erased, which means the function signature does not restrict which input type to pass in or type to return. +The function is type-erased, which means that the function signature does not restrict which input type to pass in or type to return. Under the hood, when we call a PackedFunc, it packs the input arguments to TVMArgs on stack, -and get the result back via TVMRetValue. +and gets the result back via TVMRetValue. -Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created. +Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created. The following example registers PackedFunc in C++ and calls from python. ```c++ @@ -74,7 +74,7 @@ The restriction makes the implementation simple without the need of serializatio Despite being minimum, the PackedFunc is sufficient for the use-case of deep learning deployment as most functions only take DLTensor or numbers. -Since one PackedFunc can take another PackedFunc as argument, +Since one PackedFunc can take another PackedFunc as an argument, we can pass functions from python(as PackedFunc) to C++. ```c++ TVM_REGISTER_GLOBAL("callhello") @@ -97,15 +97,15 @@ callhello(f) ``` TVM provides a [minimum C API](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h), -that allows us to embedded the PackedFunc into any languages. Besides python, so far we supported +which allows us to embed the PackedFunc into any languages. Besides python, so far we supported [java](https://github.com/dmlc/tvm/tree/master/jvm) and [javascript](https://github.com/dmlc/tvm/tree/master/web). -This philosophy of embedded API is very like Lua, except that we don't have a new language and uses C++. +This philosophy of embedded API is very like Lua, except that we don't have a new language but use C++. One fun fact about PackedFunc is that we use it for both compiler and deployment stack. - All TVM's compiler pass functions are exposed to frontend as PackedFunc, see [here](https://github.com/dmlc/tvm/tree/master/src/api) -- The compiled modules also returns compiled function as PackedFunc +- The compiled module also returns the compiled function as PackedFunc -To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules(e.g., CUDA) get included. +To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included. The overhead of calling into PackedFunc vs. a normal function is small, as it is only saving a few values on the stack. So it is OK as long as we don't wrap small functions. @@ -113,9 +113,9 @@ In summary, the PackedFunc is the universal glue in TVM where we use it extensiv ## Module -Since TVM support multiple types of devices, we need to support different type of drivers. -We have to use driver API to load the kernel, set up the argument in packed format and perform kernel launch. -We also need to patch up the driver API so that the exposed functions is threadsafe. +Since TVM supports multiple types of devices, we need to support different type of drivers. +We have to use the driver API to load the kernel, set up the argument in packed format and perform kernel launch. +We also need to patch up the driver API so that the exposed functions are threadsafe. So we often need to implement these driver glues in C++ and expose them to the user. We can certainly not do it for each type of functions, so again PackedFunc is our answer. @@ -130,32 +130,32 @@ of new device easy, and we do not need to redo the host code generation for each ## Remote Deployment The PackedFunc and Module system also makes it easy to ship the function into remote devices directly. -Under the hood, we have a RPCModule that serializes the arguments and do the data movement and launches the computation on the remote. +Under the hood, we have an RPCModule that serializes the arguments to do the data movement and launches the computation on the remote. ![](http://www.tvmlang.org/images/release/tvm_rpc.png) The RPC server itself is minimum and can be bundled into the runtime. We can start a minimum TVM -RPC server on iPhone/android/raspberry pi or even your browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout +RPC server on iPhone/android/raspberry pi or even the browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout [Cross compilation and RPC tutorial](http://docs.tvmlang.org/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py) for more details. -This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone copy the result back and do verification on my host via numpy. We can also do the profiling using the same script. +This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone, copy the result back and do verification on the host via numpy. We can also do the profiling using the same script. ## TVM Node and Compiler Stack As we mentioned earlier, we build compiler stack API on top of the PackedFunc runtime system. -We faced a constant changing the compiler API for the need of research. We need a new language object or IR node from now and then when we want to test out new primitives. +We faced a constant changing of the compiler API for the need of research. We need a new language object or IR node whenever we want to test out new primitives. However, we don't want to change our API from time to time. Besides that, we also want to - be able to serialize any language object and IRs - be able to explore, print, and manipulate the IR objects in front-end language to do quick prototyping. We introduced a base class, called [Node](https://github.com/dmlc/HalideIR/blob/master/src/tvm/node.h#L52) to solve this problem. -All the language object in compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies -the type of object. We choose string instead of int as type key so new Node class can be added in decentralized fashion without +All the language object in the compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies +the type of object. We choose string instead of int as type key so new Node class can be added in the decentralized fashion without adding the code back to the central repo. To ease the speed of dispatching, we allocate an integer type_index at runtime for each type_key. -Since usually one Node object could be referenced in multiple places in the language. We use a shared_ptr to keep -track of reference. We use NodeRef class to represents a reference to the Node. +Since usually one Node object could be referenced in multiple places in the language, we use a shared_ptr to keep +track of reference. We use NodeRef class to represent a reference to the Node. We can roughly view NodeRef class as shared_ptr to the Node container. We can also define subclass NodeRef to hold each subtypes of Node. Each Node class needs to define the VisitAttr function. @@ -206,7 +206,7 @@ class TensorNode : public Node { ``` In the above examples, both ```Operation``` and ```Array``` are NodeRef. The VisitAttrs gives us a reflection API to visit each member of the object. -We can use this function to visit the node any serialize any language object recursively. +We can use this function to visit the node and serialize any language object recursively. It also allows us to get members of an object easily in front-end language. For example, in the following code, we accessed the op field of the TensorNode. @@ -220,13 +220,13 @@ print(x.op.name) New Node can be added to C++ without changing the front-end runtime, making it easy to make extensions to the compiler stack. Note that this is not the fastest way to expose members to front-end language, but might be one of the simplest -approach possible. We also find it fits our purposes as we mainly use python for testing and prototyping and still use c++ +approaches possible. We also find that it fits our purposes as we mainly use python for testing and prototyping and still use c++ to do the heavy lifting job. ## Implementation Details Each argument in PackedFunc contains a union value [TVMValue](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L122) -and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language +and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to do runtime type checking during conversion. The relevant files are From 7e9b9f9babbf412d6a75e0327a9c35ec134f3059 Mon Sep 17 00:00:00 2001 From: kun-zh <32951065+kun-zh@users.noreply.github.com> Date: Sun, 28 Jan 2018 01:10:42 +0800 Subject: [PATCH 111/948] support using pointer with an original offset (#826) * when there is no intrin func, using body for initialization. For issue 714. * Refine code per review comments, and add a test case. * Fix lint issues. * Re-organize the tensorize test cases, and add a new case for none-reset mode. * Fix a typo. * Delete the unit case because merged it into test_schedule_tensorize.py already. * always use new tensor in its stage when rewrite for cache read * revert previous changes to sync up with master * support using the ptr with an original offset * update test case and fix CI error --- include/tvm/buffer.h | 3 ++- python/tvm/schedule.py | 8 ++++++-- src/api/api_lang.cc | 2 +- src/lang/buffer.cc | 4 ++-- tests/python/unittest/test_lang_buffer.py | 10 ++++++++++ 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h index f2790f6df7d1..d737341e1c0e 100644 --- a/include/tvm/buffer.h +++ b/include/tvm/buffer.h @@ -52,9 +52,10 @@ class Buffer : public NodeRef { * \param access_mask The access mask * \param ptr_type The type of the pointer. * \param content_lanes The number of lanes for the (data) type. + * \param offset The offset of ptr. */ TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle(), - int content_lanes = 1) const; + int content_lanes = 1, int offset = 0) const; /*! * \brief Create an Expr that does a vector load at begin index. * \param begin The beginning index diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py index 0fc6692d950e..dda5f67d1b89 100644 --- a/python/tvm/schedule.py +++ b/python/tvm/schedule.py @@ -25,7 +25,7 @@ class Buffer(NodeBase): READ = 1 WRITE = 2 - def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1): + def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0): """Get an access pointer to the head of buffer. This is the recommended method to get buffer data @@ -45,6 +45,10 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1): The number of lanes for the data type. This value is greater than one for vector types. + offset: int, optional + The offset of pointer. We can use it to offset by + the number of elements from the address of ptr. + Examples -------- .. code-block:: python @@ -68,7 +72,7 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1): raise ValueError("Unknown access_mask %s" % access_mask) access_mask = mask return _api_internal._BufferAccessPtr(self, access_mask, ptr_type, - content_lanes) + content_lanes, offset) def vload(self, begin, dtype=None): """Generate an Expr that loads dtype from begin index. diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc index 37a21cedf3db..3b5916ea5fec 100644 --- a/src/api/api_lang.cc +++ b/src/api/api_lang.cc @@ -159,7 +159,7 @@ TVM_REGISTER_API("_Buffer") TVM_REGISTER_API("_BufferAccessPtr") .set_body([](TVMArgs args, TVMRetValue* ret) { *ret = args[0].operator Buffer() - .access_ptr(args[1], args[2], args[3]); + .access_ptr(args[1], args[2], args[3], args[4]); }); TVM_REGISTER_API("_BufferVLoad") diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc index af76dcc94f71..07e455e25384 100644 --- a/src/lang/buffer.cc +++ b/src/lang/buffer.cc @@ -335,7 +335,7 @@ Buffer Buffer::MakeSlice(Array begins, Array extents) const { 0); } -Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes) const { +Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, int offset) const { const BufferNode* self = operator->(); Expr e_dtype; Expr extent; @@ -348,7 +348,7 @@ Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes) const } else { extent = arith::ComputeReduce(self->shape, Expr()); } - Expr elem_offset = self->elem_offset; + Expr elem_offset = self->elem_offset + offset; if (content_lanes > 1) { e_dtype = make_zero(self->dtype.with_lanes(content_lanes)); extent = extent / make_const(self->elem_offset.type(), content_lanes); diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py index c3f00ac2f166..fe0f1f0b759c 100644 --- a/tests/python/unittest/test_lang_buffer.py +++ b/tests/python/unittest/test_lang_buffer.py @@ -23,6 +23,15 @@ def test_buffer_access_ptr(): aptr = Ab.access_ptr("w") assert aptr.args[4].value == Buffer.WRITE +def test_buffer_access_ptr_offset(): + m = tvm.var('m') + n = tvm.var('n') + Ab = tvm.decl_buffer((m, n), tvm.float32) + aptr = Ab.access_ptr("rw", offset=100) + offset = tvm.ir_pass.Simplify(aptr.args[2]) + assert tvm.ir_pass.Equal(offset, 100) + assert aptr.args[4].value == Buffer.READ | Buffer.WRITE + def test_buffer_index_merge_mult_mod(): m = tvm.var('m') n = tvm.var('n') @@ -57,4 +66,5 @@ def assert_simplified_equal(index_simplified, index_direct): if __name__ == "__main__": test_buffer() test_buffer_access_ptr() + test_buffer_access_ptr_offset() test_buffer_index_merge_mult_mod() From c73b5901b5714dec30ddd75c7bf89d43a1132a8d Mon Sep 17 00:00:00 2001 From: Mu Li Date: Sat, 27 Jan 2018 09:56:02 -0800 Subject: [PATCH 112/948] add the link for how to setup rk3399 opencl driver (#827) --- tutorials/deployment/cross_compilation_and_rpc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tutorials/deployment/cross_compilation_and_rpc.py b/tutorials/deployment/cross_compilation_and_rpc.py index eec2f9aeb34b..ccaf9e79e3de 100644 --- a/tutorials/deployment/cross_compilation_and_rpc.py +++ b/tutorials/deployment/cross_compilation_and_rpc.py @@ -234,7 +234,10 @@ # .. note:: # # Raspberry Pi does not support OpenCL, the following code is tested on -# Firefly-RK3399. The target_host should be 'llvm -target=aarch64-linux-gnu'. +# Firefly-RK3399. You may follow this `tutorial `_ +# to setup the RK3399 OS and OpenCL driver. +# +# The target_host should be 'llvm -target=aarch64-linux-gnu'. # But here we set 'llvm' to enable this tutorial to run locally. # # Also we need to build the runtime with the flag `USE_OPENCL=1`. From 737b35597299da9ac4f92f5d1a8dfc037f5d6d09 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 27 Jan 2018 10:45:28 -0800 Subject: [PATCH 113/948] [TIMER] Enhance time evaluator to create multiple results (#830) --- python/tvm/module.py | 22 ++++++++++----- src/runtime/rpc/rpc_module.cc | 9 ++++--- src/runtime/rpc/rpc_session.cc | 37 ++++++++++++++++---------- src/runtime/rpc/rpc_session.h | 11 +++++--- tests/python/integration/test_ewise.py | 9 ++++--- 5 files changed, 56 insertions(+), 32 deletions(-) diff --git a/python/tvm/module.py b/python/tvm/module.py index 055ae7308987..d8b018b824f0 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -1,13 +1,14 @@ """Container of compiled functions of TVM.""" from __future__ import absolute_import as _abs +import struct from collections import namedtuple from ._ffi.function import ModuleBase, _set_class_module from ._ffi.function import _init_api from .contrib import cc as _cc, tar as _tar, util as _util -ProfileResult = namedtuple("ProfileResult", ["mean"]) +ProfileResult = namedtuple("ProfileResult", ["mean", "results"]) class Module(ModuleBase): @@ -110,7 +111,7 @@ def export_library(self, fcompile = _cc.create_shared fcompile(file_name, files, **kwargs) - def time_evaluator(self, func_name, ctx, number): + def time_evaluator(self, func_name, ctx, number, repeat=1): """Get an evaluator that measures time cost of running function. Parameters @@ -122,11 +123,15 @@ def time_evaluator(self, func_name, ctx, number): The context we should run this function on. number: int - The number of repeative times to run evaluation. + The number of steps used in measuring each time interval + + repeat: int, optional + Number of times to run the timer measurement + If repeat equals 3, then we will get 3 numbers in the ProfileResult. Note ---- - The function will be invoked number + 1 times, + The function will be invoked repeat * number + 1 times, with the first call discarded in case there is lazy initialization. Returns @@ -137,13 +142,16 @@ def time_evaluator(self, func_name, ctx, number): """ try: feval = _RPCTimeEvaluator( - self, func_name, ctx.device_type, ctx.device_id, number) + self, func_name, ctx.device_type, ctx.device_id, number, repeat) def evaluator(*args): """Internal wrapped evaluator.""" # Wrap feval so we can add more stats in future. - mean = feval(*args) - return ProfileResult(mean=mean) + blob = feval(*args) + fmt = "@" + ("d" * repeat) + results = struct.unpack(fmt, blob) + mean = sum(results) / float(repeat) + return ProfileResult(mean=mean, results=results) return evaluator except NameError: diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index af2e0647871d..9b4b3658c1be 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -77,10 +77,11 @@ class RPCModuleNode final : public ModuleNode { PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, - int nstep) { + int number, + int repeat) { RPCFuncHandle handle = GetFuncHandle(name); if (handle == nullptr) return PackedFunc(); - handle = sess_->GetTimeEvaluator(handle, ctx, nstep); + handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat); return WrapRemote(handle); } @@ -148,10 +149,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator") ctx.device_id = args[3]; if (tkey == "rpc") { *rv = static_cast(m.operator->()) - ->GetTimeEvaluator(args[1], ctx, args[4]); + ->GetTimeEvaluator(args[1], ctx, args[4], args[5]); } else { *rv = WrapTimeEvaluator( - m.GetFunction(args[1], false), ctx, args[4]); + m.GetFunction(args[1], false), ctx, args[4], args[5]); } }); diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 3bff73afe9bd..3d00371c034c 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -844,8 +844,9 @@ void RPCSession::CopyFromRemote(void* from, } RPCFuncHandle RPCSession::GetTimeEvaluator( - RPCFuncHandle fhandle, TVMContext ctx, int nstep) { - return this->CallRemote(RPCCode::kGetTimeEvaluator, fhandle, ctx, nstep); + RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) { + return this->CallRemote( + RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat); } // Event handler functions @@ -973,7 +974,7 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) { void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { PackedFunc *pf = static_cast(args[0].operator void*()); - void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2])); + void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3])); delete pf; *rv = fhandle; } @@ -1024,23 +1025,31 @@ void RPCSession::EventHandler::HandlePackedCall() { CHECK_EQ(state_, kRecvCode); } -PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int nstep) { - auto ftimer = [pf, ctx, nstep](TVMArgs args, TVMRetValue *rv) { +PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) { + auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) { TVMRetValue temp; + std::ostringstream os; // skip first time call, to activate lazy compilation components. pf.CallPacked(args, &temp); DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - // start timing - auto tbegin = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < nstep; ++i) { - pf.CallPacked(args, &temp); + for (int i = 0; i < repeat; ++i) { + // start timing + auto tbegin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < number; ++i) { + pf.CallPacked(args, &temp); + } + DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + auto tend = std::chrono::high_resolution_clock::now(); + double speed = std::chrono::duration_cast >( + tend - tbegin).count() / number; + os.write(reinterpret_cast(&speed), sizeof(speed)); } - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - auto tend = std::chrono::high_resolution_clock::now(); - double speed = std::chrono::duration_cast >( - tend - tbegin).count() / nstep; + std::string blob = os.str(); + TVMByteArray arr; + arr.size = blob.length(); + arr.data = blob.data(); // return the time. - *rv = speed; + *rv = arr; }; return PackedFunc(ftimer); } diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 80dde9171401..59c1c1016ae2 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -146,12 +146,14 @@ class RPCSession { * * \param fhandle The function handle. * \param ctx The ctx to run measurement on. - * \param nstep Number of steps to run. + * \param number How many steps to run in each time evaluation + * \param repeat How many times to repeat the timer * \return A remote timer function */ RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle, TVMContext ctx, - int nstep); + int number, + int repeat); /*! * \brief Call a remote defined system function with arguments. * \param fcode The function code. @@ -212,9 +214,10 @@ class RPCSession { * \brief Wrap a timer function for a given packed function. * \param f The function argument. * \param ctx The context. - * \param nstep Number of repeative steps. + * \param number Number of steps in the inner iteration + * \param repeat How many steps to repeat the time evaluation. */ -PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int nstep); +PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat); /*! * \brief Create a Global RPC module that refers to the session. diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index f8dc43da8d31..e56fac6734bb 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -55,7 +55,10 @@ def test_log_pow_llvm(): n = 1028 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) - flog(a, b) + repeat = 10 + ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat) + res = ftimer(a, b) + assert(len(res.results) == repeat) np.testing.assert_allclose( b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5) @@ -146,7 +149,7 @@ def check_device(device): if __name__ == "__main__": - test_add() test_log_pow_llvm() - test_popcount() test_exp() + test_add() + test_popcount() From 7b9285b8b5a8980bbc8caebf01e7468484a5ba04 Mon Sep 17 00:00:00 2001 From: Siva Date: Sun, 28 Jan 2018 11:20:04 +0530 Subject: [PATCH 114/948] [DEBUG] get_node_output : To retrieve out put of any node - for debug purpose. (#820) --- Makefile | 4 +++ make/config.mk | 3 ++ python/tvm/contrib/graph_runtime.py | 21 +++++++++++++ src/runtime/graph/graph_runtime.cc | 47 +++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+) diff --git a/Makefile b/Makefile index 453415de6634..44a500d26ce3 100644 --- a/Makefile +++ b/Makefile @@ -151,6 +151,10 @@ ifeq ($(USE_GRAPH_RUNTIME), 1) RUNTIME_DEP += $(GRAPH_OBJ) endif +ifeq ($(USE_GRAPH_RUNTIME_DEBUG), 1) + CFLAGS += -DTVM_GRAPH_RUNTIME_DEBUG +endif + include make/contrib/cblas.mk include make/contrib/random.mk include make/contrib/nnpack.mk diff --git a/make/config.mk b/make/config.mk index 256771ac3220..eee96ac12360 100644 --- a/make/config.mk +++ b/make/config.mk @@ -50,6 +50,9 @@ USE_RPC = 1 # Whether enable tiny embedded graph runtime. USE_GRAPH_RUNTIME = 1 +# Whether enable additional graph debug functions +USE_GRAPH_RUNTIME_DEBUG = 0 + # whether build with LLVM support # Requires LLVM version >= 4.0 # Set LLVM_CONFIG to your version, uncomment to build with llvm support diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 7e919586b0c0..ddabac004993 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -72,6 +72,10 @@ def __init__(self, module, ctx): self._set_input = module["set_input"] self._run = module["run"] self._get_output = module["get_output"] + try: + self._debug_get_output = module["debug_get_output"] + except AttributeError: + pass self._load_params = module["load_params"] self.ctx = ctx @@ -121,6 +125,23 @@ def get_output(self, index, out): self._get_output(index, out) return out + def debug_get_output(self, node, out): + """Run graph upto node and get the output to out + + Parameters + ---------- + node : int / str + The node index or name + + out : NDArray + The output array container + """ + if hasattr(self, '_debug_get_output'): + self._debug_get_output(node, out) + else: + raise RuntimeError("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0") + return out + def load_params(self, params_bytes): """Load parameters from serialized byte array of parameter dict. diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index d244fe5f028e..bf07a8c38927 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -107,7 +107,44 @@ class GraphRuntime : public ModuleNode { uint32_t eid = this->entry_id(outputs_[index]); TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr)); } +#ifdef TVM_GRAPH_RUNTIME_DEBUG + /*! + * \brief Get the node index given the name of node. + * \param name The name of the node. + * \return The index of node. + */ + int GetNodeIndex(const std::string& name) { + for (uint32_t nid = 0; nid< nodes_.size(); ++nid) { + if (nodes_[nid].name == name) { + return static_cast(nid); + } + } + LOG(FATAL) << "cannot find " << name << " among nodex"; + return -1; + } + /*! + * \brief Copy index-th node to data_out. + * + * This method will do a partial run of the the graph + * from begining upto the index-th node and return output of index-th node. + * This is costly operation and suggest to use only for debug porpose. + * + * \param index: The index of the node. + * \param data_out the node data. + */ + void DebugGetNodeOutput(int index, DLTensor* data_out) { + CHECK_LT(static_cast(index), nodes_.size()); + uint32_t eid = index; + + for (size_t i = 0; i < op_execs_.size(); ++i) { + if (static_cast(i) == index) break; + if (op_execs_[i]) op_execs_[i](); + } + + TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr)); + } +#endif /*! * \brief Load parameters from binary stream * \param strm The input stream. @@ -556,6 +593,16 @@ PackedFunc GraphRuntime::GetFunction( return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->GetOutput(args[0], args[1]); }); +#ifdef TVM_GRAPH_RUNTIME_DEBUG + } else if (name == "debug_get_output") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + if (args[0].type_code() == kStr) { + this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]); + } else { + this->DebugGetNodeOutput(args[0], args[1]); + } + }); +#endif } else if (name == "run") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); From 7d60a4b2cd18de997d064b9f9252f3889c5812a8 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Sun, 28 Jan 2018 00:50:52 -0500 Subject: [PATCH 115/948] Add type code and bits to AllocWorkspace. (#831) --- include/tvm/runtime/c_backend_api.h | 10 ++++++-- include/tvm/runtime/device_api.h | 8 +++++-- src/codegen/codegen_opengl.cc | 2 ++ src/codegen/stack_vm/codegen_stack_vm.cc | 4 +++- src/codegen/stack_vm/stack_vm.cc | 15 +++++++----- src/pass/lower_tvm_builtin.cc | 20 +++++++++------- src/pass/split_host_device.cc | 5 ++++ src/runtime/c_runtime_api.cc | 21 +++++++++++++---- src/runtime/cpu_device_api.cc | 6 +++-- src/runtime/cuda/cuda_device_api.cc | 2 +- src/runtime/metal/metal_common.h | 2 +- src/runtime/metal/metal_device_api.mm | 4 +++- src/runtime/opencl/opencl_common.h | 2 +- src/runtime/opencl/opencl_device_api.cc | 4 +++- src/runtime/opengl/opengl_common.h | 2 -- src/runtime/opengl/opengl_device_api.cc | 9 ------- src/runtime/rocm/rocm_device_api.cc | 2 +- tests/webgl/test_local_multi_stage.py | 30 ++++++++++++++++++++++++ 18 files changed, 105 insertions(+), 43 deletions(-) create mode 100644 tests/webgl/test_local_multi_stage.py diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h index e512921c969e..079ab1efb040 100644 --- a/include/tvm/runtime/c_backend_api.h +++ b/include/tvm/runtime/c_backend_api.h @@ -44,14 +44,20 @@ TVM_DLL int TVMBackendRegisterSystemLibSymbol(const char* name, void* ptr); * * \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment. * - * \param size The size of the space requested. + * \param nbytes The size of the space requested. * \param device_type The device type which the space will be allocated. * \param device_id The device id which the space will be allocated. + * \param dtype_code_hint The type code of the array elements. Only used in + * certain backends such as OpenGL. + * \param dtype_bits_hint The type bits of the array elements. Only used in + * certain backends such as OpenGL. * \return nullptr when error is thrown, a valid ptr if success */ TVM_DLL void* TVMBackendAllocWorkspace(int device_type, int device_id, - uint64_t size); + uint64_t nbytes, + int dtype_code_hint, + int dtype_bits_hint); /*! * \brief Backend function to free temporal workspace. diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 9ba08fb86825..45009f1d3af3 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -114,9 +114,13 @@ class DeviceAPI { * - Workspace should not overlap between different threads(i.e. be threadlocal) * * \param ctx The context of allocation. - * \param size The size to be allocated. + * \param nbytes The size to be allocated. + * \param type_hint The type of elements. Only needed by certain backends such + * as OpenGL, as nbytes is sufficient for most backends. */ - TVM_DLL virtual void* AllocWorkspace(TVMContext ctx, size_t size); + TVM_DLL virtual void* AllocWorkspace(TVMContext ctx, + size_t nbytes, + TVMType type_hint = {}); /*! * \brief Free temporal workspace in backend execution. * diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc index 496b15b34bfa..696082749a37 100644 --- a/src/codegen/codegen_opengl.cc +++ b/src/codegen/codegen_opengl.cc @@ -24,6 +24,8 @@ void CodeGenOpenGL::InitFuncState(LoweredFunc f) { inputs_.clear(); output_iter_var_ = nullptr; thread_extent_var_ = ""; + this->decl_stream.str(""); + this->stream.str(""); } void CodeGenOpenGL::AddFunction(LoweredFunc f) { diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stack_vm/codegen_stack_vm.cc index 5b01dae7100a..168e411fa6e2 100644 --- a/src/codegen/stack_vm/codegen_stack_vm.cc +++ b/src/codegen/stack_vm/codegen_stack_vm.cc @@ -197,10 +197,12 @@ void CodeGenStackVM::VisitExpr_(const Call* op) { vm_.stack_size += size; this->PushOp(StackVM::TVM_STACK_ALLOCA_BY_8BYTE, static_cast(size)); } else if (op->name == "TVMBackendAllocWorkspace") { - CHECK_EQ(op->args.size(), 3U); + CHECK_EQ(op->args.size(), 5U); this->Push(op->args[0]); this->Push(op->args[1]); this->Push(op->args[2]); + this->Push(op->args[3]); + this->Push(op->args[4]); this->PushOp(StackVM::TVM_DEVICE_ALLOCA); } else if (op->name == "TVMBackendFreeWorkspace") { CHECK_EQ(op->args.size(), 3U); diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/codegen/stack_vm/stack_vm.cc index a133c9797b1b..95feeae3679e 100644 --- a/src/codegen/stack_vm/stack_vm.cc +++ b/src/codegen/stack_vm/stack_vm.cc @@ -455,12 +455,15 @@ void StackVM::Run(State* s) const { break; } case TVM_DEVICE_ALLOCA: { - int device_type = static_cast(stack[sp - 2].v_int64); - int device_id = static_cast(stack[sp - 1].v_int64); - size_t nbytes = static_cast(stack[sp].v_int64); - void* ptr = TVMBackendAllocWorkspace(device_type, device_id, nbytes); - stack[sp - 2].v_handle = ptr; - sp = sp - 2; + int device_type = static_cast(stack[sp - 4].v_int64); + int device_id = static_cast(stack[sp - 3].v_int64); + size_t nbytes = static_cast(stack[sp - 2].v_int64); + int dtype_code_hint = static_cast(stack[sp - 1].v_int64); + int dtype_bits_hint = static_cast(stack[sp].v_int64); + void* ptr = TVMBackendAllocWorkspace(device_type, device_id, nbytes, + dtype_code_hint, dtype_bits_hint); + stack[sp - 4].v_handle = ptr; + sp = sp - 4; pc = pc + 1; break; } diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc index 105d58b95829..a63fef07bd12 100644 --- a/src/pass/lower_tvm_builtin.cc +++ b/src/pass/lower_tvm_builtin.cc @@ -96,14 +96,18 @@ class BuiltinLower : public IRMutator { {op->buffer_var}, Call::PureIntrinsic), throw_last_error), op->body); - Stmt alloca = LetStmt::make(op->buffer_var, - Call::make(op->buffer_var.type(), - "TVMBackendAllocWorkspace", - {cast(Int(32), device_type_), - cast(Int(32), device_id_), - cast(UInt(64), total_bytes)}, - Call::Extern), - body); + + Stmt alloca = LetStmt::make( + op->buffer_var, + Call::make(op->buffer_var.type(), + "TVMBackendAllocWorkspace", + {cast(Int(32), device_type_), + cast(Int(32), device_id_), + cast(UInt(64), total_bytes), + IntImm::make(Int(32), op->type.code()), + IntImm::make(Int(32), op->type.bits())}, + Call::Extern), + body); Expr free_op = Call::make(Int(32), "TVMBackendFreeWorkspace", diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc index 942e70339488..dc326f3cb2f1 100644 --- a/src/pass/split_host_device.cc +++ b/src/pass/split_host_device.cc @@ -146,6 +146,11 @@ class IRUseDefAnalysis : public IRMutator { class HostDeviceSplitter : public IRMutator { public: + Stmt Mutate_(const Allocate* op, const Stmt& s) final { + handle_data_type_[op->buffer_var.get()] = make_const(op->type, 0); + return IRMutator::Mutate_(op, s); + } + Stmt Mutate_(const AttrStmt *op, const Stmt& s) final { if (op->attr_key == attr::thread_extent || op->attr_key == attr::pipeline_exec_scope) { diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 0d0e36f239f2..2177fc344889 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -95,8 +95,9 @@ DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) { static_cast(ctx.device_type), allow_missing); } -void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) { - TVMType type_hint{kDLUInt, 8, 1}; +void* DeviceAPI::AllocWorkspace(TVMContext ctx, + size_t size, + TVMType type_hint) { return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint); } @@ -220,12 +221,22 @@ int TVMBackendGetFuncFromEnv(void* mod_node, } void* TVMBackendAllocWorkspace(int device_type, - int device_id, - uint64_t size) { + int device_id, + uint64_t size, + int dtype_code_hint, + int dtype_bits_hint) { TVMContext ctx; ctx.device_type = static_cast(device_type); ctx.device_id = device_id; - return DeviceAPIManager::Get(ctx)->AllocWorkspace(ctx, static_cast(size)); + + TVMType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; + + return DeviceAPIManager::Get(ctx)->AllocWorkspace(ctx, + static_cast(size), + type_hint); } int TVMBackendFreeWorkspace(int device_type, diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 30c3bb7d52df..7486f20a6ae1 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -59,7 +59,7 @@ class CPUDeviceAPI final : public DeviceAPI { void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { } - void* AllocWorkspace(TVMContext ctx, size_t size) final; + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; static const std::shared_ptr& Global() { @@ -74,7 +74,9 @@ struct CPUWorkspacePool : public WorkspacePool { WorkspacePool(kDLCPU, CPUDeviceAPI::Global()) {} }; -void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) { +void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx, + size_t size, + TVMType type_hint) { return dmlc::ThreadLocalStore::Get() ->AllocWorkspace(ctx, size); } diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 69b485a423c0..7885aa7705ed 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -112,7 +112,7 @@ class CUDADeviceAPI final : public DeviceAPI { ->stream = static_cast(stream); } - void* AllocWorkspace(TVMContext ctx, size_t size) final { + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); } diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index 7c2975fe7ccc..fa73b8250c33 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -77,7 +77,7 @@ class MetalWorkspace final : public DeviceAPI { TVMContext ctx_to, TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; - void* AllocWorkspace(TVMContext ctx, size_t size) final; + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; // get the global workspace static const std::shared_ptr& Global(); diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 82c52a23e036..6d376d3144ac 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -228,7 +228,9 @@ int GetWarpSize(id dev) { [cb waitUntilCompleted]; } -void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size) { +void* MetalWorkspace::AllocWorkspace(TVMContext ctx, + size_t size, + TVMType type_hint) { return MetalThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); } diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 29e205ced4d7..67934a078665 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -156,7 +156,7 @@ class OpenCLWorkspace final : public DeviceAPI { TVMContext ctx_to, TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; - void* AllocWorkspace(TVMContext ctx, size_t size) final; + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; // get the global workspace static const std::shared_ptr& Global(); diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 7518e72f9d9b..a07fe15f805f 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -108,7 +108,9 @@ void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) { OPENCL_CALL(clFinish(this->GetQueue(ctx))); } -void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx, size_t size) { +void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx, + size_t size, + TVMType type_hint) { return OpenCLThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); } diff --git a/src/runtime/opengl/opengl_common.h b/src/runtime/opengl/opengl_common.h index 80b1d9f95c8e..661c987e4b3c 100644 --- a/src/runtime/opengl/opengl_common.h +++ b/src/runtime/opengl/opengl_common.h @@ -175,8 +175,6 @@ class OpenGLWorkspace final : public DeviceAPI { TVMContext ctx_to, TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; - void* AllocWorkspace(TVMContext ctx, size_t size) final; - void FreeWorkspace(TVMContext ctx, void* data) final; /*! * \brief Get the global OpenGL workspace. diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc index d90d12034ae6..df2947db6255 100644 --- a/src/runtime/opengl/opengl_device_api.cc +++ b/src/runtime/opengl/opengl_device_api.cc @@ -156,15 +156,6 @@ void OpenGLWorkspace::CopyDataFromTo(const void* from, void OpenGLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {} -void* OpenGLWorkspace::AllocWorkspace(TVMContext ctx, size_t size) { - LOG(FATAL) << "Cannot allocate OpenGL workspace."; - return nullptr; -} - -void OpenGLWorkspace::FreeWorkspace(TVMContext ctx, void* data) { - LOG(FATAL) << "Cannot free OpenGL workspace."; -} - OpenGLWorkspace::OpenGLWorkspace() { // Set an error handler. // This can be called before glfwInit(). diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 443d76b76eb6..877907c7e092 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -110,7 +110,7 @@ class ROCMDeviceAPI final : public DeviceAPI { ->stream = static_cast(stream); } - void* AllocWorkspace(TVMContext ctx, size_t size) final { + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { return ROCMThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); } diff --git a/tests/webgl/test_local_multi_stage.py b/tests/webgl/test_local_multi_stage.py new file mode 100644 index 000000000000..47fa5c76c7aa --- /dev/null +++ b/tests/webgl/test_local_multi_stage.py @@ -0,0 +1,30 @@ +import tvm +import numpy as np + +def test_local_multi_stage(): + if not tvm.module.enabled("opengl"): + return + if not tvm.module.enabled("llvm"): + return + + n = tvm.var("n") + A = tvm.placeholder((n,), name='A', dtype="int32") + B = tvm.compute((n,), lambda i: A[i] + 1, name="B") + C = tvm.compute((n,), lambda i: B[i] * 2, name="C") + + s = tvm.create_schedule(C.op) + s[B].opengl() + s[C].opengl() + + f = tvm.build(s, [A, C], "opengl", name="multi_stage") + + ctx = tvm.opengl(0) + n = 10 + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) + c = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx) + f(a, c) + + np.testing.assert_allclose(c.asnumpy(), (a.asnumpy() + 1) * 2) + +if __name__ == "__main__": + test_local_multi_stage() From fda2fa143c7a819cd543a0717da2e4cec957cc0f Mon Sep 17 00:00:00 2001 From: alex-weaver Date: Sun, 28 Jan 2018 05:57:13 +0000 Subject: [PATCH 116/948] Porting schedules (except convolutions) to C++ (#763) * Ported injective schedules to C++. Added some elementwise ops. * Fix lint errors * Added reduction ops and schedules * Fix lint errors * Fix lint errors * Fix lint errors * Added transform ops * Fix lint errors * Fix lint errors * Added softmax, log_softmax, leaky_relu and flatten ops. Fixed issue where TVM_DECLARE_INTRIN_UNARY used the PureExtern flag instead of PureIntrinsic. Added softmax CUDA schedule. * Fix lint * Fix lint * Added binary_dense, batch_norm_inference, dense, dilate, scale_shift_*, global_pool and pool ops. Extended pad to allow specifying pad_value. Fixed issue where pad would throw if padding was zero in all dimensions. * Fix lint * Fix lint * Added CUDA schedules for dense, pool and global_pool * Added extern schedules for generic and CUDA * Fix lint * Added x86 binary schedules * Fix lint * Added rocm dense schedule. Added rocBLAS and cuBLAS support to dense ops * Added pow ops. Added x86 default and injective schedules * Fix lint * Fix lint * Fix lint * Fix lint * Fix lint * Fix indent * Removed schedules directory * Changed left_shift, right_shift to operators. Changed pad_value in pad() to remove pointer usage * Fixed usage of pad in nn/pooling.h. Fixed declaration of operator>> * Fixed comments for shift operators * Added comments to utility functions * Added TOPI C++ library, exporting broadcast_add op * Fix lint * Share libinfo.py with TVM * Fix lint * Add other broadcast ops * Fix lint * Fix imports in topi * Fix lib names * Fixed build issue where windows builds don't apply correct definitions * Removed TVM_EXPORTS from topi library * Attempted CI build fix * Add topi lib to tvm_multilib * Fix Jenkinsfile * Added TOPI build target to Makefile * Fix nn op namespaces. * Fix lint * Renamed TOPI lib to libtvm_topi * Removed _ffi/base.py * Remove _ffi from topi, now shared with tvm. * Make libtvm_topi loading optional * Fix compiler warnings * Fix lint * Fix lint * Fix lint * Fix build error by making new libs argument to Target optional * Added C++ Target type interop. Added registration of remaining C++ ops and schedules. Added test of broadcast ops * Fix lint * Fix lint * Fix compile error * Fix compiler warnings * Fix compiler warnings * Fixed int vector interop. Fixed argmin incorrectly invoking argmax. Fixed corner case in default schedules of attempting to fuse 0 length axes. Added tests for reduce ops. * Refactored reduce builders * Fixed typos in topi.cc. Added basic test. * Fixed padding size error. Added dense, dilate, pooling tests * Fixed issue where clip would output a different dtype to the input. Added split_sections op to cover the other mode of the python split op. Added tests. * Changed extension type numbers to avoid clash with NNVM * Fix lint * Fix compiler warnings * Removed use of std::vector from the public TOPI API * Fix lint * Add TOPI C++ tests to CI * Fixed detail namespacing. Improved comments. --- CMakeLists.txt | 15 +- Jenkinsfile | 7 +- Makefile | 19 +- include/tvm/build_module.h | 8 +- include/tvm/expr.h | 2 +- include/tvm/ir_operator.h | 8 +- include/tvm/ir_pass.h | 4 +- include/tvm/operation.h | 2 +- include/tvm/schedule.h | 44 +- include/tvm/schedule_pass.h | 2 +- python/tvm/_ffi/function.py | 5 +- python/tvm/_ffi/libinfo.py | 22 +- src/codegen/build_module.cc | 12 +- src/lang/ir_operator.cc | 1 + tests/scripts/task_cpp_topi.sh | 4 + tests/travis/run_test.sh | 1 + topi/include/topi/broadcast.h | 63 ++- topi/include/topi/contrib/cublas.h | 49 ++ topi/include/topi/contrib/rocblas.h | 48 ++ topi/include/topi/cuda/dense.h | 134 ++++++ topi/include/topi/cuda/extern.h | 68 +++ topi/include/topi/cuda/injective.h | 58 +++ topi/include/topi/cuda/pooling.h | 163 +++++++ topi/include/topi/cuda/reduction.h | 181 ++++++++ topi/include/topi/cuda/softmax.h | 62 +++ topi/include/topi/detail/array_utils.h | 35 ++ topi/include/topi/detail/broadcast.h | 6 +- topi/include/topi/detail/constant_utils.h | 70 +++ topi/include/topi/detail/extern.h | 137 ++++++ topi/include/topi/detail/fuse.h | 37 ++ topi/include/topi/detail/pad_utils.h | 38 ++ topi/include/topi/detail/ravel_unravel.h | 60 +++ topi/include/topi/elemwise.h | 138 ++++++ topi/include/topi/generic/default.h | 46 ++ topi/include/topi/generic/extern.h | 37 ++ topi/include/topi/generic/injective.h | 42 ++ topi/include/topi/nn.h | 50 ++- topi/include/topi/nn/batch_norm.h | 65 +++ topi/include/topi/nn/bnn.h | 110 +++++ topi/include/topi/nn/dense.h | 61 +++ topi/include/topi/nn/dilate.h | 87 ++++ topi/include/topi/nn/flatten.h | 63 +++ topi/include/topi/nn/mapping.h | 66 +++ topi/include/topi/nn/pooling.h | 161 +++++++ topi/include/topi/nn/softmax.h | 87 ++++ topi/include/topi/reduction.h | 379 ++++++++++++++++ topi/include/topi/rocm/dense.h | 82 ++++ topi/include/topi/tags.h | 18 + topi/include/topi/transform.h | 362 +++++++++++++++ topi/include/topi/x86/bnn.h | 110 +++++ topi/include/topi/x86/default.h | 58 +++ topi/include/topi/x86/injective.h | 50 +++ topi/python/setup.py | 39 +- topi/python/topi/__init__.py | 3 + topi/python/topi/cpp.py | 86 ++++ topi/src/topi.cc | 448 +++++++++++++++++++ topi/tests/python_cpp/test_topi_basic.py | 31 ++ topi/tests/python_cpp/test_topi_bnn.py | 55 +++ topi/tests/python_cpp/test_topi_broadcast.py | 114 +++++ topi/tests/python_cpp/test_topi_clip.py | 45 ++ topi/tests/python_cpp/test_topi_dense.py | 61 +++ topi/tests/python_cpp/test_topi_dilate.py | 34 ++ topi/tests/python_cpp/test_topi_pooling.py | 120 +++++ topi/tests/python_cpp/test_topi_reduce.py | 120 +++++ topi/tests/python_cpp/test_topi_relu.py | 62 +++ topi/tests/python_cpp/test_topi_softmax.py | 81 ++++ topi/tests/python_cpp/test_topi_transform.py | 216 +++++++++ 67 files changed, 4891 insertions(+), 61 deletions(-) create mode 100644 tests/scripts/task_cpp_topi.sh create mode 100644 topi/include/topi/contrib/cublas.h create mode 100644 topi/include/topi/contrib/rocblas.h create mode 100644 topi/include/topi/cuda/dense.h create mode 100644 topi/include/topi/cuda/extern.h create mode 100644 topi/include/topi/cuda/injective.h create mode 100644 topi/include/topi/cuda/pooling.h create mode 100644 topi/include/topi/cuda/reduction.h create mode 100644 topi/include/topi/cuda/softmax.h create mode 100644 topi/include/topi/detail/array_utils.h create mode 100644 topi/include/topi/detail/constant_utils.h create mode 100644 topi/include/topi/detail/extern.h create mode 100644 topi/include/topi/detail/fuse.h create mode 100644 topi/include/topi/detail/pad_utils.h create mode 100644 topi/include/topi/detail/ravel_unravel.h create mode 100644 topi/include/topi/generic/default.h create mode 100644 topi/include/topi/generic/extern.h create mode 100644 topi/include/topi/generic/injective.h create mode 100644 topi/include/topi/nn/batch_norm.h create mode 100644 topi/include/topi/nn/bnn.h create mode 100644 topi/include/topi/nn/dense.h create mode 100644 topi/include/topi/nn/dilate.h create mode 100644 topi/include/topi/nn/flatten.h create mode 100644 topi/include/topi/nn/mapping.h create mode 100644 topi/include/topi/nn/pooling.h create mode 100644 topi/include/topi/nn/softmax.h create mode 100644 topi/include/topi/reduction.h create mode 100644 topi/include/topi/rocm/dense.h create mode 100644 topi/include/topi/transform.h create mode 100644 topi/include/topi/x86/bnn.h create mode 100644 topi/include/topi/x86/default.h create mode 100644 topi/include/topi/x86/injective.h create mode 100644 topi/python/topi/cpp.py create mode 100644 topi/src/topi.cc create mode 100644 topi/tests/python_cpp/test_topi_basic.py create mode 100644 topi/tests/python_cpp/test_topi_bnn.py create mode 100644 topi/tests/python_cpp/test_topi_broadcast.py create mode 100644 topi/tests/python_cpp/test_topi_clip.py create mode 100644 topi/tests/python_cpp/test_topi_dense.py create mode 100644 topi/tests/python_cpp/test_topi_dilate.py create mode 100644 topi/tests/python_cpp/test_topi_pooling.py create mode 100644 topi/tests/python_cpp/test_topi_reduce.py create mode 100644 topi/tests/python_cpp/test_topi_relu.py create mode 100644 topi/tests/python_cpp/test_topi_softmax.py create mode 100644 topi/tests/python_cpp/test_topi_transform.py diff --git a/CMakeLists.txt b/CMakeLists.txt index fd381b9a12e4..62a564c2b656 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,9 +44,7 @@ if(MSVC) add_definitions(-DWIN32_LEAN_AND_MEAN) add_definitions(-D_CRT_SECURE_NO_WARNINGS) add_definitions(-D_SCL_SECURE_NO_WARNINGS) - add_definitions(-DTVM_EXPORTS) add_definitions(-DHalide_SHARED) - add_definitions(-DHalide_EXPORTS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj") @@ -82,6 +80,10 @@ file(GLOB COMPILER_SRCS src/op/*.cc src/schedule/*.cc ) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/topi/include) +file(GLOB TOPI_SRCS + topi/src/*.cc +) file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp) list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS}) file(GLOB RUNTIME_SRCS src/runtime/*.cc) @@ -209,8 +211,10 @@ endif() list(APPEND RUNTIME_SRCS ${GROUP_Include}) add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS}) +add_library(tvm_topi SHARED ${TOPI_SRCS}) add_library(tvm_runtime SHARED ${RUNTIME_SRCS}) target_link_libraries(tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS}) +target_link_libraries(tvm_topi tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS}) target_link_libraries(tvm_runtime ${TVM_RUNTIME_LINKER_LIBS}) install(TARGETS tvm_runtime DESTINATION lib${LIB_SUFFIX}) if (INSTALL_DEV) @@ -242,3 +246,10 @@ else(INSTALL_DEV) PATTERN "*.h" ) endif(INSTALL_DEV) + +if(MSVC) + target_compile_definitions(tvm PRIVATE -DHalide_EXPORTS) + target_compile_definitions(tvm_runtime PRIVATE -DHalide_EXPORTS) + target_compile_definitions(tvm PRIVATE -DTVM_EXPORTS) + target_compile_definitions(tvm_runtime PRIVATE -DTVM_EXPORTS) +endif() \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 4fc2285f507c..85d38d3dfcad 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,10 +4,11 @@ // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/ // tvm libraries -tvm_runtime = "lib/libtvm_runtime.so, config.mk" -tvm_lib = "lib/libtvm.so, " + tvm_runtime +topi_lib = "lib/libtopi.so" +tvm_runtime = "lib/libtvm_runtime.so, config.mk, " +tvm_lib = "lib/libtvm.so, " + tvm_runtime + topi_lib // LLVM upstream lib -tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime +tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime + topi_lib // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' diff --git a/Makefile b/Makefile index 44a500d26ce3..332d6ebe86f2 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,7 @@ OPENGL_SRC = $(wildcard src/runtime/opengl/*.cc) RPC_SRC = $(wildcard src/runtime/rpc/*.cc) GRAPH_SRC = $(wildcard src/runtime/graph/*.cc) RUNTIME_SRC = $(wildcard src/runtime/*.cc) +TOPI_SRC = $(wildcard topi/src/*.cc) # Objectives LLVM_BUILD = build/llvm${LLVM_VERSION} @@ -71,11 +72,13 @@ RPC_OBJ = $(patsubst src/%.cc, build/%.o, $(RPC_SRC)) GRAPH_OBJ = $(patsubst src/%.cc, build/%.o, $(GRAPH_SRC)) CC_OBJ = $(patsubst src/%.cc, build/%.o, $(CC_SRC)) $(LLVM_OBJ) RUNTIME_OBJ = $(patsubst src/%.cc, build/%.o, $(RUNTIME_SRC)) +TOPI_OBJ = $(patsubst topi/%.cc, build/%.o, $(TOPI_SRC)) CONTRIB_OBJ = # Deps ALL_DEP = $(CC_OBJ) $(CONTRIB_OBJ) $(LIB_HALIDEIR) RUNTIME_DEP = $(RUNTIME_OBJ) +TOPI_DEP = $(TOPI_OBJ) # Dependency specific rules ifdef CUDA_PATH @@ -198,10 +201,11 @@ else JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-cpu endif -BUILD_TARGETS ?= lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) +BUILD_TARGETS ?= lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_topi.$(SHARED_LIBRARY_SUFFIX) all: ${BUILD_TARGETS} runtime: lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) web: lib/libtvm_web_runtime.js lib/libtvm_web_runtime.bc +topi: lib/libtvm_topi.$(SHARED_LIBRARY_SUFFIX) include tests/cpp/unittest.mk @@ -226,10 +230,19 @@ build/%.o: src/%.cc $(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d $(CXX) -c $(CFLAGS) -c $< -o $@ +build/src/%.o: topi/src/%.cc + @mkdir -p $(@D) + $(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d + $(CXX) -c $(CFLAGS) -c $< -o $@ + lib/libtvm.dylib: $(ALL_DEP) $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) +lib/libtvm_topi.dylib: lib/libtvm.so $(TOPI_DEP) + @mkdir -p $(@D) + $(CXX) $(CFLAGS) $(FRAMEWORKS) -L./lib -ltvm -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) + lib/libtvm_runtime.dylib: $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) @@ -238,6 +251,10 @@ lib/libtvm.so: $(ALL_DEP) $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) +lib/libtvm_topi.so: lib/libtvm.so $(TOPI_DEP) + @mkdir -p $(@D) + $(CXX) $(CFLAGS) $(FRAMEWORKS) -L./lib -ltvm -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) + lib/libtvm_runtime.so: $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index ae7ca3ca681d..391901730e57 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -31,19 +31,23 @@ struct Target { std::unordered_set keys; /*! \brief Options for this target */ std::vector options; + /*! \brief Set of imported libs */ + std::unordered_set libs; Target(const std::string& target_name, DLDeviceType device_type, int max_num_threads, int thread_warp_size, const std::unordered_set& keys, - const std::vector& options) : + const std::vector& options, + const std::unordered_set& libs = {}) : target_name(target_name), device_type(device_type), max_num_threads(max_num_threads), thread_warp_size(thread_warp_size), keys(keys), - options(options) { + options(options), + libs(libs) { } /*! \return the full device string to pass to codegen::Build */ diff --git a/include/tvm/expr.h b/include/tvm/expr.h index 5d35728c24b4..d2816358be0b 100644 --- a/include/tvm/expr.h +++ b/include/tvm/expr.h @@ -73,7 +73,7 @@ inline int GetVectorBytes(Type dtype) { /*! \brief a named variable in TVM */ class Var : public HalideIR::VarExpr { public: - explicit Var(const std::string& name_hint = "v", + EXPORT explicit Var(const std::string& name_hint = "v", Type t = Int(32)) : VarExpr(name_hint, t) {} explicit Var(std::shared_ptr n) : VarExpr(n) {} explicit Var(VarExpr v) : VarExpr(v) {} diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h index 8b27389db0d8..a2cd01042db5 100644 --- a/include/tvm/ir_operator.h +++ b/include/tvm/ir_operator.h @@ -45,13 +45,19 @@ TVM_DLL Expr min(Expr source, Array axis); // Unary intrinsic operators #define TVM_DECLARE_INTRIN_UNARY(OpName) \ inline Expr OpName(Expr x) { \ - return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureExtern); \ + return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureIntrinsic); \ } \ TVM_DECLARE_INTRIN_UNARY(exp); TVM_DECLARE_INTRIN_UNARY(tanh); TVM_DECLARE_INTRIN_UNARY(sigmoid); TVM_DECLARE_INTRIN_UNARY(sqrt); +TVM_DECLARE_INTRIN_UNARY(log); + +inline Expr pow(Expr x, Expr y) { + return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic); +} + } // namespace tvm #endif // TVM_IR_OPERATOR_H_ diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index 525b36d1b6b1..76a060345be1 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -28,7 +28,7 @@ namespace ir { * \param vrange The range information about the variable. * \return Canonicalized statement. */ -Expr Simplify(Expr expr, Map vrange = Map()); +EXPORT Expr Simplify(Expr expr, Map vrange = Map()); /*! * \brief Simplify the statement. @@ -62,7 +62,7 @@ Expr CanonicalSimplify(Expr expr, * \param rhs The right operand * \return The comparison result. */ -bool Equal(const Expr& lhs, const Expr& rhs); +EXPORT bool Equal(const Expr& lhs, const Expr& rhs); /*! * \brief Deep compare lhs and rhs diff --git a/include/tvm/operation.h b/include/tvm/operation.h index 8242bfbeefb4..d598df8d21b1 100644 --- a/include/tvm/operation.h +++ b/include/tvm/operation.h @@ -353,7 +353,7 @@ class ExternOpNode : public OperationNode { v->Visit("inputs", &inputs); v->Visit("body", &body); } - static Operation make(std::string name, + EXPORT static Operation make(std::string name, std::string tag, Array inputs, Array input_placeholders, diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h index 2f94aedccf3d..003555132789 100644 --- a/include/tvm/schedule.h +++ b/include/tvm/schedule.h @@ -56,24 +56,24 @@ class Stage : public NodeRef { * \brief set the memory scope of the stage * \param scope The memory scope. */ - Stage& set_scope(std::string scope); // NOLINT(*) + EXPORT Stage& set_scope(std::string scope); // NOLINT(*) /*! * \brief specify the schedule to be computed at the parent schedule's scope. * \param parent The parent schedule. * \param scope The iteration point to carry the schedule. * \return reference to self. */ - Stage& compute_at(Stage parent, IterVar scope); // NOLINT(*) + EXPORT Stage& compute_at(Stage parent, IterVar scope); // NOLINT(*) /*! * \brief Compute the function inline. * \return reference to self. */ - Stage& compute_inline(); // NOLINT(*) + EXPORT Stage& compute_inline(); // NOLINT(*) /*! * \brief Compute the function at group root. * \return reference to self. */ - Stage& compute_root(); // NOLINT(*) + EXPORT Stage& compute_root(); // NOLINT(*) /*! * \brief Bind the ivar to thread index. * @@ -92,7 +92,7 @@ class Stage : public NodeRef { * \param predicate The condition to be checked. * \return reference to self. */ - Stage& set_store_predicate(Expr predicate); + EXPORT Stage& set_store_predicate(Expr predicate); /*! * \brief Specify environment threads that launched around the group's scope. * This can only be used in group stage. @@ -101,7 +101,7 @@ class Stage : public NodeRef { * This is a beta feature. * \return reference to self. */ - Stage& env_threads(Array threads); + EXPORT Stage& env_threads(Array threads); /*! * \brief Split the parent by factor, generate * \param parent The parent iteration domain. @@ -120,7 +120,7 @@ class Stage : public NodeRef { * \param p_inner The result inner domain. * \return reference to self. */ - Stage& split_by_nparts(IterVar parent, Expr nparts, IterVar* p_outer, IterVar* p_inner); // NOLINT(*) + EXPORT Stage& split_by_nparts(IterVar parent, Expr nparts, IterVar* p_outer, IterVar* p_inner); // NOLINT(*) /*! * \brief Fuse the inner outer domain to the target * \param outer The outer domain to be fused. @@ -128,13 +128,13 @@ class Stage : public NodeRef { * \param p_target The result target domain. * \return reference to self. */ - Stage& fuse(IterVar outer, IterVar inner, IterVar* p_target); // NOLINT(*) + EXPORT Stage& fuse(IterVar outer, IterVar inner, IterVar* p_target); // NOLINT(*) /*! * \brief Reorder the iteration * \param order The order of iteration variable. * \return reference to self. */ - Stage& reorder(const Array& order); // NOLINT(*) + EXPORT Stage& reorder(const Array& order); // NOLINT(*) /*! * \brief Perform tiling on two dimensions * The final loop order from outmost to inner most are @@ -150,7 +150,7 @@ class Stage : public NodeRef { * \param p_y_inner Inner axis of y dimension * \return reference to self. */ - Stage& tile(IterVar x_parent, IterVar y_parent, // NOLINT(*) + EXPORT Stage& tile(IterVar x_parent, IterVar y_parent, // NOLINT(*) Expr x_factor, Expr y_factor, IterVar* p_x_outer, IterVar* p_y_outer, IterVar* p_x_inner, IterVar* p_y_inner); @@ -159,7 +159,7 @@ class Stage : public NodeRef { * \param var The axis to be vectorized. * \return reference to self. */ - Stage& vectorize(IterVar var); // NOLINT(*) + EXPORT Stage& vectorize(IterVar var); // NOLINT(*) /*! * \brief Replace computation of the current stage by tensor intrinsic f. * \param var The axis marks beginning of tensorization. @@ -167,19 +167,19 @@ class Stage : public NodeRef { * \param f The Tensor compute intrinsics. * \return reference to self. */ - Stage& tensorize(IterVar var, TensorIntrin f); // NOLINT(*) + EXPORT Stage& tensorize(IterVar var, TensorIntrin f); // NOLINT(*) /*! * \brief Unroll iteration. * \param var The axis to be unrolled. * \return reference to self. */ - Stage& unroll(IterVar var); // NOLINT(*) + EXPORT Stage& unroll(IterVar var); // NOLINT(*) /*! * \brief Parallelize iteration. * \param var The axis to be parallelized. * \return reference to self. */ - Stage& parallel(IterVar var); // NOLINT(*) + EXPORT Stage& parallel(IterVar var); // NOLINT(*) /*! * \brief Annotate the iteration with pragma * @@ -188,7 +188,7 @@ class Stage : public NodeRef { * * \return reference to self. */ - Stage& pragma(IterVar var, const std::string& pragma_type); // NOLINT(*) + EXPORT Stage& pragma(IterVar var, const std::string& pragma_type); // NOLINT(*) /*! * \brief Fetch data in advance. * \param domain the tensor to be prefetched @@ -196,7 +196,7 @@ class Stage : public NodeRef { * \param offset the number of iterations be to fetched in advance * \return reference to self */ - Stage& prefetch(const Tensor &domain, IterVar var, Expr offset); //NOLINT(*) + EXPORT Stage& prefetch(const Tensor &domain, IterVar var, Expr offset); //NOLINT(*) /*! * \brief Set alignment requirement for specific dimension. * @@ -207,12 +207,12 @@ class Stage : public NodeRef { * \param offset The required offset factor. * \return reference to self */ - Stage& storage_align(IterVar axis, int factor, int offset); //NOLINT(*) + EXPORT Stage& storage_align(IterVar axis, int factor, int offset); //NOLINT(*) /*! * \brief Compute current stage with double buffering. * \return reference to self. */ - Stage& double_buffer(); // NOLINT(*) + EXPORT Stage& double_buffer(); // NOLINT(*) /*! * \brief Schedule for OpenGL fragment shader. * \return reference to self. @@ -271,7 +271,7 @@ class Schedule : public NodeRef { * \param include_inputs Whether include inputs if they are reachable from outputs. * \return The new grouped stage. */ - Stage create_group(const Array& outputs, + EXPORT Stage create_group(const Array& outputs, const Array& inputs, bool include_inputs = false); /*! @@ -283,7 +283,7 @@ class Schedule : public NodeRef { * \param readers The readers to redirect to the tensor. * \return The created tensor. */ - Tensor cache_read(const Tensor& tensor, + EXPORT Tensor cache_read(const Tensor& tensor, const std::string& scope, const Array& readers); /*! @@ -302,7 +302,7 @@ class Schedule : public NodeRef { * \param scope The scope of the storage. * \return The created tensor. */ - Tensor cache_write(const Tensor& tensor, const std::string& scope); + EXPORT Tensor cache_write(const Tensor& tensor, const std::string& scope); /*! * \brief Factor a reduction axis in tensor's schedule to be an explicit axis. * This will create a new stage that generated the new tensor with axis @@ -315,7 +315,7 @@ class Schedule : public NodeRef { * \param axis The reduction axis in tensor's schedule to be factored. * \return The created factored tensors. */ - Array rfactor(const Tensor& tensor, + EXPORT Array rfactor(const Tensor& tensor, const IterVar& axis); /*! * \brief Normalize the schedule. diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h index 189b999a253d..719448513fb8 100644 --- a/include/tvm/schedule_pass.h +++ b/include/tvm/schedule_pass.h @@ -48,7 +48,7 @@ void AutoInlineElemWise(Schedule sch); * * \param sch The schedule to be inlined. */ -void AutoInlineInjective(Schedule sch); +EXPORT void AutoInlineInjective(Schedule sch); } // namespace schedule } // namespace tvm diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py index b89da713a1ad..2edb355fb721 100644 --- a/python/tvm/_ffi/function.py +++ b/python/tvm/_ffi/function.py @@ -261,9 +261,12 @@ def _init_api(namespace): mod : str The name of the module. """ - module = sys.modules[namespace] assert namespace.startswith("tvm.") prefix = namespace[4:] + _init_api_prefix(namespace, prefix) + +def _init_api_prefix(module_name, prefix): + module = sys.modules[module_name] for name in list_global_func_names(): if prefix == "api": diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index f3ed174c0d44..46d5f536a94e 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -2,9 +2,9 @@ from __future__ import absolute_import import sys import os +import warnings - -def find_lib_path(name=None, search_path=None): +def find_lib_path(name=None, search_path=None, optional=False): """Find dynamic library files. Parameters @@ -56,7 +56,12 @@ def find_lib_path(name=None, search_path=None): else: dll_path.append(search_path) if name is not None: - lib_dll_path = [os.path.join(p, name) for p in dll_path] + if isinstance(name, list): + lib_dll_path = [] + for n in name: + lib_dll_path += [os.path.join(p, n) for p in dll_path] + else: + lib_dll_path = [os.path.join(p, name) for p in dll_path] runtime_dll_path = [] else: if sys.platform.startswith('win32'): @@ -81,9 +86,14 @@ def find_lib_path(name=None, search_path=None): lib_found = [p for p in runtime_dll_path if os.path.exists(p) and os.path.isfile(p)] if not lib_found: - raise RuntimeError('Cannot find the files.\n' + - 'List of candidates:\n' + - str('\n'.join(lib_dll_path + runtime_dll_path))) + message = ('Cannot find the files.\n' + + 'List of candidates:\n' + + str('\n'.join(lib_dll_path + runtime_dll_path))) + if not optional: + raise RuntimeError(message) + else: + warnings.warn(message) + return None if use_runtime: sys.stderr.write("Loading runtime library %s... exec only\n" % lib_found[0]) diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index df71a4a41bec..53aef46a9751 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -85,25 +85,25 @@ namespace target { Target llvm() { std::unordered_set keys({ "llvm", "cpu" }); std::vector options; - return Target("llvm", kDLCPU, 512, 1, keys, options); + return Target("llvm", kDLCPU, 512, 1, keys, options, {}); } Target cuda() { std::unordered_set keys({ "cuda", "gpu" }); std::vector options; - return Target("cuda", kDLGPU, 512, 32, keys, options); + return Target("cuda", kDLGPU, 512, 32, keys, options, {}); } Target rocm() { std::unordered_set keys({ "rocm", "gpu" }); std::vector options; - return Target("rocm", kDLROCM, 256, 1, keys, options); + return Target("rocm", kDLROCM, 256, 1, keys, options, {}); } Target metal() { std::unordered_set keys({ "gpu" }); std::vector options; - return Target("metal", kDLMetal, 256, 1, keys, options); + return Target("metal", kDLMetal, 256, 1, keys, options, {}); } Target rasp() { @@ -114,7 +114,7 @@ Target rasp() { "-mcpu=cortex-a53", "-mattr=+neon" }); - return Target("llvm", kDLCPU, 512, 1, keys, options); + return Target("llvm", kDLCPU, 512, 1, keys, options, {}); } Target mali() { @@ -129,7 +129,7 @@ Target mali() { Target stackvm() { std::unordered_set keys({ "stackvm", "cpu" }); std::vector options; - return Target("stackvm", kDLCPU, 512, 1, keys, options); + return Target("stackvm", kDLCPU, 512, 1, keys, options, {}); } } // namespace target diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc index f06ccc060165..ded27bbdce7e 100644 --- a/src/lang/ir_operator.cc +++ b/src/lang/ir_operator.cc @@ -4,6 +4,7 @@ */ #include #include +#include namespace tvm { diff --git a/tests/scripts/task_cpp_topi.sh b/tests/scripts/task_cpp_topi.sh new file mode 100644 index 000000000000..c005b0e56ef4 --- /dev/null +++ b/tests/scripts/task_cpp_topi.sh @@ -0,0 +1,4 @@ +export PYTHONPATH=python:topi/python + +python -m nose -v topi/tests/python_cpp || exit -1 +python3 -m nose -v topi/tests/python_cpp || exit -1 diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index bb23e2df058a..24a2ac9265aa 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -32,6 +32,7 @@ fi if [ ${TASK} == "cpp_test" ] || [ ${TASK} == "all_test" ]; then make -f dmlc-core/scripts/packages.mk gtest ./tests/scripts/task_cpp_unittest.sh || exit -1 + ./tests/scripts/task_cpp_topi.sh || exit -1 fi if [ ${TASK} == "python_test" ] || [ ${TASK} == "all_test" ]; then diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h index ff0c5ce4ded1..14f09d893f67 100644 --- a/topi/include/topi/broadcast.h +++ b/topi/include/topi/broadcast.h @@ -33,7 +33,7 @@ inline tvm::Tensor broadcast_to(const tvm::Tensor& t, << output_shape << "\nvs\ninput: " << t; auto bh = detail::BroadcastShape(output_shape, t->shape); CHECK_EQ(output_shape.size(), bh.common_shape.size()); - for (int i = 0; i < output_shape.size(); ++i) { + for (size_t i = 0; i < output_shape.size(); ++i) { CHECK(tvm::ir::Equal(output_shape[i], bh.common_shape[i])); } auto l = [&](tvm::Array ovars) { @@ -147,6 +147,67 @@ inline tvm::Tensor broadcast_mod(const tvm::Tensor& A, return detail::WithBroadcast(l, A, B, name, tag); } +/*! +* \brief Creates an operation that performs pointwise maximum of 2 tensors +* and broadcasts them into a common compatible shape where necessary, +* according to numpy's rules +* +* \param A The first tensor +* \param B The second tensor +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is a pointwise maximum with broadcast +*/ +inline tvm::Tensor broadcast_maximum(const tvm::Tensor& A, + const tvm::Tensor& B, + std::string name = "tensor", + std::string tag = kBroadcast) { + auto l = [&](tvm::Expr a, tvm::Expr b) { return tvm::max(a, b); }; // NOLINT(*) + return detail::WithBroadcast(l, A, B, name, tag); +} + +/*! +* \brief Creates an operation that performs pointwise minimum of 2 tensors +* and broadcasts them into a common compatible shape where necessary, +* according to numpy's rules +* +* \param A The first tensor +* \param B The second tensor +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is a pointwise minimum with broadcast +*/ +inline tvm::Tensor broadcast_minimum(const tvm::Tensor& A, + const tvm::Tensor& B, + std::string name = "tensor", + std::string tag = kBroadcast) { + auto l = [&](tvm::Expr a, tvm::Expr b) { return tvm::min(a, b); }; // NOLINT(*) + return detail::WithBroadcast(l, A, B, name, tag); +} + +/*! +* \brief Creates an operation that raises one tensor to the power of another +* pointwise and broadcasts them into a common compatible shape where necessary, +* according to numpy's rules +* +* \param A The first tensor +* \param B The second tensor to compute pow(A, B) +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is a pointwise pow with +* broadcast +*/ +inline tvm::Tensor broadcast_pow(const tvm::Tensor& A, + const tvm::Tensor& B, + std::string name = "tensor", + std::string tag = kBroadcast) { + auto l = [&](tvm::Expr a, tvm::Expr b) { return tvm::pow(a, b); }; + return detail::WithBroadcast(l, A, B, name, tag); +} + } // namespace topi #endif // TOPI_BROADCAST_H_ diff --git a/topi/include/topi/contrib/cublas.h b/topi/include/topi/contrib/cublas.h new file mode 100644 index 000000000000..0ce09e313d37 --- /dev/null +++ b/topi/include/topi/contrib/cublas.h @@ -0,0 +1,49 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief External function interface to cuBLAS libraries + * \file tags.h + */ +#ifndef TOPI_CONTRIB_CUBLAS_H_ +#define TOPI_CONTRIB_CUBLAS_H_ + +#include "tvm/tvm.h" +#include "topi/detail/extern.h" + +namespace topi { +namespace contrib { +using namespace tvm; +using namespace topi::detail; +/*! +* \brief Create an op that multiplies lhs and rhs with cuBLAS +* +* \param lhs The left matrix operand +* \param rhs The right matrix operand +* \param transa Whether to transpose lhs +* \param transb Whether to transpose rhs +* +* \return The output tensor +*/ +inline Tensor cublas_matmul(const Tensor& lhs, + const Tensor& rhs, + bool transa, + bool transb) { + auto n = transa ? lhs->shape[1] : lhs->shape[0]; + auto m = transb ? rhs->shape[0] : rhs->shape[1]; + + return make_extern( + { { n, m } }, { lhs->dtype }, { lhs, rhs }, + [&](Array ins, Array outs) { + return call_packed({ + Expr("tvm.contrib.cublas.matmul"), + pack_buffer(ins[0]), + pack_buffer(ins[1]), + pack_buffer(outs[0]), + transa, + transb }); + }, "C", "")[0]; +} + +} // namespace contrib +} // namespace topi + +#endif // TOPI_CONTRIB_CUBLAS_H_ diff --git a/topi/include/topi/contrib/rocblas.h b/topi/include/topi/contrib/rocblas.h new file mode 100644 index 000000000000..dbadaf10b60b --- /dev/null +++ b/topi/include/topi/contrib/rocblas.h @@ -0,0 +1,48 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief External function interface to rocBLAS libraries + * \file tags.h + */ +#ifndef TOPI_CONTRIB_ROCBLAS_H_ +#define TOPI_CONTRIB_ROCBLAS_H_ + +#include "tvm/tvm.h" +#include "topi/detail/extern.h" + +namespace topi { +namespace contrib { +using namespace tvm; +/*! +* \brief Create an op that multiplies lhs and rhs with rocBLAS +* +* \param lhs The left matrix operand +* \param rhs The right matrix operand +* \param transa Whether to transpose lhs +* \param transb Whether to transpose rhs +* +* \return The output tensor +*/ +inline Tensor rocblas_matmul(const Tensor& lhs, + const Tensor& rhs, + bool transa, + bool transb) { + auto n = transa ? lhs->shape[1] : lhs->shape[0]; + auto m = transb ? rhs->shape[0] : rhs->shape[1]; + + return make_extern( + { { n, m } }, { lhs->dtype }, { lhs, rhs }, + [&](Array ins, Array outs) { + return call_packed({ + Expr("tvm.contrib.rocblas.matmul"), + pack_buffer(ins[0]), + pack_buffer(ins[1]), + pack_buffer(outs[0]), + transa, + transb }); + }, "C", "")[0]; +} + +} // namespace contrib +} // namespace topi + +#endif // TOPI_CONTRIB_ROCBLAS_H_ diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h new file mode 100644 index 000000000000..3504ef3bfd17 --- /dev/null +++ b/topi/include/topi/cuda/dense.h @@ -0,0 +1,134 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file cuda/dense.h +* \brief CUDA schedule for dense operation +*/ +#ifndef TOPI_CUDA_DENSE_H_ +#define TOPI_CUDA_DENSE_H_ + +#include "tvm/tvm.h" +#include "tvm/build_module.h" +#include "topi/tags.h" +#include "topi/detail/array_utils.h" +#include "topi/nn/dense.h" +#include "topi/contrib/cublas.h" +#include "topi/generic/extern.h" + +namespace topi { +using namespace tvm; + +namespace cuda { +/*! +* \brief Implementation of dense for CUDA backend +* +* \param target The target device +* \param data Tensor with shape [batch, in_dim] +* \param weight Tensor with shape [out_dim, in_dim] +* \param bias Tensor with shape [out_dim] (optional) +* +* \return Tensor with shape [batch, out_dim] +*/ +inline tvm::Tensor dense_cuda(const Target& target, + const tvm::Tensor& data, + const tvm::Tensor& weight, + tvm::Tensor* bias) { + CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data"; + CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight"; + if (bias != nullptr) { + CHECK_EQ((*bias)->shape.size(), 1) << "dense requires 1-D bias"; + } + + auto batch = data->shape[0]; + auto in_dim = data->shape[1]; + auto out_dim = weight->shape[0]; + + if (target.libs.count("cublas") > 0) { + auto mm = topi::contrib::cublas_matmul(data, weight, false, true); + if (bias != nullptr) { + auto bias_val = *bias; + mm = tvm::compute({ batch, out_dim }, + [&](Var i, Var j) { + return mm(i, j) + bias_val(j); + }, "tensor", kBroadcast); + } + + return mm; + } else { + return topi::nn::dense(data, weight, bias); + } +} + +/*! +* \brief Create a CUDA schedule for dense +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_dense(const Target &target, const Array& outs) { + if (target.target_name == "cuda" && + target.libs.count("cublas") > 0) { + return topi::generic::schedule_extern(target, outs); + } + + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + auto _schedule = [&](const Tensor& dense) { + auto num_thread = 64; + auto k = dense->op.as()->reduce_axis[0]; + IterVar ko, kf; + s[dense].split(k, num_thread, &ko, &kf); + auto dense_f = s.rfactor(dense, kf)[0]; + + Tensor out; + if (contains(s->outputs, dense->op)) { + out = dense; + } else { + out = outs[0]->op.output(0); + s[dense].compute_at(s[out], s[out]->op.as()->axis[1]); + } + s[out].bind(s[out]->op.as()->axis[0], tvm::thread_axis(Range(), "blockIdx.y")); + s[out].bind(s[out]->op.as()->axis[1], tvm::thread_axis(Range(), "blockIdx.x")); + + auto tx = s[dense]->op.as()->reduce_axis[0]; + auto thread_x = tvm::thread_axis(Range(), "threadIdx.x"); + s[dense].bind(tx, thread_x); + s[dense_f].compute_at(s[dense], tx); + s[dense].set_store_predicate(static_cast(thread_x) == 0); + s[out].set_store_predicate(static_cast(thread_x) == 0); + }; + + std::function traverse; + traverse = [&](const Operation& op) { + // Inline all one-to-one-mapping operators except the last stage (output) + if (is_broadcast(op->tag)) { + if (!contains(s->outputs, op)) { + s[op].compute_inline(); + } + for (auto tensor : op->InputTensors()) { + if (tensor->op->InputTensors().size() > 0) { + traverse(tensor->op); + } + } + } else if (op->tag == "dense") { + // If tag starts with global_pool + auto dense = op.output(0); + _schedule(dense); + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } + }; + + traverse(outs[0]->op); + return s; +} + +} // namespace cuda +} // namespace topi +#endif // TOPI_CUDA_DENSE_H_ + diff --git a/topi/include/topi/cuda/extern.h b/topi/include/topi/cuda/extern.h new file mode 100644 index 000000000000..27380b3aa77d --- /dev/null +++ b/topi/include/topi/cuda/extern.h @@ -0,0 +1,68 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file cuda/extern.h +* \brief CUDA schedule for extern followed by injective operations +*/ +#ifndef TOPI_CUDA_EXTERN_H_ +#define TOPI_CUDA_EXTERN_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace cuda { +/*! + * \brief Schedule a given operation representing one of the outputs of an + * external function which is followed by injective operations. + * + * \param target The target to generate a schedule for. + * \param op The operation representing the output followed by injective operations. + * \param sch The schedule to apply this scheduling to + * + * \return The schedule given by sch + */ +Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) { + auto x = op.output(0); + auto fused = Fuse(sch[x], sch[x]->op.as()->axis); + auto num_thread = target.max_num_threads; + IterVar bx, tx; + sch[x].split(fused, num_thread, &bx, &tx); + sch[x].bind(bx, tvm::thread_axis(Range(), "blockIdx.x")); + sch[x].bind(tx, tvm::thread_axis(Range(), "threadIdx.x")); + return sch; +} + +/*! +* \brief Schedule an extern op followed by injective operations. +* For example, cudnn kernel + bias add + relu +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the op. +*/ +Schedule schedule_extern(const Target& target, Array outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + tvm::schedule::AutoInlineInjective(s); + for (auto out : outs) { + if (out->op->derived_from()) { + continue; + } + ScheduleOutputForExtern(target, out->op, s); + } + + return s; +} + +} // namespace cuda +} // namespace topi +#endif // TOPI_CUDA_EXTERN_H_ diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h new file mode 100644 index 000000000000..31ff5e5383d1 --- /dev/null +++ b/topi/include/topi/cuda/injective.h @@ -0,0 +1,58 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file cuda/injective.h +* \brief CUDA schedule for injective operations +*/ +#ifndef TOPI_CUDA_INJECTIVE_H_ +#define TOPI_CUDA_INJECTIVE_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace cuda { +/*! +* \brief Schedule a given injective operation. +* +* \param target The target to generate a schedule for. +* \param op The operation representing the injective operation. +* \param s The schedule to apply this scheduling to +*/ +void ScheduleInjectiveOp(const Target &target, Operation op, Schedule s) { + auto x = op.output(0); + auto fused = Fuse(s[x], s[x]->op.as()->axis); + auto num_thread = target.max_num_threads; + IterVar bx, tx; + s[x].split(fused, num_thread, &bx, &tx); + s[x].bind(bx, thread_axis(Range(), "blockIdx.x")); + s[x].bind(tx, thread_axis(Range(), "threadIdx.x")); +} + +/*! + * \brief Create a CUDA schedule for the given output tensors. + * + * \param target The target to generate a schedule for. + * \param outs The output tensors. + * + * \return A schedule for the given ops. + */ +Schedule schedule_injective(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + tvm::schedule::AutoInlineInjective(s); + for (auto out : outs) { + ScheduleInjectiveOp(target, out->op, s); + } + return s; +} + +} // namespace cuda +} // namespace topi +#endif // TOPI_CUDA_INJECTIVE_H_ diff --git a/topi/include/topi/cuda/pooling.h b/topi/include/topi/cuda/pooling.h new file mode 100644 index 000000000000..934e845e335d --- /dev/null +++ b/topi/include/topi/cuda/pooling.h @@ -0,0 +1,163 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file cuda/pooling.h +* \brief CUDA schedule for pooling operations +*/ +#ifndef TOPI_CUDA_POOLING_H_ +#define TOPI_CUDA_POOLING_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "topi/detail/array_utils.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace cuda { + +/*! +* \brief Create a CUDA schedule for pool +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_pool(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + auto _schedule = [&](const Tensor& padded_input, const Tensor& pool) { + s[padded_input].compute_inline(); + auto num_thread = target.max_num_threads; + Tensor out; + Tensor OL; + if (contains(s->outputs, pool->op)) { + out = pool; + OL = s.cache_write(pool, "local"); + } else { + out = outs[0]->op.output(0); + s[pool].set_scope("local"); + } + auto fused = Fuse(s[out], s[out]->op.as()->axis); + IterVar bx, tx; + s[out].split(fused, num_thread, &bx, &tx); + s[out].bind(bx, tvm::thread_axis(Range(), "blockIdx.x")); + s[out].bind(tx, tvm::thread_axis(Range(), "threadIdx.x")); + if (contains(s->outputs, pool->op)) { + s[OL].compute_at(s[out], tx); + } else { + s[pool].compute_at(s[out], tx); + } + }; + + std::function traverse; + traverse = [&](const Operation& op) { + // Inline all one-to-one-mapping operators except the last stage (output) + if (is_broadcast(op->tag)) { + if (!contains(s->outputs, op)) { + s[op].compute_inline(); + } + for (auto tensor : op->InputTensors()) { + if (tensor->op->InputTensors().size() > 0) { + traverse(tensor->op); + } + } + } else if (op->tag.rfind("pool", 0) == 0) { + // If tag starts with pool + auto padded_input = op->InputTensors()[0]; + auto pool = op.output(0); + _schedule(padded_input, pool); + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } + }; + + traverse(outs[0]->op); + return s; +} + +/*! +* \brief Create a CUDA schedule for global_pool +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_global_pool(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + auto _schedule = [&](const Tensor& pool) { + auto num_thread = 8; + auto block_x = tvm::thread_axis(Range(), "blockIdx.x"); + auto block_y = tvm::thread_axis(Range(), "blockIdx.y"); + auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x"); + auto thread_y = tvm::thread_axis(Range(0, num_thread), "threadIdx.y"); + Tensor out; + Tensor OL; + if (contains(s->outputs, pool->op)) { + out = pool; + OL = s.cache_write(pool, "local"); + } else { + out = outs[0]->op.output(0); + s[pool].set_scope("local"); + } + + auto i = s[out]->op.as()->axis[0]; + auto c = s[out]->op.as()->axis[1]; + + IterVar by, ty; + s[out].split(i, num_thread, &by, &ty); + IterVar bx, tx; + s[out].split(c, num_thread, &bx, &tx); + s[out].reorder({ by, bx, ty, tx }); + s[out].bind(ty, thread_y); + s[out].bind(tx, thread_x); + s[out].bind(by, block_y); + s[out].bind(bx, block_x); + + if (contains(s->outputs, pool->op)) { + s[OL].compute_at(s[out], tx); + } else { + s[pool].compute_at(s[out], tx); + } + }; + + std::function traverse; + traverse = [&](const Operation& op) { + // Inline all one-to-one-mapping operators except the last stage (output) + if (is_broadcast(op->tag)) { + if (!contains(s->outputs, op)) { + s[op].compute_inline(); + } + for (auto tensor : op->InputTensors()) { + if (tensor->op->InputTensors().size() > 0) { + traverse(tensor->op); + } + } + } else if (op->tag.rfind("global_pool", 0) == 0) { + // If tag starts with global_pool + auto pool = op.output(0); + _schedule(pool); + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } + }; + + traverse(outs[0]->op); + return s; +} + +} // namespace cuda +} // namespace topi +#endif // TOPI_CUDA_POOLING_H_ diff --git a/topi/include/topi/cuda/reduction.h b/topi/include/topi/cuda/reduction.h new file mode 100644 index 000000000000..554224a9036c --- /dev/null +++ b/topi/include/topi/cuda/reduction.h @@ -0,0 +1,181 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file cuda/reduction.h +* \brief CUDA schedule for reduction operations +*/ +#ifndef TOPI_CUDA_REDUCTION_H_ +#define TOPI_CUDA_REDUCTION_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace cuda { +/*! + * \brief Schedule a given reduce operation. + * + * \param target The target to generate a schedule for. + * \param op The operation representing the injective operation. + * \param sch The schedule to apply this scheduling to + * \param is_idx_reduce Pass true to schedule a reduce op that returns + * an index, such as argmax or argmin. + * + * \return The schedule given by sch +*/ +Schedule ScheduleReduce(const Target& target, + Operation op, + Schedule sch, + bool is_idx_reduce = false) { + Tensor data_out; + Tensor data_in; + + if (!is_idx_reduce) { + data_in = op->InputTensors()[0]; + data_out = op.output(0); + } else { + data_out = op->InputTensors()[0]; + } + + auto out_stage = sch[data_out]; + CHECK_GT(out_stage->op.as()->reduce_axis.size(), 0) << + "reduce_axis must be greater than zero"; + + bool all_reduce; + int num_thread; + IterVar block_x, thread_x, thread_y; + + if (out_stage->op.as()->axis.size() > 0) { + all_reduce = false; + num_thread = 32; + if (target.target_name == "opencl") { + // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests. + // Don't know why. + num_thread = 16; + } + block_x = tvm::thread_axis(Range(), "blockIdx.x"); + thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x"); + thread_y = tvm::thread_axis(Range(0, num_thread), "threadIdx.y"); + } else { + all_reduce = true; + num_thread = target.max_num_threads; + thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x"); + } + + auto fused_reduce = Fuse(out_stage, out_stage->op.as()->reduce_axis); + + IterVar ko, ki; + out_stage.split(fused_reduce, num_thread, &ko, &ki); + auto data_out_rf = sch.rfactor(data_out, ki)[0]; + auto tx = out_stage->op.as()->reduce_axis[0]; + out_stage.bind(tx, thread_x); + sch[data_out_rf].compute_at(out_stage, tx); + + Tensor real_output; + Tensor temp_idx_input, temp_val_input; + if (is_idx_reduce) { + real_output = op.output(0); + temp_idx_input = data_out->op.output(0); + temp_val_input = data_out->op.output(1); + } else { + real_output = data_out; + } + + auto stage_real = sch[real_output]; + if (!all_reduce) { + // Fuse and split the axis + auto fused_outer = Fuse(stage_real, stage_real->op.as()->axis); + IterVar bx, outer_in; + stage_real.split(fused_outer, num_thread, &bx, &outer_in); + + // Bind the axes to threads and blocks + stage_real.bind(outer_in, thread_y); + stage_real.bind(bx, block_x); + if (is_idx_reduce) { + sch[temp_idx_input].compute_at(stage_real, outer_in); + sch[temp_val_input].compute_at(stage_real, outer_in); + } + } else { + if (is_idx_reduce) { + sch[temp_idx_input].compute_at(stage_real, + stage_real->op.as()->axis[0]); + sch[temp_val_input].compute_at(stage_real, + stage_real->op.as()->axis[0]); + } + } + + stage_real.set_store_predicate(static_cast(thread_x) == 0); + return sch; +} + +/*! + * \brief Recursively traverse operator inputs, setting injective inputs + * to be computed inline. + * + * \param s The schedule we are building + * \param op The current op in the traversal + */ +void TraverseBeforeReduce(Schedule s, Operation op) { + if (op->derived_from()) { + return; + } else if (is_injective(op->tag)) { + s[op].compute_inline(); + for (auto tensor : op->InputTensors()) { + TraverseBeforeReduce(s, tensor->op); + } + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } +} + +/*! +* \brief Schedule a reduce op, then invoke TraverseBeforeReduce on each +* of the op's inputs. +* +* \param target The target to generate a schedule for. +* \param s The schedule we are building +* \param op The reduce op +*/ +void TraverseAfterReduce(const Target& target, Schedule s, Operation op) { + if (is_broadcast(op->tag)) { + LOG(ERROR) << "Elementwise op after reduce is not yet supported"; + } else if (op->tag == kCommReduce) { + ScheduleReduce(target, op, s, false); + for (auto tensor : op->InputTensors()) { + TraverseBeforeReduce(s, tensor->op); + } + } else if (op->tag == kCommReduceIdx) { + ScheduleReduce(target, op, s, true); + for (auto tensor : op->InputTensors()[0]->op->InputTensors()) { + TraverseBeforeReduce(s, tensor->op); + } + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } +} + +/*! +* \brief Create a CUDA schedule for a reduce operation. +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_reduce(const Target& target, Array outs) { + CHECK_EQ(outs.size(), 1) << "outs must have size 1"; + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + TraverseAfterReduce(target, s, outs[0]->op); + return s; +} + +} // namespace cuda +} // namespace topi +#endif // TOPI_CUDA_REDUCTION_H_ diff --git a/topi/include/topi/cuda/softmax.h b/topi/include/topi/cuda/softmax.h new file mode 100644 index 000000000000..77121c20be69 --- /dev/null +++ b/topi/include/topi/cuda/softmax.h @@ -0,0 +1,62 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file cuda/injective.h +* \brief CUDA schedule for injective operations +*/ +#ifndef TOPI_CUDA_SOFTMAX_H_ +#define TOPI_CUDA_SOFTMAX_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace cuda { + +/*! + * \brief Create a CUDA schedule for the given softmax output tensors. + * + * \param target The target to generate a schedule for. + * \param outs The output tensors. + * + * \return A schedule for the given ops. + */ +Schedule schedule_softmax(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + auto softmax = outs[0]; + auto max_elem = softmax->op->InputTensors()[1]; + auto expsum = softmax->op->InputTensors()[2]; + + int num_thread = 64; + auto block_x = tvm::thread_axis(Range(), "blockIdx.x"); + auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x"); + + s[max_elem].bind(max_elem->op.as()->axis[0], block_x); + + auto k = expsum->op.as()->reduce_axis[0]; + IterVar ko, ki; + s[expsum].split(k, num_thread, &ko, &ki); + auto EF = s.rfactor(expsum, ki)[0]; + s[expsum].bind(s[expsum]->op.as()->axis[0], block_x); + s[expsum].bind(s[expsum]->op.as()->reduce_axis[0], thread_x); + s[EF].compute_at(s[expsum], s[expsum]->op.as()->reduce_axis[0]); + s[expsum].set_store_predicate(thread_x->var == 0); + + IterVar tx, xi; + s[softmax].split_by_nparts(softmax->op.as()->axis[1], num_thread, &tx, &xi); + s[softmax].bind(tx, thread_x); + + return s; +} + +} // namespace cuda +} // namespace topi +#endif // TOPI_CUDA_SOFTMAX_H_ diff --git a/topi/include/topi/detail/array_utils.h b/topi/include/topi/detail/array_utils.h new file mode 100644 index 000000000000..ca94bd6a1202 --- /dev/null +++ b/topi/include/topi/detail/array_utils.h @@ -0,0 +1,35 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file array_utils.h +* \brief Utility functions for handling arrays +*/ +#ifndef TOPI_DETAIL_ARRAY_UTILS_H_ +#define TOPI_DETAIL_ARRAY_UTILS_H_ + +#include "tvm/tvm.h" + +namespace topi { +namespace detail { +using namespace tvm; + +/*! + * \brief Search an array for a specific item + * + * \param array The array to search + * \param item The item to search for + * + * \return True iff the given array contains the given item. + */ +template +bool contains(Array array, T item) { + for (auto& i : array) { + if (i == item) { + return true; + } + } + return false; +} + +} // namespace detail +} // namespace topi +#endif // TOPI_DETAIL_ARRAY_UTILS_H_ diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h index 19f074661734..ba7193cd0647 100644 --- a/topi/include/topi/detail/broadcast.h +++ b/topi/include/topi/detail/broadcast.h @@ -69,10 +69,10 @@ inline tvm::Array InputIndexFromBroadcast( tvm::Array ivars; CHECK_EQ(ovars.size(), all_vars.size()); // N^2, could use a map but NBD.. - int expected_dims = T->shape.size(); - for (int i = 0; i < ovars.size(); ++i) { + size_t expected_dims = T->shape.size(); + for (size_t i = 0; i < ovars.size(); ++i) { bool found = false; - for (int j = 0; j < my_vars.size(); ++j) { + for (size_t j = 0; j < my_vars.size(); ++j) { if (all_vars[i].same_as(my_vars[j])) { ivars.push_back(ovars[i]); found = true; diff --git a/topi/include/topi/detail/constant_utils.h b/topi/include/topi/detail/constant_utils.h new file mode 100644 index 000000000000..eb3219e9b25a --- /dev/null +++ b/topi/include/topi/detail/constant_utils.h @@ -0,0 +1,70 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file constant_utils.h +* \brief Utility functions for handling constants in TVM expressions +*/ +#ifndef TOPI_DETAIL_CONSTANT_UTILS_H_ +#define TOPI_DETAIL_CONSTANT_UTILS_H_ + +#include +#include + +#include "tvm/tvm.h" + +namespace topi { +namespace detail { +using namespace tvm; + +/*! + * \brief Test whether the given Expr is a constant integer + * + * \param expr the Expr to query + * + * \return true if the given expr is a constant int or uint, false otherwise. + */ +bool IsConstInt(Expr expr) { + return + expr->derived_from() || + expr->derived_from(); +} + +/*! + * \brief Get the value of the given constant integer expression. An error + * is logged if the given expression is not a constant integer. + * + * \param expr The expression to get the value of + * + * \return The integer value. + */ +int64_t GetConstInt(Expr expr) { + if (expr->derived_from()) { + return expr.as()->value; + } + if (expr->derived_from()) { + return expr.as()->value; + } + LOG(ERROR) << "expr must be a constant integer"; + return -1; +} + +/*! + * \brief Get the value of all the constant integer expressions in the given array + * + * \param exprs The array of expressions to get the values of + * \param var_name The name to be used when logging an error in the event that any + * of the expressions are not constant integers. + * + * \return A vector of the integer values + */ +std::vector GetConstIntValues(Array exprs, const std::string& var_name) { + std::vector result; + for (auto expr : exprs) { + CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers"; + result.push_back(GetConstInt(expr)); + } + return result; +} + +} // namespace detail +} // namespace topi +#endif // TOPI_DETAIL_CONSTANT_UTILS_H_ diff --git a/topi/include/topi/detail/extern.h b/topi/include/topi/detail/extern.h new file mode 100644 index 000000000000..12cc65d72530 --- /dev/null +++ b/topi/include/topi/detail/extern.h @@ -0,0 +1,137 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file detail/extern.h +* \brief Helpers for using external functions +*/ +#ifndef TOPI_DETAIL_EXTERN_H_ +#define TOPI_DETAIL_EXTERN_H_ + +#include +#include + +#include "tvm/tvm.h" + +namespace topi { +namespace detail { +using namespace tvm; + +/*! + * \brief Construct a buffer to pass to an external function + * + * \param shape The shape of the buffer + * \param dtype The type of the buffer elements + * \param name The name of the buffer + * + * \return The Buffer object + */ +Buffer DeclExternBuffer(Array shape, + Type dtype, + std::string name) { + auto data = var(name, Handle()); + auto elem_offset = Expr(); + return BufferNode::make(data, dtype, shape, Array(), elem_offset, name, "", + -1, 0); +} + +/*! + * \brief A function which constructs an Expr representing the invocation of an external + * function. The function expects two arguments: an array of Buffers holding the input + * tensor values, and a pre-allocated array of Buffers to be filled with the outputs. + */ +using FExtern = std::function, Array)>; + +/*! + * \brief Create tensors representing the result of invoking an external function. + * This function will create the necessary buffers to hold input and output tensor values. + * + * \param out_shapes An array where each element is the shape of the corresponding output tensor. + * \param out_types An array where each element is the dtype of the corresponding output tensor. + * \param inputs An array of input Tensors + * \param fextern A function that constructs an Expr representing the invocation of + * the external function given the input and output buffers. + * \param name The name of the operation + * \param tag The tag to mark the operation + * + * \return An array of Tensors representing the outputs of the function invocation. There will + * be one output Tensor for each element of out_shapes, with dtype equal to the corresponding + * element of out_types. + */ +Array make_extern(const Array< Array >& out_shapes, + const std::vector& out_types, + const Array& inputs, + FExtern fextern, + std::string name, + std::string tag) { + CHECK_EQ(out_shapes.size(), out_types.size()) + << "make_extern: out_shapes and out_types must have equal size"; + + Array input_placeholders; + for (auto t : inputs) { + input_placeholders.push_back(DeclExternBuffer(t->shape, t->dtype, t->op->name)); + } + Array output_placeholders; + for (size_t i = 0; i < out_shapes.size(); ++i) { + output_placeholders.push_back(DeclExternBuffer(out_shapes[i], out_types[i], name)); + } + + auto body = fextern(input_placeholders, output_placeholders); + auto body_stmt = tvm::ir::Evaluate::make(body); + + auto op = ExternOpNode::make( + name, tag, inputs, input_placeholders, output_placeholders, body_stmt); + + Array outputs; + for (size_t i = 0; i < output_placeholders.size(); ++i) { + outputs.push_back(op.output(i)); + } + return outputs; +} + +/*! + * \brief This function is used to create a DLTensor structure on the stack to + * be able to pass a symbolic buffer as arguments to TVM PackedFunc + * + * \param buf The buffer to pack + * + * \return An expression representing the pack operation + */ +Expr pack_buffer(Buffer buf) { + CHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element"; + auto shape = tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_shape, + buf->shape, tvm::ir::Call::CallType::Intrinsic); + Expr strides; + if (buf->strides.size() > 0) { + strides = tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_shape, + buf->shape, tvm::ir::Call::CallType::Intrinsic); + } else { + strides = 0; + } + Array pack_args{ + buf->data, + shape, + strides, + make_const(Int(32), buf->shape.size()), + make_const(buf->dtype, 0), + buf->elem_offset + }; + return tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_array, + pack_args, tvm::ir::Call::CallType::Intrinsic); +} + +/*! + * \brief Construct an Expr representing the invocation of a PackedFunc + * + * \param args An array containing the registered name of the PackedFunc followed + * by the arguments to pass to the PackedFunc when called. The first element of the + * array must be a constant string expression. + * + * \return An expression representing the invocation + */ +Expr call_packed(Array args) { + return tvm::ir::Call::make(Int(32), tvm::ir::intrinsic::tvm_call_packed, + args, tvm::ir::Call::CallType::Intrinsic); +} + +} // namespace detail +} // namespace topi +#endif // TOPI_DETAIL_EXTERN_H_ diff --git a/topi/include/topi/detail/fuse.h b/topi/include/topi/detail/fuse.h new file mode 100644 index 000000000000..8ace8bf97fd3 --- /dev/null +++ b/topi/include/topi/detail/fuse.h @@ -0,0 +1,37 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file fuse.h +* \brief Fuse operation +*/ +#ifndef TOPI_DETAIL_FUSE_H_ +#define TOPI_DETAIL_FUSE_H_ + +#include "tvm/tvm.h" + +namespace topi { +namespace detail { +using namespace tvm; + +/*! + * \brief Fuse all of the given args + * + * \param stage The stage in which to apply the fuse + * \param args The iteration variables to be fused + * + * \return The fused iteration variable + */ +IterVar Fuse(Stage stage, const Array& args) { + CHECK_GE(args.size(), 1) << "Fuse requires at least 1 arg"; + + auto fused = args[0]; + for (size_t i = 1; i < args.size(); ++i) { + IterVar out; + stage.fuse(fused, args[i], &out); + fused = out; + } + return fused; +} + +} // namespace detail +} // namespace topi +#endif // TOPI_DETAIL_FUSE_H_ diff --git a/topi/include/topi/detail/pad_utils.h b/topi/include/topi/detail/pad_utils.h new file mode 100644 index 000000000000..e284859c1e50 --- /dev/null +++ b/topi/include/topi/detail/pad_utils.h @@ -0,0 +1,38 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file pad_utils.h +* \brief Padding helpers +*/ +#ifndef TOPI_DETAIL_PAD_UTILS_H_ +#define TOPI_DETAIL_PAD_UTILS_H_ + +#include + +#include "tvm/tvm.h" + +namespace topi { +namespace detail { +using namespace tvm; + +/*! + * \brief Get padding size for each side given padding height and width + * + * \param pad_h The amount to pad each of the top and bottom sides + * \param pad_w The amount to pad each of the left and right sides + * + * \return An array of 4 elements, representing padding sizes for + * each individual side. The array is in the order { top, left, bottom, right } + */ +Array GetPadTuple(Expr pad_h, Expr pad_w) { + pad_h *= 2; + pad_w *= 2; + + auto pad_top = (pad_h + 1) / 2; + auto pad_left = (pad_w + 1) / 2; + + return { pad_top, pad_left, pad_h - pad_top, pad_w - pad_left }; +} + +} // namespace detail +} // namespace topi +#endif // TOPI_DETAIL_PAD_UTILS_H_ diff --git a/topi/include/topi/detail/ravel_unravel.h b/topi/include/topi/detail/ravel_unravel.h new file mode 100644 index 000000000000..b9774153efc7 --- /dev/null +++ b/topi/include/topi/detail/ravel_unravel.h @@ -0,0 +1,60 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file ravel_unravel.h +* \brief Index ravel and unraval operations +*/ +#ifndef TOPI_DETAIL_RAVEL_UNRAVEL_H_ +#define TOPI_DETAIL_RAVEL_UNRAVEL_H_ + +#include + +#include "tvm/tvm.h" + +namespace topi { +namespace detail { +using namespace tvm; + +/*! +* \brief Flatten the indices to 1D +* +* \param indices The input coordinates +* \param shape Shape of the tensor +* +* \return The index after flattening +*/ +inline Expr RavelIndex(Array indices, Array shape) { + CHECK_EQ(indices.size(), shape.size()) << "indices and shape must have equal size"; + CHECK_GT(indices.size(), 0) << "indices must not be empty"; + Expr idx; + for (size_t i = 0; i < indices.size(); ++i) { + if (i == 0) { + idx = indices[i]; + } else { + idx = idx * shape[i] + indices[i]; + } + } + return idx; +} + +/*! +* \brief Convert flattened index to coordinate array +* +* \param idx The 1D index +* \param shape Shape of the tensor +* +* \return The coordinate corresponding to the 1D index +*/ +inline Array UnavelIndex(Expr idx, Array shape) { + std::vector indices; + + for (int i = static_cast(shape.size()) - 1; i >= 0; --i) { + indices.push_back(idx % shape[i]); + idx = idx / shape[i]; + } + std::reverse(indices.begin(), indices.end()); + return indices; +} + +} // namespace detail +} // namespace topi +#endif // TOPI_DETAIL_RAVEL_UNRAVEL_H_ diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h index e133833a790b..c3797197710b 100644 --- a/topi/include/topi/elemwise.h +++ b/topi/include/topi/elemwise.h @@ -28,6 +28,144 @@ TOPI_DECLARE_UNARY_OP(exp); TOPI_DECLARE_UNARY_OP(tanh); TOPI_DECLARE_UNARY_OP(sigmoid); TOPI_DECLARE_UNARY_OP(sqrt); +TOPI_DECLARE_UNARY_OP(log); + +/*! +* \brief Creates an operation that returns identity of a given tensor +* +* \param x The input tensor +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the identity operation +*/ +inline Tensor identity(const Tensor& x, + std::string name = "tensor", + std::string tag = kElementWise) { + return compute(x->shape, [&](const Array& i) { + return x(i); + }, name, tag); +} + +/*! +* \brief Creates an operation that returns the negation of a given tensor +* +* \param x The input tensor +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the negation operation +*/ +inline Tensor negative(const Tensor& x, + std::string name = "tensor", + std::string tag = kElementWise) { + return compute(x->shape, [&](const Array& i) { + return -x(i); + }, name, tag); +} + +/*! +* \brief Creates an operation that raises each element of tensor x to power y +* +* \param x The input tensor +* \param y The exponent +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the pow operation +*/ +inline Tensor pow(const Tensor& x, + const Expr& y, + std::string name = "tensor", + std::string tag = kElementWise) { + return compute(x->shape, [&](const Array& i) { + return tvm::pow(x(i), y); + }, name, tag); +} + +/*! +* \brief Creates an operation that performs pointwise left shift by n bits +* +* \param x The input tensor +* \param n The number of bits to shift by +* +* \return A Tensor whose op member is the left shift operation +*/ +inline Tensor operator<<(const Tensor& x, + const Expr& n) { + return compute(x->shape, [&](const Array& i) { + return x(i) << n; + }, "tensor", kElementWise); +} + +/*! +* \brief Creates an operation that performs pointwise right shift by n bits +* +* \param x The input tensor +* \param n The number of bits to shift by +* +* \return A Tensor whose op member is the right shift operation +*/ +inline Tensor operator>>(const Tensor& x, + const Expr& n) { + return compute(x->shape, [&](const Array& i) { + return x(i) >> n; + }, "tensor", kElementWise); +} + +/*! +* \brief Creates an operation that clips each element of a tensor to +* the interval [a_min, a_max] +* +* \param x The input tensor +* \param a_min The inclusive lower bound of the interval +* \param a_max The inclusive upper bound of the interval +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the clip operation +*/ +inline Tensor clip(const Tensor& x, + const Expr& a_min, + const Expr& a_max, + std::string name = "tensor", + std::string tag = kElementWise) { + return compute(x->shape, [&](const Array& i) { + auto min_val = tvm::cast(x->dtype, a_min); + auto max_val = tvm::cast(x->dtype, a_max); + return tvm::max(tvm::min(x(i), max_val), min_val); // NOLINT(*) + }, name, tag); +} + +/*! + * \brief Cast each element of x to the given type. If expr is + * scalar and type is a corresponding vector type, a + * Broadcast is generated, otherwise a Cast is generated. + * + * \param x The input tensor + * \param type The type to cast to + * \param name The name of the operation + * \param tag The tag to mark the operation + * + * \return A Tensor whose op member is the cast operation + */ +inline Tensor cast(const Tensor& x, + Type type, + std::string name = "tensor", + std::string tag = kElementWise) { + return compute(x->shape, [&](const Array& i) { + auto expr = x(i); + if (expr.type().code() == type.code() && expr.type().bits() == type.bits()) { + if (expr.type().lanes() == type.lanes()) { + return expr; + } else if (expr.type().lanes() == 1 && type.lanes() > 1) { + return tvm::ir::Broadcast::make(expr, type.lanes()); + } + } + + return tvm::cast(type, x(i)); + }, name, tag); +} } // namespace topi #endif // TOPI_ELEMWISE_H_ diff --git a/topi/include/topi/generic/default.h b/topi/include/topi/generic/default.h new file mode 100644 index 000000000000..5dd93030f95b --- /dev/null +++ b/topi/include/topi/generic/default.h @@ -0,0 +1,46 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file generic/default.h +* \brief Generic default schedule +*/ +#ifndef TOPI_GENERIC_DEFAULT_H_ +#define TOPI_GENERIC_DEFAULT_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace generic { +/*! + * \brief Create a generic default schedule for the given output tensors. + * + * \param target The target to generate a schedule for. + * \param outs The output tensors. + * \param auto_inline Whether to apply the auto inline step. + * + * \return A schedule for the given ops. + */ +Schedule default_schedule(const Target& target, Array outs, bool auto_inline) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + if (auto_inline) { + auto x = outs[0]; + tvm::schedule::AutoInlineInjective(s); + auto axis = s[x]->op.as()->axis; + if (axis.size() > 0) { + Fuse(s[x], axis); + } + } + return s; +} + +} // namespace generic +} // namespace topi +#endif // TOPI_GENERIC_DEFAULT_H_ diff --git a/topi/include/topi/generic/extern.h b/topi/include/topi/generic/extern.h new file mode 100644 index 000000000000..21852d108239 --- /dev/null +++ b/topi/include/topi/generic/extern.h @@ -0,0 +1,37 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file generic/extern.h +* \brief Schedule for extern followed by injective ops +*/ +#ifndef TOPI_GENERIC_EXTERN_H_ +#define TOPI_GENERIC_EXTERN_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace generic { +/*! +* \brief Schedule an extern op followed by injective operations +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the op. +*/ +Schedule schedule_extern(const Target& target, Array outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + return s; +} + +} // namespace generic +} // namespace topi +#endif // TOPI_GENERIC_EXTERN_H_ diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h new file mode 100644 index 000000000000..f05f6243485d --- /dev/null +++ b/topi/include/topi/generic/injective.h @@ -0,0 +1,42 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file generic/injective.h +* \brief Generic schedule for injective operations +*/ +#ifndef TOPI_GENERIC_INJECTIVE_H_ +#define TOPI_GENERIC_INJECTIVE_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace generic { + +/*! + * \brief Create a generic schedule for the given injective ops. + * + * \param target The target to generate a schedule for. + * \param outs The output tensors. + * + * \return A schedule for the given ops. + */ +Schedule schedule_injective(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + tvm::schedule::AutoInlineInjective(s); + auto x = outs[0]; + Fuse(s[x], s[x]->op.as()->axis); + + return s; +} + +} // namespace generic +} // namespace topi +#endif // TOPI_GENERIC_INJECTIVE_H_ diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h index cfca85d1b704..5bb4ebf2ac8f 100644 --- a/topi/include/topi/nn.h +++ b/topi/include/topi/nn.h @@ -21,7 +21,7 @@ template tvm::Expr Map(const tvm::Array& exprs, T op) { CHECK_GE(exprs.size(), 1); tvm::Expr res = exprs[0]; - for (int i = 1; i < exprs.size(); ++i) { + for (size_t i = 1; i < exprs.size(); ++i) { res = op(res, exprs[i]); } return res; @@ -51,6 +51,34 @@ inline tvm::Tensor relu(const tvm::Tensor& t, tag); } +/*! +* \brief Creates an operation that performs a leaky rectified linear unit +* +* \param t The input tensor +* \param threshold The relu threshold (default 0) +* \param alpha The slope for the small gradient when t < threshold +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the relu operation +*/ +template +inline tvm::Tensor leaky_relu(const tvm::Tensor& t, + T threshold = static_cast(0), + T alpha = static_cast(0.1), + std::string name = "tensor", + std::string tag = kElementWise) { + return tvm::compute( + t->shape, + [&](const tvm::Array& i) { + auto value = t(i); + auto calpha = tvm::make_const(value.type(), alpha); + return tvm::select(value > 0, value, value * alpha); + }, + name, + tag); +} + /*! * \brief Creates an operation that performs padding * @@ -59,10 +87,11 @@ inline tvm::Tensor relu(const tvm::Tensor& t, * respective iterator * \param pad_after An Array of Expr describing the padding after the * respective iterator + * \param pad_value The value to fill padding elements with * \param name The name of the operation * \param tag The tag to mark the operation * - * \return A Tensor whose op member is the relu operation + * \return A Tensor whose op member is the padding operation * * \note * The pad_after Array must either be empty or have the same length as @@ -86,17 +115,18 @@ inline tvm::Tensor relu(const tvm::Tensor& t, inline tvm::Tensor pad(const tvm::Tensor& t, const tvm::Array& pad_before, tvm::Array pad_after = tvm::Array(), + Expr pad_value = Expr(), std::string name = "tensor", std::string tag = kElementWise) { if (pad_after.size() < pad_before.size()) { - for (int i = pad_after.size(); i < pad_before.size(); ++i) { + for (size_t i = pad_after.size(); i < pad_before.size(); ++i) { pad_after.push_back(pad_before[i]); } } CHECK_GE(pad_before.size(), 1); CHECK_EQ(pad_before.size(), pad_after.size()); tvm::Array output_shape; - for (int i = 0; i < t->shape.size(); ++i) { + for (size_t i = 0; i < t->shape.size(); ++i) { if (i >= pad_before.size()) { output_shape.push_back(t->shape[i]); } else { @@ -104,10 +134,15 @@ inline tvm::Tensor pad(const tvm::Tensor& t, tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i])); } } + + if (!pad_value.defined()) { + pad_value = tvm::make_const(t->dtype, 0); + } + auto l = [&](tvm::Array ovars) { tvm::Array indices; tvm::Array sel; - for (int i = 0; i < t->shape.size(); ++i) { + for (size_t i = 0; i < t->shape.size(); ++i) { if (i >= pad_before.size()) { indices.push_back(ovars[i]); continue; @@ -122,7 +157,10 @@ inline tvm::Tensor pad(const tvm::Tensor& t, sel.push_back(tvm::ir::Simplify(ovars[i] < pad_before[i] + t->shape[i])); } } - return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), 0); + if (sel.size() != 0) { + return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), pad_value); + } + return t(indices); }; return tvm::compute(output_shape, l, name, tag); } diff --git a/topi/include/topi/nn/batch_norm.h b/topi/include/topi/nn/batch_norm.h new file mode 100644 index 000000000000..be3e31d216d0 --- /dev/null +++ b/topi/include/topi/nn/batch_norm.h @@ -0,0 +1,65 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Batch normalization op constructions + * \file nn/batch_norm.h + */ +#ifndef TOPI_NN_BATCH_NORM_H_ +#define TOPI_NN_BATCH_NORM_H_ + +#include + +#include "topi/tags.h" +#include "tvm/tvm.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Batch normalization inference operator with NCHW layout +* +* \param x The input tensor. 4-D with shape [batch, channel, height, width] +* \param gamma 1-D with shape [channel] +* \param beta 1-D with shape [channel] +* \param moving_mean 1-D with shape [channel] +* \param moving_var 1-D with shape [channel] +* \param eps Epsilon to prevent div by 0 +* \param fix_gamma Fix gamma while training +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the batch normalization operation +*/ +inline Tensor batch_norm_inference(const Tensor& x, + const Tensor& gamma, + const Tensor& beta, + const Tensor& moving_mean, + const Tensor& moving_var, + float eps, + bool fix_gamma, + std::string name = "tensor", + std::string tag = kBroadcast) { + CHECK_EQ(x->shape.size(), 4) << "Batch norm requires 4-D input"; + + Tensor out; + if (fix_gamma) { + out = tvm::compute( + x->shape, + [&](const Array& indices) { + auto c = Array({ indices[1] }); + return (x(indices) - moving_mean(c)) / tvm::sqrt(moving_var(c) + eps) + beta(c); + }, name, tag); + } else { + out = tvm::compute( + x->shape, + [&](const Array& indices) { + auto c = Array({ indices[1] }); + return (x(indices) - moving_mean(c)) / tvm::sqrt(moving_var(c) + eps) * gamma(c) + beta(c); + }, name, tag); + } + return out; +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_BATCH_NORM_H_ diff --git a/topi/include/topi/nn/bnn.h b/topi/include/topi/nn/bnn.h new file mode 100644 index 000000000000..f7b1b860d461 --- /dev/null +++ b/topi/include/topi/nn/bnn.h @@ -0,0 +1,110 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Binary op constructions + * \file nn/bnn.h + */ +#ifndef TOPI_NN_BNN_H_ +#define TOPI_NN_BNN_H_ + +#include + +#include "tvm/tvm.h" +#include "tvm/ir_pass.h" +#include "topi/tags.h" +#include "topi/detail/constant_utils.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Binarization and bit-packing along a certain axis. +* +* \param data N-D tensor, can be any layout +* \param axis The axis along which to do binarization and bit-packing. This axis +* must have a size equal to an integer multiple of 32. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return Output tensor with dtype uint32 +*/ +inline tvm::Tensor binarize_pack(const tvm::Tensor& data, + int axis, + std::string name = "PackedInput", + std::string tag = "binarize_pack") { + auto ishape = data->shape; + CHECK_EQ(GetConstInt(ishape[axis]) % 32, 0) + << "binarize_pack: axis size must be a multiple of 32"; + + auto n = ishape.size(); + Array oshape; + for (size_t i = 0; i < n; ++i) { + oshape.push_back(i == static_cast(axis) ? + tvm::ir::Simplify(ishape[i] / 32) : + ishape[i]); + } + + return tvm::compute( + oshape, + [&](const Array& indices) { + Array start_idx; + for (size_t i = 0; i < n; ++i) { + start_idx.push_back(i == static_cast(axis) ? + indices[i] * 32 : + static_cast(indices[i])); + } + auto packed = make_const(UInt(32), 0); + for (size_t j = 0; j < 32; ++j) { + Array idx; + for (size_t i = 0; i < n; ++i) { + idx.push_back(i == static_cast(axis) ? + start_idx[i] + static_cast(j) : + start_idx[i]); + } + auto sign = tvm::cast(UInt(32), data(idx) >= 0); + packed = (packed | sign); + if (j == 31) { + return packed; + } + packed = packed << 1; + } + return packed; // never reached, but suppress compiler warning + }, name, tag); +} + +/*! +* \brief Binary matrix multiplication using xor and bit-count +* +* \param data Tensor with shape [batch, in_dim], dtype is uint32 +* \param weight Tensor with shape [out_dim, in_dim], dtype is uint32 +* +* \return Tensor with shape [batch, out_dim], dtype is float32 +*/ +inline tvm::Tensor binary_dense(const tvm::Tensor& data, + const tvm::Tensor& weight) { + CHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data"; + CHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight"; + CHECK_EQ(data->dtype, UInt(32)) << "binary_dense requires uint32 data"; + CHECK_EQ(weight->dtype, UInt(32)) << "binary_dense requires uint32 weight"; + + auto batch = data->shape[0]; + auto in_dim = data->shape[1]; + auto out_dim = weight->shape[0]; + + auto k = tvm::reduce_axis(Range(0, in_dim), "k"); + auto matmul = tvm::compute( + { batch, out_dim }, + [&](Var i, Var j) { + return tvm::sum(popcount(data(i, k) ^ weight(j, k)), { k }); + }, "tensor", "binary_dense"); + + return tvm::compute( + { batch, out_dim }, + [&](Var i, Var j) { + return 32 * in_dim - 2.0f * matmul(i, j); + }, "tensor", kElementWise); +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_BNN_H_ diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h new file mode 100644 index 000000000000..cdc7fde158a6 --- /dev/null +++ b/topi/include/topi/nn/dense.h @@ -0,0 +1,61 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Dense op constructions + * \file nn/dense.h + */ +#ifndef TOPI_NN_DENSE_H_ +#define TOPI_NN_DENSE_H_ + +#include + +#include "topi/tags.h" +#include "tvm/tvm.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Creates an operation that calculates data * weight^T + bias +* +* \param data Tensor with shape [batch, in_dim] +* \param weight Tensor with shape [out_dim, in_dim] +* \param bias Tensor with shape [out_dim] (optional) +* +* \return Tensor with shape [batch, out_dim] +*/ +inline tvm::Tensor dense(const tvm::Tensor& data, + const tvm::Tensor& weight, + tvm::Tensor* bias) { + CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data"; + CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight"; + if (bias != nullptr) { + CHECK_EQ((*bias)->shape.size(), 1) << "dense requires 1-D bias"; + } + + auto batch = data->shape[0]; + auto in_dim = data->shape[1]; + auto out_dim = weight->shape[0]; + + auto k = tvm::reduce_axis(Range(0, in_dim), "k"); + auto matmul = tvm::compute( + { batch, out_dim }, + [&](Var i, Var j) { + return tvm::sum(data(i, k) * weight(j, k), { k }); + }, "tensor", "dense"); + + if (bias != nullptr) { + auto bias_val = *bias; + matmul = tvm::compute( + { batch, out_dim }, + [&](Var i, Var j) { + return matmul(i, j) + bias_val(j); + }, "tensor", kBroadcast); + } + + return matmul; +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_DENSE_H_ diff --git a/topi/include/topi/nn/dilate.h b/topi/include/topi/nn/dilate.h new file mode 100644 index 000000000000..f4638f4b6b8a --- /dev/null +++ b/topi/include/topi/nn/dilate.h @@ -0,0 +1,87 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Dilate op constructions + * \file nn/dilate.h + */ +#ifndef TOPI_NN_DILATE_H_ +#define TOPI_NN_DILATE_H_ + +#include + +#include "tvm/tvm.h" +#include "tvm/ir_pass.h" +#include "topi/tags.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Create a new expression of the logical and of all +* conditions in the arguments. +* +* \param args The arguments to find the logical conjunction of +* +* \return The logical conjunction expression +*/ +Expr all(Array args) { + CHECK_GT(args.size(), 0) << "all requires at least one argument"; + + Expr ret = args[0]; + for (size_t i = 1; i < args.size(); ++i) { + ret = ret && args[i]; + } + return ret; +} + +/*! +* \brief Dilate data with zeros +* +* \param x The input tensor, this can have any number of +* dimensions and any layout. +* \param strides Dilation stride for each dimension. Stride 1 +* means no dilation. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return The output tensor. +*/ +inline Tensor dilate(const Tensor& x, + Array strides, + std::string name = "tensor", + std::string tag = kInjective) { + auto n = x->shape.size(); + CHECK_EQ(n, strides.size()) + << "strides size (" << strides.size() + << ") must match dimension of x (" << n << ")"; + + Array out_shape; + for (size_t i = 0; i < n; ++i) { + out_shape.push_back(tvm::ir::Simplify( + (x->shape[i] - 1) * strides[i] + 1)); + } + + return tvm::compute( + out_shape, + [&](const Array& indices) { + Array not_zero; + Array index_tuple; + for (size_t i = 0; i < n; ++i) { + if (IsConstInt(strides[i]) && GetConstInt(strides[i]) == 1) { + index_tuple.push_back(indices[i]); + } else { + index_tuple.push_back(indices[i] / strides[i]); + not_zero.push_back((indices[i] % strides[i]) == 0); + } + } + if (not_zero.size() > 0) { + auto all_not_zero = all(not_zero); + return tvm::select(all_not_zero, x(index_tuple), make_const(x->dtype, 0)); + } + return x(index_tuple); + }, name, tag); +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_DILATE_H_ diff --git a/topi/include/topi/nn/flatten.h b/topi/include/topi/nn/flatten.h new file mode 100644 index 000000000000..a74b390eeb0c --- /dev/null +++ b/topi/include/topi/nn/flatten.h @@ -0,0 +1,63 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Softmax op constructions + * \file nn/flatten.h + */ +#ifndef TOPI_NN_FLATTEN_H_ +#define TOPI_NN_FLATTEN_H_ + +#include +#include + +#include "topi/tags.h" +#include "topi/detail/constant_utils.h" +#include "tvm/tvm.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Flattens the input tensor into a 2-D tensor by collapsing higher dimensions. +* This requires the input tensor to have constant sized dimensions. +* +* \param x The input tensor. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A 2-D tensor. +*/ +inline Tensor flatten(const Tensor& x, + std::string name = "tensor", + std::string tag = kInjective) { + auto ishape = x->shape; + int dim = 1; + for (size_t i = 1; i < ishape.size(); ++i) { + dim = dim * static_cast(GetConstInt(ishape[i])); + } + + Array oshape({ ishape[0], dim }); + + std::vector extra_shape; + for (size_t i = 1; i < ishape.size(); ++i) { + extra_shape.push_back(ishape[i]); + } + std::reverse(extra_shape.begin(), extra_shape.end()); + + return tvm::compute( + oshape, [&](Var i, Var j) { + Expr idx = j; + std::vector index; + for (auto s : extra_shape) { + index.push_back(idx % s); + idx = idx / s; + } + index.push_back(i); + std::reverse(index.begin(), index.end()); + return x(index); + }); +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_FLATTEN_H_ diff --git a/topi/include/topi/nn/mapping.h b/topi/include/topi/nn/mapping.h new file mode 100644 index 000000000000..60cd6d6310a7 --- /dev/null +++ b/topi/include/topi/nn/mapping.h @@ -0,0 +1,66 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Mapping op constructions + * \file nn/mapping.h + */ +#ifndef TOPI_NN_MAPPING_H_ +#define TOPI_NN_MAPPING_H_ + +#include + +#include "topi/tags.h" +#include "tvm/tvm.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Scale and shift with NCHW order +* +* \param x The input tensor. +* \param scale Scale tensor, 1-D of size channel +* \param shift Shift tensor, 1-D of size channel +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the scale shift operation +*/ +inline Tensor scale_shift_nchw(const Tensor& x, + const Tensor& scale, + const Tensor& shift, + std::string name = "ScaleShift", + std::string tag = kBroadcast) { + return tvm::compute( + x->shape, + [&](Var b, Var c, Var h, Var w) { + return x(b, c, h, w) * scale(c) + shift(w); + }, name, tag); +} + +/*! +* \brief Scale and shift with NHWC order +* +* \param x The input tensor. +* \param scale Scale tensor, 1-D of size channel +* \param shift Shift tensor, 1-D of size channel +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the scale shift operation +*/ +inline Tensor scale_shift_nhwc(const Tensor& x, + const Tensor& scale, + const Tensor& shift, + std::string name = "ScaleShift", + std::string tag = kBroadcast) { + return tvm::compute( + x->shape, + [&](Var b, Var h, Var w, Var c) { + return x(b, h, w, c) * scale(c) + shift(w); + }, name, tag); +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_MAPPING_H_ diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h new file mode 100644 index 000000000000..4333f2749573 --- /dev/null +++ b/topi/include/topi/nn/pooling.h @@ -0,0 +1,161 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Pooling op constructions + * \file nn/pooling.h + */ +#ifndef TOPI_NN_POOLING_H_ +#define TOPI_NN_POOLING_H_ + +#include + +#include "tvm/tvm.h" +#include "tvm/ir_pass.h" +#include "topi/tags.h" +#include "topi/detail/pad_utils.h" +#include "topi/nn.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! \brief Pooling type */ +enum PoolType : int { + kAvgPool, + kMaxPool, +}; + +/*! +* \brief Perform pooling on data in NCHW order +* +* \param x The input tensor in NCHW order +* \param kernel_size Vector of two ints: {kernel_height, kernel_width} +* \param stride_size Vector of two ints: {stride_height, stride_width} +* \param padding_size Vector of two ints: {padding_height, padding_width} +* \param pool_type The type of pooling operator +* \param ceil_mode Whether to use ceil when calculating the output size +* +* \return The output tensor in NCHW order +*/ +inline Tensor pool(const Tensor& x, + const Array& kernel_size, + const Array& stride_size, + const Array& padding_size, + PoolType pool_type, + bool ceil_mode) { + CHECK_EQ(x->shape.size(), 4) << "Pooling input must be 4-D"; + CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements"; + CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements"; + CHECK_EQ(padding_size.size(), 2) << "Pooling padding_size must have 2 elements"; + + auto kernel_height = kernel_size[0]; + auto kernel_width = kernel_size[1]; + auto stride_height = stride_size[0]; + auto stride_width = stride_size[1]; + auto padding_height = padding_size[0]; + auto padding_width = padding_size[1]; + + auto batch = x->shape[0]; + auto channel = x->shape[1]; + auto height = x->shape[2]; + auto width = x->shape[3]; + + auto pad_tuple = detail::GetPadTuple(padding_height, padding_width); + auto pad_top = pad_tuple[0]; + auto pad_left = pad_tuple[1]; + auto pad_down = pad_tuple[2]; + auto pad_right = pad_tuple[3]; + + if (ceil_mode) { + // Additional padding to ensure we do ceil instead of floor when + // dividing by stride. + pad_down += stride_height - 1; + pad_right += stride_width - 1; + } + + Array pad_before{ 0, 0, pad_top, pad_left }; + Array pad_after{ 0, 0, pad_down, pad_right }; + + auto out_height = tvm::ir::Simplify( + (height - kernel_height + pad_top + pad_down) / stride_height + 1); + auto out_width = tvm::ir::Simplify( + (width - kernel_width + pad_left + pad_right) / stride_width + 1); + + auto dheight = tvm::reduce_axis(Range(0, kernel_height)); + auto dwidth = tvm::reduce_axis(Range(0, kernel_width)); + + if (pool_type == kMaxPool) { + auto temp = pad(x, pad_before, pad_after, x->dtype.min(), "pad_temp"); + return tvm::compute( + { batch, channel, out_height, out_width }, + [&](Var n, Var c, Var h, Var w) { + return tvm::max(temp(n, c, h * stride_height + dheight, w * stride_width + dwidth), + { dheight, dwidth }); + }, "tensor", "pool_max"); + } else if (pool_type == kAvgPool) { + auto temp = pad(x, pad_before, pad_after, 0, "pad_temp"); + + auto tsum = tvm::compute( + { batch, channel, out_height, out_width }, + [&](Var n, Var c, Var h, Var w) { + return tvm::sum(temp(n, c, h * stride_height + dheight, w * stride_width + dwidth), + { dheight, dwidth }); + }, "tensor", "pool_avg"); + + return tvm::compute( + { batch, channel, out_height, out_width }, + [&](Var n, Var c, Var h, Var w) { + return tsum(n, c, h, w) / (kernel_height * kernel_width); + }, "tensor", kElementWise); + } else { + LOG(ERROR) << "Unrecognized pool_type: " << pool_type; + return x; + } +} + +/*! +* \brief Perform global pooling on data in NCHW order +* +* \param x The input tensor in NCHW order +* \param pool_type The type of pooling operator +* +* \return The output tensor with shape [batch, channel, 1, 1] +*/ +inline Tensor global_pool(const Tensor& x, + PoolType pool_type) { + CHECK_EQ(x->shape.size(), 4) << "Pooling input must be 4-D"; + + auto batch = x->shape[0]; + auto channel = x->shape[1]; + auto height = x->shape[2]; + auto width = x->shape[3]; + + auto dheight = tvm::reduce_axis(Range(0, height)); + auto dwidth = tvm::reduce_axis(Range(0, width)); + + if (pool_type == kMaxPool) { + return tvm::compute( + { batch, channel, 1, 1 }, + [&](Var n, Var c, Var h, Var w) { + return tvm::max(x(n, c, dheight, dwidth), { dheight, dwidth }); // NOLINT(*) + }, "tensor", "global_pool_max"); + } else if (pool_type == kAvgPool) { + auto tsum = tvm::compute( + { batch, channel, 1, 1 }, + [&](Var n, Var c, Var h, Var w) { + return tvm::sum(x(n, c, dheight, dwidth), { dheight, dwidth }); + }, "tensor", "global_pool_sum"); + + return tvm::compute( + { batch, channel, 1, 1 }, + [&](Var n, Var c, Var h, Var w) { + return tsum(n, c, h, w) / tvm::cast(x->dtype, height * width); + }, "tensor", kElementWise); + } else { + LOG(ERROR) << "Unrecognized pool_type: " << pool_type; + return x; + } +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_POOLING_H_ diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h new file mode 100644 index 000000000000..273bac4ff76d --- /dev/null +++ b/topi/include/topi/nn/softmax.h @@ -0,0 +1,87 @@ +/*! + * Copyright (c) 2017 by Contributors + * \brief Softmax op constructions + * \file nn/softmax.h + */ +#ifndef TOPI_NN_SOFTMAX_H_ +#define TOPI_NN_SOFTMAX_H_ + +#include +#include + +#include "topi/tags.h" +#include "tvm/tvm.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Softmax activation +* +* \param x The input tensor. 2-D where softmax is performed along the second dimension +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the softmax operation +*/ +inline Tensor softmax(const Tensor& x, + std::string name = "tensor", + std::string tag = "softmax_output") { + CHECK_EQ(x->shape.size(), 2) << "Softmax requires 2-D input"; + + Expr m = x->shape[0]; + Expr n = x->shape[1]; + + auto k = tvm::reduce_axis(Range(0, n), "k"); + auto max_elem = tvm::compute( + { m }, [&](Var i) { + return tvm::max(x(i, k), Array{ k }); }); + k = tvm::reduce_axis(Range(0, n), "k"); + + auto expsum = tvm::compute( + { m }, [&](Var i) { + return tvm::sum(tvm::exp(x(i, k) - max_elem(i)), { k }); }); + + return tvm::compute( + x->shape, [&](Var i, Var j) { + return tvm::exp(x(i, j) - max_elem(i)) / expsum(i); + }); +} + +/*! +* \brief Log softmax activation +* +* \param x The input tensor. 2-D where log softmax is performed along the second dimension +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the log softmax operation +*/ +inline Tensor log_softmax(const Tensor& x, + std::string name = "tensor", + std::string tag = "log_softmax_output") { + CHECK_EQ(x->shape.size(), 2) << "Log softmax requires 2-D input"; + + Expr m = x->shape[0]; + Expr n = x->shape[1]; + + auto k = tvm::reduce_axis(Range(0, n), "k"); + auto max_elem = tvm::compute( + { m }, [&](Var i) { + return tvm::max(x(i, k), Array{ k }); }); + k = tvm::reduce_axis(Range(0, n), "k"); + + auto expsum = tvm::compute( + { m }, [&](Var i) { + return tvm::sum(tvm::exp(x(i, k) - max_elem(i)), { k }); }); + + return tvm::compute( + x->shape, [&](Var i, Var j) { + return x(i, j) - max_elem(i) - tvm::log(expsum(i)); + }); +} + +} // namespace nn +} // namespace topi +#endif // TOPI_NN_SOFTMAX_H_ diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h new file mode 100644 index 000000000000..3fda17130fce --- /dev/null +++ b/topi/include/topi/reduction.h @@ -0,0 +1,379 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file topi/reduction.h + * \brief Reduction op constructors + */ +#ifndef TOPI_REDUCTION_H_ +#define TOPI_REDUCTION_H_ + +#include +#include +#include +#include + +#include "topi/tags.h" +#include "topi/detail/ravel_unravel.h" +#include "topi/detail/constant_utils.h" +#include "tvm/tvm.h" + +namespace topi { +using namespace tvm; + +/*! \brief The operation to use for CommReduce */ +using FReduce = std::function& axis)>; + +/*! \brief The operation to use for CommReduceIdx */ +using FCommReduce = std::function< + Array(Array exprs, const Array& axis, Expr* condition)>; + +/*! +* \brief Convert a reduction axis which could be empty or have negative +* elements into a real axis with valid dimension indices. +* +* \return A non-empty sorted array of valid dimension indices, with no duplicates. +* If the input axis is empty, the result will be an axis including all dimensions. +* If any input element is negative, it will be treated as an offset from the +* last dimension (same as python indexing rules). +*/ +std::vector GetRealAxis(int ndim, const std::vector& axis) { + std::vector real_axis; + if (axis.size() == 0) { + for (int i = 0; i < ndim; ++i) { + real_axis.push_back(i); + } + } else { + // Use a set so duplicates are removed and the dims are sorted + std::set dims; + for (auto ele : axis) { + if (ele < 0) { + ele += ndim; + } + if (ele >= ndim) { + LOG(ERROR) << ele << " exceeds the maximum dimension " << ndim; + } + dims.emplace(ele); + } + std::copy(dims.begin(), dims.end(), std::back_inserter(real_axis)); + } + return real_axis; +} + +/*! \brief Enumerate the axes for a reduce op */ +Array MakeReduceAxes(const std::vector& real_axis, const Tensor& data) { + Array reduce_axes; + for (auto i : real_axis) { + std::string name = "k" + std::to_string(i); + reduce_axes.push_back( + tvm::reduce_axis(Range(0, data->shape[i]), name)); + } + return reduce_axes; +} + +/*! \brief Calculate the target shape for a reduce op */ +Array MakeReduceTargetShape(const std::vector& real_axis, + const Tensor& data, + bool keepdims) { + auto ndim = data->shape.size(); + Array target_shape; + if (keepdims) { + for (size_t i = 0; i < ndim; ++i) { + if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) { + // real_axis contains i + target_shape.push_back(1); + } else { + target_shape.push_back(data->shape[i]); + } + } + } else { + for (size_t i = 0; i < ndim; ++i) { + if (std::find(real_axis.begin(), real_axis.end(), i) == real_axis.end()) { + // real_axis does not contain i + target_shape.push_back(data->shape[i]); + } + } + } + return target_shape; +} + +/*! + * \brief Create a reduction operation. + * + * \param data The input tensor. + * \param axis The axes along which the reduction is performed. + * \param func The reduction function eg. tvm::sum + * \param keepdims If this is set to true, the axes which are reduced are + * left in the result as dimensions with size one. This enables the result + * to broadcast correctly against the input array. + * + * \return The result tensor. + */ +Tensor CommReduce(const Tensor& data, + const Array& axis, + FReduce func, + bool keepdims = false) { + auto ndim = data->shape.size(); + CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor"; + auto axis_val = detail::GetConstIntValues(axis, "axis"); + auto real_axis = GetRealAxis(static_cast(ndim), axis_val); + auto reduce_axes = MakeReduceAxes(real_axis, data); + auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims); + + auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data] + (const Array& indices) { + Array eval_range; + Array eval_indices; + int arg_counter = 0; + int red_counter = 0; + + for (size_t i = 0; i < ndim; ++i) { + if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) { + // real_axis contains i + eval_range.push_back(reduce_axes[red_counter]); + eval_indices.push_back(reduce_axes[red_counter]->var); + red_counter++; + } else { + if (!keepdims) { + eval_range.push_back(indices[arg_counter]); + arg_counter++; + } else { + eval_range.push_back(indices[i]); + } + } + } + + return func(data(eval_range), reduce_axes); + }; + + return tvm::compute(target_shape, compute, data->op->name + "_red", kCommReduce); +} + +/*! +* \brief Create an index reduction operation. +* +* \param data The input tensor. +* \param axis The axes along which the reduction is performed. +* \param func The reduction function +* \param keepdims If this is set to true, the axes which are reduced are +* left in the result as dimensions with size one. This enables the result +* to broadcast correctly against the input array. +* +* \return The result tensor. +*/ +Tensor CommReduceIdx(const Tensor& data, + const Array& axis, + FCommReduce func, + bool keepdims = false) { + auto ndim = data->shape.size(); + CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor"; + auto axis_val = detail::GetConstIntValues(axis, "axis"); + auto real_axis = GetRealAxis(static_cast(ndim), axis_val); + auto reduce_axes = MakeReduceAxes(real_axis, data); + auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims); + + auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data] + (const Array& indices) { + Array eval_range; + Array eval_indices; + int arg_counter = 0; + int red_counter = 0; + + for (size_t i = 0; i < ndim; ++i) { + if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) { + // real_axis contains i + eval_range.push_back(reduce_axes[red_counter]); + eval_indices.push_back(reduce_axes[red_counter]->var); + red_counter++; + } else { + if (!keepdims) { + eval_range.push_back(indices[arg_counter]); + arg_counter++; + } else { + eval_range.push_back(indices[i]); + } + } + } + + Array ravel_shape; + for (auto i : real_axis) { + ravel_shape.push_back(data->shape[i]); + } + auto idx = detail::RavelIndex(eval_indices, ravel_shape); + return func({ idx, data(eval_range) }, reduce_axes, nullptr); + }; + + auto temp_idx_val = tvm::compute(target_shape, compute, + data->op->name + "_red_temp", kCommReduceIdx); + auto temp_idx = temp_idx_val[0]; + auto temp_val = temp_idx_val[1]; + return tvm::compute( + target_shape, + [&temp_idx](const Array& indices) { return temp_idx(indices); }, + data->op->name + "_red", + kCommReduceIdx); +} + +/*! \brief A combiner function for a reduction */ +using FCombine = std::function(Array lhs, Array rhs)>; + +/*! \brief An initializer function for a reduction */ +using FIdentity = std::function(std::vector types)>; + +/*! + * \brief Create a commutative reducer for a reduction + * + * \param fcombine A function to combine exprs + * \param fidentity A function to initialize elements + * \param name The name of the operation + * + * \return A reducer function which creates a reduce expression over an axis. + */ +FCommReduce MakeCommReducer(FCombine fcombine, + FIdentity fidentity, + std::string name = "reduce") { + return [fcombine, fidentity, &name] + (Array exprs, const Array& axis, Expr* condition) { + Array lhs, rhs; + std::vector dtypes; + + for (size_t i = 0; i < exprs.size(); ++i) { + auto dtype = exprs[i].type(); + dtypes.push_back(dtype); + lhs.push_back(var("lhs_" + std::to_string(i), dtype)); + rhs.push_back(var("rhs_" + std::to_string(i), dtype)); + } + + auto result = fcombine(lhs, rhs); + auto id_elem = fidentity(dtypes); + auto cond = condition != nullptr ? *condition : tvm::const_true(); + + auto combiner = tvm::ir::CommReducerNode::make(lhs, rhs, result, id_elem); + Array outputs; + for (size_t i = 0; i < exprs.size(); ++i) { + outputs.push_back(tvm::ir::Reduce::make(combiner, exprs, axis, cond, static_cast(i))); + } + return outputs; + }; +} + +/*! \brief Wrap tvm::min to ensure we get the correct overload */ +inline Expr MinOp(Expr source, Array axis) { + return tvm::min(source, axis); +} + +/*! \brief Wrap tvm::max to ensure we get the correct overload */ +inline Expr MaxOp(Expr source, Array axis) { + return tvm::max(source, axis); // NOLINT(*) +} + +/*! +* \brief Creates an operation that sums array elements over a given axis +* +* \param data The input tensor +* \param axis The axis to sum over. If axis is empty, the operation will +* sum over all elements of the array. +* \param keepdims If this is set to true, the axes which are reduced are +* left in the result as dimensions with size one. This enables the result +* to broadcast correctly against the input array. +* +* \return A Tensor whose op member is the sum operation +*/ +Tensor sum(const Tensor& data, Array axis, bool keepdims = false) { + return CommReduce(data, axis, tvm::sum, keepdims); +} + +/*! +* \brief Creates an operation that finds the minimum of elements over +* a given axis. +* +* \param data The input tensor +* \param axis The axis to find the minimum over. If axis is empty, the +* operation will find the minimum over all elements of the array. +* \param keepdims If this is set to true, the axes which are reduced are +* left in the result as dimensions with size one. This enables the result +* to broadcast correctly against the input array. +* +* \return A Tensor whose op member is the min operation +*/ +Tensor min(const Tensor& data, Array axis, bool keepdims = false) { + return CommReduce(data, axis, MinOp, keepdims); +} + +/*! +* \brief Creates an operation that finds the maximum of elements over +* a given axis. +* +* \param data The input tensor +* \param axis The axis to find the maximum over. If axis is empty, the +* operation will find the maximum over all elements of the array. +* \param keepdims If this is set to true, the axes which are reduced are +* left in the result as dimensions with size one. This enables the result +* to broadcast correctly against the input array. +* +* \return A Tensor whose op member is the max operation +*/ +Tensor max(const Tensor& data, Array axis, bool keepdims = false) { // NOLINT(*) + return CommReduce(data, axis, MaxOp, keepdims); +} + +/*! +* \brief Creates an operation that finds the indices of the minimum +* values over a given axis. +* +* \param data The input tensor +* \param axis The axis along which the argmin is performed. If axis is empty, +* the operation will find the minimum index over all elements of the array. +* \param keepdims If this is set to true, the axes which are reduced are +* left in the result as dimensions with size one. This enables the result +* to broadcast correctly against the input array. +* +* \return A Tensor whose op member is the argmin operation +*/ +Tensor argmin(const Tensor& data, Array axis, bool keepdims = false) { + auto fcombine = [](Array lhs, Array rhs) { + Array result; + result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0])); // idx + result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[1], rhs[1])); // val + return result; + }; + auto fidentity = [](std::vector types) { + Array result; + result.push_back(tvm::make_const(types[0], -1)); // idx + result.push_back(types[1].max()); // val + return result; + }; + auto func = MakeCommReducer(fcombine, fidentity, "argmin"); + return CommReduceIdx(data, axis, func, keepdims); +} + +/*! +* \brief Creates an operation that finds the indices of the maximum +* values over a given axis. +* +* \param data The input tensor +* \param axis The axis along which the argmax is performed. If axis is empty, +* the operation will find the maximum index over all elements of the array. +* \param keepdims If this is set to true, the axes which are reduced are +* left in the result as dimensions with size one. This enables the result +* to broadcast correctly against the input array. +* +* \return A Tensor whose op member is the argmax operation +*/ +Tensor argmax(const Tensor& data, Array axis, bool keepdims = false) { + auto fcombine = [](Array lhs, Array rhs) { + Array result; + result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0])); // idx + result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[1], rhs[1])); // val + return result; + }; + auto fidentity = [](std::vector types) { + Array result; + result.push_back(tvm::make_const(types[0], -1)); // idx + result.push_back(types[1].min()); // val + return result; + }; + auto func = MakeCommReducer(fcombine, fidentity, "argmax"); + return CommReduceIdx(data, axis, func, keepdims); +} + +} // namespace topi +#endif // TOPI_REDUCTION_H_ diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h new file mode 100644 index 000000000000..cf3c3f60b751 --- /dev/null +++ b/topi/include/topi/rocm/dense.h @@ -0,0 +1,82 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file rocm/dense.h +* \brief rocm schedule for dense operation +*/ +#ifndef TOPI_ROCM_DENSE_H_ +#define TOPI_ROCM_DENSE_H_ + +#include "tvm/tvm.h" +#include "tvm/build_module.h" +#include "topi/tags.h" +#include "topi/detail/array_utils.h" +#include "topi/nn/dense.h" +#include "topi/contrib/rocblas.h" +#include "topi/generic/extern.h" +#include "topi/cuda/dense.h" + +namespace topi { +using namespace tvm; + +namespace rocm { +/*! +* \brief Implementation of dense for rocm backend +* +* \param target The target device +* \param data Tensor with shape [batch, in_dim] +* \param weight Tensor with shape [out_dim, in_dim] +* \param bias Tensor with shape [out_dim] (optional) +* +* \return Tensor with shape [batch, out_dim] +*/ +inline tvm::Tensor dense_rocm(const Target& target, + const tvm::Tensor& data, + const tvm::Tensor& weight, + tvm::Tensor* bias) { + CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data"; + CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight"; + if (bias != nullptr) { + CHECK_EQ((*bias)->shape.size(), 1) << "dense requires 1-D bias"; + } + + auto batch = data->shape[0]; + auto in_dim = data->shape[1]; + auto out_dim = weight->shape[0]; + + if (target.libs.count("rocblas") > 0) { + auto mm = topi::contrib::rocblas_matmul(data, weight, false, true); + if (bias != nullptr) { + auto bias_val = *bias; + mm = tvm::compute({ batch, out_dim }, + [&](Var i, Var j) { + return mm(i, j) + bias_val(j); + }, "tensor", kBroadcast); + } + + return mm; + } else { + return topi::nn::dense(data, weight, bias); + } +} + +/*! +* \brief Create a rocm schedule for dense +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_dense(const Target &target, const Array& outs) { + if (target.target_name == "rocm" && + target.libs.count("rocblas") > 0) { + return topi::generic::schedule_extern(target, outs); + } + + return topi::cuda::schedule_dense(target, outs); +} + +} // namespace rocm +} // namespace topi +#endif // TOPI_ROCM_DENSE_H_ + diff --git a/topi/include/topi/tags.h b/topi/include/topi/tags.h index 17882f877697..8ba9955be050 100644 --- a/topi/include/topi/tags.h +++ b/topi/include/topi/tags.h @@ -6,9 +6,14 @@ #ifndef TOPI_TAGS_H_ #define TOPI_TAGS_H_ +#include + namespace topi { constexpr auto kElementWise = "elemwise"; +constexpr auto kInjective = "injective"; +constexpr auto kCommReduce = "comm_reduce"; +constexpr auto kCommReduceIdx = "comm_reduce_idx"; constexpr auto kBroadcast = "broadcast"; constexpr auto kMatMult = "matmult"; constexpr auto kConv2dNCHW = "conv2d_nchw"; @@ -19,6 +24,19 @@ constexpr auto kDepthwiseConv2dBackInputNHWC = "depthwise_conv2d_back_input_nhwc constexpr auto kDepthwiseConv2dBackWeightNHWC = "depthwise_conv2d_back_weight_nhwc"; constexpr auto kGroupConv2d = "group_conv2d"; +inline bool is_broadcast(std::string tag) { + return + tag.rfind(kElementWise, 0) == 0 || + tag.rfind(kBroadcast, 0) == 0; +} + +inline bool is_injective(std::string tag) { + return + tag.rfind(kElementWise, 0) == 0 || + tag.rfind(kBroadcast, 0) == 0 || + tag.rfind(kInjective, 0) == 0; +} + } // namespace topi #endif // TOPI_TAGS_H_ diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h new file mode 100644 index 000000000000..9e4c8b06366f --- /dev/null +++ b/topi/include/topi/transform.h @@ -0,0 +1,362 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file topi/transform.h + * \brief Transform op constructors + */ +#ifndef TOPI_TRANSFORM_H_ +#define TOPI_TRANSFORM_H_ + +#include +#include +#include +#include + +#include "topi/tags.h" +#include "topi/detail/ravel_unravel.h" +#include "topi/detail/constant_utils.h" +#include "tvm/tvm.h" + +namespace topi { +using namespace tvm; +using namespace topi::detail; + +/*! +* \brief Creates an operation to insert new dimensions of length 1 +* +* \param x The input tensor +* \param axis The index of the first new dimension (allows negative +* indices as offsets from the last dimension) +* \param num_newaxis The number of new dimensions to insert +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the dim expansion operation +*/ +inline Tensor expand_dims(const Tensor& x, + int axis, + int num_newaxis = 1, + std::string name = "tensor", + std::string tag = kBroadcast) { + if (axis < 0) { + // Calculate offset from last dimension + axis = static_cast(x->shape.size()) + axis + 1; + } + + Array new_shape; + for (size_t i = 0; i < static_cast(axis); ++i) { + new_shape.push_back(x->shape[i]); + } + for (size_t i = 0; i < static_cast(num_newaxis); ++i) { + new_shape.push_back(1); + } + for (size_t i = axis; i < x->shape.size(); ++i) { + new_shape.push_back(x->shape[i]); + } + + return compute( + new_shape, [&](const Array& indices) { + Array idx; + for (size_t i = 0; i < static_cast(axis); ++i) { + idx.push_back(indices[i]); + } + for (size_t i = axis + num_newaxis; i < indices.size(); ++i) { + idx.push_back(indices[i]); + } + return x(idx); + }, name, tag); +} + +/*! +* \brief Permute the dimensions of an array +* +* \param x The input tensor +* \param axes The indices of the permutation. If this is empty, +* the dimensions will be reversed. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the transpose operation +*/ +inline Tensor transpose(const Tensor& x, + Array axes, + std::string name = "tensor", + std::string tag = kInjective) { + if (axes.size() == 0) { + axes = Array(); + for (int i = static_cast(x->shape.size()) - 1; i >= 0; --i) { + axes.push_back(i); + } + } + + auto axes_val = GetConstIntValues(axes, "axes"); + + Array new_shape; + for (size_t i = 0; i < axes_val.size(); ++i) { + new_shape.push_back(x->shape[axes_val[i]]); + } + return compute( + new_shape, [&](const Array& indices) { + std::vector idx; + for (size_t i = 0; i < axes_val.size(); ++i) { + idx.push_back(1); + } + for (size_t i = 0; i < axes_val.size(); ++i) { + idx[axes_val[i]] = indices[i]; + } + return x(idx); + }, name, tag); +} + + +/*! +* \brief Reshape a tensor +* +* \param x The input tensor +* \param newshape The new shape +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the reshape operation +*/ +inline Tensor reshape(const Tensor& x, + Array newshape, + std::string name = "tensor", + std::string tag = kInjective) { + auto x_shape = x->shape; + return compute( + newshape, [&](const Array& indices) { + return x(UnavelIndex(RavelIndex(indices, newshape), x_shape)); + }, name, tag); +} + +/*! +* \brief Remove size 1 dimensions from the shape of a tensor. +* The removed dimensions must have a constant size of 1. +* +* \param x The input tensor +* \param axis Indices of the dimensions to remove. If this is empty, +* all entries with a constant size of 1 will be removed. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the squeeze operation +*/ +inline Tensor squeeze(const Tensor& x, + Array axis, + std::string name = "tensor", + std::string tag = kInjective) { + auto axis_val = GetConstIntValues(axis, "axis"); + auto ndim = x->shape.size(); + if (axis_val.size() == 0) { + for (size_t i = 0; i < ndim; ++i) { + if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) { + axis_val.push_back(static_cast(i)); + } + } + } else { + for (size_t i = 0; i < axis_val.size(); ++i) { + if (axis_val[i] < 0) { + axis_val[i] += static_cast(x->shape.size()); + } + CHECK_EQ(GetConstInt(x->shape[axis_val[i]]), 1) << + "Dimension " << axis[i] << " must have size 1"; + } + } + + std::unordered_set axis_set(axis_val.begin(), axis_val.end()); + + Array out_shape; + for (size_t i = 0; i < ndim; ++i) { + if (axis_set.count(static_cast(i)) == 0) { + out_shape.push_back(x->shape[i]); + } + } + if (out_shape.size() == 0) { + out_shape.push_back(1); + } + + return compute( + out_shape, [&](const Array& indices) { + Array real_indices; + int flag = 0; + for (size_t i = 0; i < ndim; ++i) { + if (axis_set.count(static_cast(i)) == 0) { + real_indices.push_back(indices[i - flag]); + } else { + real_indices.push_back(0); + flag += 1; + } + } + return x(real_indices); + }, name, tag); +} + +/*! +* \brief Join a sequence of tensors along an existing axis +* +* \param inputs The input tensors +* \param axis The axis along which the tensors will be joined +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the concatenate operation +*/ +inline Tensor concatenate(const Array& inputs, + int axis = 0, + std::string name = "tensor", + std::string tag = kInjective) { + if (axis < 0) { + axis += static_cast(inputs[0]->shape.size()); + } + CHECK_LT(axis, inputs[0]->shape.size()) << + "axis out of bounds"; + + Array axis_sizes; + for (auto t : inputs) { + axis_sizes.push_back(t->shape[axis]); + } + + Expr join_size = axis_sizes[0]; + for (size_t i = 1; i < axis_sizes.size(); ++i) { + join_size += axis_sizes[i]; + } + Array out_shape; + for (size_t i = 0; i < inputs[0]->shape.size(); ++i) { + out_shape.push_back(i == static_cast(axis) ? join_size : inputs[0]->shape[i]); + } + + return compute( + out_shape, [&](const Array& indices) { + auto ret = inputs[0](indices); + auto ind = indices[axis]; + for (size_t i = 0; i < inputs.size() - 1; ++i) { + ind -= axis_sizes[i]; + + Array idx; + for (size_t i = 0; i < static_cast(axis); ++i) { + idx.push_back(indices[i]); + } + idx.push_back(ind); + for (size_t i = axis + 1; i < indices.size(); ++i) { + idx.push_back(indices[i]); + } + + ret = tvm::select(ind >= 0, + inputs[i + 1](idx), + ret); + } + return ret; + }, name, tag); +} + +/*! +* \brief Split a tensor into multiple sub-tensors +* +* \param x The input tensor +* \param split_indices The indices to split the input at. This must be in ascending +* order. +* \param axis The axis to split along. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the split operation +*/ +inline Array split(const Tensor& x, + Array split_indices, + int axis, + std::string name = "tensor", + std::string tag = kInjective) { + if (axis < 0) { + axis += static_cast(x->shape.size()); + } + auto src_axis_size = static_cast(GetConstInt(x->shape[axis])); + + auto split_indices_val = GetConstIntValues(split_indices, "split_indices"); + CHECK(std::is_sorted(split_indices_val.begin(), split_indices_val.end())) << + "split_indices must be sorted"; + + std::vector begin_ids; + begin_ids.push_back(0); + std::copy(split_indices_val.begin(), split_indices_val.end(), std::back_inserter(begin_ids)); + + Array< Array > out_shapes; + for (size_t i = 0; i < begin_ids.size(); ++i) { + int out_axis_size; + if (i == begin_ids.size() - 1) { + out_axis_size = src_axis_size - begin_ids[i]; + } else { + out_axis_size = begin_ids[i + 1] - begin_ids[i]; + } + + Array shape; + for (size_t i = 0; i < static_cast(axis); ++i) { + shape.push_back(x->shape[i]); + } + shape.push_back(out_axis_size); + for (size_t i = axis + 1; i < x->shape.size(); ++i) { + shape.push_back(x->shape[i]); + } + + out_shapes.push_back(shape); + } + + Array result; + for (size_t i = 0; i < begin_ids.size(); ++i) { + result.push_back( + compute( + out_shapes[i], [&](const Array& indices) { + auto begin = begin_ids[i]; + Array real_indices; + for (size_t j = 0; j < static_cast(axis); ++j) { + real_indices.push_back(indices[j]); + } + real_indices.push_back(indices[axis] + begin); + for (size_t j = axis + 1; j < indices.size(); ++j) { + real_indices.push_back(indices[j]); + } + + return x(real_indices); + }, name, tag)); + } + + return result; +} + +/*! +* \brief Split a tensor into a number of sub-tensors +* +* \param x The input tensor +* \param num_sections The number of sections to split the tensor into. +* this must be an integer factor of the size of the axis being split. +* \param axis The axis to split along. +* \param name The name of the operation +* \param tag The tag to mark the operation +* +* \return A Tensor whose op member is the split operation +*/ +inline Array split_sections(const Tensor& x, + int num_sections, + int axis, + std::string name = "tensor", + std::string tag = kInjective) { + auto src_axis_size = static_cast(GetConstInt(x->shape[axis])); + + CHECK_GT(num_sections, 0) << "Slice count must be > 0"; + CHECK_EQ(src_axis_size % num_sections, 0) + << "num_sections must be an integer factor of the size of axis " << axis + << " (" << src_axis_size << ")"; + + Array split_indices; + auto seg_size = src_axis_size / num_sections; + for (int i = 0; i < num_sections; ++i) { + // region at index 0 is added by split() + if (i != 0) { + split_indices.push_back(seg_size * i); + } + } + + return split(x, split_indices, axis, name, tag); +} + +} // namespace topi +#endif // TOPI_TRANSFORM_H_ diff --git a/topi/include/topi/x86/bnn.h b/topi/include/topi/x86/bnn.h new file mode 100644 index 000000000000..a5046b0e79a9 --- /dev/null +++ b/topi/include/topi/x86/bnn.h @@ -0,0 +1,110 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file x86/bnn.h +* \brief x86 schedule for binary operations +*/ +#ifndef TOPI_X86_BNN_H_ +#define TOPI_X86_BNN_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace x86 { +/*! +* \brief Create a generic schedule for binarize_pack +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_binarize_pack(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + auto _schedule = [&](const Tensor& out) { + s[out].parallel(out->op.as()->axis[0]); + }; + + std::function traverse; + traverse = [&](const Operation& op) { + if (op->tag == "binarize_pack") { + _schedule(op.output(0)); + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } + }; + + traverse(outs[0]->op); + return s; +} + +/*! +* \brief Create a generic schedule for binary_dense +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_binary_dense(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + + auto _schedule = [&](const Tensor& A, const Tensor& B, const Tensor& C) { + IterVar co, ci; + s[C].split(s[C]->op.as()->reduce_axis[0], 8, &co, &ci); + s[C].parallel(s[C]->op.as()->axis[0]); + + Tensor out; + if (contains(s->outputs, C->op)) { + out = C; + } else { + out = outs[0]->op.output(0); + } + + IterVar xo, xi; + s[out].split(out->op.as()->axis[1], 8, &xo, &xi); + s[out].vectorize(xi); + }; + + std::function traverse; + traverse = [&](const Operation& op) { + // Inline all one-to-one-mapping operators except the last stage (output) + if (is_broadcast(op->tag)) { + if (!contains(s->outputs, op)) { + s[op].compute_inline(); + } + for (auto tensor : op->InputTensors()) { + if (tensor->op->InputTensors().size() > 0) { + traverse(tensor->op); + } + } + } else if (op->tag == "binary_dense") { + auto output = op.output(0); + auto data = op->InputTensors()[0]; + auto weight = op->InputTensors()[1]; + _schedule(data, weight, output); + } else { + LOG(ERROR) << "Unsupported operator " << op->tag; + } + }; + + traverse(outs[0]->op); + return s; +} + +} // namespace x86 +} // namespace topi +#endif // TOPI_X86_BNN_H_ diff --git a/topi/include/topi/x86/default.h b/topi/include/topi/x86/default.h new file mode 100644 index 000000000000..76df67b79169 --- /dev/null +++ b/topi/include/topi/x86/default.h @@ -0,0 +1,58 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file x86/default.h +* \brief default x86 schedule +*/ +#ifndef TOPI_X86_DEFAULT_H_ +#define TOPI_X86_DEFAULT_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace x86 { +/*! +* \brief Create a default x86 schedule for the given ops. +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* \param auto_inline Whether to apply the auto inline step. +* +* \return A schedule for the given ops. +*/ +Schedule default_schedule(const Target &target, const Array& outs, bool auto_inline) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + auto x = outs[0]; + auto axis = s[x]->op.as()->axis; + + if (auto_inline) { + tvm::schedule::AutoInlineInjective(s); + if (axis.size() > 0) { + Fuse(s[x], axis); + } + return s; + } + + if (axis.size() == 4) { + auto n = axis[0]; + auto c = axis[1]; + auto fused = Fuse(s[x], { n, c }); // for nhwc layout, fuse n and h + s[x].parallel(fused); + } else { + s[x].parallel(axis[0]); + } + + return s; +} + +} // namespace x86 +} // namespace topi +#endif // TOPI_X86_DEFAULT_H_ diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h new file mode 100644 index 000000000000..8609beb56aa0 --- /dev/null +++ b/topi/include/topi/x86/injective.h @@ -0,0 +1,50 @@ +/*! +* Copyright (c) 2017 by Contributors +* \file x86/injective.h +* \brief x86 schedule for injective ops +*/ +#ifndef TOPI_X86_INJECTIVE_H_ +#define TOPI_X86_INJECTIVE_H_ + +#include "topi/tags.h" +#include "topi/detail/fuse.h" +#include "tvm/tvm.h" +#include "tvm/build_module.h" + +namespace topi { +using namespace tvm; + +namespace x86 { +/*! +* \brief Create an x86 schedule for the given injective ops. +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +Schedule schedule_injective(const Target &target, const Array& outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + tvm::schedule::AutoInlineInjective(s); + + auto x = outs[0]; + auto axis = s[x]->op.as()->axis; + if (axis.size() == 4) { + auto n = axis[0]; + auto c = axis[1]; + auto fused = Fuse(s[x], { n, c }); // for nhwc layout, fuse n and h + s[x].parallel(fused); + } else { + s[x].parallel(axis[0]); + } + + return s; +} + +} // namespace x86 +} // namespace topi +#endif // TOPI_X86_INJECTIVE_H_ diff --git a/topi/python/setup.py b/topi/python/setup.py index e2a8a5d77a4f..6967051acbd8 100644 --- a/topi/python/setup.py +++ b/topi/python/setup.py @@ -2,6 +2,7 @@ """Setup TOPI package.""" from __future__ import absolute_import import sys +import os from setuptools import find_packages from setuptools.dist import Distribution @@ -13,7 +14,40 @@ from setuptools import setup from setuptools.extension import Extension -__version__ = "0.1.0" +def get_lib_names(): + if sys.platform.startswith('win32'): + return ['libtvm_topi.dll', 'tvm_topi.dll'] + if sys.platform.startswith('darwin'): + return ['libtvm_topi.dylib', 'tvm_topi.dylib'] + return ['libtvm_topi.so', 'tvm_topi.so'] + +def get_lib_path(): + """Get library path, name and version""" + # We can not import `libinfo.py` in setup.py directly since __init__.py + # Will be invoked which introduces dependences + CURRENT_DIR = os.path.dirname(__file__) + libinfo_py = os.path.join(CURRENT_DIR, '../../python/tvm/_ffi/libinfo.py') + libinfo = {'__file__': libinfo_py} + exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo) + lib_path = libinfo['find_lib_path'](get_lib_names()) + version = libinfo['__version__'] + libs = [lib_path[0]] + if libs[0].find("runtime") == -1: + for name in lib_path[1:]: + if name.find("runtime") != -1: + libs.append(name) + break + return libs, version + +LIB_LIST, __version__ = get_lib_path() + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +for i, path in enumerate(LIB_LIST): + LIB_LIST[i] = os.path.relpath(path, curr_path) +setup_kwargs = { + "include_package_data": True, + "data_files": [('topi', LIB_LIST)] +} setup(name='topi', version=__version__, @@ -23,4 +57,5 @@ "decorator", ], packages=find_packages(), - url='https://github.com/dmlc/tvm') + url='https://github.com/dmlc/tvm', + **setup_kwargs) diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index 9760722798af..4979bdbaac8d 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -9,6 +9,8 @@ """ from __future__ import absolute_import as _abs +from tvm._ffi.libinfo import __version__ + from .math import * from .reduction import * from .transform import * @@ -21,3 +23,4 @@ from . import testing from . import util from . import rocm +from . import cpp diff --git a/topi/python/topi/cpp.py b/topi/python/topi/cpp.py new file mode 100644 index 000000000000..c59d6d6bccfa --- /dev/null +++ b/topi/python/topi/cpp.py @@ -0,0 +1,86 @@ +"""FFI for C++ TOPI ops and schedules""" +import sys +import os +import ctypes +from imp import new_module as _new_module +from tvm._ffi.function import _init_api_prefix +from tvm._ffi import libinfo +import tvm as _tvm + +def _get_lib_names(): + if sys.platform.startswith('win32'): + return ['libtvm_topi.dll', 'tvm_topi.dll'] + if sys.platform.startswith('darwin'): + return ['libtvm_topi.dylib', 'tvm_topi.dylib'] + return ['libtvm_topi.so', 'tvm_topi.so'] + +def _load_lib(): + """Load libary by searching possible path.""" + curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) + lib_search = curr_path + lib_path = libinfo.find_lib_path(_get_lib_names(), lib_search, optional=True) + if lib_path is None: + return None, None + lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL) + return lib, os.path.basename(lib_path[0]) + +_LIB, _LIB_NAME = _load_lib() + +_init_api_prefix("topi.cpp", "topi") + +def _create_module(name): + fullname = __name__ + "." + name + mod = _new_module(fullname) + sys.modules[fullname] = mod + return mod + +# pylint: disable-msg=C0103 + +nn = _create_module("nn") +_init_api_prefix("topi.cpp.nn", "topi.nn") +generic = _create_module("generic") +_init_api_prefix("topi.cpp.generic", "topi.generic") +cuda = _create_module("cuda") +_init_api_prefix("topi.cpp.cuda", "topi.cuda") +rocm = _create_module("rocm") +_init_api_prefix("topi.cpp.rocm", "topi.rocm") +x86 = _create_module("x86") +_init_api_prefix("topi.cpp.x86", "topi.x86") + +class IntVector(object): + """Handle to std::vector instance """ + _tvm_tcode = 27 + + def __init__(self, handle): + self.handle = handle + + def __del__(self): + _tvm.nd.free_extension_handle(self.handle, 27) + + @property + def _tvm_handle(self): + return self.handle.value + + def __getitem__(self, idx): + return ivec_get(self, idx) + +_tvm.register_extension(IntVector, IntVector) + +class Target(object): + """Handle to C++ Target instance """ + _tvm_tcode = 28 + + def __init__(self, handle): + self.handle = handle + + def __del__(self): + _tvm.nd.free_extension_handle(self.handle, 28) + + @property + def _tvm_handle(self): + return self.handle.value + + def __getitem__(self, idx): + return ivec_get(self, idx) + +_tvm.register_extension(Target, Target) diff --git a/topi/src/topi.cc b/topi/src/topi.cc new file mode 100644 index 000000000000..03dd004bd42c --- /dev/null +++ b/topi/src/topi.cc @@ -0,0 +1,448 @@ +/*! +* Copyright (c) 2017 by Contributors +* \brief Registration of TVM operators and schedules +* \file topi.cc +*/ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace tvm { +namespace runtime { +template<> +struct extension_class_info { + static const int code = 28; +}; +} // namespace tvm +} // namespace runtime + +namespace topi { +using namespace tvm; +using namespace tvm::runtime; + +TVM_REGISTER_EXT_TYPE(tvm::Target); + +/*! \brief Canonicalize an argument that may be Array or int to Array */ +Array ArrayOrInt(TVMArgValue arg) { + if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) { + Array result; + result.push_back(arg.operator int()); + return result; + } else { + return arg; + } +} + +TVM_REGISTER_GLOBAL("topi.TEST_create_target") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = tvm::Target::create(args[0]); + }); + +/* Ops from broadcast.h */ +TVM_REGISTER_GLOBAL("topi.broadcast_to") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_to(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_add") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_add(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_sub") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_sub(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_mul") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_mul(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_div") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_div(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_maximum") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_maximum(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_minimum") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_minimum(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.broadcast_pow") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = broadcast_pow(args[0], args[1]); + }); + +/* Ops from elemwise.h */ +TVM_REGISTER_GLOBAL("topi.exp") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = exp(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.tanh") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = tanh(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.sigmoid") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = sigmoid(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.sqrt") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = sqrt(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.log") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = log(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.identity") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = identity(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.negative") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = negative(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.pow") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = pow(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.left_shift") +.set_body([](TVMArgs args, TVMRetValue *rv) { + Tensor lhs = args[0]; + Expr rhs = args[1]; + *rv = lhs >> rhs; + }); + +TVM_REGISTER_GLOBAL("topi.right_shift") +.set_body([](TVMArgs args, TVMRetValue *rv) { + Tensor lhs = args[0]; + Expr rhs = args[1]; + *rv = lhs << rhs; + }); + +TVM_REGISTER_GLOBAL("topi.clip") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = clip(args[0], args[1], args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.cast") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = cast(args[0], args[1]); + }); + +/* Ops from nn.h */ +TVM_REGISTER_GLOBAL("topi.nn.relu") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = relu(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.nn.leaky_relu") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = leaky_relu(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.nn.pad") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = pad(args[0], args[1], args[2], args[3]); + }); + +/* Ops from reduction.h */ +TVM_REGISTER_GLOBAL("topi.sum") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::sum(args[0], ArrayOrInt(args[1]), args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.min") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::min(args[0], ArrayOrInt(args[1]), args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.max") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::max(args[0], ArrayOrInt(args[1]), args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.argmin") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::argmin(args[0], ArrayOrInt(args[1]), args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.argmax") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::argmax(args[0], ArrayOrInt(args[1]), args[2]); + }); + +/* Ops from transform.h */ +TVM_REGISTER_GLOBAL("topi.expand_dims") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = expand_dims(args[0], args[1], args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.transpose") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = transpose(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.reshape") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = reshape(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.squeeze") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = squeeze(args[0], ArrayOrInt(args[1])); + }); + +TVM_REGISTER_GLOBAL("topi.concatenate") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = concatenate(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.split") +.set_body([](TVMArgs args, TVMRetValue *rv) { + if (args[1].type_code() == kDLInt || args[1].type_code() == kDLUInt) { + *rv = split_sections(args[0], args[1], args[2]); + } else { + *rv = split(args[0], args[1], args[2]); + } + }); + +/* Ops from nn/batch_norm.h */ +TVM_REGISTER_GLOBAL("topi.nn.batch_norm_inference") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::batch_norm_inference(args[0], + args[1], + args[2], + args[3], + args[4], + static_cast(args[5]), + args[6]); + }); + +/* Ops from nn/bnn.h */ +TVM_REGISTER_GLOBAL("topi.nn.binarize_pack") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::binarize_pack(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.nn.binary_dense") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::binary_dense(args[0], args[1]); + }); + +/* Ops from nn/dense.h */ +TVM_REGISTER_GLOBAL("topi.nn.dense") +.set_body([](TVMArgs args, TVMRetValue *rv) { + Tensor bias_val; + Tensor *bias; + if (args[2].type_code() == kNull) { + bias = nullptr; + } else { + bias_val = args[2]; + bias = &bias_val; + } + *rv = nn::dense(args[0], args[1], bias); + }); + +/* Ops from nn/dilate.h */ +TVM_REGISTER_GLOBAL("topi.nn.dilate") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::dilate(args[0], args[1]); + }); + +/* Ops from nn/flatten.h */ +TVM_REGISTER_GLOBAL("topi.nn.flatten") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::flatten(args[0]); + }); + +/* Ops from nn/mapping.h */ +TVM_REGISTER_GLOBAL("topi.nn.scale_shift_nchw") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::scale_shift_nchw(args[0], args[1], args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.nn.scale_shift_nhwc") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::scale_shift_nhwc(args[0], args[1], args[2]); + }); + +/* Ops from nn/pooling.h */ +TVM_REGISTER_GLOBAL("topi.nn.pool") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::pool(args[0], args[1], args[2], args[3], + static_cast(static_cast(args[4])), + args[5]); + }); + +TVM_REGISTER_GLOBAL("topi.nn.global_pool") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::global_pool(args[0], + static_cast(static_cast(args[1]))); + }); + +/* Ops from nn/softmax.h */ +TVM_REGISTER_GLOBAL("topi.nn.softmax") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::softmax(args[0]); + }); + +TVM_REGISTER_GLOBAL("topi.nn.log_softmax") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::log_softmax(args[0]); + }); + +/* Generic schedules */ +TVM_REGISTER_GLOBAL("topi.generic.default_schedule") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::generic::default_schedule(args[0], args[1], args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.generic.schedule_extern") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::generic::schedule_extern(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.generic.schedule_injective") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::generic::schedule_injective(args[0], args[1]); + }); + +/* x86 schedules */ +TVM_REGISTER_GLOBAL("topi.x86.schedule_binarize_pack") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::x86::schedule_binarize_pack(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.x86.schedule_binary_dense") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::x86::schedule_binary_dense(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.x86.default_schedule") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::x86::default_schedule(args[0], args[1], args[2]); + }); + +TVM_REGISTER_GLOBAL("topi.x86.schedule_injective") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::x86::schedule_injective(args[0], args[1]); + }); + +/* ROCm schedules */ +TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda") +.set_body([](TVMArgs args, TVMRetValue *rv) { + Tensor bias_val; + Tensor *bias; + if (args[3].type_code() == kNull) { + bias = nullptr; + } else { + bias_val = args[3]; + bias = &bias_val; + } + *rv = rocm::dense_rocm(args[0], args[1], args[2], bias); + }); + +TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::rocm::schedule_dense(args[0], args[1]); + }); + +/* CUDA schedules */ +TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda") +.set_body([](TVMArgs args, TVMRetValue *rv) { + Tensor bias_val; + Tensor *bias; + if (args[3].type_code() == kNull) { + bias = nullptr; + } else { + bias_val = args[3]; + bias = &bias_val; + } + *rv = cuda::dense_cuda(args[0], args[1], args[2], bias); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_dense") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_dense(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_extern") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_extern(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_injective(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_pool(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_global_pool") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_global_pool(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_reduce") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_reduce(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = topi::cuda::schedule_softmax(args[0], args[1]); + }); + +} // namespace topi diff --git a/topi/tests/python_cpp/test_topi_basic.py b/topi/tests/python_cpp/test_topi_basic.py new file mode 100644 index 000000000000..1faba588b877 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_basic.py @@ -0,0 +1,31 @@ +import tvm +import topi +from topi import util + + +def test_util(): + x = tvm.const(100) + assert util.get_const_int(x) == 100 + assert util.get_const_tuple((x, x)) == (100, 100) + + +def test_ewise(): + m = tvm.var('m') + l = tvm.var('l') + A = tvm.placeholder((m, l), name='A') + + def test_apply(func, name): + B = func(A) + assert tuple(B.shape) == tuple(A.shape) + assert B.op.body[0].name == name + + test_apply(topi.cpp.exp, "exp") + test_apply(topi.cpp.tanh, "tanh") + test_apply(topi.cpp.sigmoid, "sigmoid") + test_apply(topi.cpp.log, "log") + test_apply(topi.cpp.sqrt, "sqrt") + + +if __name__ == "__main__": + test_util() + test_ewise() diff --git a/topi/tests/python_cpp/test_topi_bnn.py b/topi/tests/python_cpp/test_topi_bnn.py new file mode 100644 index 000000000000..3fa5cfc4a0a7 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_bnn.py @@ -0,0 +1,55 @@ +"""Test code for binary neural network operators.""" +import numpy as np +import tvm +import topi +from topi.util import get_const_tuple +from tvm.contrib.pickle_memoize import memoize + + +def verify_binary_dense(batch, in_dim, out_dim): + A = tvm.placeholder((batch, in_dim), name='A') + B = tvm.placeholder((out_dim, in_dim), name='B') + bnn_A = topi.cpp.nn.binarize_pack(A, 1) + bnn_B = topi.cpp.nn.binarize_pack(B, 1) + # binary dense + bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype) + bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype) + bnn_C = topi.cpp.nn.binary_dense(bnn_A1, bnn_B1) + # schedule + target = topi.cpp.TEST_create_target("llvm") + s1 = topi.cpp.x86.schedule_binarize_pack(target, [bnn_A]) + s2 = topi.cpp.x86.schedule_binarize_pack(target, [bnn_B]) + s3 = topi.cpp.x86.schedule_binary_dense(target, [bnn_C]) + + dtype = A.dtype + @memoize("topi.tests.test_topi_binary_dense") + def get_ref_data(): + # generate random matrix of +1 or -1 value + a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype) + b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype) + c_np = np.dot(a_np, b_np.T) + return (a_np, b_np, c_np) + + a_np, b_np, c_np = get_ref_data() + + ctx = tvm.cpu(0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx) + bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx) + bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx) + f1 = tvm.build(s1, [A, bnn_A], 'llvm') + f2 = tvm.build(s2, [B, bnn_B], 'llvm') + f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm') + f1(a, bnn_a) + f2(b, bnn_b) + f3(bnn_a, bnn_b, bnn_c) + np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5) + +def test_binary_dense(): + verify_binary_dense(1, 4096, 1024) + verify_binary_dense(1, 1024, 1000) + + +if __name__ == "__main__": + test_binary_dense() diff --git a/topi/tests/python_cpp/test_topi_broadcast.py b/topi/tests/python_cpp/test_topi_broadcast.py new file mode 100644 index 000000000000..301751c25517 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_broadcast.py @@ -0,0 +1,114 @@ +"""Test code for broadcasting operators.""" +import os +import numpy as np +import tvm +import topi + +def verify_broadcast_to_ele(in_shape, out_shape): + # Build the logic and compile the function + A = tvm.placeholder(shape=in_shape, name="A") + B = topi.cpp.broadcast_to(A, out_shape) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + s = topi.cpp.cuda.schedule_injective(target, [B]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B], device, name="broadcast_to") + data_npy = np.random.uniform(size=in_shape).astype(A.dtype) + out_npy = np.broadcast_to(data_npy, out_shape) + data_nd = tvm.nd.array(data_npy, ctx) + out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) + for _ in range(1): + foo(data_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + check_device("opencl") + check_device("cuda") + #check_device("metal") + #check_device("rocm") + + +def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"): + # Build the logic and compile the function + A = tvm.placeholder(shape=lhs_shape, name="A") + B = tvm.placeholder(shape=rhs_shape, name="B") + if typ == "add": + C = topi.cpp.broadcast_add(A, B) + elif typ == "sub": + C = topi.cpp.broadcast_sub(A, B) + elif typ == "div": + C = topi.cpp.broadcast_div(A, B) + elif typ == "mul": + C = topi.cpp.broadcast_mul(A, B) + elif typ == "maximum": + C = topi.cpp.broadcast_maximum(A, B) + elif typ == "minimum": + C = topi.cpp.broadcast_minimum(A, B) + elif typ == "pow": + C = topi.cpp.broadcast_pow(A, B) + else: + raise NotImplementedError + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + s = topi.cpp.cuda.schedule_injective(target, [C]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) + lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) + rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) + if typ == "add": + out_npy = lhs_npy + rhs_npy + elif typ == "sub": + out_npy = lhs_npy - rhs_npy + elif typ == "div": + rhs_npy = np.abs(rhs_npy) + 0.001 + out_npy = lhs_npy / rhs_npy + elif typ == "mul": + out_npy = lhs_npy * rhs_npy + elif typ == "maximum": + out_npy = np.maximum(lhs_npy, rhs_npy) + elif typ == "minimum": + out_npy = np.minimum(lhs_npy, rhs_npy) + elif typ == "pow": + out_npy = lhs_npy ** rhs_npy + else: + raise NotImplementedError + lhs_nd = tvm.nd.array(lhs_npy, ctx) + rhs_nd = tvm.nd.array(rhs_npy, ctx) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) + for _ in range(1): + foo(lhs_nd, rhs_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) + + check_device("opencl") + check_device("cuda") + #check_device("metal") + #check_device("rocm") + +def test_broadcast_to(): + verify_broadcast_to_ele((1,), (10,)) + verify_broadcast_to_ele((), (10,)) + verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4)) + verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32)) + + +def test_broadcast_binary(): + verify_broadcast_binary_ele((5, 2, 3), (2, 1), typ="add") + verify_broadcast_binary_ele((5, 2, 3), (), typ="add") + verify_broadcast_binary_ele((5, 64, 128), (2, 5, 64, 1), typ="mul") + verify_broadcast_binary_ele((2, 3, 1, 32), (64, 32), typ="div") + verify_broadcast_binary_ele((1, 32), (64, 32), typ="sub") + verify_broadcast_binary_ele((32,), (64, 32), typ="maximum") + verify_broadcast_binary_ele((1, 2, 2, 1, 32), (64, 32), typ="minimum") + verify_broadcast_binary_ele((1, 32), (64, 32), typ="pow") + + +if __name__ == "__main__": + test_broadcast_to() + test_broadcast_binary() diff --git a/topi/tests/python_cpp/test_topi_clip.py b/topi/tests/python_cpp/test_topi_clip.py new file mode 100644 index 000000000000..a059464375ac --- /dev/null +++ b/topi/tests/python_cpp/test_topi_clip.py @@ -0,0 +1,45 @@ +"""Test code for clip operator""" +import numpy as np +import tvm +import topi +from topi.util import get_const_tuple +from tvm.contrib.pickle_memoize import memoize +from util import make_vector + + +def verify_clip(N, a_min, a_max, dtype): + A = tvm.placeholder((N, N), dtype=dtype, name='A') + B = topi.cpp.clip(A, a_min, a_max) + + # use memoize to pickle the test data for next time use + @memoize("topi.tests.test_topi_clip") + def get_ref_data(): + a_np = np.random.uniform(a_min*2, a_max*2, size=(N, N)).astype(dtype) + b_np = np.clip(a_np, a_min, a_max) + return a_np, b_np + a_np, b_np = get_ref_data() + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + target = topi.cpp.TEST_create_target(device) + s = topi.cpp.generic.default_schedule(target, [B], False) + ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) + f = tvm.build(s, [A, B], device, name="clip") + f(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['llvm']: + check_device(device) + +def test_clip(): + verify_clip(1024, -127, 127, 'int8') + verify_clip(1024, -127, 127, 'int16') + verify_clip(1024, -127, 127, 'float32') + + +if __name__ == "__main__": + test_clip() diff --git a/topi/tests/python_cpp/test_topi_dense.py b/topi/tests/python_cpp/test_topi_dense.py new file mode 100644 index 000000000000..6ebd6948ee71 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_dense.py @@ -0,0 +1,61 @@ +"""Test code for dense operator""" +import numpy as np +import tvm +import topi +from topi.util import get_const_tuple +from tvm.contrib.pickle_memoize import memoize + + +def verify_dense(batch, in_dim, out_dim, use_bias=True): + A = tvm.placeholder((batch, in_dim), name='A') + B = tvm.placeholder((out_dim, in_dim), name='B') + C = tvm.placeholder((out_dim,), name='C') + D = topi.cpp.nn.dense(A, B, C if use_bias else None) + D = topi.cpp.nn.relu(D) + dtype = A.dtype + + # use memoize to pickle the test data for next time use + @memoize("topi.tests.test_topi_dense") + def get_ref_data(): + a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype) + b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype) + c_np = np.random.uniform(size=(out_dim,)).astype(dtype) + if use_bias: + d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0) + else: + d_np = np.maximum(np.dot(a_np, b_np.T), 0.0) + return (a_np, b_np, c_np, d_np) + # get the test data + a_np, b_np, c_np, d_np = get_ref_data() + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_dense(target, [D]) + elif device == "rocm": + s = topi.cpp.rocm.schedule_dense(target, [D]) + else: + s = topi.cpp.cuda.schedule_dense(target, [D]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(c_np, ctx) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + f = tvm.build(s, [A, B, C, D], device, name="dense") + f(a, b, c, d) + np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5) + + for device in ['cuda', 'opencl', 'metal', 'rocm']: + check_device(device) + +def test_dense(): + verify_dense(1, 1024, 1000, use_bias=True) + verify_dense(1, 1024, 1000, use_bias=False) + + +if __name__ == "__main__": + test_dense() diff --git a/topi/tests/python_cpp/test_topi_dilate.py b/topi/tests/python_cpp/test_topi_dilate.py new file mode 100644 index 000000000000..177ddbc3cfd7 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_dilate.py @@ -0,0 +1,34 @@ +import tvm +import topi +import numpy as np + +def test_dilate(): + target = 'llvm' + ctx = tvm.cpu(0) + + def _test_dilate(input_size, strides): + Input = tvm.placeholder((input_size)) + Output = topi.cpp.nn.dilate(Input, strides) + tgt = topi.cpp.TEST_create_target(target) + schedule = topi.cpp.generic.default_schedule(tgt, [Output], True) + input_np = np.random.uniform(size=input_size).astype(Input.dtype) + output_np = topi.testing.dilate_python(input_np, strides) + input_tvm = tvm.nd.array(input_np, ctx=ctx) + output_size = topi.util.get_const_tuple(Output.shape) + output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx) + f = tvm.build(schedule, [Input, Output], target) + f(input_tvm, output_tvm) + np.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5) + + _test_dilate((32,), (2,)) + _test_dilate((32,32), (2,2)) + _test_dilate((1,3,32,32), (1,1,1,1)) + _test_dilate((1,3,32,32), (2,2,2,2)) + _test_dilate((1,32,32,3,3), (1,1,1,1,1)) + _test_dilate((1,32,32,3,3), (2,2,2,2,2)) + _test_dilate((1,32,32,32,3,3), (1,1,1,2,2,2)) + _test_dilate((1,32,32,32,3,3), (2,2,2,1,1,1)) + + +if __name__ == "__main__": + test_dilate() diff --git a/topi/tests/python_cpp/test_topi_pooling.py b/topi/tests/python_cpp/test_topi_pooling.py new file mode 100644 index 000000000000..6132fcd36469 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_pooling.py @@ -0,0 +1,120 @@ +"""Test code for pooling""" +import numpy as np +import tvm +import topi +import math +from topi.util import get_const_tuple + +pool_code = { + "avg": 0, + "max": 1 +} +def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode): + iw = ih + kw = kh + sw = sh + ph, pw = padding + A = tvm.placeholder((n, ic, ih, iw), name='A') + B = topi.cpp.nn.pool(A, [kh, kw], [sh, sw], padding, + pool_code[pool_type], ceil_mode) + B = topi.cpp.nn.relu(B) + dtype = A.dtype + + bshape = get_const_tuple(B.shape) + ashape = get_const_tuple(A.shape) + if ceil_mode: + assert bshape[2] == int(math.ceil(float(ashape[2] - kh + ph * 2) / sh) + 1) + assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pw * 2) / sw) + 1) + else: + assert bshape[2] == int(math.floor(float(ashape[2] - kh + ph * 2) / sh) + 1) + assert bshape[3] == int(math.floor(float(ashape[3] - kw + pw * 2) / sw) + 1) + + + a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype) + pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype) + no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw))) + pad_np[np.ix_(*no_zero)] = a_np + _, oc, oh, ow = get_const_tuple(B.shape) + b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype) + + if pool_type == 'avg': + for i in range(oh): + for j in range(ow): + b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) + elif pool_type =='max': + for i in range(oh): + for j in range(ow): + b_np[:,:,i,j] = np.max(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) + b_np = np.maximum(b_np, 0.0) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.default_schedule(target, [B], False) + else: + s = topi.cpp.cuda.schedule_pool(target, [B]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) + f = tvm.build(s, [A, B], device) + f(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['cuda', 'opencl', 'metal', 'rocm']: + check_device(device) + +def test_pool(): + verify_pool(1, 256, 32, 2, 2, [0, 0], 'avg', False) + verify_pool(1, 256, 31, 3, 3, [1, 2], 'avg', False) + verify_pool(1, 256, 32, 2, 2, [0, 0], 'max', False) + verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', False) + verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', True) + + + +def verify_global_pool(n, c, h, w, pool_type): + A = tvm.placeholder((n, c, h, w), name='A') + B = topi.cpp.nn.global_pool(A, pool_code[pool_type]) + B = topi.cpp.nn.relu(B) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + if pool_type == 'avg': + b_np = np.mean(a_np, axis=(2,3), keepdims=True) + elif pool_type =='max': + b_np = np.max(a_np, axis=(2,3), keepdims=True) + b_np = np.maximum(b_np, 0.0) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.default_schedule(target, [B], False) + else: + s = topi.cpp.cuda.schedule_global_pool(target, [B]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + f = tvm.build(s, [A, B], device) + f(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['cuda', 'opencl', 'metal', 'rocm']: + check_device(device) + +def test_global_pool(): + verify_global_pool(1, 1024, 7, 7, 'avg') + verify_global_pool(4, 1024, 7, 7, 'avg') + verify_global_pool(1, 1024, 7, 7, 'max') + verify_global_pool(4, 1024, 7, 7, 'max') + + +if __name__ == "__main__": + test_pool() + test_global_pool() diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py new file mode 100644 index 000000000000..adfe18ba4ef9 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_reduce.py @@ -0,0 +1,120 @@ +"""Test code for reduce.""" +import os +import numpy as np +import tvm +import topi + +def _my_npy_argmax(arr, axis, keepdims): + if not keepdims: + return arr.argmax(axis=axis) + else: + if axis is not None: + out_shape = list(arr.shape) + out_shape[axis] = 1 + else: + out_shape = [1 for _ in range(len(arr.shape))] + return arr.argmax(axis=axis).reshape(out_shape) + + +def _my_npy_argmin(arr, axis, keepdims): + if not keepdims: + return arr.argmin(axis=axis) + else: + out_shape = list(arr.shape) + out_shape[axis] = 1 + return arr.argmin(axis=axis).reshape(out_shape) + +def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"): + # Build the logic and compile the function + dat_dtype = "float32" + A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype) + A1 = topi.cpp.sqrt(topi.cpp.exp(A)) + out_dtype = "float32" + if type == "sum": + B = topi.cpp.sum(A1, axis, keepdims) + elif type == "max": + B = topi.cpp.max(A1, axis, keepdims) + elif type == "min": + B = topi.cpp.min(A1, axis, keepdims) + elif type == "argmax": + B = topi.cpp.argmax(A1, axis, keepdims) + out_dtype = "int32" + elif type == "argmin": + B = topi.cpp.argmin(A1, axis, keepdims) + out_dtype = "int32" + else: + raise NotImplementedError + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.default_schedule(target, [B], True) + else: + s = topi.cpp.cuda.schedule_reduce(target, [B]) + + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B], device, name="sum") + # Test + in_npy = np.random.uniform(size=in_shape).astype(np.float32) + in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32) + if type == "sum": + out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) + elif type == "max": + out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) + elif type == "min": + out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) + elif type == "argmax": + out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) + elif type == "argmin": + out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) + else: + raise NotImplementedError + data_tvm = tvm.nd.array(in_npy, ctx=ctx) + out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) + for _ in range(1): + foo(data_tvm, out_tvm) + np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) + for device in ["cuda", "opencl", "metal", "llvm", "rocm"]: + check_device(device) + + +def test_reduce_map(): + verify_reduce_map_ele(in_shape=(128, 24, 128, 24), + axis=(1, 2, 3), + keepdims=True, + type="sum") + verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24), + axis=(1,), + keepdims=False, + type="max") + verify_reduce_map_ele(in_shape=(32, 128, 24), + axis=None, + keepdims=True, + type="sum") + verify_reduce_map_ele(in_shape=(128, 24, 128, 24), + axis=(0, 2), + keepdims=False, + type="min") + verify_reduce_map_ele(in_shape=(32, 128), + axis=1, + keepdims=True, + type="argmax") + verify_reduce_map_ele(in_shape=(32, 24, 32, 24), + axis=2, + keepdims=False, + type="argmin") + verify_reduce_map_ele(in_shape=(31, 21, 15), + axis=None, + keepdims=True, + type="argmax") + verify_reduce_map_ele(in_shape=(31, 21, 15), + axis=None, + keepdims=False, + type="sum") + +if __name__ == "__main__": + test_reduce_map() diff --git a/topi/tests/python_cpp/test_topi_relu.py b/topi/tests/python_cpp/test_topi_relu.py new file mode 100644 index 000000000000..7322f8925517 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_relu.py @@ -0,0 +1,62 @@ +"""Test code for relu activation""" +import os +import numpy as np +import tvm +import topi +from topi.util import get_const_tuple + +def verify_relu(m, n): + A = tvm.placeholder((m, n), name='A') + B = topi.cpp.nn.relu(A) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + b_np = a_np * (a_np > 0) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, [B]) + else: + s = topi.cpp.cuda.schedule_injective(target, [B]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [A, B], device, name="relu") + foo(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['cuda', 'opencl', 'metal', 'rocm']: + check_device(device) + + +def verify_leaky_relu(m, alpha): + A = tvm.placeholder((m,), name='A') + B = topi.cpp.nn.leaky_relu(A, alpha) + device = "llvm" + target = topi.cpp.TEST_create_target(device) + s = topi.cpp.generic.schedule_injective(target, [B]) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha + ctx = tvm.cpu(0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [A, B], device, name="leaky_relu") + foo(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + +def test_relu(): + verify_relu(10, 128) + +def test_leaky_relu(): + verify_leaky_relu(100, 0.1) + + +if __name__ == "__main__": + test_relu() + test_leaky_relu() diff --git a/topi/tests/python_cpp/test_topi_softmax.py b/topi/tests/python_cpp/test_topi_softmax.py new file mode 100644 index 000000000000..2c2f62c18f20 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_softmax.py @@ -0,0 +1,81 @@ +"""Test code for softmax""" +import os +import numpy as np +import tvm +import topi +import logging +from topi.util import get_const_tuple + +def verify_softmax(m, n): + A = tvm.placeholder((m, n), name='A') + B = topi.cpp.nn.softmax(A) + # confirm lower works + s = tvm.create_schedule([B.op]) + tvm.lower(s, [A, B], simple_mode=True) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + b_np = topi.testing.softmax_python(a_np) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.default_schedule(target, [B], False) + else: + s = topi.cpp.cuda.schedule_softmax(target, [B]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [A, B], device, name="softmax") + foo(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['cuda', 'opencl', 'metal', 'rocm']: + check_device(device) + +def test_softmax(): + verify_softmax(32, 10) + verify_softmax(3, 4) + + +def verify_log_softmax(m, n): + A = tvm.placeholder((m, n), name='A') + B = topi.cpp.nn.log_softmax(A) + # confirm lower works + s = tvm.create_schedule([B.op]) + tvm.lower(s, [A, B], simple_mode=True) + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + b_np = topi.testing.log_softmax_python(a_np) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.default_schedule(target, [B], False) + else: + s = topi.cpp.cuda.schedule_softmax(target, [B]) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [A, B], device, name="log_softmax") + foo(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ["cuda", "opencl", "metal", "rocm"]: + check_device(device) + + +def test_log_softmax(): + verify_log_softmax(32, 10) + verify_log_softmax(3, 4) + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + test_softmax() + test_log_softmax() diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py new file mode 100644 index 000000000000..68fad8dae707 --- /dev/null +++ b/topi/tests/python_cpp/test_topi_transform.py @@ -0,0 +1,216 @@ +"""Test code for broadcasting operators.""" +import numpy as np +import tvm +import topi + +def verify_expand_dims(in_shape, out_shape, axis, num_newaxis): + A = tvm.placeholder(shape=in_shape, name="A") + B = topi.cpp.expand_dims(A, axis, num_newaxis) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, [B]) + else: + s = topi.cpp.cuda.schedule_injective(target, [B]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B], device, name="expand_dims") + data_npy = np.random.uniform(size=in_shape).astype(A.dtype) + out_npy = data_npy.reshape(out_shape) + data_nd = tvm.nd.array(data_npy, ctx) + out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) + foo(data_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]: + check_device(device) + + +def verify_tranpose(in_shape, axes): + A = tvm.placeholder(shape=in_shape, name="A") + B = topi.cpp.transpose(A, axes) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, [B]) + else: + s = topi.cpp.cuda.schedule_injective(target, [B]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B], device, name="tranpose") + data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype) + out_npy = data_npy.transpose(axes) + data_nd = tvm.nd.array(data_npy, ctx) + out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype) + foo(data_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]: + check_device(device) + + +def verify_reshape(src_shape, dst_shape): + A = tvm.placeholder(shape=src_shape, name="A") + B = topi.cpp.reshape(A, dst_shape) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, [B]) + else: + s = topi.cpp.cuda.schedule_injective(target, [B]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B], device, name="reshape") + data_npy = np.random.normal(size=src_shape).astype(A.dtype) + out_npy = np.reshape(data_npy, newshape=dst_shape) + data_nd = tvm.nd.array(data_npy, ctx) + out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype) + foo(data_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]: + check_device(device) + + +def verify_squeeze(src_shape, axis): + A = tvm.placeholder(shape=src_shape, name="A") + B = topi.cpp.squeeze(A, axis) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, [B]) + else: + s = topi.cpp.cuda.schedule_injective(target, [B]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A, B], device, name="squeeze") + data_npy = np.random.normal(size=src_shape).astype(A.dtype) + out_npy = np.squeeze(data_npy, axis=axis) + data_nd = tvm.nd.array(data_npy, ctx) + if out_npy.shape == (): + out_nd_shape = (1,) + else: + out_nd_shape = out_npy.shape + out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype) + foo(data_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]: + check_device(device) + +def verify_concatenate(shapes, axis): + tensor_l = [] + for i, shape in enumerate(shapes): + tensor_l.append(tvm.placeholder(shape, name="A" + str(i))) + out_tensor = topi.cpp.concatenate(tensor_l, axis) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, [out_tensor]) + else: + s = topi.cpp.cuda.schedule_injective(target, [out_tensor]) + ctx = tvm.context(device, 0) + foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate") + data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] + out_npy = np.concatenate(data_npys, axis=axis) + data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys] + out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype) + foo(*(data_nds + [out_nd])) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]: + check_device(device) + + +def verify_split(src_shape, indices_or_sections, axis): + A = tvm.placeholder(shape=src_shape, name="A") + tensor_l = topi.cpp.split(A, indices_or_sections, axis) + tensor_l = list(tensor_l) + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + target = topi.cpp.TEST_create_target(device) + if device == "llvm": + s = topi.cpp.generic.schedule_injective(target, tensor_l) + else: + s = topi.cpp.cuda.schedule_injective(target, tensor_l) + ctx = tvm.context(device, 0) + foo = tvm.build(s, [A] + tensor_l, device, name="split") + data_npy = np.random.normal(size=src_shape).astype(A.dtype) + out_npys = np.split(data_npy, indices_or_sections, axis=axis) + data_nd = tvm.nd.array(data_npy, ctx) + out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys] + foo(*([data_nd] + out_nds)) + for out_nd, out_npy in zip(out_nds, out_npys): + np.testing.assert_allclose(out_nd.asnumpy(), out_npy) + + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]: + check_device(device) + + +def test_expand_dims(): + verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2) + verify_expand_dims((3, 10), (1, 3, 10), -3, 1) + + +def test_tranpose(): + verify_tranpose((3, 10, 2), (1, 0, 2)) + verify_tranpose((3, 10, 5), (2, 0, 1)) + verify_tranpose((3, 10), None) + + +def test_reshape(): + verify_reshape((1, 2, 3, 4), (2, 3, 4)) + verify_reshape((4, 2, 3, 4), (2, 4, 12)) + verify_reshape((4, 2, 3, 4), (2, 48)) + verify_reshape((16, ), (2, 2, 2, 2)) + + +def test_squeeze(): + verify_squeeze((1, 2, 3, 4), 0) + verify_squeeze((1, 2, 1, 4), None) + verify_squeeze((1, 1, 1, 4), (1, 2)) + verify_squeeze((1, 1, 1, 1), None) + + +def test_concatenate(): + verify_concatenate([(2,), (2,), (2,)], 0) + verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1) + verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1) + verify_concatenate([(5, 6, 7, 3), + (16, 6, 7, 3), + (12, 6, 7, 3), + (8, 6, 7, 3), + (2, 6, 7, 3)], 0) + + +def test_split(): + verify_split((2, 12, 3), 3, 1) + verify_split((2, 12, 3), [2, 4], 1) + verify_split((10, 12, 24), [5, 7, 9], -1) + +if __name__ == "__main__": + test_concatenate() + test_tranpose() + test_expand_dims() + test_reshape() + test_squeeze() + test_split() From a5dbf4a0f22be5bc0fd4658d1a222a2fb80f6e55 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 28 Jan 2018 09:55:22 -0800 Subject: [PATCH 117/948] Fix Jenkins pipeline (#835) --- Jenkinsfile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 85d38d3dfcad..4fc2285f507c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,11 +4,10 @@ // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/ // tvm libraries -topi_lib = "lib/libtopi.so" -tvm_runtime = "lib/libtvm_runtime.so, config.mk, " -tvm_lib = "lib/libtvm.so, " + tvm_runtime + topi_lib +tvm_runtime = "lib/libtvm_runtime.so, config.mk" +tvm_lib = "lib/libtvm.so, " + tvm_runtime // LLVM upstream lib -tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime + topi_lib +tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' From 3a91764ec919b7a0034e104c7fac40ed5a840d12 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 28 Jan 2018 14:43:17 -0800 Subject: [PATCH 118/948] [TOPI] Fix compiler warning in topi cpp (#837) --- topi/include/topi/detail/broadcast.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h index ba7193cd0647..d76aa7d0ec56 100644 --- a/topi/include/topi/detail/broadcast.h +++ b/topi/include/topi/detail/broadcast.h @@ -55,7 +55,7 @@ inline BroadcastHelper BroadcastShape(const tvm::Array& shape1, auto max_size = std::max(s1_size, s2_size); auto& shape = (s1_size > s2_size) ? shape1 : shape2; auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2; - for (i = i; i <= max_size; ++i) { + for (; i <= max_size; ++i) { bh.all_vars.push_front(tvm::Var()); bh.common_shape.push_front(shape[max_size - i]); vars.push_front(bh.all_vars[0]); From f04454fc8759279ffe90307e6259a721cf53dcb9 Mon Sep 17 00:00:00 2001 From: Clouds Date: Tue, 30 Jan 2018 01:31:42 +0800 Subject: [PATCH 119/948] fix opengl runtime to use OpenGL/gl3.h for macOS (#833) * fix opengl to OpenGL/gl3.h for APPLE * use glfw3 to include gl.h header --- src/runtime/opengl/opengl_common.h | 4 +++- src/runtime/opengl/opengl_device_api.cc | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/runtime/opengl/opengl_common.h b/src/runtime/opengl/opengl_common.h index 661c987e4b3c..0d247edfade4 100644 --- a/src/runtime/opengl/opengl_common.h +++ b/src/runtime/opengl/opengl_common.h @@ -11,7 +11,9 @@ #include #include #include -#include +#if defined(__APPLE__) +#define GLFW_INCLUDE_GLCOREARB +#endif #include #include #include diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc index df2947db6255..5f629fcf1a1f 100644 --- a/src/runtime/opengl/opengl_device_api.cc +++ b/src/runtime/opengl/opengl_device_api.cc @@ -27,10 +27,12 @@ static const char* GLGetErrorString(GLenum error) { return "GL_INVALID_VALUE"; case GL_INVALID_OPERATION: return "GL_INVALID_OPERATION"; +#if !defined(__APPLE__) case GL_STACK_OVERFLOW: return "GL_STACK_OVERFLOW"; case GL_STACK_UNDERFLOW: return "GL_STACK_UNDERFLOW"; +#endif case GL_OUT_OF_MEMORY: return "GL_OUT_OF_MEMORY"; default: From 68b9aa9a05b5914a7c5cf2fce9fda13f4ea6f503 Mon Sep 17 00:00:00 2001 From: ZhiWei Zhang <971749411@qq.com> Date: Tue, 30 Jan 2018 12:49:28 +0800 Subject: [PATCH 120/948] fixed #841 (#845) * Update workspace_pool.cc * Update workspace_pool.cc --- src/runtime/workspace_pool.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc index 24035faedaa7..c903a8621206 100644 --- a/src/runtime/workspace_pool.cc +++ b/src/runtime/workspace_pool.cc @@ -28,7 +28,10 @@ class WorkspacePool::Pool { nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize; if (nbytes == 0) nbytes = kWorkspacePageSize; Entry e; - TVMType type = {.code = kDLUInt, .bits = 8, .lanes = 1}; + TVMType type; + type.code = kDLUInt; + type.bits = 8; + type.lanes = 1; if (free_list_.size() == 2) { e = free_list_.back(); free_list_.pop_back(); From 5379b1c7975cd5fa7eff50425de59718a6b9c446 Mon Sep 17 00:00:00 2001 From: xqdan Date: Tue, 30 Jan 2018 22:19:25 -0800 Subject: [PATCH 121/948] [PASS] Improve storage rewrite(#846) (#847) * fix #802, create cache based on sugar tensor * [Pass] Improve storage rewrite * fix ci * fix comment * fix comment --- src/pass/storage_rewrite.cc | 5 +- .../unittest/test_pass_storage_rewrite.py | 87 ++++++++++++++++++- 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index f052a9b05b90..00674153bb35 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -681,6 +681,8 @@ class StoragePlanRewriter : public IRMutator { StorageEntry* dst_entry = nullptr; // inplace detection if (detect_inplace) { + // only one inplace var for s.stmt + bool inplace_found = false; for (const Variable* src : it->second.kill) { if (!inplace_flag.count(src) && alloc_map_.count(src)) { InplaceOpVerifier visitor; @@ -693,10 +695,11 @@ class StoragePlanRewriter : public IRMutator { ae.alloc->constant_allocation_size() * ae.alloc->type.bits() * ae.alloc->type.lanes()); - if (src_entry->const_nbits == const_nbits) { + if (src_entry->const_nbits == const_nbits && !inplace_found) { // successfully inplace dst_entry = src_entry; inplace_flag.insert(src); + inplace_found = true; } } } diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index 6b6ff71810bb..c9999f312300 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -218,7 +218,7 @@ def test_parallel_alloc(): def test_inplace_rule2(): #Test Buffer - scope_tb = "local_TB" + scope_tb = "local_TB2" @tvm.register_func("tvm.info.mem.%s" % scope_tb) def mem_info_inp_buffer(): return tvm.make.node("MemoryInfo", @@ -258,6 +258,90 @@ def verify(n): tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2 +def test_inplace_rule3(): + #Test Buffer + scope_tb = "local_TB3" + @tvm.register_func("tvm.info.mem.%s" % scope_tb) + def mem_info_inp_buffer(): + return tvm.make.node("MemoryInfo", + unit_bits= 16, + max_simd_bits=32, + max_num_bits=1024*1024*1024, + head_address=None) + m = 10 + B0 = tvm.placeholder((m,), name='B0') + B1 = tvm.placeholder((m,), name='B1') + B2 = tvm.placeholder((m,), name='B2') + B3 = tvm.placeholder((m,), name='B3') + B4 = tvm.placeholder((m,), name='B4') + B5 = tvm.placeholder((m,), name='B5') + + B6 = tvm.compute((m,), lambda i: B1[i] * B5[i], name='B6') + B7 = tvm.compute((m,), lambda i: B2[i] * B4[i], name='B7') + B8 = tvm.compute((m,), lambda i: B6[i] - B7[i], name='B8') + + B9 = tvm.compute((m,), lambda i: B2[i] * B3[i], name='B9') + B10 = tvm.compute((m,), lambda i: B0[i] * B5[i], name='B10') + B11 = tvm.compute((m,), lambda i: B9[i] - B10[i], name='B11') + + B12 = tvm.compute((m,), lambda i: B0[i] * B4[i], name='B12') + B13 = tvm.compute((m,), lambda i: B1[i] * B3[i], name='B13') + B14 = tvm.compute((m,), lambda i: B12[i] - B13[i], name='B14') + + B = tvm.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name='B') + s = tvm.create_schedule(B.op) + + B1L = s.cache_read(B1, scope_tb, [B6, B13]) + B5L = s.cache_read(B5, scope_tb, [B6, B10]) + B2L = s.cache_read(B2, scope_tb, [B7, B9]) + B4L = s.cache_read(B4, scope_tb, [B7, B12]) + B3L = s.cache_read(B3, scope_tb, [B9, B13]) + B0L = s.cache_read(B0, scope_tb, [B10, B12]) + + B8L = s.cache_write(B8, scope_tb) + B11L = s.cache_write(B11, scope_tb) + B14L = s.cache_write(B14, scope_tb) + B6L = s.cache_write(B6, scope_tb) + B7L = s.cache_write(B7, scope_tb) + B9L = s.cache_write(B9, scope_tb) + B10L = s.cache_write(B10, scope_tb) + B12L = s.cache_write(B12, scope_tb) + B13L = s.cache_write(B13, scope_tb) + + s[B12].compute_inline() + s[B13].compute_inline() + s[B8].compute_inline() + s[B11].compute_inline() + s[B14].compute_inline() + s[B6].compute_inline() + s[B7].compute_inline() + s[B9].compute_inline() + s[B10].compute_inline() + + s = s.normalize() + bounds = tvm.schedule.InferBound(s) + assert isinstance(bounds, tvm.container.Map) + stmt = tvm.schedule.ScheduleOps(s, bounds) + + B0a = tvm.decl_buffer(B0.shape, B0.dtype, name='B0') + B1a = tvm.decl_buffer(B1.shape, B1.dtype, name='B1') + B2a = tvm.decl_buffer(B2.shape, B2.dtype, name='B2') + B3a = tvm.decl_buffer(B3.shape, B3.dtype, name='B3') + B4a = tvm.decl_buffer(B4.shape, B4.dtype, name='B4') + B5a = tvm.decl_buffer(B5.shape, B5.dtype, name='B5') + + Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') + stmt = tvm.ir_pass.StorageFlatten(stmt, {B0: B0a, B1: B1a, B2: B2a, B3: B2a, B4: B4a, B5: B5a, B: Bb}, 64) + stmt = tvm.ir_pass.CanonicalSimplify(stmt) + stmt = tvm.ir_pass.Simplify(stmt) + stmt = tvm.ir_pass.StorageRewrite(stmt) + # verify only have one allocations. + # verify inplace folding works + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + assert n.extents[0].value == 70 + tvm.ir_pass.PostOrderVisit(stmt, verify) + if __name__ == "__main__": test_alloc_seq() test_alloc_different_dtypes() @@ -267,3 +351,4 @@ def verify(n): test_storage_combine() test_storage_share_gpu() test_inplace_rule2() + test_inplace_rule3() From 4252d4e661eb7c349be61b31d2c28bca2896f8a1 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 31 Jan 2018 10:20:57 -0800 Subject: [PATCH 122/948] [TOPI] Fix cpp library dependency on MAC (#852) --- Makefile | 29 +++++++++++-------------- python/tvm/_ffi/libinfo.py | 6 ++--- topi/include/topi/detail/extern.h | 4 ++-- topi/tests/python_cpp/test_topi_clip.py | 1 - 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 332d6ebe86f2..2361b505fbf3 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ LLVM_CFLAGS= -fno-rtti -DDMLC_ENABLE_RTTI=0 -DDMLC_USE_FOPEN64=0 LDFLAGS = -pthread -lm -ldl INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include -IHalideIR/src -Itopi/include CFLAGS = -std=c++11 -Wall -O2 $(INCLUDE_FLAGS) -fPIC +PKG_LDFLAGS = FRAMEWORKS = OBJCFLAGS = -fno-objc-arc EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\ @@ -80,6 +81,10 @@ ALL_DEP = $(CC_OBJ) $(CONTRIB_OBJ) $(LIB_HALIDEIR) RUNTIME_DEP = $(RUNTIME_OBJ) TOPI_DEP = $(TOPI_OBJ) +ifeq ($(UNAME_S), Darwin) + PKG_LDFLAGS += -undefined dynamic_lookup +endif + # Dependency specific rules ifdef CUDA_PATH NVCC=$(CUDA_PATH)/bin/nvcc @@ -201,7 +206,10 @@ else JVM_PKG_PROFILE := $(JVM_PKG_PROFILE)-cpu endif -BUILD_TARGETS ?= lib/libtvm.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) lib/libtvm_topi.$(SHARED_LIBRARY_SUFFIX) +BUILD_TARGETS ?= lib/libtvm.$(SHARED_LIBRARY_SUFFIX) \ + lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) \ + lib/libtvm_topi.$(SHARED_LIBRARY_SUFFIX) + all: ${BUILD_TARGETS} runtime: lib/libtvm_runtime.$(SHARED_LIBRARY_SUFFIX) web: lib/libtvm_web_runtime.js lib/libtvm_web_runtime.bc @@ -235,29 +243,18 @@ build/src/%.o: topi/src/%.cc $(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d $(CXX) -c $(CFLAGS) -c $< -o $@ -lib/libtvm.dylib: $(ALL_DEP) $(RUNTIME_DEP) +lib/libtvm.${SHARED_LIBRARY_SUFFIX}: $(ALL_DEP) $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) -lib/libtvm_topi.dylib: lib/libtvm.so $(TOPI_DEP) - @mkdir -p $(@D) - $(CXX) $(CFLAGS) $(FRAMEWORKS) -L./lib -ltvm -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) - -lib/libtvm_runtime.dylib: $(RUNTIME_DEP) +lib/libtvm_topi.${SHARED_LIBRARY_SUFFIX}: $(TOPI_DEP) @mkdir -p $(@D) - $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) + $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) $(PKG_LDFLAGS) -lib/libtvm.so: $(ALL_DEP) $(RUNTIME_DEP) +lib/libtvm_runtime.${SHARED_LIBRARY_SUFFIX}: $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) -lib/libtvm_topi.so: lib/libtvm.so $(TOPI_DEP) - @mkdir -p $(@D) - $(CXX) $(CFLAGS) $(FRAMEWORKS) -L./lib -ltvm -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) - -lib/libtvm_runtime.so: $(RUNTIME_DEP) - @mkdir -p $(@D) - $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) lib/libtvm_web_runtime.bc: web/web_runtime.cc @mkdir -p build/web diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index 46d5f536a94e..169756e43cd5 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -2,7 +2,7 @@ from __future__ import absolute_import import sys import os -import warnings + def find_lib_path(name=None, search_path=None, optional=False): """Find dynamic library files. @@ -91,9 +91,7 @@ def find_lib_path(name=None, search_path=None, optional=False): str('\n'.join(lib_dll_path + runtime_dll_path))) if not optional: raise RuntimeError(message) - else: - warnings.warn(message) - return None + return None if use_runtime: sys.stderr.write("Loading runtime library %s... exec only\n" % lib_found[0]) diff --git a/topi/include/topi/detail/extern.h b/topi/include/topi/detail/extern.h index 12cc65d72530..3c5e09edae01 100644 --- a/topi/include/topi/detail/extern.h +++ b/topi/include/topi/detail/extern.h @@ -110,7 +110,7 @@ Expr pack_buffer(Buffer buf) { buf->data, shape, strides, - make_const(Int(32), buf->shape.size()), + make_const(Int(32), static_cast(buf->shape.size())), make_const(buf->dtype, 0), buf->elem_offset }; @@ -125,7 +125,7 @@ Expr pack_buffer(Buffer buf) { * by the arguments to pass to the PackedFunc when called. The first element of the * array must be a constant string expression. * - * \return An expression representing the invocation + * \return An expression representing the invocation */ Expr call_packed(Array args) { return tvm::ir::Call::make(Int(32), tvm::ir::intrinsic::tvm_call_packed, diff --git a/topi/tests/python_cpp/test_topi_clip.py b/topi/tests/python_cpp/test_topi_clip.py index a059464375ac..fe00408642f5 100644 --- a/topi/tests/python_cpp/test_topi_clip.py +++ b/topi/tests/python_cpp/test_topi_clip.py @@ -4,7 +4,6 @@ import topi from topi.util import get_const_tuple from tvm.contrib.pickle_memoize import memoize -from util import make_vector def verify_clip(N, a_min, a_max, dtype): From 9e675779974dc6985ce31bd66f171a8e5db5668c Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 31 Jan 2018 11:58:43 -0800 Subject: [PATCH 123/948] [RELEASE] Release note for 0.2 (#853) --- NEWS.md | 42 ++++++++++++++++++++++++++++++++++++-- python/tvm/_ffi/libinfo.py | 2 +- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6bc97b163ab1..dd165b75c0e0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,11 +3,49 @@ TVM Change Log This file records the changes in TVM library in reverse chronological order. +## On-going version -## On onging verison +Refer to the Roadmap issue for complete list on on-going version features. +If you check in something that is not reflected in Roadmap issue, please reply +to that issue so it can get added. + +## 0.2 + +This release comes with a complete set of TOPI support for NNVM compiler, which allows compilation of end to end workloads. +We also make major improvements in supporting new backends: ROCm for AMDGPUs and ARM GPU. + +- Backend support + - Support LLVM mainline(4.0, 5.0, 6.0) + - Support ROCM stack for AMD GPUs + - More robust OpenCL support for ARM GPUs +- Android RPC runtime +- Multi-threading optimization for ARM + - multi-threaded depthwise + - multi-threaded conv2d +- New schedule primitives + - storage_align for shared memory alignment + - double_buffer - UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled. +- Full set of TOPI operators + - Introduce tvm.target to specify target options for compilation better. + - broadcast/ reduction operators + - pooling and global pooling + - Generic target support for topi + - schedule with external libraries +- End to end deep learning pipelines for CPU, GPU, ARM GPU +- Tutorials + - How to load compiled module in any language runtime + - How to use java runtime +- Contrib library: MIOpen, CuDNN +- Ongoing items that contains functioning pieces + - WebGL backend + - C++ compiler support + - MPS DNN + - low bit support, introduced popcount + + +## 0.1 -## 0.1rc - Language runtime - python - javascript diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index 169756e43cd5..b17d0041f54d 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -100,4 +100,4 @@ def find_lib_path(name=None, search_path=None, optional=False): # current version -__version__ = "0.1.0" +__version__ = "0.2.0" From 208946e3b080fe6c7809a4451a026bbcd0e37f58 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 31 Jan 2018 15:56:14 -0800 Subject: [PATCH 124/948] copy intrinsic now can include typecast (#855) --- src/pass/inject_copy_intrin.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/pass/inject_copy_intrin.cc b/src/pass/inject_copy_intrin.cc index a7151ed0aeb5..cafcddcb9dde 100644 --- a/src/pass/inject_copy_intrin.cc +++ b/src/pass/inject_copy_intrin.cc @@ -51,12 +51,17 @@ class CopyIntrinInjector : public IRMutator { const Store* store = body.as(); if (store == nullptr) return false; const Select* select = store->value.as(); const Cast* cast = store->value.as(); const Load* load = store->value.as(); - + if (0 == loops.size()) { + is_single_point_copy = true; + CHECK(select == nullptr); + } // for now only support true condition matching if (select != nullptr) { load = select->true_value.as(); @@ -74,13 +78,19 @@ class CopyIntrinInjector : public IRMutator { arith::DetectLinearEquation(load->index, loop_vars); if (load_strides.size() == 0 || store_strides.size() == 0) return false; Array dst_shape; - for (const For* op : loops) { - dst_shape.push_back(op->extent); + auto loop_var_size = loop_vars.size(); + if (is_single_point_copy) { + loop_var_size = 1; + dst_shape.push_back(make_const(Int(32), 1)); + } else { + for (const For* op : loops) { + dst_shape.push_back(op->extent); + } } Array src_shape = dst_shape; Array pad_before, pad_after; Expr pad_value; - Expr src_elem_offset = load_strides[loop_vars.size()]; + Expr src_elem_offset = load_strides[loop_var_size]; if (select != nullptr) { Array clip_bound = arith::DetectClipBound(select->condition, loop_vars); @@ -114,15 +124,15 @@ class CopyIntrinInjector : public IRMutator { src_elem_offset = Simplify(src_elem_offset); } CHECK_EQ(load_strides.size(), store_strides.size()); - CHECK_EQ(load_strides.size(), loop_vars.size() + 1); - Array src_strides(load_strides.begin(), load_strides.begin() + loop_vars.size()); - Array dst_strides(store_strides.begin(), store_strides.begin() + loop_vars.size()); + CHECK_EQ(load_strides.size(), loop_var_size + 1); + Array src_strides(load_strides.begin(), load_strides.begin() + loop_var_size); + Array dst_strides(store_strides.begin(), store_strides.begin() + loop_var_size); Buffer dst = BufferNode::make( Var(store->buffer_var.node_), store->value.type(), dst_shape, dst_strides, - store_strides[loop_vars.size()], + store_strides[loop_var_size], store->buffer_var->name_hint, GetStorageScope(store->buffer_var.get()), 0, 0); diff --git a/tests/python/unittest/test_pass_inject_copy_intrin.py b/tests/python/unittest/test_pass_inject_copy_intrin.py index 08477895b322..c6ed19d65b69 100644 --- a/tests/python/unittest/test_pass_inject_copy_intrin.py +++ b/tests/python/unittest/test_pass_inject_copy_intrin.py @@ -44,6 +44,25 @@ def cb(src, dst, pad_before, pad_after, pad_value): return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb) +def test_single_point_test(): + A = tvm.placeholder((1,), name='A') + B = tvm.compute((1,), lambda i: + A[i], name='B') + s = tvm.create_schedule(B.op) + s[B].pragma(B.op.axis[0], "memcpy") + bounds = tvm.schedule.InferBound(s) + stmt = tvm.schedule.ScheduleOps(s, bounds) + Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') + Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') + stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) + def cb(src, dst, pad_before, pad_after, pad_value): + assert tvm.ir_pass.Simplify(src.elem_offset).value == 0 + assert tvm.ir_pass.Simplify(dst.elem_offset).value == 0 + assert tvm.ir_pass.Simplify(src.strides[0]).value == 1 + assert tvm.ir_pass.Simplify(dst.strides[0]).value == 1 + return tvm.make.Evaluate(0) + stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb) + def assert_expr_equal(a, b): assert tvm.ir_pass.Simplify(a - b).value == 0 @@ -80,3 +99,4 @@ def cb(src, dst, pad_before, pad_after, pad_value): test_copy2d() test_copy_pad() test_copy_pad_split() + test_single_point_test() From 9f0f5c497c3ffc357be403ae40883c1a4af9443d Mon Sep 17 00:00:00 2001 From: alex-weaver Date: Mon, 5 Feb 2018 01:06:30 +0000 Subject: [PATCH 135/948] Run C++ TOPI tests in Jenkins build (#870) * Added +x permission to task_cpp_topi.sh. Added C++ topi tests to Jenkinsfile * Fixed test_topi_math.py * Minor style fix --- Jenkinsfile | 3 +++ tests/scripts/task_cpp_topi.sh | 0 topi/tests/python/test_topi_math.py | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) mode change 100644 => 100755 tests/scripts/task_cpp_topi.sh diff --git a/Jenkinsfile b/Jenkinsfile index d74a20716829..75d55787ad9a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -224,6 +224,9 @@ stage('Unit Test') { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} cpu ./tests/scripts/task_cpp_unittest.sh" } + timeout(time: max_time, unit: 'MINUTES') { + sh "${docker_run} cpu ./tests/scripts/task_cpp_topi.sh" + } } } }, diff --git a/tests/scripts/task_cpp_topi.sh b/tests/scripts/task_cpp_topi.sh old mode 100644 new mode 100755 diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py index b1b989179f08..2c31d0fea62c 100644 --- a/topi/tests/python/test_topi_math.py +++ b/topi/tests/python/test_topi_math.py @@ -21,7 +21,7 @@ def test_apply(func, name, f_numpy): B = func(A) assert tuple(B.shape) == tuple(A.shape) assert B.op.body[0].name == name - a_np = np.random.uniform(size=shape).astype(A.dtype) + a_np = np.random.uniform(low=1e-5, size=shape).astype(A.dtype) a_np = np.abs(a_np) b_np = f_numpy(a_np) @@ -37,7 +37,7 @@ def check_device(device): b = tvm.nd.array(np.zeros_like(b_np), ctx) foo = tvm.build(s, [A, B], device, name=name) foo(a, b) - np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm']: check_device(device) From 9ef95bbc6f67b16ba5f368d8a99b5a9623da33a3 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 4 Feb 2018 22:40:59 -0800 Subject: [PATCH 136/948] Try fix cpp topi test (#871) * Try fix cpp topi test * move cpp test to another stage * update --- Jenkinsfile | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 75d55787ad9a..0506caac850d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -5,9 +5,9 @@ // tvm libraries tvm_runtime = "lib/libtvm_runtime.so, config.mk" -tvm_lib = "lib/libtvm.so, " + tvm_runtime +tvm_lib = "lib/libtvm.so " + tvm_runtime // LLVM upstream lib -tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, " + tvm_runtime +tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, lib/libtvm_topi.so, " + tvm_runtime // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' @@ -224,9 +224,6 @@ stage('Unit Test') { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} cpu ./tests/scripts/task_cpp_unittest.sh" } - timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} cpu ./tests/scripts/task_cpp_topi.sh" - } } } }, @@ -254,6 +251,7 @@ stage('Integration Test') { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} gpu ./tests/scripts/task_python_integration.sh" sh "${docker_run} gpu ./tests/scripts/task_python_topi.sh" + sh "${docker_run} gpu ./tests/scripts/task_cpp_topi.sh" } } } From 229d94b2fddd242be7fad98c079f25ddbbe4c622 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 4 Feb 2018 23:06:46 -0800 Subject: [PATCH 137/948] [CODEGEN] Fix vector element access in metal (#872) --- apps/ios_rpc/tests/ios_rpc_test.py | 1 + python/tvm/contrib/xcode.py | 8 +++++++- python/tvm/target.py | 2 +- src/codegen/codegen_metal.cc | 14 ++++++++++++++ src/codegen/codegen_metal.h | 6 ++++++ 5 files changed, 29 insertions(+), 2 deletions(-) diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py index a3df1d3a9043..80317a6e350d 100644 --- a/apps/ios_rpc/tests/ios_rpc_test.py +++ b/apps/ios_rpc/tests/ios_rpc_test.py @@ -59,6 +59,7 @@ def test_rpc_module(): # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, + options=['-quiet'], libs=[path_dso1, path_dso2]) # connect to the proxy diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py index 7f9b53c5b532..e05df7181544 100644 --- a/python/tvm/contrib/xcode.py +++ b/python/tvm/contrib/xcode.py @@ -201,5 +201,11 @@ def popen_test_rpc(host, if options: cmd += options cmd += ["test"] - proc = subprocess.Popen(cmd) + if "-quiet" in options: + with open(os.devnull, 'w') as devnull: + proc = subprocess.Popen(cmd, + stderr=subprocess.STDOUT, + stdout=devnull) + else: + proc = subprocess.Popen(cmd) return proc diff --git a/python/tvm/target.py b/python/tvm/target.py index 5196241cd7cf..a24eefcb5ba7 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -117,7 +117,7 @@ def __init__(self, self.keys += ("rocm", "gpu") self.max_num_threads = 256 elif target_name in ("metal", "vulkan"): - self.keys += ("gpu",) + self.keys += (target_name, "gpu",) self.max_num_threads = 256 elif target_name in ("opengl",): self.keys += ("opengl",) diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc index 0df63aad49a2..fe8486668da0 100644 --- a/src/codegen/codegen_metal.cc +++ b/src/codegen/codegen_metal.cc @@ -186,6 +186,20 @@ void CodeGenMetal::PrintStorageSync(const Call* op) { } } +void CodeGenMetal::PrintVecElemLoad(const std::string& vec, + Type t, int i, + std::ostream& os) { // NOLINT(*) + os << vec << "[" << i << "]"; +} + +void CodeGenMetal::PrintVecElemStore(const std::string& vec, + Type t, int i, + const std::string& value) { + this->PrintIndent(); + stream << vec << "[" << i << "]" + << " = " << value << ";\n"; +} + void CodeGenMetal::PrintStorageScope( const std::string& scope, std::ostream& os) { // NOLINT(*) if (scope == "global") { diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h index 80efef67332e..6f8bef64bbcf 100644 --- a/src/codegen/codegen_metal.h +++ b/src/codegen/codegen_metal.h @@ -25,6 +25,12 @@ class CodeGenMetal final : public CodeGenC { void PrintStorageSync(const Call* op) final; // NOLINT(*) void PrintType(Type t, std::ostream& os) final; // NOLINT(*) void BindThreadIndex(const IterVar& iv) final; // NOLINT(*) + // print load of single element + void PrintVecElemLoad( + const std::string& vec, Type t, int i, std::ostream& os) final; // NOLINT(*) + // print store of single element. + void PrintVecElemStore( + const std::string& vec, Type t, int i, const std::string& value) final; // overload visitor void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) From 0d2a9e8bf5a06d3f57f3b450a2c83d8f23433131 Mon Sep 17 00:00:00 2001 From: alex-weaver Date: Mon, 5 Feb 2018 19:11:54 +0000 Subject: [PATCH 138/948] Fixed namespacing issues in schedules (#873) * Fixed namespacing issues in schedules * Fixed compile error --- topi/include/topi/cuda/dense.h | 4 +-- topi/include/topi/cuda/extern.h | 2 +- topi/include/topi/cuda/injective.h | 2 +- topi/include/topi/cuda/pooling.h | 14 ++++---- topi/include/topi/cuda/reduction.h | 4 +-- topi/include/topi/generic/default.h | 47 ++++++++++++++++++--------- topi/include/topi/generic/injective.h | 2 +- topi/include/topi/x86/bnn.h | 4 +-- topi/include/topi/x86/default.h | 36 ++++++++++++++++---- topi/include/topi/x86/injective.h | 2 +- topi/src/topi.cc | 12 +++++-- 11 files changed, 88 insertions(+), 41 deletions(-) diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h index b012f108fd93..a07aafea6e30 100644 --- a/topi/include/topi/cuda/dense.h +++ b/topi/include/topi/cuda/dense.h @@ -86,7 +86,7 @@ inline Schedule schedule_dense(const Target &target, const Array& outs) auto dense_f = s.rfactor(dense, kf)[0]; Tensor out; - if (contains(s->outputs, dense->op)) { + if (detail::contains(s->outputs, dense->op)) { out = dense; } else { out = outs[0]->op.output(0); @@ -107,7 +107,7 @@ inline Schedule schedule_dense(const Target &target, const Array& outs) traverse = [&](const Operation& op) { // Inline all one-to-one-mapping operators except the last stage (output) if (is_broadcast(op->tag)) { - if (!contains(s->outputs, op)) { + if (!detail::contains(s->outputs, op)) { s[op].compute_inline(); } for (auto tensor : op->InputTensors()) { diff --git a/topi/include/topi/cuda/extern.h b/topi/include/topi/cuda/extern.h index b97c33943010..1c2f9a79ab00 100644 --- a/topi/include/topi/cuda/extern.h +++ b/topi/include/topi/cuda/extern.h @@ -27,7 +27,7 @@ namespace cuda { */ inline Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) { auto x = op.output(0); - auto fused = Fuse(sch[x], sch[x]->op.as()->axis); + auto fused = detail::Fuse(sch[x], sch[x]->op.as()->axis); auto num_thread = target.max_num_threads; IterVar bx, tx; sch[x].split(fused, num_thread, &bx, &tx); diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h index c213002cc52f..e8e60fb6809e 100644 --- a/topi/include/topi/cuda/injective.h +++ b/topi/include/topi/cuda/injective.h @@ -24,7 +24,7 @@ namespace cuda { */ inline void ScheduleInjectiveOp(const Target &target, Operation op, Schedule s) { auto x = op.output(0); - auto fused = Fuse(s[x], s[x]->op.as()->axis); + auto fused = detail::Fuse(s[x], s[x]->op.as()->axis); auto num_thread = target.max_num_threads; IterVar bx, tx; s[x].split(fused, num_thread, &bx, &tx); diff --git a/topi/include/topi/cuda/pooling.h b/topi/include/topi/cuda/pooling.h index e8fb9546a241..d7536f315dba 100644 --- a/topi/include/topi/cuda/pooling.h +++ b/topi/include/topi/cuda/pooling.h @@ -37,19 +37,19 @@ inline Schedule schedule_pool(const Target &target, const Array& outs) { auto num_thread = target.max_num_threads; Tensor out; Tensor OL; - if (contains(s->outputs, pool->op)) { + if (detail::contains(s->outputs, pool->op)) { out = pool; OL = s.cache_write(pool, "local"); } else { out = outs[0]->op.output(0); s[pool].set_scope("local"); } - auto fused = Fuse(s[out], s[out]->op.as()->axis); + auto fused = detail::Fuse(s[out], s[out]->op.as()->axis); IterVar bx, tx; s[out].split(fused, num_thread, &bx, &tx); s[out].bind(bx, tvm::thread_axis(Range(), "blockIdx.x")); s[out].bind(tx, tvm::thread_axis(Range(), "threadIdx.x")); - if (contains(s->outputs, pool->op)) { + if (detail::contains(s->outputs, pool->op)) { s[OL].compute_at(s[out], tx); } else { s[pool].compute_at(s[out], tx); @@ -60,7 +60,7 @@ inline Schedule schedule_pool(const Target &target, const Array& outs) { traverse = [&](const Operation& op) { // Inline all one-to-one-mapping operators except the last stage (output) if (is_broadcast(op->tag)) { - if (!contains(s->outputs, op)) { + if (!detail::contains(s->outputs, op)) { s[op].compute_inline(); } for (auto tensor : op->InputTensors()) { @@ -105,7 +105,7 @@ inline Schedule schedule_global_pool(const Target &target, const Array& auto thread_y = tvm::thread_axis(Range(0, num_thread), "threadIdx.y"); Tensor out; Tensor OL; - if (contains(s->outputs, pool->op)) { + if (detail::contains(s->outputs, pool->op)) { out = pool; OL = s.cache_write(pool, "local"); } else { @@ -126,7 +126,7 @@ inline Schedule schedule_global_pool(const Target &target, const Array& s[out].bind(by, block_y); s[out].bind(bx, block_x); - if (contains(s->outputs, pool->op)) { + if (detail::contains(s->outputs, pool->op)) { s[OL].compute_at(s[out], tx); } else { s[pool].compute_at(s[out], tx); @@ -137,7 +137,7 @@ inline Schedule schedule_global_pool(const Target &target, const Array& traverse = [&](const Operation& op) { // Inline all one-to-one-mapping operators except the last stage (output) if (is_broadcast(op->tag)) { - if (!contains(s->outputs, op)) { + if (!detail::contains(s->outputs, op)) { s[op].compute_inline(); } for (auto tensor : op->InputTensors()) { diff --git a/topi/include/topi/cuda/reduction.h b/topi/include/topi/cuda/reduction.h index 554224a9036c..e7a41d9274fa 100644 --- a/topi/include/topi/cuda/reduction.h +++ b/topi/include/topi/cuda/reduction.h @@ -65,7 +65,7 @@ Schedule ScheduleReduce(const Target& target, thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x"); } - auto fused_reduce = Fuse(out_stage, out_stage->op.as()->reduce_axis); + auto fused_reduce = detail::Fuse(out_stage, out_stage->op.as()->reduce_axis); IterVar ko, ki; out_stage.split(fused_reduce, num_thread, &ko, &ki); @@ -87,7 +87,7 @@ Schedule ScheduleReduce(const Target& target, auto stage_real = sch[real_output]; if (!all_reduce) { // Fuse and split the axis - auto fused_outer = Fuse(stage_real, stage_real->op.as()->axis); + auto fused_outer = detail::Fuse(stage_real, stage_real->op.as()->axis); IterVar bx, outer_in; stage_real.split(fused_outer, num_thread, &bx, &outer_in); diff --git a/topi/include/topi/generic/default.h b/topi/include/topi/generic/default.h index 0deadb07e241..5e0615742a14 100644 --- a/topi/include/topi/generic/default.h +++ b/topi/include/topi/generic/default.h @@ -16,27 +16,42 @@ using namespace tvm; namespace generic { /*! - * \brief Create a generic default schedule for the given output tensors. - * - * \param target The target to generate a schedule for. - * \param outs The output tensors. - * \param auto_inline Whether to apply the auto inline step. - * - * \return A schedule for the given ops. - */ -inline Schedule default_schedule(const Target& target, Array outs, bool auto_inline) { +* \brief Create a generic default schedule for the given output tensors. +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +inline Schedule default_schedule(const Target& target, Array outs) { + Array out_ops; + for (auto t : outs) { + out_ops.push_back(t->op); + } + auto s = create_schedule(out_ops); + return s; +} + +/*! +* \brief Create a generic default schedule for the given output tensors, and apply +* auto inline +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +inline Schedule default_schedule_auto_inline(const Target& target, Array outs) { Array out_ops; for (auto t : outs) { out_ops.push_back(t->op); } auto s = create_schedule(out_ops); - if (auto_inline) { - auto x = outs[0]; - tvm::schedule::AutoInlineInjective(s); - auto axis = s[x]->op.as()->axis; - if (axis.size() > 0) { - Fuse(s[x], axis); - } + auto x = outs[0]; + tvm::schedule::AutoInlineInjective(s); + auto axis = s[x]->op.as()->axis; + if (axis.size() > 0) { + detail::Fuse(s[x], axis); } return s; } diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h index 8c0a12d23b60..c1d63eac6af7 100644 --- a/topi/include/topi/generic/injective.h +++ b/topi/include/topi/generic/injective.h @@ -32,7 +32,7 @@ inline Schedule schedule_injective(const Target &target, const Array& ou auto s = create_schedule(out_ops); tvm::schedule::AutoInlineInjective(s); auto x = outs[0]; - Fuse(s[x], s[x]->op.as()->axis); + detail::Fuse(s[x], s[x]->op.as()->axis); return s; } diff --git a/topi/include/topi/x86/bnn.h b/topi/include/topi/x86/bnn.h index 8e50c5bac124..f379ada8a516 100644 --- a/topi/include/topi/x86/bnn.h +++ b/topi/include/topi/x86/bnn.h @@ -68,7 +68,7 @@ inline Schedule schedule_binary_dense(const Target &target, const Array& s[C].parallel(s[C]->op.as()->axis[0]); Tensor out; - if (contains(s->outputs, C->op)) { + if (detail::contains(s->outputs, C->op)) { out = C; } else { out = outs[0]->op.output(0); @@ -83,7 +83,7 @@ inline Schedule schedule_binary_dense(const Target &target, const Array& traverse = [&](const Operation& op) { // Inline all one-to-one-mapping operators except the last stage (output) if (is_broadcast(op->tag)) { - if (!contains(s->outputs, op)) { + if (!detail::contains(s->outputs, op)) { s[op].compute_inline(); } for (auto tensor : op->InputTensors()) { diff --git a/topi/include/topi/x86/default.h b/topi/include/topi/x86/default.h index 4a3f36e4ae66..5d71855a8c94 100644 --- a/topi/include/topi/x86/default.h +++ b/topi/include/topi/x86/default.h @@ -16,7 +16,7 @@ using namespace tvm; namespace x86 { /*! -* \brief Create a default x86 schedule for the given ops. +* \brief Helper to create a default x86 schedule for the given ops. * * \param target The target to generate a schedule for. * \param outs The output tensors. @@ -24,9 +24,9 @@ namespace x86 { * * \return A schedule for the given ops. */ -inline Schedule default_schedule(const Target &target, - const Array& outs, - bool auto_inline) { +inline Schedule MakeDefaultSchedule(const Target &target, + const Array& outs, + bool auto_inline) { Array out_ops; for (auto t : outs) { out_ops.push_back(t->op); @@ -38,7 +38,7 @@ inline Schedule default_schedule(const Target &target, if (auto_inline) { tvm::schedule::AutoInlineInjective(s); if (axis.size() > 0) { - Fuse(s[x], axis); + detail::Fuse(s[x], axis); } return s; } @@ -46,7 +46,7 @@ inline Schedule default_schedule(const Target &target, if (axis.size() == 4) { auto n = axis[0]; auto c = axis[1]; - auto fused = Fuse(s[x], { n, c }); // for nhwc layout, fuse n and h + auto fused = detail::Fuse(s[x], { n, c }); // for nhwc layout, fuse n and h s[x].parallel(fused); } else { s[x].parallel(axis[0]); @@ -55,6 +55,30 @@ inline Schedule default_schedule(const Target &target, return s; } +/*! +* \brief Create a default x86 schedule for the given ops. +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +inline Schedule default_schedule(const Target &target, const Array& outs) { + return MakeDefaultSchedule(target, outs, false); +} + +/*! +* \brief Create a default x86 schedule for the given ops, with auto inline +* +* \param target The target to generate a schedule for. +* \param outs The output tensors. +* +* \return A schedule for the given ops. +*/ +inline Schedule default_schedule_auto_inline(const Target &target, const Array& outs) { + return MakeDefaultSchedule(target, outs, true); +} + } // namespace x86 } // namespace topi #endif // TOPI_X86_DEFAULT_H_ diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h index 40045b4920ce..9e5a603af908 100644 --- a/topi/include/topi/x86/injective.h +++ b/topi/include/topi/x86/injective.h @@ -36,7 +36,7 @@ inline Schedule schedule_injective(const Target &target, const Array& ou if (axis.size() == 4) { auto n = axis[0]; auto c = axis[1]; - auto fused = Fuse(s[x], { n, c }); // for nhwc layout, fuse n and h + auto fused = detail::Fuse(s[x], { n, c }); // for nhwc layout, fuse n and h s[x].parallel(fused); } else { s[x].parallel(axis[0]); diff --git a/topi/src/topi.cc b/topi/src/topi.cc index 03dd004bd42c..970e982276f6 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -343,7 +343,11 @@ TVM_REGISTER_GLOBAL("topi.nn.log_softmax") /* Generic schedules */ TVM_REGISTER_GLOBAL("topi.generic.default_schedule") .set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = topi::generic::default_schedule(args[0], args[1], args[2]); + if (args[2]) { + *rv = topi::generic::default_schedule_auto_inline(args[0], args[1]); + } else { + *rv = topi::generic::default_schedule(args[0], args[1]); + } }); TVM_REGISTER_GLOBAL("topi.generic.schedule_extern") @@ -369,7 +373,11 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_binary_dense") TVM_REGISTER_GLOBAL("topi.x86.default_schedule") .set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = topi::x86::default_schedule(args[0], args[1], args[2]); + if (args[2]) { + *rv = topi::x86::default_schedule_auto_inline(args[0], args[1]); + } else { + *rv = topi::x86::default_schedule(args[0], args[1]); + } }); TVM_REGISTER_GLOBAL("topi.x86.schedule_injective") From 1b0f53cc927145ca845aeb31254ee5aa6a435882 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 7 Feb 2018 01:23:19 +0800 Subject: [PATCH 139/948] support to keep trivial loops with extent of 1 (#877) --- include/tvm/operation.h | 16 +++++++++++----- include/tvm/schedule_pass.h | 3 ++- src/api/api_schedule.cc | 9 ++++++++- src/codegen/build_module.cc | 2 +- src/op/compute_op.cc | 21 ++++++++++++--------- src/op/compute_op.h | 12 +++++++++--- src/op/cross_thread_reduction.cc | 5 +++-- src/op/extern_op.cc | 3 ++- src/op/op_util.cc | 7 ++++--- src/op/op_util.h | 4 +++- src/op/placeholder_op.cc | 3 ++- src/op/scan_op.cc | 5 +++-- src/op/tensorize.cc | 5 +++-- src/schedule/schedule_ops.cc | 13 +++++++------ 14 files changed, 70 insertions(+), 38 deletions(-) diff --git a/include/tvm/operation.h b/include/tvm/operation.h index d598df8d21b1..9b950c3d544f 100644 --- a/include/tvm/operation.h +++ b/include/tvm/operation.h @@ -117,11 +117,13 @@ class OperationNode : public FunctionBaseNode { * \brief Build the statement that provide the output tensors. * \param stage The schedule stage of the op. * \param dom_map The domain map of all iteration domains. + * \param del_trivial_loop Whether eliminate trivial loop with extent of 1 * \return A statement that add production and wraps consumer. */ virtual Stmt BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const = 0; + const std::unordered_map& dom_map, + bool del_trivial_loop) const = 0; static constexpr const char* _type_key = "Operation"; @@ -160,7 +162,8 @@ class PlaceholderOpNode : public OperationNode { const Stmt& body) const final; Stmt BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const final; + const std::unordered_map& dom_map, + bool del_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); @@ -211,7 +214,8 @@ class ComputeOpNode : public OperationNode { const Stmt& body) const final; Stmt BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const final; + const std::unordered_map& dom_map, + bool del_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); @@ -282,7 +286,8 @@ class ScanOpNode : public OperationNode { const Stmt& body) const final; Stmt BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const final; + const std::unordered_map& dom_map, + bool del_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); @@ -345,7 +350,8 @@ class ExternOpNode : public OperationNode { const Stmt& body) const final; Stmt BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const final; + const std::unordered_map& dom_map, + bool del_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h index 719448513fb8..011c7510ced9 100644 --- a/include/tvm/schedule_pass.h +++ b/include/tvm/schedule_pass.h @@ -29,9 +29,10 @@ Map InferBound(const Schedule& sch); * * \param s The schedule to be realized * \param dom_map The domain of each iter vars. + * \param del_trivial_loop Whether delete trivial loops with extent of 1 * \return the result Stmt */ -Stmt ScheduleOps(Schedule s, Map dom_map); +Stmt ScheduleOps(Schedule s, Map dom_map, bool del_trivial_loop); /*! * \brief To automatically inline the element-wise operations. diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc index 0b5ef251503c..b1a6729ec662 100644 --- a/src/api/api_schedule.cc +++ b/src/api/api_schedule.cc @@ -24,6 +24,14 @@ TVM_REGISTER_API("schedule.AutoInlineInjective") AutoInlineInjective(args[0]); }); +TVM_REGISTER_API("schedule.ScheduleOps") +.set_body([](TVMArgs args, TVMRetValue* ret) { + if (args.size() == 2) + *ret = ScheduleOps(args[0], args[1], true); + else + *ret = ScheduleOps(args[0], args[1], args[2]); +}); + #define REGISTER_SCHEDULE_PASS1(PassName) \ TVM_REGISTER_API("schedule."#PassName) \ .set_body([](TVMArgs args, TVMRetValue *ret) { \ @@ -43,7 +51,6 @@ REGISTER_SCHEDULE_PASS2(PostDFSOrder); REGISTER_SCHEDULE_PASS1(CreateAttachPath); REGISTER_SCHEDULE_PASS1(ScanGetBody); REGISTER_SCHEDULE_PASS1(ScanFixPointAnalysis); -REGISTER_SCHEDULE_PASS2(ScheduleOps); } // namespace schedule } // namespace tvm diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 53aef46a9751..2e8e5bb278eb 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -211,7 +211,7 @@ Stmt BuildStmt(Schedule sch, // Phase 0 auto bounds = schedule::InferBound(sch); - auto stmt = schedule::ScheduleOps(sch, bounds); + auto stmt = schedule::ScheduleOps(sch, bounds, true); stmt = ir::InjectPrefetch(stmt); // Phase 1 diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc index 9e7db1deea45..8b8bfbfe602e 100644 --- a/src/op/compute_op.cc +++ b/src/op/compute_op.cc @@ -305,9 +305,10 @@ Stmt MakeProvide(const ComputeOpNode* op, Stmt MakeComputeStmt(const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map) { + const std::unordered_map& dom_map, + bool del_trivial_loop) { // grab the nest structure - ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map); + ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, del_trivial_loop); // Normal loop structure n.init_nest.emplace_back(op::MakeIfNest(n.init_predicates)); n.main_nest.emplace_back(op::MakeIfNest(n.main_predicates)); @@ -387,28 +388,30 @@ ComputeType DetectComputeType(const ComputeOpNode* self, // implement the provide utility. Stmt ComputeOpNode::BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const { + const std::unordered_map& dom_map, + bool del_trivial_loop) const { CHECK_EQ(stage->op.operator->(), this); ComputeType ctype = DetectComputeType(this, stage); if (ctype == ComputeType::kCrossThreadReduction) { // specially handle cross thread reduction. - return MakeCrossThreadReduction(this, stage, dom_map); + return MakeCrossThreadReduction(this, stage, dom_map, del_trivial_loop); } else if (ctype == ComputeType::kTensorize) { - return MakeTensorize(this, stage, dom_map); + return MakeTensorize(this, stage, dom_map, del_trivial_loop); } else { - return MakeComputeStmt(this, stage, dom_map); + return MakeComputeStmt(this, stage, dom_map, del_trivial_loop); } } ComputeLoopNest ComputeLoopNest::make( const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map) { + const std::unordered_map& dom_map, + bool del_trivial_loop) { CHECK_EQ(stage->op.operator->(), self); ComputeLoopNest ret; // make main loop nest ret.main_nest = op::MakeLoopNest( - stage, dom_map, 0, false, std::unordered_set(), &ret.main_vmap); + stage, dom_map, 0, false, std::unordered_set(), &ret.main_vmap, del_trivial_loop); ret.main_predicates = schedule::MakeBoundCheck( stage, dom_map, ret.main_vmap, false, std::unordered_set()); @@ -450,7 +453,7 @@ ComputeLoopNest ComputeLoopNest::make( } ret.init_nest = op::MakeLoopNest( stage, dom_map, begin_loop, true, - skip_iter, &(ret.init_vmap)); + skip_iter, &(ret.init_vmap), del_trivial_loop); ret.init_predicates = schedule::MakeBoundCheck( stage, dom_map, ret.init_vmap, true, skip_iter); for (auto& e : ret.init_predicates) { diff --git a/src/op/compute_op.h b/src/op/compute_op.h index 95dc0f44d8d4..2164feee6988 100644 --- a/src/op/compute_op.h +++ b/src/op/compute_op.h @@ -37,12 +37,14 @@ struct ComputeLoopNest { * \param self The pointer to compute op. * \param stage The scxhedule stage. * \param dom_map The domain map. + * \param del_trivial_loop Whether eliminate trivial loops with extent of 1 * \return The constructed loop nest */ static ComputeLoopNest make( const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map); + const std::unordered_map& dom_map, + bool del_trivial_loop); }; /*! @@ -50,23 +52,27 @@ struct ComputeLoopNest { * \param self The pointer to ComputeOpNode * \param stage The schedule stage. * \param dom_map The domain map. + * \param del_trivial_loop Wheter eliminate trivial loops with extent of 1 * \return The created statement. */ Stmt MakeCrossThreadReduction( const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map); + const std::unordered_map& dom_map, + bool del_trivial_loop); /*! * \brief Build body of compute for tensorization. * \param self The pointer to ComputeOpNode * \param stage The schedule stage. * \param dom_map The domain map. + * \param del_trivial_loop Wheter eliminate trivial loops with extent of 1 * \return The created statement. */ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map); + const std::unordered_map& dom_map, + bool del_trivial_loop); } // namespace tvm #endif // TVM_OP_COMPUTE_OP_H_ diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc index 6eec3bd69d6a..e32b3dcd4407 100644 --- a/src/op/cross_thread_reduction.cc +++ b/src/op/cross_thread_reduction.cc @@ -13,14 +13,15 @@ using namespace ir; Stmt MakeCrossThreadReduction( const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map) { + const std::unordered_map& dom_map, + bool del_trivial_loop) { Array args; for (IterVar iv : self->axis) { args.push_back(iv->var); } std::unordered_map value_map; auto nest = op::MakeLoopNest( - stage, dom_map, 0, false, std::unordered_set(), &value_map); + stage, dom_map, 0, false, std::unordered_set(), &value_map, del_trivial_loop); auto conds = schedule::MakeBoundCheck( stage, dom_map, value_map, false, std::unordered_set()); diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc index e83f97b14652..df3a32d50fe7 100644 --- a/src/op/extern_op.cc +++ b/src/op/extern_op.cc @@ -128,7 +128,8 @@ Stmt ExternOpNode::BuildRealize( Stmt ExternOpNode::BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const { + const std::unordered_map& dom_map, + bool del_trivial_loop) const { CHECK_EQ(stage->op.operator->(), this); Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body); auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) { diff --git a/src/op/op_util.cc b/src/op/op_util.cc index 78e092ca844e..ef7af85bf079 100644 --- a/src/op/op_util.cc +++ b/src/op/op_util.cc @@ -23,7 +23,8 @@ MakeLoopNest(const Stage& stage, size_t begin_iter_pos, bool new_loop_var, const std::unordered_set& skip_iter, - std::unordered_map* p_value_map) { + std::unordered_map* p_value_map, + bool del_trivial_loop) { auto leaf_iter_vars = stage->leaf_iter_vars; Stmt no_op = Evaluate::make(0); // create the loop nest @@ -75,7 +76,7 @@ MakeLoopNest(const Stage& stage, AttrStmt::make(iv, ir::attr::pragma_scope, p, no_op)); } } - if (is_one(dom->extent)) { + if (del_trivial_loop && is_one(dom->extent)) { nest[i + 1].emplace_back( LetStmt::make(var, dom->min, no_op)); value_map[iv] = dom->min; @@ -130,7 +131,7 @@ MakeLoopNest(const Stage& stage, // annotate the extent of the IterVar nest[i + 1].emplace_back( AttrStmt::make(bind_iv, ir::attr::thread_extent, dom->extent, no_op)); - if (is_one(dom->extent)) { + if (del_trivial_loop && is_one(dom->extent)) { value_map[iv] = dom->min; } else { value_map[iv] = var; diff --git a/src/op/op_util.h b/src/op/op_util.h index 783fbb989422..9b8f7dc629bd 100644 --- a/src/op/op_util.h +++ b/src/op/op_util.h @@ -29,6 +29,7 @@ using ir::MergeNest; * \param new_loop_var Whether create new loop variable. * \param skip_iter Whether skip certain iteration. * \param p_value_map The result value of each IterVar. + * \param del_trivial_loop Whether eliminate trivial loops with extent of 1 */ std::vector > MakeLoopNest(const Stage& stage, @@ -36,7 +37,8 @@ MakeLoopNest(const Stage& stage, size_t begin_iter_pos, bool new_loop_var, const std::unordered_set& skip_iter, - std::unordered_map* p_value_map); + std::unordered_map* p_value_map, + bool del_trivial_loop); /*! * \brief Create a nest of if checking the predicates. diff --git a/src/op/placeholder_op.cc b/src/op/placeholder_op.cc index 4e9d1d094d74..27c1fa9c5001 100644 --- a/src/op/placeholder_op.cc +++ b/src/op/placeholder_op.cc @@ -78,7 +78,8 @@ Stmt PlaceholderOpNode::BuildRealize( Stmt PlaceholderOpNode::BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const { + const std::unordered_map& dom_map, + bool del_trivial_loop) const { return Stmt(); } } // namespace tvm diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc index 94e3a4aa6586..5c61eae0f183 100644 --- a/src/op/scan_op.cc +++ b/src/op/scan_op.cc @@ -252,7 +252,8 @@ Stmt ScanOpNode::BuildRealize( Stmt ScanOpNode::BuildProvide( const Stage& stage, - const std::unordered_map& dom_map) const { + const std::unordered_map& dom_map, + bool del_trivial_loop) const { CHECK_EQ(stage->op.operator->(), this); Stmt provide = AttrStmt::make( stage->op, attr::scan_update_scope, this->scan_axis->var, @@ -270,7 +271,7 @@ Stmt ScanOpNode::BuildProvide( std::unordered_map vmap; std::unordered_set empty; auto nest = op::MakeLoopNest( - stage, dom_map, 0, false, empty, &vmap); + stage, dom_map, 0, false, empty, &vmap, del_trivial_loop); nest[begin_scan].push_back(init); nest.push_back( op::MakeIfNest( diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc index 6fa5459829fc..1f03ec9c0ebb 100644 --- a/src/op/tensorize.cc +++ b/src/op/tensorize.cc @@ -369,14 +369,15 @@ Stmt TransformUpdate(const Stage& stage, Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, - const std::unordered_map& dom_map) { + const std::unordered_map& dom_map, + bool del_trivial_loop) { std::unordered_map out_dom; std::unordered_map > in_region; size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region); TensorIntrin intrin = stage->iter_var_attrs.at( stage->leaf_iter_vars[tloc])->tensor_intrin; CHECK(intrin.defined()); - ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map); + ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, del_trivial_loop); VerifyTensorizeLoopNest(self, stage, n, tloc); VerifyTensorizeBody(self, stage, out_dom, in_region, intrin); // Start bind data. diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc index 875df556466a..e0dc3321b1fc 100644 --- a/src/schedule/schedule_ops.cc +++ b/src/schedule/schedule_ops.cc @@ -22,8 +22,9 @@ using namespace ir; Stmt MakePipeline(const Stage& s, const std::unordered_map& dom_map, - Stmt consumer) { - Stmt producer = s->op->BuildProvide(s, dom_map); + Stmt consumer, + bool del_trivial_loop) { + Stmt producer = s->op->BuildProvide(s, dom_map, del_trivial_loop); if (producer.defined()) { producer = ProducerConsumer::make(s->op, true, producer); } @@ -68,7 +69,7 @@ class InjectAttach : public IRMutator { found_attach = true; stmt = AttrStmt::make( op->node, op->attr_key, op->value, - MakePipeline(stage_, dom_map_, op->body)); + MakePipeline(stage_, dom_map_, op->body, true)); } } return stmt; @@ -107,7 +108,7 @@ class InjectScanStep : public IRMutator { found_attach = true; stmt = AttrStmt::make( op->node, op->attr_key, op->value, - MakePipeline(stage_, dom_map_, op->body)); + MakePipeline(stage_, dom_map_, op->body, true)); } } return stmt; @@ -324,7 +325,7 @@ class SchedulePostProc : public IRMutator { }; Stmt ScheduleOps( - Schedule sch, Map dom_map_) { + Schedule sch, Map dom_map_, bool del_trivial_loop) { Stmt body = Stmt(); std::unordered_map dom_map = as_unordered_map(dom_map_); // scan init and scan updates @@ -374,7 +375,7 @@ Stmt ScheduleOps( // do nothing } else if (attach_spec->attach_type == kGroupRoot) { CHECK(!s->group.defined()); - body = MakePipeline(s, dom_map, body); + body = MakePipeline(s, dom_map, body, del_trivial_loop); } else { CHECK_EQ(attach_spec->attach_type, kScope); CHECK(body.defined()); From b78338290e09f86ca4ab91ec4f580f64b669c90a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 7 Feb 2018 09:51:03 +0800 Subject: [PATCH 140/948] [CONTRIB] add peak test (#878) * add peak test * fix error for lanes=16 * update doc * fix names * fix names --- python/tvm/contrib/peak.py | 325 ++++++++++++++++++++++++++++++++ python/tvm/exec/measure_peak.py | 32 ++++ 2 files changed, 357 insertions(+) create mode 100644 python/tvm/contrib/peak.py create mode 100644 python/tvm/exec/measure_peak.py diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py new file mode 100644 index 000000000000..36bfaaee7662 --- /dev/null +++ b/python/tvm/contrib/peak.py @@ -0,0 +1,325 @@ +# pylint: disable=invalid-name +"""measure bandwidth and compute peak""" + +import logging + +import tvm +from tvm.contrib import rpc, util + +def _convert_to_remote(func, remote): + """ convert module function to remote rpc function""" + temp = util.tempdir() + path_dso = temp.relpath("tmp_func.tar") + func.export_library(path_dso) + + remote.upload(path_dso) + func = remote.load_module("tmp_func.tar") + return func + +def measure_bandwidth_sum(total_item, item_per_thread, stride, + base_type, bits, lanes, + target, target_host, remote, ctx, n_times): + """ measure memory bandwidth of gpu by product reduction for a given type + + The IR for measurement is + + for each thread + for i in 1..num_per_thread: + y[global_id] = y[global_id] * x[base + i * stride] + + Parameters + ---------- + total_item: int + number of elements in input array + item_per_thread: int + number of elements each thread accumulates + stride: int + stride in memory access + base_type: str + can be "int", "float" + bits: int + can be 16, 32 + lanes: int + lane of the vector type, can be 1, 2, 4, 8, 16 + target: :any:`tvm.target.Target` + the target and option of the compilation. + target_host : str or :any:`tvm.target.Target` + host compilation target + ctx: TVMcontext + the context of array + remote: tvm.contrib.rpc.RPCSession + remote rpc session + n_times: int + number of runs for taking mean + + Returns + ------- + GBPS: float + gigabyte per second + """ + n, m = total_item, item_per_thread + n //= lanes + + base_type = str(base_type) + str(bits) + dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) + + k = tvm.reduce_axis((0, m), name="k") + + x = tvm.placeholder((n,), dtype=dtype, name="x") + op = tvm.comm_reducer(lambda x, y: x*y, lambda t: tvm.const(1, dtype=t), name="sum") + y = tvm.compute((n // m,), + lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k)) + s = tvm.create_schedule(y.op) + + yo, yi = s[y].split(y.op.axis[0], target.max_num_threads) + s[y].bind(yo, tvm.thread_axis("blockIdx.x")) + s[y].bind(yi, tvm.thread_axis("threadIdx.x")) + s[y].unroll(k) + + try: + func = tvm.build(s, [x, y], target, target_host=target_host) + + x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) + y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx) + + func = _convert_to_remote(func, remote) + time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) + time = time_f(x, y).mean + except tvm._ffi.base.TVMError: + # build error (occur when device does not support half) + return -1 + + return 1.0 * (total_item * bits / 8) / 1e9 / time + +def measure_bandwidth_all_types(total_item, item_per_thread, n_times, + target, target_host, remote, ctx, verbose=True): + """ measure memory bandwidth for all types + + Parameters + ---------- + total_item: int + number of elements in input array + item_per_thread: int + number of elements each thread accmulates + n_times: int + number of runs for averaging + target: :any:`tvm.target.Target` + the target and option of the compilation. + target_host : str or :any:`tvm.target.Target` + host compilation target + remote: tvm.contrib.rpc.RPCSession + remote rpc session + ctx: TVMcontext + the context of array + verbose: bool + whether outputs immediate result + + Returns + ------- + result: list + a list of (type_name, GBPS) pairs + """ + max_threads = target.max_num_threads + + result = [] + for base_type in ["float"]: + for bits in [32]: + for lanes in [1, 2, 4, 8, 16]: + max_speed = -1e9 + # try different strides + for stride in [max_threads, total_item // (lanes * item_per_thread)]: + speed = measure_bandwidth_sum(total_item, item_per_thread, stride, + base_type, bits, lanes, target, + target_host, remote, ctx, n_times) + max_speed = max(max_speed, speed) + type_name = base_type + str(bits) + result.append(["%sx%d" % (type_name, lanes), max_speed]) + if verbose: + logging.info("\t%-10s %.2f GBPS", result[-1][0], result[-1][1]) + return result + +def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes, + target, target_host, remote, ctx, n_times): + """ measure peak compute speed by computing mad for a type + + The IR for measurement is + + for each thread + for i in 1..item_per_thread + x = mad(x, x, y) + y = mad(y, y, x) + + Parameters + ---------- + total_item: int + number of elements in input array + item_per_thread: int + number of operations each thread does + base_type: str + can be "int", "float" + bits: int + can be 16, 32 + lanes: int + lane of the vector type, can be 1, 2, 4, 8, 16 + target: :any:`tvm.target.Target` + the target and option of the compilation. + target_host : str or :any:`tvm.target.Target` + host compilation target + remote: tvm.contrib.rpc.RPCSession + if it is not None, use remote rpc session + ctx: TVMcontext + the context of array + n_times: int + number of runs for taking mean + + Returns + ------- + GOPS: float + giga operation per second + """ + + n = total_item + + if bits >= 64 or lanes >= 16: + n //= 2 + + max_threads = target.max_num_threads + + base_type = str(base_type) + str(bits) + dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) + + def extern(ins, outs): + # pylint: disable=unused-argument + """construct measurement function by building IR directly""" + ib = tvm.ir_builder.create() + + bx = tvm.thread_axis("blockIdx.x") + tx = tvm.thread_axis("threadIdx.x") + + ib.scope_attr(bx, "thread_extent", n // max_threads) + ib.scope_attr(tx, "thread_extent", max_threads) + + idx = bx.var * max_threads + tx.var + + a = ib.allocate(dtype, (1), name='a', scope='local') + b = ib.allocate(dtype, (1), name='b', scope='local') + + a[0] = outs[0].vload(idx, dtype) + b[0] = outs[0].vload(idx, dtype) + + if base_type.find('float') != -1: + mad_func = lambda x, y: (x * x + y) + else: + mad_func = lambda x, y: y * y + x + + for _ in range(item_per_thread // 4 // lanes): + a[0] = mad_func(a[0], b[0]) + b[0] = mad_func(b[0], a[0]) + + ib.emit(outs[0].vstore(idx, b[0])) + return ib.get() + + y = tvm.extern((n,), [], extern, name="y", dtype=dtype) + s = tvm.create_schedule(y.op) + + try: + func = tvm.build(s, [y], target, target_host=target_host) + func = _convert_to_remote(func, remote) + time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) + y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) + time = time_f(y).mean + except tvm._ffi.base.TVMError: + # build error (occur when device does not support half) + return -1 + + return 1.0 * (n * item_per_thread) / 1e9 / time + +def measure_compute_all_types(total_item, item_per_thread, n_times, + target, target_host, remote, ctx, verbose=True): + """ measure peak flops for all types + + Parameters + ---------- + total_item: int + number of elements in input array + item_per_thread: int + number of elements each thread accmulates + n_times: int + number of runs for averaging + target: :any:`tvm.target.Target` + the target and option of the compilation. + target_host : str or :any:`tvm.target.Target` + host compilation target + remote: tvm.contrib.rpc.RPCSession + remote rpc session + ctx: TVMcontext + the context of array + verbose: bool + whether outputs immediate result + + Returns + ------- + result: list + a list of (type_name, GFLOPS/GIOPS) pairs + """ + result = [] + for base_type in ["float", "int"]: + for bits in [16, 32, 64]: + for lanes in [1, 2, 4, 8, 16]: + if base_type == 'int' and bits != 32: # only measure int32 + continue + + max_speed = -1e9 + for per_thread in [item_per_thread//2, item_per_thread, item_per_thread*2]: + speed = measure_compute_mad(total_item, per_thread, + base_type, bits, lanes, target, + target_host, remote, ctx, n_times) + max_speed = max(max_speed, speed) + type_name = base_type + str(bits) + result.append(["%sx%d" % (type_name, lanes), max_speed]) + + unit = "GFLOPS" if base_type == "float" else "GIOPS" + + if verbose: + logging.info("\t%-10s %.2f %s", result[-1][0], result[-1][1], unit) + + return result + + +def measure_peak_all(target, target_host, host, port): + """measure memory bandwidth and peak compute for gpu devices + + Parameters + ---------- + target: str or :any:`tvm.target.Target` + target_host: str + host: str + port: int + """ + + target = tvm.target.create(target) + remote = rpc.connect(host, port) + n_times = 20 + + bandwidth_total_item = 1 << 25 + bandwidth_item_per_thread = 32 + + compute_total_item = 1 << 21 + compute_item_per_thread = 4096 + + if str(target).startswith("opencl"): + ctx = remote.cl() + elif str(target).startswith("cuda"): + ctx = remote.gpu() + elif str(target).startswith("metal"): + ctx = remote.metal() + else: + raise RuntimeError("Unsupported target") + + logging.info("========== measure memory bandwidth ==========") + measure_bandwidth_all_types(bandwidth_total_item, bandwidth_item_per_thread, + n_times, target, target_host, remote, ctx) + + logging.info("========== measure peak compute ==========") + measure_compute_all_types(compute_total_item, compute_item_per_thread, + n_times, target, target_host, remote, ctx) diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py new file mode 100644 index 000000000000..b477ac23d3af --- /dev/null +++ b/python/tvm/exec/measure_peak.py @@ -0,0 +1,32 @@ +"""measure bandwidth and compute peak + +e.g. +python3 -m tvm.exec.measure_peak --target cuda --rpc-host 0.0.0.0 --rpc-port 9090 +python3 -m tvm.exec.measure_peak --target opencl --target-host "llvm -target=aarch64-linux-gnu" \ + --rpc-host $TVM_OPENCL_DEVICE_HOST --rpc-port 9090 +""" + +import argparse +import logging + +from ..contrib.peak import measure_peak_all + +def main(): + """Main funciton""" + parser = argparse.ArgumentParser() + parser.add_argument('--target', type=str, default="llvm", + help='The build target') + parser.add_argument('--target-host', type=str, default=None, + help='The host code compilation target') + parser.add_argument('--rpc-host', type=str, default="0.0.0.0", + help='the hostname of the server') + parser.add_argument('--rpc-port', type=int, default=9090, + help='The port of the PRC') + + args = parser.parse_args() + logging.basicConfig(level=logging.INFO) + + measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port) + +if __name__ == "__main__": + main() From 9c27998a256e14d99b2b7a2a101a7361899fad69 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Wed, 7 Feb 2018 15:57:00 -0500 Subject: [PATCH 141/948] [WIP] Add OpenGL topi. (#836) [TOPI][GL] OpenGL topi. --- include/tvm/ir.h | 7 ++ include/tvm/schedule.h | 3 + python/tvm/target.py | 11 ++ src/codegen/codegen_opengl.cc | 56 +++++----- src/codegen/codegen_opengl.h | 3 + src/codegen/intrin_rule_opengl.cc | 32 ++++++ src/pass/storage_flatten.cc | 18 +++- src/runtime/opengl/opengl_device_api.cc | 2 +- src/schedule/schedule_lang.cc | 3 + src/schedule/schedule_ops.cc | 5 + tests/webgl/test_local_topi_conv2d_nchw.py | 82 +++++++++++++++ tests/webgl/test_local_topi_dense.py | 59 +++++++++++ tests/webgl/test_local_topi_pooling.py | 115 +++++++++++++++++++++ tests/webgl/test_local_topi_softmax.py | 79 ++++++++++++++ topi/python/topi/__init__.py | 1 + topi/python/topi/opengl/__init__.py | 9 ++ topi/python/topi/opengl/conv2d_nchw.py | 50 +++++++++ topi/python/topi/opengl/dense.py | 49 +++++++++ topi/python/topi/opengl/injective.py | 36 +++++++ topi/python/topi/opengl/pooling.py | 94 +++++++++++++++++ topi/python/topi/opengl/softmax.py | 29 ++++++ 21 files changed, 714 insertions(+), 29 deletions(-) create mode 100644 src/codegen/intrin_rule_opengl.cc create mode 100644 tests/webgl/test_local_topi_conv2d_nchw.py create mode 100644 tests/webgl/test_local_topi_dense.py create mode 100644 tests/webgl/test_local_topi_pooling.py create mode 100644 tests/webgl/test_local_topi_softmax.py create mode 100644 topi/python/topi/opengl/__init__.py create mode 100644 topi/python/topi/opengl/conv2d_nchw.py create mode 100644 topi/python/topi/opengl/dense.py create mode 100644 topi/python/topi/opengl/injective.py create mode 100644 topi/python/topi/opengl/pooling.py create mode 100644 topi/python/topi/opengl/softmax.py diff --git a/include/tvm/ir.h b/include/tvm/ir.h index 989802326ae4..f36d914e621f 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -226,6 +226,13 @@ constexpr const char* channel_write_advance = "channel_write_advance"; constexpr const char* pipeline_stage_scope = "pipeline_stage_scope"; /*! \brief pipeline execution scope, implies the scope can be pipelined. */ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope"; +/*! + * \brief Mark that this stage is an OpenGL shader. Since OpenGL shader only + * allows writing out to one element of the output texture, the Provide node + * gets translated to a special Call::glsl_texture_store statement instead of a + * Store statement. + */ +constexpr const char* opengl_stage_scope = "opengl_stage_scope"; } // namespace attr /*! \brief namespace of TVM Intrinsic functions */ diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h index 003555132789..51e27a9e94bf 100644 --- a/include/tvm/schedule.h +++ b/include/tvm/schedule.h @@ -427,6 +427,8 @@ class StageNode : public Node { std::string scope; /*! \brief Whether this is an output stage */ bool is_output{false}; + /*! \brief Whether this is an OpenGL stage */ + bool is_opengl{false}; /*! \brief Whether apply double buffer optimization to this stage */ bool double_buffer{false}; /*! @@ -450,6 +452,7 @@ class StageNode : public Node { v->Visit("attach_stage", &attach_stage); v->Visit("scope", &scope); v->Visit("is_output", &is_output); + v->Visit("is_opengl", &is_opengl); v->Visit("double_buffer", &double_buffer); v->Visit("group", &group); v->Visit("num_child_stages", &num_child_stages); diff --git a/python/tvm/target.py b/python/tvm/target.py index a24eefcb5ba7..5966326c71eb 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -280,6 +280,17 @@ def mali(options=None): return Target("opencl", opts) +def opengl(options=None): + """Returns a OpenGL target. + + Parameters + ---------- + options : list of str + Additional options + """ + return Target("opengl", options) + + def create(target_str): """Get a target given target string. diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc index 696082749a37..0e9e274b6bba 100644 --- a/src/codegen/codegen_opengl.cc +++ b/src/codegen/codegen_opengl.cc @@ -168,31 +168,9 @@ void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) { this->stream << "}\n"; } -// GLSL texture store is special. We can only store to one output texture, and -// we must store to the index that matches the current "thread index". void CodeGenOpenGL::VisitStmt_(const Store* op) { - auto t = op->value.type(); - auto buffer = op->buffer_var.get(); - auto index = op->index; - - if (t.lanes() == 1) { - // Store to a scalar. - CHECK(inputs_.find(buffer) == inputs_.cend()) - << "Texture has been read from before. Must not store to it."; - if (output_ == nullptr) { - output_ = buffer; // Record that this texture is the output. - } else { - CHECK(output_ == buffer) << "GLSL can only write to 1 texture."; - } - - this->PrintIndent(); - this->stream << GetBufferRef(t, buffer, index) << " = " - << PrintExpr(op->value) << ";\n"; - - } else { - // Store to a vector. - LOG(FATAL) << "Vectorized store not implemented."; - } + LOG(FATAL) << "Store statement not supported in OpenGL." + << " Texture store should be a Call statement."; } // texelFetch(tex, ivec2(idx & kTextureRowMask, idx >> kTextureRowBits), 0).r @@ -215,8 +193,6 @@ std::string CodeGenOpenGL::GetBufferRef( if (buffer == this->output_) { // This is the output texture. - CHECK_EQ(index.get(), output_iter_var_) - << "GLSL must access corresponding elem of output texture."; return GetVarID(buffer); } else { // This is an input texture. @@ -265,5 +241,33 @@ void CodeGenOpenGL::VisitExpr_(const StringImm*, std::ostream& os) { LOG(FATAL) << "GLSL 3.0 doesn't support strings."; } +void CodeGenOpenGL::VisitStmt_(const Evaluate* op) { + auto call = op->value.as(); + if (call == nullptr || call->name != Call::glsl_texture_store) { + // Fallback to normal logic. + CodeGenC::VisitStmt_(op); + } + + CHECK_EQ(call->args.size(), 2); + auto buffer = call->args[0].as(); + auto value = call->args[1]; + + // Doesn't support store to vector. + auto type = value.type(); + CHECK_EQ(type.lanes(), 1) + << "Vectorized store not implemented, type = " << type; + + CHECK(inputs_.find(buffer) == inputs_.cend()) + << "Texture has been read from before. Must not store to it."; + if (output_ == nullptr) { + output_ = buffer; // Record that this texture is the output. + } else { + CHECK(output_ == buffer) << "GLSL can only write to 1 texture."; + } + + this->PrintIndent(); + this->stream << GetVarID(buffer) << " = " << PrintExpr(value) << ";\n"; +} + } // namespace codegen } // namespace tvm diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h index 6ff1f7e9ac95..3cae1e323ec4 100644 --- a/src/codegen/codegen_opengl.h +++ b/src/codegen/codegen_opengl.h @@ -34,6 +34,9 @@ class CodeGenOpenGL final : public CodeGenC { void VisitExpr_(const FloatImm* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const StringImm* op, std::ostream& os) final; // NOLINT(*) + // Match glsl_texture_store Call. + void VisitStmt_(const Evaluate* op) final; // NOLINT(*) + private: const Variable* output_{nullptr}; std::unordered_set inputs_; diff --git a/src/codegen/intrin_rule_opengl.cc b/src/codegen/intrin_rule_opengl.cc new file mode 100644 index 000000000000..6ae2ee5d2b4e --- /dev/null +++ b/src/codegen/intrin_rule_opengl.cc @@ -0,0 +1,32 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file intrin_rule_opencl.cc + * \brief OpenCL intrinsic rules. + */ +#include "./intrin_rule.h" + +namespace tvm { +namespace codegen { +namespace intrin { + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.exp") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.log") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.tanh") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.sqrt") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.pow") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opengl.popcount") +.set_body(DispatchExtern); + +} // namespace intrin +} // namespace codegen +} // namespace tvm diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc index bc380d473791..94332ff5cb7e 100644 --- a/src/pass/storage_flatten.cc +++ b/src/pass/storage_flatten.cc @@ -29,7 +29,8 @@ using intrinsic::tvm_address_of; class StorageFlattener : public IRMutator { public: - explicit StorageFlattener(Map extern_buffer, int cache_line_size) { + explicit StorageFlattener(Map extern_buffer, + int cache_line_size) { for (auto kv : extern_buffer) { BufferEntry e; e.buffer = kv.second; @@ -38,6 +39,7 @@ class StorageFlattener : public IRMutator { } cache_line_size_ = cache_line_size; } + Stmt Mutate_(const Store* op, const Stmt& s) final { Stmt stmt = IRMutator::Mutate_(op, s); op = stmt.as(); @@ -90,6 +92,8 @@ class StorageFlattener : public IRMutator { vinfo[dim].align_factor = tuple->args[1].as()->value; vinfo[dim].align_offset = tuple->args[2].as()->value; return this->Mutate(op->body); + } else if (op->attr_key == attr::opengl_stage_scope) { + is_opengl_ = true; } return IRMutator::Mutate_(op, s); } @@ -104,7 +108,15 @@ class StorageFlattener : public IRMutator { const BufferEntry& e = it->second; CHECK(!e.released) << "Read a buffer that is already out of scope"; - return e.buffer.vstore(e.RelIndex(op->args), op->value); + if (is_opengl_) { + return Evaluate::make(Call::make( + Type(), + Call::glsl_texture_store, + {e.buffer->data, op->value}, + Call::Intrinsic)); + } else { + return e.buffer.vstore(e.RelIndex(op->args), op->value); + } } Stmt Mutate_(const Realize* op, const Stmt& s) final { @@ -421,6 +433,8 @@ class StorageFlattener : public IRMutator { std::vector curr_thread_scope_; // The size of cacheline int cache_line_size_; + // The current stage is an OpenGL shader. + bool is_opengl_{false}; }; Stmt StorageFlatten(Stmt stmt, diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc index 5f629fcf1a1f..c47446388836 100644 --- a/src/runtime/opengl/opengl_device_api.cc +++ b/src/runtime/opengl/opengl_device_api.cc @@ -281,7 +281,7 @@ GLuint OpenGLWorkspace::CreateShader(GLenum shader_kind, if (err != GL_TRUE) { std::unique_ptr err_msg(new char[info_log_len + 1]); gl->GetShaderInfoLog(shader, info_log_len, nullptr, err_msg.get()); - LOG(FATAL) << err_msg.get(); + LOG(FATAL) << err_msg.get() << "\n" << shader_src; assert(false); } diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc index 59bc3f242b03..0bdae67f60a6 100644 --- a/src/schedule/schedule_lang.cc +++ b/src/schedule/schedule_lang.cc @@ -433,6 +433,9 @@ Stage& Stage::opengl() { // Bind the only dimension to threadIdx.x. bind(fused, thread_axis(Range(nullptr), "threadIdx.x")); + // Mark this stage as OpenGL. + (*this)->is_opengl = true; + return *this; } diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc index e0dc3321b1fc..1fbffb61fc7f 100644 --- a/src/schedule/schedule_ops.cc +++ b/src/schedule/schedule_ops.cc @@ -44,6 +44,11 @@ Stmt MakePipeline(const Stage& s, s->op, ir::attr::realize_scope, StringImm::make(s->scope), pipeline); + + if (s->is_opengl) { + pipeline = AttrStmt::make( + s->op, ir::attr::opengl_stage_scope, StringImm::make(""), pipeline); + } return pipeline; } diff --git a/tests/webgl/test_local_topi_conv2d_nchw.py b/tests/webgl/test_local_topi_conv2d_nchw.py new file mode 100644 index 000000000000..106534505694 --- /dev/null +++ b/tests/webgl/test_local_topi_conv2d_nchw.py @@ -0,0 +1,82 @@ +"""Example code to do convolution. +Copied from topi/tests/python/test_topi_conv2d_nchw.py. +Should be removed once we fix OpenGL testing on Jenkins.""" +import os +import numpy as np +import tvm +import topi +from tvm.contrib.pickle_memoize import memoize +from topi.util import get_const_tuple + +def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding): + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') + B = topi.nn.conv2d_nchw(A, W, stride, padding) + C = topi.nn.relu(B) + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + + @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") + def get_ref_data(): + a_np = np.random.uniform(size=a_shape).astype(dtype) + w_np = np.random.uniform(size=w_shape).astype(dtype) + b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) + c_np = np.maximum(b_np, 0) + return a_np, w_np, b_np, c_np + + a_np, w_np, b_np, c_np = get_ref_data() + + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s1 = topi.generic.schedule_conv2d_nchw([B]) + s2 = topi.generic.schedule_conv2d_nchw([C]) + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + with tvm.build_config(auto_unroll_max_step=1400, + unroll_explicit=(device != "cuda")): + func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) + func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) + func1(a, w, b) + func2(a, w, c) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + + for device in ['opengl']: + check_device(device) + + +def test_conv2d_nchw(): + # ResNet18 worklaods + verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3) + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) + verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0) + verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1) + verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0) + verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1) + verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1) + verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0) + verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1) + verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1) + verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0) + verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) + # Vgg16 workloads + verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1) + # Super resolution workloads + verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2) + verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1) + verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1) + verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1) + +if __name__ == "__main__": + test_conv2d_nchw() diff --git a/tests/webgl/test_local_topi_dense.py b/tests/webgl/test_local_topi_dense.py new file mode 100644 index 000000000000..f2e7dfc1331c --- /dev/null +++ b/tests/webgl/test_local_topi_dense.py @@ -0,0 +1,59 @@ +"""Test code for dense operator +Copied from topi/tests/python/test_topi_dense.py. +Should be removed once we fix OpenGL testing on Jenkins. +""" +import numpy as np +import tvm +import topi +from topi.util import get_const_tuple +from tvm.contrib.pickle_memoize import memoize + + +def verify_dense(batch, in_dim, out_dim, use_bias=True): + A = tvm.placeholder((batch, in_dim), name='A') + B = tvm.placeholder((out_dim, in_dim), name='B') + C = tvm.placeholder((out_dim,), name='C') + D = topi.nn.dense(A, B, C if use_bias else None) + D = topi.nn.relu(D) + dtype = A.dtype + + # use memoize to pickle the test data for next time use + @memoize("topi.tests.test_topi_dense") + def get_ref_data(): + a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype) + b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype) + c_np = np.random.uniform(size=(out_dim,)).astype(dtype) + if use_bias: + d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0) + else: + d_np = np.maximum(np.dot(a_np, b_np.T), 0.0) + return (a_np, b_np, c_np, d_np) + # get the test data + a_np, b_np, c_np, d_np = get_ref_data() + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_dense(D) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(c_np, ctx) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + f = tvm.build(s, [A, B, C, D], device, name="dense") + f(a, b, c, d) + np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5) + + for device in ['opengl']: + check_device(device) + +def test_dense(): + verify_dense(1, 1024, 1000, use_bias=True) + verify_dense(1, 1024, 1000, use_bias=False) + + +if __name__ == "__main__": + test_dense() diff --git a/tests/webgl/test_local_topi_pooling.py b/tests/webgl/test_local_topi_pooling.py new file mode 100644 index 000000000000..813fcd227e2f --- /dev/null +++ b/tests/webgl/test_local_topi_pooling.py @@ -0,0 +1,115 @@ +"""Test code for pooling +Copied from topi/tests/python/test_topi_pooling.py. +Should be removed once we fix OpenGL testing on Jenkins. +""" +import numpy as np +import tvm +import topi +import math +from topi.util import get_const_tuple + +def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode): + iw = ih + kw = kh + sw = sh + ph, pw = padding + A = tvm.placeholder((n, ic, ih, iw), name='A') + B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding, + pool_type=pool_type, ceil_mode=ceil_mode) + B = topi.nn.relu(B) + dtype = A.dtype + + bshape = get_const_tuple(B.shape) + ashape = get_const_tuple(A.shape) + if ceil_mode: + assert bshape[2] == int(math.ceil(float(ashape[2] - kh + ph * 2) / sh) + 1) + assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pw * 2) / sw) + 1) + else: + assert bshape[2] == int(math.floor(float(ashape[2] - kh + ph * 2) / sh) + 1) + assert bshape[3] == int(math.floor(float(ashape[3] - kw + pw * 2) / sw) + 1) + + + a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype) + pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype) + no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw))) + pad_np[np.ix_(*no_zero)] = a_np + _, oc, oh, ow = get_const_tuple(B.shape) + b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype) + + if pool_type == 'avg': + for i in range(oh): + for j in range(ow): + b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) + elif pool_type =='max': + for i in range(oh): + for j in range(ow): + b_np[:,:,i,j] = np.max(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) + b_np = np.maximum(b_np, 0.0) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_pool(B) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) + print(tvm.lower(s, [A, B], simple_mode=True)) + + f = tvm.build(s, [A, B], device) + f(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['opengl']: + check_device(device) + +def test_pool(): + verify_pool(1, 256, 32, 2, 2, [0, 0], 'avg', False) + verify_pool(1, 256, 31, 3, 3, [1, 2], 'avg', False) + verify_pool(1, 256, 32, 2, 2, [0, 0], 'max', False) + verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', False) + verify_pool(1, 256, 31, 3, 3, [2, 1], 'max', True) + + + +def verify_global_pool(n, c, h, w, pool_type): + A = tvm.placeholder((n, c, h, w), name='A') + B = topi.nn.global_pool(A, pool_type=pool_type) + B = topi.nn.relu(B) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + if pool_type == 'avg': + b_np = np.mean(a_np, axis=(2,3), keepdims=True) + elif pool_type =='max': + b_np = np.max(a_np, axis=(2,3), keepdims=True) + b_np = np.maximum(b_np, 0.0) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_global_pool(B) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + f = tvm.build(s, [A, B], device) + f(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['opengl']: + check_device(device) + +def test_global_pool(): + verify_global_pool(1, 1024, 7, 7, 'avg') + verify_global_pool(4, 1024, 7, 7, 'avg') + verify_global_pool(1, 1024, 7, 7, 'max') + verify_global_pool(4, 1024, 7, 7, 'max') + + +if __name__ == "__main__": + test_pool() + test_global_pool() diff --git a/tests/webgl/test_local_topi_softmax.py b/tests/webgl/test_local_topi_softmax.py new file mode 100644 index 000000000000..34f8bfb8d8f5 --- /dev/null +++ b/tests/webgl/test_local_topi_softmax.py @@ -0,0 +1,79 @@ +"""Test code for softmax +Copied from topi/tests/python/test_topi_softmax.py. +Should be removed once we fix OpenGL testing on Jenkins. +""" + +import os +import numpy as np +import tvm +import topi +import logging +from topi.util import get_const_tuple + +def verify_softmax(m, n): + A = tvm.placeholder((m, n), name='A') + B = topi.nn.softmax(A) + # confirm lower works + s = tvm.create_schedule([B.op]) + tvm.lower(s, [A, B], simple_mode=True) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + b_np = topi.testing.softmax_python(a_np) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_softmax(B) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [A, B], device, name="softmax") + foo(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ["opengl"]: + check_device(device) + +def test_softmax(): + verify_softmax(32, 10) + verify_softmax(3, 4) + + +def verify_log_softmax(m, n): + A = tvm.placeholder((m, n), name='A') + B = topi.nn.log_softmax(A) + # confirm lower works + s = tvm.create_schedule([B.op]) + tvm.lower(s, [A, B], simple_mode=True) + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + b_np = topi.testing.log_softmax_python(a_np) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_softmax(B) + ctx = tvm.context(device, 0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [A, B], device, name="log_softmax") + foo(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ["opengl"]: + check_device(device) + + +def test_log_softmax(): + verify_log_softmax(32, 10) + verify_log_softmax(3, 4) + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + test_softmax() + test_log_softmax() diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index d7875cdbe521..f7a922c5c379 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -21,6 +21,7 @@ from . import cuda from . import rasp from . import mali +from . import opengl from . import testing from . import util from . import rocm diff --git a/topi/python/topi/opengl/__init__.py b/topi/python/topi/opengl/__init__.py new file mode 100644 index 000000000000..c8f20b9825a7 --- /dev/null +++ b/topi/python/topi/opengl/__init__.py @@ -0,0 +1,9 @@ +# pylint: disable=redefined-builtin, wildcard-import +"""CUDA specific declaration and schedules.""" +from __future__ import absolute_import as _abs + +from .conv2d_nchw import schedule_conv2d_nchw +from .injective import schedule_injective, schedule_elemwise, schedule_broadcast +from .softmax import schedule_softmax +from .dense import schedule_dense +from .pooling import schedule_pool, schedule_global_pool diff --git a/topi/python/topi/opengl/conv2d_nchw.py b/topi/python/topi/opengl/conv2d_nchw.py new file mode 100644 index 000000000000..c633d8a21e6e --- /dev/null +++ b/topi/python/topi/opengl/conv2d_nchw.py @@ -0,0 +1,50 @@ +#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long +"""Schedule for conv2d_nchw with auto fusion""" +import tvm +from .. import tag +from .. import generic + +@generic.schedule_conv2d_nchw.register(["opengl"]) +def schedule_conv2d_nchw(outs): + """Schedule for conv2d_nchw. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def _schedule(conv2d, data): + if conv2d.op in s.outputs: + Out = conv2d + else: + Out = outs[0].op.output(0) + s[conv2d].opengl() + s[Out].opengl() + s[data].opengl() + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].opengl() + for tensor in OP.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + # schedule conv2d_nchw + elif OP.tag.startswith('conv2d_nchw'): + conv2d = OP.output(0) + data = OP.input_tensors[0] + _schedule(conv2d, data) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/opengl/dense.py b/topi/python/topi/opengl/dense.py new file mode 100644 index 000000000000..e7cf008ae240 --- /dev/null +++ b/topi/python/topi/opengl/dense.py @@ -0,0 +1,49 @@ +# pylint: disable=invalid-name, unused-variable +"""Schedule for dense operator""" +from __future__ import absolute_import as _abs +import tvm +from .. import tag +from .. import generic + +@generic.schedule_dense.register(["opengl"]) +def schedule_dense(outs): + """Schedule for dense operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of dense + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for dense. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def _schedule(Dense): + if Dense.op in s.outputs: + Out = Dense + else: + Out = outs[0].op.output(0) + s[Dense].opengl() + s[Out].opengl() + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + # schedule dense + elif OP.tag == 'dense': + Dense = OP.output(0) + _schedule(Dense) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py new file mode 100644 index 000000000000..9665e330864f --- /dev/null +++ b/topi/python/topi/opengl/injective.py @@ -0,0 +1,36 @@ +# pylint: disable=invalid-name, unused-variable, +"""Schedule for composition of injective operator""" +import tvm +from .. import generic + +def _schedule_injective(op, sch): + x = op.output(0) + sch[x].opengl() + return sch + + +@generic.schedule_injective.register(["opengl"]) +def schedule_injective(outs): + """Schedule for injective op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of reduce in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + tvm.schedule.AutoInlineInjective(s) + for out in outs: + _schedule_injective(out.op, s) + return s + +schedule_elemwise = schedule_injective +schedule_broadcast = schedule_injective diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py new file mode 100644 index 000000000000..5c26c56bb1ac --- /dev/null +++ b/topi/python/topi/opengl/pooling.py @@ -0,0 +1,94 @@ +# pylint: disable=invalid-name, unused-variable +"""Schedule for pooling operators""" +import tvm +from .. import tag +from .. import generic + +@generic.schedule_global_pool.register(["opengl"]) +def schedule_global_pool(outs): + """Schedule for global_pool. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of global_pool + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for global_pool. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def _schedule(Pool): + if Pool.op in s.outputs: + Out = Pool + else: + Out = outs[0].op.output(0) + s[Pool].opengl() + s[Out].opengl() + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].opengl() + for tensor in OP.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + # schedule global_pool + elif OP.tag.startswith('global_pool'): + Pool = OP.output(0) + _schedule(Pool) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s + + +@generic.schedule_pool.register(["opengl"]) +def schedule_pool(outs): + """Schedule for pool. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of pool + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for pool. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + def _schedule(PaddedInput, Pool): + s[PaddedInput].opengl() + if Pool.op in s.outputs: + Out = Pool + else: + Out = outs[0].op.output(0) + s[Pool].opengl() + s[Out].opengl() + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + # schedule pool + elif OP.tag.startswith('pool'): + PaddedInput = OP.input_tensors[0] + Pool = OP.output(0) + _schedule(PaddedInput, Pool) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/opengl/softmax.py b/topi/python/topi/opengl/softmax.py new file mode 100644 index 000000000000..a5bf4371eb13 --- /dev/null +++ b/topi/python/topi/opengl/softmax.py @@ -0,0 +1,29 @@ +# pylint: disable=invalid-name, unused-variable, trailing-whitespace +"""Schedule for softmax operator""" +import tvm +from .. import generic + +@generic.schedule_softmax.register(["opengl"]) +def schedule_softmax(outs): + """Schedule for softmax op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of reduce in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + softmax = outs[0] + max_elem = softmax.op.input_tensors[1] + expsum = softmax.op.input_tensors[2] + s[max_elem].opengl() + s[expsum].opengl() + s[softmax].opengl() + return s From 732e67abf2b744082fe0a11a3dc4ea659127fd84 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 7 Feb 2018 13:40:30 -0800 Subject: [PATCH 142/948] [PASS] Enable StorageRewrite before virtual thread lowering (#880) * [PASS] Enable StorageRewrite before virtual thread lowering * update * fix testcase --- src/pass/storage_rewrite.cc | 20 ++++++++++++-------- topi/tests/python_cpp/test_topi_reduce.py | 17 ++++++++++++++++- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 5a0ea067d433..fe4295feb982 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -154,6 +154,8 @@ class LinearAccessPatternFinder final : public IRVisitor { in_thread_env_ = false; } else if (op->attr_key == attr::extern_scope) { VisitNewScope(op); + } else if (op->attr_key == attr::virtual_thread) { + VisitNewScope(op); } else if (op->attr_key == attr::storage_scope) { const Variable* buf = op->node.as(); alloc_info_[buf].storage_scope = @@ -395,11 +397,10 @@ class StoragePlanRewriter : public IRMutator { } Stmt Mutate_(const AttrStmt* op, const Stmt& s) final { - CHECK(op->attr_key != attr::virtual_thread) - << "InjectVirtualThread before StoragePlan"; if (op->attr_key == attr::storage_scope) { return this->Mutate(op->body); } else if (op->attr_key == attr::thread_extent || + op->attr_key == attr::virtual_thread || op->attr_key == attr::pragma_scope) { // remake all the allocation at the attach scope. if (attach_map_.count(op)) { @@ -481,11 +482,13 @@ class StoragePlanRewriter : public IRMutator { Stmt body) { std::vector nest; for (StorageEntry* e : svec) { - nest.emplace_back(AttrStmt::make( - e->alloc_var, attr::storage_scope, - StringImm::make(e->scope.to_string()), - Evaluate::make(0))); - nest.push_back(e->new_alloc); + if (e->new_alloc.defined()) { + nest.emplace_back(AttrStmt::make( + e->alloc_var, attr::storage_scope, + StringImm::make(e->scope.to_string()), + Evaluate::make(0))); + nest.push_back(e->new_alloc); + } } return MergeNest(nest, body); } @@ -716,7 +719,8 @@ class StoragePlanRewriter : public IRMutator { if (s.stmt->is_type()) { const auto* op = static_cast(s.stmt); if (op->attr_key == attr::thread_extent || - op->attr_key == attr::pragma_scope) { + op->attr_key == attr::pragma_scope || + op->attr_key == attr::virtual_thread) { PlanNewScope(op); } else { CHECK(op->attr_key == attr::extern_scope); diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py index adfe18ba4ef9..b4c630395f60 100644 --- a/topi/tests/python_cpp/test_topi_reduce.py +++ b/topi/tests/python_cpp/test_topi_reduce.py @@ -77,7 +77,22 @@ def check_device(device): out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) - np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) + if type == "argmax" or type == "argmin": + out_tvm_indices = out_tvm.asnumpy() + if keepdims: + out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis) + if axis is None: + out_tvm_val = in_npy_map.ravel()[out_tvm_indices] + else: + other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):])) + sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:] + out_tvm_val = in_npy_map[sel_indices] + if type == "argmax": + np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3) + elif type == "argmin": + np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3) + else: + np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) for device in ["cuda", "opencl", "metal", "llvm", "rocm"]: check_device(device) From 90dd507e82f38ca5c5a63e6bdc2d101cf3a5519f Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Thu, 8 Feb 2018 12:24:24 -0800 Subject: [PATCH 143/948] [TOPI] conv2d avx (#883) * conv2d schedules for Intel CPU (AVX2 & AVX512) * fix lint * remove override register --- CODEOWNERS | 2 +- CONTRIBUTORS.md | 2 +- topi/python/topi/x86/conv2d.py | 146 ++++++++++++++++---- topi/python/topi/x86/conv2d_avx_1x1.py | 135 ++++++++++++++++++ topi/python/topi/x86/conv2d_avx_common.py | 151 +++++++++++++++++++++ topi/tests/python/test_topi_conv2d_nchw.py | 4 +- 6 files changed, 413 insertions(+), 27 deletions(-) create mode 100644 topi/python/topi/x86/conv2d_avx_1x1.py create mode 100644 topi/python/topi/x86/conv2d_avx_common.py diff --git a/CODEOWNERS b/CODEOWNERS index 0e22cff91e0b..4af7a7e70c67 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -11,7 +11,7 @@ src/llvm/* @aatluri src/runtime/rocm/* @aatluri # JVM language -jvm/* @javelinjs +jvm/* @yzhliu # TOPI topi/python/topi/* @Laurawly @Huyuwei diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a62c11e5fa4d..13b5ee1c5dd3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,7 +26,7 @@ and are qualified to lead development and review changes of the owned module. - [Aditya Atluri](https://github.com/adityaatluri) ROCM - [Leyuan Wang](https://github.com/Laurawly) TOPI - [Yuwei Hu](https://github.com/Huyuwei) TOPI -- [Yizhi Liu](https://github.com/javelinjs) JVM package +- [Yizhi Liu](https://github.com/yzhliu) JVM package List of Contributors -------------------- diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index cb3571d6a91b..7457deab4c17 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -1,13 +1,90 @@ # pylint: disable=invalid-name,unused-variable,invalid-name """Conv2D schedule on x86""" import tvm -from .. import generic -from .. import tag +from .. import generic, tag +from .. import nn +from ..nn.util import infer_pad, infer_stride +from ..nn.conv2d import conv2d, _get_workload, _get_schedule, _WORKLOADS + +from . import conv2d_avx_1x1, conv2d_avx_common +from .conv2d_avx_common import AVXConvCommonFwd +from .conv2d_avx_1x1 import AVXConv1x1Fwd + +_AVX_SCH_TO_DECL_FUNC = { + AVXConvCommonFwd: conv2d_avx_common._declaration_conv, + AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv +} + +_AVX_SCH_TO_SCH_FUNC = { + AVXConvCommonFwd: conv2d_avx_common._schedule_conv, + AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv +} + +@_get_schedule.register("cpu") +def _get_schedule_conv(wkl): + if wkl not in _WORKLOADS: + raise ValueError("no schedule for such workload: {}".format(wkl)) + idx = _WORKLOADS.index(wkl) + + fp32_vec_len = 8 + target = tvm.target.current_target(allow_none=False) + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + fp32_vec_len = 16 + + _SCHEDULES_AVX_NCHW = [ + # float32 resnet-18 + AVXConvCommonFwd(3, fp32_vec_len, 28, False), + AVXConvCommonFwd(16, fp32_vec_len, 28, False), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConvCommonFwd(16, fp32_vec_len, 28, False), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConvCommonFwd(16, fp32_vec_len, 28, False), + AVXConvCommonFwd(16, fp32_vec_len, 14, False), + AVXConv1x1Fwd(16, fp32_vec_len, 2, 14), + AVXConvCommonFwd(16, fp32_vec_len, 14, True), + AVXConvCommonFwd(16, 32, 7, True), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 7), + AVXConvCommonFwd(16, fp32_vec_len, 7, True), + # float32 mobilenet + AVXConvCommonFwd(3, fp32_vec_len, 28, False), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 28), + AVXConv1x1Fwd(16, fp32_vec_len, 2, 14), + AVXConv1x1Fwd(16, fp32_vec_len, 2, 14), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 7), + AVXConv1x1Fwd(16, fp32_vec_len, 1, 7), + ] + + sch = _SCHEDULES_AVX_NCHW[idx] + return sch + + +@conv2d.register("cpu") +def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): + target = tvm.target.current_target(allow_none=False) + if 'avx' in str(target) and layout == 'NCHW': + wkl = _get_workload(data, kernel, stride, padding, out_dtype) + sch = _get_schedule(wkl) + return _AVX_SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype) + elif layout == 'NCHW': + return nn.conv2d_nchw(data, kernel, stride, padding, out_dtype) + elif layout == 'HWCN': + return nn.conv2d_hwcn(data, kernel, stride, padding, out_dtype) + elif layout == 'NHWC': + return nn.conv2d_nhwc(data, kernel, stride, padding, out_dtype) + else: + raise ValueError("not support this layout {} yet".format(layout)) + @generic.schedule_conv2d_nchw.register(["cpu"]) def schedule_conv2d(outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) + target = tvm.target.current_target(allow_none=False) def traverse(op): """Traverse operators from computation graph""" @@ -16,7 +93,7 @@ def traverse(op): if op not in s.outputs: s[op].compute_inline() else: # inject custom schedule - if len(op.axis) == 4: # schedule bias + bn + relu + if len(op.axis) == 4 and 'avx' not in str(target): # schedule bias + bn + relu n, c, h, w = op.axis fused = s[op].fuse(n, c) s[op].parallel(fused) @@ -26,27 +103,50 @@ def traverse(op): traverse(tensor.op) if 'conv2d_nchw' in op.tag: - conv = op.output(0) - kernel = op.input_tensors[1] - data = op.input_tensors[0] - data_pad = None - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - data_pad = data - data = data_pad.op.input_tensors[0] + if 'avx' in str(target): + output = op.output(0) + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel = kernel_vec.op.input_tensors[0] + data_vec = conv_out.op.input_tensors[0] + data = data_vec.op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] - n_pad, c_pad, h_pad, w_pad = data_pad.op.axis - pad_fused = s[data_pad].fuse(n_pad, c_pad) - s[data_pad].parallel(pad_fused) - C = conv - n, c, h, w = C.op.axis - rc, ry, rx = C.op.reduce_axis - fused = s[C].fuse(n, c) - s[C].parallel(fused) - wo, wi = s[C].split(w, factor=16) - s[C].reorder(fused, rc, h, wo, ry, rx, wi) # move rc to outer loop - s[C].unroll(rx) - s[C].unroll(ry) - s[C].vectorize(wi) + padding = infer_pad(data, data_pad) + if data_pad is None: + stride = infer_stride(data, kernel, output) + else: + stride = infer_stride(data_pad, kernel, output) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype) + sch = _get_schedule(wkl) + _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, + kernel, kernel_vec, conv_out, output, outs[0]) + else: + conv = op.output(0) + kernel = op.input_tensors[1] + data = op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + n_pad, c_pad, h_pad, w_pad = data_pad.op.axis + pad_fused = s[data_pad].fuse(n_pad, c_pad) + s[data_pad].parallel(pad_fused) + C = conv + n, c, h, w = C.op.axis + rc, ry, rx = C.op.reduce_axis + fused = s[C].fuse(n, c) + s[C].parallel(fused) + wo, wi = s[C].split(w, factor=16) + s[C].reorder(fused, rc, h, wo, ry, rx, wi) # move rc to outer loop + s[C].unroll(rx) + s[C].unroll(ry) + s[C].vectorize(wi) traverse(outs[0].op) return s diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py new file mode 100644 index 000000000000..cc264d04ac24 --- /dev/null +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -0,0 +1,135 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name +"""1x1 Conv2D schedule on for Intel CPU""" +from __future__ import absolute_import as _abs +from collections import namedtuple +import tvm + +from ..util import get_const_tuple +from ..nn.conv2d import _get_schedule, _get_workload +from ..nn.util import infer_pad, infer_stride +from ..nn.pad import pad + +AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor']) + +def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): + assert layout == 'NCHW', "only support NCHW convolution for AVX" + wkl = _get_workload(data, kernel, stride, padding, out_dtype) + sch = _get_schedule(wkl) + + HPAD, WPAD = wkl.hpad, wkl.wpad + HSTR, WSTR = wkl.hstride, wkl.wstride + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) + + pad_height = in_height + 2 * HPAD + pad_width = in_width + 2 * WPAD + + out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 + out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 + + DOPAD = (HPAD != 0 and WPAD != 0) + if DOPAD: + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + else: + data_pad = data + shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) + data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w]) + + shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1) + kernel_vec = tvm.compute(shape, lambda CO, CI, ci, co, h, w: + kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w], + name='kernel_vec') + + oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) + ic = tvm.reduce_axis((0, in_channel), name='ic') + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn] * + kernel_vec[oc_chunk, ic//sch.ic_bn, ic%sch.ic_bn, oc_block, 0, 0], + axis=[ic]), name='conv') + + oshape = (batch_size, num_filter, out_height, out_width) + unpack = tvm.compute(oshape, lambda n, oc, oh, ow: + conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn], + tag='conv2d_nchw') + return unpack + + +def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last): + # no stride and padding info here + padding = infer_pad(data, data_pad) + if data_pad is None: + stride = infer_stride(data, kernel, output) + else: + stride = infer_stride(data_pad, kernel, output) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype) + sch = _get_schedule(wkl) + + HPAD, WPAD = wkl.hpad, wkl.wpad + DOPAD = (HPAD != 0 and WPAD != 0) + + A, W = data, kernel_vec + A0, A1 = data_pad, data_vec + # schedule data + if DOPAD: + s[A0].compute_inline() + batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis + parallel_axis = s[A1].fuse(ic_chunk, ih) + s[A1].parallel(parallel_axis) + s[A1].pragma(batch, "parallel_launch_point") + s[A1].pragma(parallel_axis, "parallel_stride_pattern") + s[A1].pragma(batch, "parallel_barrier_when_finish") + + # schedule kernel pack + oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis + s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + if sch.oc_bn > 1: + s[W].vectorize(oc_block) + parallel_axis = s[W].fuse(oc_chunk, oh) + s[W].parallel(parallel_axis) + s[W].pragma(parallel_axis, "parallel_launch_point") + s[W].pragma(parallel_axis, "parallel_stride_pattern") + s[W].pragma(parallel_axis, "parallel_barrier_when_finish") + + C, O0, O = conv_out, output, last + CC = s.cache_write(C, 'global') + + batch, oc_chunk, oh, ow, oc_block = s[C].op.axis + oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor) + s[C].vectorize(oc_block) + + s[CC].compute_at(s[C], oh_outer) + _, oc_chunk, oh, ow, oc_block = s[CC].op.axis + ic, = s[CC].op.reduce_axis + + ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) + + oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) + ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) + + s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block) + s[CC].vectorize(oc_block) + + s[CC].unroll(ow_inner) + s[CC].unroll(oh_inner) + + if O0 != O: + s[O0].compute_inline() + batch, oc, oh, ow = s[O].op.axis + + oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn) + oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor) + ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor) + s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) + + parallel_axis = s[O].fuse(oc_chunk, oh_outer) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + + s[O].parallel(parallel_axis) + s[O].pragma(batch, "parallel_launch_point") + s[O].pragma(parallel_axis, "parallel_stride_pattern") + s[O].pragma(batch, "parallel_barrier_when_finish") + + return s diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py new file mode 100644 index 000000000000..4f5be019f45a --- /dev/null +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -0,0 +1,151 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name +"""Conv2D schedule on for Intel CPU""" +from __future__ import absolute_import as _abs +from collections import namedtuple +import tvm + +from ..util import get_const_tuple +from ..nn.conv2d import _get_schedule, _get_workload +from ..nn.util import infer_pad, infer_stride +from ..nn.pad import pad + +AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw']) + +def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): + assert layout == 'NCHW', "only support NCHW convolution for AVX" + wkl = _get_workload(data, kernel, stride, padding, out_dtype) + sch = _get_schedule(wkl) + + HPAD, WPAD = wkl.hpad, wkl.wpad + HSTR, WSTR = wkl.hstride, wkl.wstride + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) + + pad_height = in_height + 2 * HPAD + pad_width = in_width + 2 * WPAD + + out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 + out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 + + # pack data + DOPAD = (HPAD != 0 and WPAD != 0) + if DOPAD: + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + else: + data_pad = data + + shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width) + data_vec = tvm.compute(shape, + lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w], + name='data_vec') + + # pack kernel + shape = (num_filter//sch.oc_bn, in_channel//sch.ic_bn, + kernel_height, kernel_width, sch.ic_bn, sch.oc_bn) + kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co: + kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w], + name='kernel_vec') + + # convolution + oshape = (batch_size, num_filter//sch.oc_bn, out_height, out_width, sch.oc_bn) + unpack_shape = (batch_size, num_filter, out_height, out_width) + + ic = tvm.reduce_axis((0, in_channel), name='ic') + kh = tvm.reduce_axis((0, kernel_height), name='kh') + kw = tvm.reduce_axis((0, kernel_width), name='kw') + + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR+kh, ic%sch.ic_bn, ow*WSTR+kw] * + kernel_vec[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block], + axis=[ic, kh, kw]), + name='conv') + + unpack = tvm.compute(unpack_shape, + lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn], + name='output_unpack', + tag='conv2d_nchw') + return unpack + + +def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last): + # no stride and padding info here + padding = infer_pad(data, data_pad) + if data_pad is None: + stride = infer_stride(data, kernel, output) + else: + stride = infer_stride(data_pad, kernel, output) + wkl = _get_workload(data, kernel, stride, padding, output.dtype) + sch = _get_schedule(wkl) + + HPAD, WPAD = wkl.hpad, wkl.wpad + DOPAD = (HPAD != 0 and WPAD != 0) + + A, W = data, kernel_vec + A0, A1 = data_pad, data_vec + + # schedule data + if DOPAD: + s[A0].compute_inline() + batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis + parallel_axis = s[A1].fuse(ic_chunk, ih) + s[A1].parallel(parallel_axis) + s[A1].pragma(batch, "parallel_launch_point") + s[A1].pragma(parallel_axis, "parallel_stride_pattern") + s[A1].pragma(batch, "parallel_barrier_when_finish") + + # schedule kernel pack + oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis + s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + if sch.oc_bn > 1: + s[W].vectorize(oc_block) + parallel_axis = s[W].fuse(oc_chunk, oh) + s[W].parallel(parallel_axis) + s[W].pragma(parallel_axis, "parallel_launch_point") + s[W].pragma(parallel_axis, "parallel_stride_pattern") + s[W].pragma(parallel_axis, "parallel_barrier_when_finish") + + # schedule conv + C, O0, O = conv_out, output, last + CC = s.cache_write(C, 'global') + + _, oc_chunk, oh, ow, oc_block = s[C].op.axis + ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n) + s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + s[C].fuse(oc_chunk, oh) + s[C].vectorize(oc_block) + + s[CC].compute_at(s[C], ow_chunk) + _, oc_chunk, oh, ow, oc_block = s[CC].op.axis + ic, kh, kw = s[CC].op.reduce_axis + + ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) + + if sch.unroll_kw: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) + s[CC].unroll(kw) + else: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) + + s[CC].fuse(oc_chunk, oh) + s[CC].vectorize(oc_block) + s[CC].unroll(ow_block) + + if O0 != O: + s[O0].compute_inline() + + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn) + s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + + s[O].parallel(parallel_axis) + s[O].pragma(batch, "parallel_launch_point") + s[O].pragma(parallel_axis, "parallel_stride_pattern") + s[O].pragma(batch, "parallel_barrier_when_finish") + + return s diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py index 8b88bc3ba387..e5a674709009 100644 --- a/topi/tests/python/test_topi_conv2d_nchw.py +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -11,8 +11,6 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') - B = topi.nn.conv2d_nchw(A, W, stride, padding) - C = topi.nn.relu(B) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) @@ -35,6 +33,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): + B = topi.nn.conv2d(A, W, stride, padding, layout='NCHW') + C = topi.nn.relu(B) s1 = topi.generic.schedule_conv2d_nchw([B]) s2 = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) From bb298af593aab90ade73834e982b394890edb68d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 8 Feb 2018 17:26:32 -0800 Subject: [PATCH 144/948] [PASS] Prepare storage rewrite for unified buffer (#885) * [PASS] Prepare storage rewrite for unified buffer * more comments --- src/pass/storage_rewrite.cc | 39 ++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index fe4295feb982..452b1a7b6abf 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -363,7 +363,7 @@ class StoragePlanRewriter : public IRMutator { Expr Mutate_(const Variable* op, const Expr& e) final { auto it = alloc_map_.find(op); if (it != alloc_map_.end()) { - if (it->second->elem_offset != 0) { + if (it->second->bits_offset != 0) { LOG(WARNING) << "Use a merged buffer variable address, could cause error"; } return it->second->alloc_var; @@ -381,11 +381,10 @@ class StoragePlanRewriter : public IRMutator { const StorageEntry* se = it->second; Expr offset = Mutate(op->args[2]); Expr extent = Mutate(op->args[3]); - CHECK_EQ(se->elem_type, dtype.element_of()) - << " buffer=" << buffer->name_hint; - CHECK_EQ(se->elem_offset % dtype.lanes(), 0); - if (se->elem_offset != 0) { - offset = make_const(offset.type(), se->elem_offset / dtype.lanes()) + offset; + uint64_t elem_bits = dtype.bits() * dtype.lanes(); + CHECK_EQ(se->bits_offset % elem_bits, 0U); + if (se->bits_offset != 0) { + offset = make_const(offset.type(), se->bits_offset / elem_bits) + offset; } return Call::make( op->type, op->name, @@ -465,8 +464,17 @@ class StoragePlanRewriter : public IRMutator { // The allocation element type. Type elem_type; // This is non-zero if this allocate is folded into another one - // the address becomes alloc_var + sizeof(elem_type) * elem_offset; - uint64_t elem_offset{0}; + // the address(in bits) becomes alloc_var + bits_offset; + // can be effectively converted to the element type. + // We need to convert bit_offset to offset of specific element type later. + // + // We use bits(instead of bytes) to support non-conventional indexing in hardware. + // When we are merging buffer together, the bits_offset are set to be aligned + // to certain value given by the max_simd_bits property of the special memory. + // + // This allows effective sharing among different types as long as their alignment + // requirement fits into the max_simd_bits. + uint64_t bits_offset{0}; }; // Alllocate entry of node. @@ -495,8 +503,10 @@ class StoragePlanRewriter : public IRMutator { // Remap the index Expr RemapIndex(Type dtype, Expr index, StorageEntry* e) { CHECK_EQ(dtype.element_of(), e->elem_type); - if (e->elem_offset == 0) return index; - return make_const(index.type(), e->elem_offset) + index; + if (e->bits_offset == 0) return index; + uint64_t elem_bits = dtype.bits() * dtype.lanes(); + CHECK_EQ(e->bits_offset % elem_bits, 0U); + return make_const(index.type(), e->bits_offset / elem_bits) + index; } // Prepare the new allocations void PrepareNewAlloc() { @@ -526,7 +536,7 @@ class StoragePlanRewriter : public IRMutator { for (size_t i = 0; i < vec.size(); ++i) { StorageEntry* e = vec[i]; // already merged - if (e->elem_offset != 0) continue; + if (e->bits_offset != 0) continue; if (e->merged_children.size() != 0) { NewAllocTagMerged(e); continue; } @@ -580,10 +590,13 @@ class StoragePlanRewriter : public IRMutator { CHECK_NE(e->const_nbits, 0U); MemoryInfo info = GetMemoryInfo(e->scope.to_string()); uint64_t total_bits = e->const_nbits; - size_t align = 1; + // By default, align to 32 bits. + size_t align = 32; if (info.defined()) { align = info->max_simd_bits; } + // Always align to max_simd_bits + // so we can remap types by keeping this property if (total_bits % align != 0) { total_bits += align - (total_bits % align); } @@ -591,7 +604,7 @@ class StoragePlanRewriter : public IRMutator { for (StorageEntry* child : e->merged_children) { CHECK_NE(child->const_nbits, 0U); CHECK_NE(total_bits, 0U); - child->elem_offset = total_bits / child->elem_type.bits(); + child->bits_offset = total_bits; child->alloc_var = e->alloc_var; total_bits += child->const_nbits; if (total_bits % align != 0) { From 8c804c072246fcf18ac7c670831bc4b8ff12fcd3 Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 9 Feb 2018 11:03:23 +0900 Subject: [PATCH 145/948] allow fallback path to non imagenet workloads (#886) --- topi/python/topi/x86/conv2d.py | 94 ++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 7457deab4c17..1cbf2f2cadfd 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -66,8 +66,8 @@ def _get_schedule_conv(wkl): @conv2d.register("cpu") def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): target = tvm.target.current_target(allow_none=False) - if 'avx' in str(target) and layout == 'NCHW': - wkl = _get_workload(data, kernel, stride, padding, out_dtype) + wkl = _get_workload(data, kernel, stride, padding, out_dtype) + if wkl in _WORKLOADS and 'avx' in str(target) and layout == 'NCHW': sch = _get_schedule(wkl) return _AVX_SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype) elif layout == 'NCHW': @@ -86,6 +86,30 @@ def schedule_conv2d(outs): s = tvm.create_schedule([x.op for x in outs]) target = tvm.target.current_target(allow_none=False) + def default_schedule(op): + """NCHW conv2d schedule for non imagenet workloads""" + conv = op.output(0) + kernel = op.input_tensors[1] + data = op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + n_pad, c_pad, h_pad, w_pad = data_pad.op.axis + pad_fused = s[data_pad].fuse(n_pad, c_pad) + s[data_pad].parallel(pad_fused) + C = conv + n, c, h, w = C.op.axis + rc, ry, rx = C.op.reduce_axis + fused = s[C].fuse(n, c) + s[C].parallel(fused) + wo, wi = s[C].split(w, factor=16) + s[C].reorder(fused, rc, h, wo, ry, rx, wi) # move rc to outer loop + s[C].unroll(rx) + s[C].unroll(ry) + s[C].vectorize(wi) + def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) @@ -104,49 +128,31 @@ def traverse(op): if 'conv2d_nchw' in op.tag: if 'avx' in str(target): - output = op.output(0) - conv_out = op.input_tensors[0] - kernel_vec = conv_out.op.input_tensors[1] - kernel = kernel_vec.op.input_tensors[0] - data_vec = conv_out.op.input_tensors[0] - data = data_vec.op.input_tensors[0] - data_pad = None - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - data_pad = data - data = data_pad.op.input_tensors[0] - - padding = infer_pad(data, data_pad) - if data_pad is None: - stride = infer_stride(data, kernel, output) - else: - stride = infer_stride(data_pad, kernel, output) - - wkl = _get_workload(data, kernel, stride, padding, output.dtype) - sch = _get_schedule(wkl) - _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, - kernel, kernel_vec, conv_out, output, outs[0]) + try: + output = op.output(0) + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel = kernel_vec.op.input_tensors[0] + data_vec = conv_out.op.input_tensors[0] + data = data_vec.op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + padding = infer_pad(data, data_pad) + if data_pad is None: + stride = infer_stride(data, kernel, output) + else: + stride = infer_stride(data_pad, kernel, output) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype) + sch = _get_schedule(wkl) + _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, + kernel, kernel_vec, conv_out, output, outs[0]) + except IndexError: + default_schedule(op) else: - conv = op.output(0) - kernel = op.input_tensors[1] - data = op.input_tensors[0] - data_pad = None - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - data_pad = data - data = data_pad.op.input_tensors[0] - - n_pad, c_pad, h_pad, w_pad = data_pad.op.axis - pad_fused = s[data_pad].fuse(n_pad, c_pad) - s[data_pad].parallel(pad_fused) - C = conv - n, c, h, w = C.op.axis - rc, ry, rx = C.op.reduce_axis - fused = s[C].fuse(n, c) - s[C].parallel(fused) - wo, wi = s[C].split(w, factor=16) - s[C].reorder(fused, rc, h, wo, ry, rx, wi) # move rc to outer loop - s[C].unroll(rx) - s[C].unroll(ry) - s[C].vectorize(wi) + default_schedule(op) traverse(outs[0].op) return s From 1b0d9f58019de07d0060cb80e33a74e7f8500c08 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 9 Feb 2018 14:33:25 -0800 Subject: [PATCH 146/948] Update jenkins (#890) --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0506caac850d..b667359f0f2b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -5,7 +5,7 @@ // tvm libraries tvm_runtime = "lib/libtvm_runtime.so, config.mk" -tvm_lib = "lib/libtvm.so " + tvm_runtime +tvm_lib = "lib/libtvm.so, " + tvm_runtime // LLVM upstream lib tvm_multilib = "lib/libtvm_llvm40.so, lib/libtvm_llvm50.so, lib/libtvm_llvm60.so, lib/libtvm_topi.so, " + tvm_runtime @@ -217,7 +217,7 @@ stage('Unit Test') { } }, 'cpp': { - node('linux') { + node('CPU' && 'linux') { ws('workspace/tvm/ut-cpp') { init_git() unpack_lib('cpu', tvm_lib) From 469fba311bf33d9585c7015dc49458c865643446 Mon Sep 17 00:00:00 2001 From: eqy Date: Fri, 9 Feb 2018 14:47:51 -0800 Subject: [PATCH 147/948] ignore model option in target (#889) --- src/codegen/llvm/llvm_common.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index f8b80118cb5d..03ac6f0ff227 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -85,7 +85,7 @@ void ParseLLVMTargetOptions(const std::string& target_str, } else { LOG(FATAL) << "invalid -mfloat-abi option " << value; } - } else if (key == "-device" || key == "-libs") { + } else if (key == "-device" || key == "-libs" || key == "-model") { // pass } else { LOG(FATAL) << "unknown option " << key; From 726c0ebb234206a6051c448abd82441dd4edb0ce Mon Sep 17 00:00:00 2001 From: masahi Date: Sat, 10 Feb 2018 11:14:34 +0900 Subject: [PATCH 148/948] [TOPI] Initial NHWC layout support (#882) * add 4 dim softmax * update for NHWC layout * remove layout param from softmax * fix typo * minor fix to pool support axis=1 ndims=5 softmax. add softmax axis * few fix for softmax * fix typo * add more doc * minor doc fix * fix upsampling output shape * fix lint * cleanup softmax * minor fix * raise exception instead of assert, handles negative axis * check axis after axis transformation --- topi/python/topi/nn/conv2d.py | 2 + topi/python/topi/nn/pooling.py | 118 +++++++++++++++++++++++++++++- topi/python/topi/nn/softmax.py | 47 +++++++++--- topi/python/topi/nn/upsampling.py | 53 +++++++++++++- 4 files changed, 206 insertions(+), 14 deletions(-) diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 3bd910e29974..d6488b164d04 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -82,6 +82,8 @@ def conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): return conv2d_nchw(data, kernel, stride, padding, out_dtype) elif layout == 'HWCN': return conv2d_hwcn(data, kernel, stride, padding, out_dtype) + elif layout == 'NHWC': + return conv2d_nhwc(data, kernel, stride, padding, out_dtype) else: raise ValueError("not support this layout {} yet".format(layout)) diff --git a/topi/python/topi/nn/pooling.py b/topi/python/topi/nn/pooling.py index 99b15e18e4e1..0519471eaa7e 100644 --- a/topi/python/topi/nn/pooling.py +++ b/topi/python/topi/nn/pooling.py @@ -44,9 +44,50 @@ def global_pool(data, pool_type): raise ValueError("Pool type should be 'avg' or 'max'.") -def pool(data, kernel, stride, padding, pool_type, ceil_mode=False): +def pool(data, kernel, stride, padding, pool_type, ceil_mode=False, layout="NCHW"): """Perform pooling on the data + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, channel, in_height, in_width] + or [batch, in_height, in_width, channel] + + kernel : list/tuple of two ints + Kernel size, [kernel_height, kernel_width] + + stride : list/tuple of two ints + Stride size, [stride_height, stride_width] + + paddding : list/tuple of two ints + Pad size, [pad_height, pad_width] + + pool_type : str + Pool type, 'max' or 'avg' + + ceil_mode : bool + Whether to use ceil when caculate output size. + + layout: string + either "NCHW" or "NHWC" + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, channel, out_height, out_width] + or [batch, out_height, out_width, channel] + """ + if layout == "NCHW": + return pool_nchw(data, kernel, stride, padding, pool_type, ceil_mode=ceil_mode) + elif layout == "NHWC": + return pool_nhwc(data, kernel, stride, padding, pool_type, ceil_mode=ceil_mode) + else: + raise ValueError("not support this layout {} yet".format(layout)) + + +def pool_nchw(data, kernel, stride, padding, pool_type, ceil_mode=False): + """Perform pooling on the data in NCHW layout + Parameters ---------- data : tvm.Tensor @@ -117,3 +158,78 @@ def pool(data, kernel, stride, padding, pool_type, ceil_mode=False): tag=tag.ELEMWISE) else: raise ValueError("Pool type should be 'avg' or 'max'.") + + +def pool_nhwc(data, kernel, stride, padding, pool_type, ceil_mode=False): + """Perform pooling on the data in NHWC layout + + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, in_height, in_width, channel] + + kernel : list/tuple of two ints + Kernel size, [kernel_height, kernel_width] + + stride : list/tuple of two ints + Stride size, [stride_height, stride_width] + + paddding : list/tuple of two ints + Pad size, [pad_height, pad_width] + + pool_type : str + Pool type, 'max' or 'avg' + + ceil_mode : bool + Whether to use ceil when caculate output size. + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, channel, out_height, out_width] + """ + assert len(data.shape) == 4, "only support 4-dim pooling" + assert len(stride) == 2, "only support 2-dim stride" + kernel_height, kernel_width = kernel + stride_height, stride_width = stride + batch, height, width, channel = data.shape + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + padding, (kernel_height, kernel_width)) + + if ceil_mode: + # Additional padding to ensure we do ceil instead of floor when divide stride. + pad_down += stride_height -1 + pad_right += stride_width - 1 + + pad_before = [0, pad_top, pad_left, 0] + pad_after = [0, pad_down, pad_right, 0] + + out_height = util.simplify((height - kernel_height + pad_top + pad_down) // stride_height + 1) + out_width = util.simplify((width - kernel_width + pad_left + pad_right) // stride_width + 1) + + dheight = tvm.reduce_axis((0, kernel_height)) + dwidth = tvm.reduce_axis((0, kernel_width)) + + if pool_type == 'max': + temp = pad(data, pad_before, pad_after, name="pad_temp", \ + pad_value=tvm.min_value(data.dtype)) + return tvm.compute((batch, out_height, out_width, channel), \ + lambda n, h, w, c: \ + tvm.max(temp[n, h*stride_height+dheight, w*stride_width+dwidth, c], \ + axis=[dheight, dwidth]), \ + tag="pool_max") + elif pool_type == 'avg': + temp = pad(data, pad_before, pad_after, name="pad_temp", \ + pad_value=tvm.const(0.).astype(data.dtype)) + tsum = tvm.compute((batch, out_height, out_width, channel, ), \ + lambda n, h, w, c: \ + tvm.sum(temp[n, h*stride_height+dheight, w*stride_width+dwidth, c], \ + axis=[dheight, dwidth]), \ + tag="pool_avg") + return tvm.compute((batch, out_height, out_width, channel), \ + lambda n, h, w, c: \ + tsum[n, h, w, c] / (kernel_height*kernel_width), \ + tag=tag.ELEMWISE) + else: + raise ValueError("Pool type should be 'avg' or 'max'.") diff --git a/topi/python/topi/nn/softmax.py b/topi/python/topi/nn/softmax.py index 9060a31f532b..5e73f7633f4d 100644 --- a/topi/python/topi/nn/softmax.py +++ b/topi/python/topi/nn/softmax.py @@ -4,28 +4,51 @@ import tvm @tvm.tag_scope(tag='softmax_output') -def softmax(x): +def softmax(x, axis=-1): """Perform softmax activation on the data Parameters ---------- data : tvm.Tensor - 2-D input data + can be any dimension + + axis : int + channel axis Returns ------- output : tvm.Tensor - 2-D output with same shape + output shape is the same as input """ - assert len(x.shape) == 2, "only support 2-dim softmax" - m, n = x.shape - k = tvm.reduce_axis((0, n), name='k') - max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k)) - k = tvm.reduce_axis((0, n), name='k') - expsum = tvm.compute( - (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k)) - return tvm.compute( - x.shape, lambda i, j: tvm.exp(x[i, j] - max_elem[i]) / expsum[i]) + shape = x.shape + if axis < 0: + axis = len(shape) + axis + if axis >= len(shape): + ValueError("axis parameter should be less than input dim") + + k1 = tvm.reduce_axis((0, shape[axis]), name='k') + k2 = tvm.reduce_axis((0, shape[axis]), name='k') + + def insert_reduce_index(indices, reduce_index): + return indices[:axis] + (reduce_index,) + indices[axis:] + + def _compute_max(*indices): + eval_range = insert_reduce_index(indices, k1) + return tvm.max(x[eval_range], axis=k1) + + def _compute_expsum(max_elem, *indices): + eval_range = insert_reduce_index(indices, k2) + return tvm.sum(tvm.exp(x[eval_range] - max_elem[indices]), axis=k2) + + def _normalize(max_elem, expsum, *indices): + non_reduce_indices = tuple([var for (i, var) in enumerate(indices) if i != axis]) + return tvm.exp(x[indices] - max_elem[non_reduce_indices]) / expsum[non_reduce_indices] + + reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis]) + max_elem = tvm.compute(reduced_shape, _compute_max) + expsum = tvm.compute(reduced_shape, lambda *indices: _compute_expsum(max_elem, *indices)) + return tvm.compute(shape, lambda *indices: _normalize(max_elem, expsum, *indices)) + @tvm.tag_scope(tag='log_softmax_output') def log_softmax(x): diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py index df77bbdb23c0..9297eb4ad06b 100644 --- a/topi/python/topi/nn/upsampling.py +++ b/topi/python/topi/nn/upsampling.py @@ -4,10 +4,40 @@ from .. import util -def upsampling(data, scale): +def upsampling(data, scale, layout="NCHW"): """Perform nearest neighbor upsampling on the data. Bilinear upsampling is not supported. + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, channel, in_height, in_width] + or [batch, in_height, in_width, channel] + + scale: int + upsampling scaling factor + + layout: string + either "NCHW" or "NHWC" + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, channel, in_height*scale, in_width*scale] + or [batch, in_height*scale, in_width*scale, channel] + """ + + if layout == "NCHW": + return upsampling_nchw(data, scale) + elif layout == "NHWC": + return upsampling_nhwc(data, scale) + else: + raise ValueError("not support this layout {} yet".format(layout)) + + +def upsampling_nchw(data, scale): + """Perform nearest neighor upsampling on NCHW layout input. + Parameters ---------- data : tvm.Tensor @@ -27,3 +57,24 @@ def upsampling(data, scale): return tvm.compute((batch, channel, out_height, out_width), \ lambda n, c, h, w: data[n, c, h/scale, w/scale]) + + +def upsampling_nhwc(data, scale): + """Perform nearest neighor upsampling on NHWC layout input. + + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, in_height, in_width, channel] + + scale: int + upsampling scaling factor + + """ + + batch, height, width, channel = data.shape + out_height = util.simplify(height * scale) + out_width = util.simplify(width * scale) + + return tvm.compute((batch, out_height, out_width, channel), \ + lambda n, h, w, c: data[n, h/scale, w/scale, c]) From 902e58532233f36e9ca3ee3a7e96486e557ff341 Mon Sep 17 00:00:00 2001 From: alex-weaver Date: Sat, 10 Feb 2018 20:39:34 +0000 Subject: [PATCH 149/948] Convert BuildModule to use TVM node system (#879) * Make python BuildConfig serializable/deserializable to/from string * Make C++ BuildConfig serializable/deserializable to/from string * Revert "Make python BuildConfig serializable/deserializable to/from string" This reverts commit a5e1fb3ff63a161cc0d63475d2a32816cc4c3666. * Revert "Make C++ BuildConfig serializable/deserializable to/from string" This reverts commit ec0c2c54543050fe6f264d06eebff33dee70370b. * Converted BuildConfig to use TVM node system * Fix lint * Fix lint * Added code to set node attributes through the C API * Fixed bug in build_config() * Fix lint * Fix lint * Fix test errors * Reduced scope of node __setattr__ to apply only to BuildConfig * Fix lint * Fix lint * Changed python BuildConfig to be immutable, with values set once on construction. * Fix lint * Fix C++ test * Fixed BuildConfig setting python-side args * Fix lint * Removed dependency on reflection.cc to construct BuildConfig (allow use in runtime library) * Fix lint * Revert "Fix lint" This reverts commit 16ed6d7a1ca5e551b035bad46e8361ea487cd45b. * Revert "Removed dependency on reflection.cc to construct BuildConfig (allow use in runtime library)" This reverts commit 43817c97a2ee045791e0c031d962fa97636ce8f6. * Avoid accessing BuildConfig when using runtime lib * Fix missing import * Fix error running under cython (root cause: node handle is not valid until after __init__ has returned, so cannot call __dir__ during __init__ * Fix error where BuildConfig._node_defaults was not copied in build_config() * Fix lint * Fix lint * Fix lint * Fix lint * Add comments to python BuildConfig --- include/tvm/build_module.h | 28 ++++++++++++- python/tvm/build_module.py | 75 ++++++++++++++++++++++------------ src/api/api_lang.cc | 1 + src/codegen/build_module.cc | 37 +++++++++++++---- tests/cpp/build_module_test.cc | 2 +- 5 files changed, 106 insertions(+), 37 deletions(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index 391901730e57..d4186e8f8167 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -85,10 +85,13 @@ EXPORT Target stackvm(); } // namespace target +class BuildConfig; + /*! * \brief Container for build configuration options */ -struct BuildConfig { +class BuildConfigNode : public Node { + public: /*! * \brief The data alignment to use when constructing buffers. If this is set to * -1, then TVM's internal default will be used @@ -126,10 +129,31 @@ struct BuildConfig { /*! \brief Whether to partition const loop */ bool partition_const_loop = false; - BuildConfig() { + void VisitAttrs(AttrVisitor* v) final { + v->Visit("data_alignment", &data_alignment); + v->Visit("offset_factor", &offset_factor); + v->Visit("double_buffer_split_loop", &double_buffer_split_loop); + v->Visit("auto_unroll_max_step", &auto_unroll_max_step); + v->Visit("auto_unroll_max_depth", &auto_unroll_max_depth); + v->Visit("auto_unroll_max_extent", &auto_unroll_max_extent); + v->Visit("unroll_explicit", &unroll_explicit); + v->Visit("restricted_func", &restricted_func); + v->Visit("detect_global_barrier", &detect_global_barrier); + v->Visit("partition_const_loop", &partition_const_loop); } + + static constexpr const char* _type_key = "BuildConfig"; + TVM_DECLARE_NODE_TYPE_INFO(BuildConfigNode, Node); }; +TVM_DEFINE_NODE_REF(BuildConfig, BuildConfigNode); + +/*! +* \brief Construct a BuildConfig containing a new BuildConfigNode +* \return The new BuildConfig +*/ +EXPORT BuildConfig build_config(); + /*! * \brief Build a LoweredFunc given a schedule, args and binds * \param sch The schedule to lower. diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 8b52b11d86e7..9c442a07425d 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -6,7 +6,9 @@ from __future__ import absolute_import as _abs import warnings import types +import os +from ._ffi.node import NodeBase, register_node from . import api from . import tensor from . import schedule @@ -18,6 +20,7 @@ from . import codegen from . import ndarray from . import target as _target +from . import make class DumpIR(object): """Dump IR for each pass. @@ -95,16 +98,23 @@ def exit(self): BuildConfig.current.add_lower_pass = self._old_custom_pass DumpIR.scope_level -= 1 -class BuildConfig(object): +@register_node +class BuildConfig(NodeBase): """Configuration scope to set a build config option. - Parameters - ---------- - kwargs - Keyword arguments of configurations to set. + Note + ---- + This object is backed by node system in C++, with arguments that can be + exchanged between python and C++. + + Do not construct directly, use build_config instead. + + The fields that are backed by the C++ node are immutable once an instance + is constructed. See _node_defaults for the fields. """ + current = None - defaults = { + _node_defaults = { "auto_unroll_max_step": 0, "auto_unroll_max_depth": 8, "auto_unroll_max_extent": 0, @@ -114,30 +124,28 @@ class BuildConfig(object): "offset_factor": 0, "data_alignment": -1, "restricted_func": True, - "double_buffer_split_loop": 1, - "add_lower_pass": None, - "dump_pass_ir": False + "double_buffer_split_loop": 1 } - def __init__(self, **kwargs): + + # pylint: disable=no-member + def __init__(self, handle): + """Initialize the function with handle + + Parameters + ---------- + handle : SymbolHandle + the handle to the underlying C++ Symbol + """ + super(BuildConfig, self).__init__(handle) + self.handle = handle self._old_scope = None self._dump_ir = DumpIR() - for k, _ in kwargs.items(): - if k not in BuildConfig.defaults: - raise ValueError( - "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys())) - self._attr = kwargs - - def __getattr__(self, name): - if name not in self._attr: - return BuildConfig.defaults[name] - return self._attr[name] + self.dump_pass_ir = False + self.add_lower_pass = None def __enter__(self): # pylint: disable=protected-access self._old_scope = BuildConfig.current - attr = BuildConfig.current._attr.copy() - attr.update(self._attr) - self._attr = attr BuildConfig.current = self if self.dump_pass_ir is True: self._dump_ir.enter() @@ -149,8 +157,11 @@ def __exit__(self, ptype, value, trace): self._dump_ir.exit() BuildConfig.current = self._old_scope - -BuildConfig.current = BuildConfig() + def __setattr__(self, name, value): + if name in BuildConfig._node_defaults: + raise AttributeError( + "'%s' object cannot set attribute '%s'" % (str(type(self)), name)) + return super(BuildConfig, self).__setattr__(name, value) def build_config(**kwargs): """Configure the build behavior by setting config variables. @@ -206,8 +217,18 @@ def build_config(**kwargs): config: BuildConfig The build configuration """ - return BuildConfig(**kwargs) - + node_args = {k: v if k not in kwargs else kwargs[k] + for k, v in BuildConfig._node_defaults.items()} + config = make.node("BuildConfig", **node_args) + + for k in kwargs: + if not k in node_args: + setattr(config, k, kwargs[k]) + return config + +if not os.environ.get("TVM_USE_RUNTIME_LIB", False): + # BuildConfig is not available in tvm_runtime + BuildConfig.current = build_config() def get_binds(args, binds=None): """Internal function to get binds and arg_list given arguments. diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc index 3b5916ea5fec..de388cf0b51f 100644 --- a/src/api/api_lang.cc +++ b/src/api/api_lang.cc @@ -10,6 +10,7 @@ #include #include #include +#include namespace tvm { diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 2e8e5bb278eb..cca09a966e21 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -179,7 +179,7 @@ void GetBinds(const Array& args, for (const auto &x : args) { if (out_binds->find(x) == out_binds->end()) { auto buf = BufferWithOffsetAlignment(x->shape, x->dtype, x->op->name, - config.data_alignment, config.offset_factor); + config->data_alignment, config->offset_factor); out_binds->Set(x, buf); out_arg_list->push_back(buf); } else { @@ -218,14 +218,14 @@ Stmt BuildStmt(Schedule sch, stmt = ir::StorageFlatten(stmt, out_binds, 64); stmt = ir::CanonicalSimplify(stmt); if (loop_partition) { - stmt = ir::LoopPartition(stmt, config.partition_const_loop); + stmt = ir::LoopPartition(stmt, config->partition_const_loop); } stmt = ir::VectorizeLoop(stmt); stmt = ir::InjectVirtualThread(stmt); - stmt = ir::InjectDoubleBuffer(stmt, config.double_buffer_split_loop); + stmt = ir::InjectDoubleBuffer(stmt, config->double_buffer_split_loop); stmt = ir::StorageRewrite(stmt); - stmt = ir::UnrollLoop(stmt, config.auto_unroll_max_step, config.auto_unroll_max_depth, - config.auto_unroll_max_extent, config.unroll_explicit); + stmt = ir::UnrollLoop(stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth, + config->auto_unroll_max_extent, config->unroll_explicit); // Phase 2 stmt = ir::Simplify(stmt); @@ -243,7 +243,7 @@ Array lower(Schedule sch, const BuildConfig& config) { Array out_arg_list; auto stmt = BuildStmt(sch, args, binds, true, &out_arg_list, config); - return Array({ ir::MakeAPI(stmt, name, out_arg_list, 0, config.restricted_func) }); + return Array({ ir::MakeAPI(stmt, name, out_arg_list, 0, config->restricted_func) }); } runtime::Module build(const Array& funcs, @@ -266,7 +266,7 @@ runtime::Module build(const Array& funcs, for (const auto &x : funcs) { if (x->func_type == kMixedFunc) { auto func = x; - if (config.detect_global_barrier) { + if (config->detect_global_barrier) { func = ir::ThreadSync(func, "global"); } @@ -321,4 +321,27 @@ runtime::Module build(const Array& funcs, return mhost; } + +BuildConfig build_config() { + return BuildConfig(std::make_shared()); +} + +TVM_REGISTER_NODE_TYPE(BuildConfigNode); + +TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) +.set_dispatch([](const BuildConfigNode *op, IRPrinter *p) { + p->stream << "build_config("; + p->stream << "data_alignment=" << op->data_alignment << ", "; + p->stream << "offset_factor=" << op->offset_factor << ", "; + p->stream << "double_buffer_split_loop=" << op->double_buffer_split_loop << ", "; + p->stream << "auto_unroll_max_step=" << op->auto_unroll_max_step << ", "; + p->stream << "auto_unroll_max_depth=" << op->auto_unroll_max_depth << ", "; + p->stream << "auto_unroll_max_extent=" << op->auto_unroll_max_extent << ", "; + p->stream << "unroll_explicit=" << op->unroll_explicit << ", "; + p->stream << "restricted_func=" << op->restricted_func << ", "; + p->stream << "detect_global_barrier=" << op->detect_global_barrier << ", "; + p->stream << "partition_const_loop=" << op->partition_const_loop; + p->stream << ")"; +}); + } // namespace tvm diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index fc3f6ac9324d..fe0a9151cc2c 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -27,7 +27,7 @@ TEST(BuildModule, Basic) { auto args = Array({ A, B, C }); std::unordered_map binds; - BuildConfig config; + auto config = build_config(); auto target = target::llvm(); auto lowered = lower(s, args, "func", binds, config); From 71f1b2d76b3d30924adeb67a0221d72977946e08 Mon Sep 17 00:00:00 2001 From: Siju Samuel Date: Mon, 12 Feb 2018 11:26:08 +0530 Subject: [PATCH 150/948] [Documentation Changes] Parameter description change for reduction apis (#881) * [Documentation Changes] Parameter description change for reduction api The parameter description is updated for max, min, argmax and argmin * Lint changes on patch-3 of reduction.py --- topi/python/topi/reduction.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py index 997ec8e9ba95..9f88953bb770 100644 --- a/topi/python/topi/reduction.py +++ b/topi/python/topi/reduction.py @@ -186,9 +186,9 @@ def max(data, axis=None, keepdims=False): The input tvm tensor axis : None or int or tuple of int - Axis or axes along which a sum is performed. - The default, axis=None, will sum all of the elements of the input array. - If axis is negative it counts from the last to the first axis. + Axis or axes along which the max operation is performed. + The default, axis=None, will find the max element from all of the elements of the input + array. If axis is negative it counts from the last to the first axis. keepdims : bool If this is set to True, the axes which are reduced are left in the result as dimensions @@ -212,9 +212,9 @@ def min(data, axis=None, keepdims=False): The input tvm tensor axis : None or int or tuple of int - Axis or axes along which a sum is performed. - The default, axis=None, will sum all of the elements of the input array. - If axis is negative it counts from the last to the first axis. + Axis or axes along which a minimum operation is performed. + The default, axis=None, will find the minimum element from all of the elements of the + input array. If axis is negative it counts from the last to the first axis. keepdims : bool If this is set to True, the axes which are reduced are left in the result as dimensions @@ -238,9 +238,9 @@ def argmax(data, axis=None, keepdims=False): The input tvm tensor axis : None or int or tuple of int - Axis or axes along which a sum is performed. - The default, axis=None, will sum all of the elements of the input array. - If axis is negative it counts from the last to the first axis. + Axis or axes along which a argmax operation is performed. + The default, axis=None, will find the indices of the maximum element of the elements of + the input array. If axis is negative it counts from the last to the first axis. keepdims : bool If this is set to True, the axes which are reduced are left in the result as dimensions @@ -265,9 +265,9 @@ def argmin(data, axis=None, keepdims=False): The input tvm tensor axis : None or int or tuple of int - Axis or axes along which a sum is performed. - The default, axis=None, will sum all of the elements of the input array. - If axis is negative it counts from the last to the first axis. + Axis or axes along which a argmin operation is performed. + The default, axis=None, will find the indices of minimum element all of the elements of + the input array. If axis is negative it counts from the last to the first axis. keepdims : bool If this is set to True, the axes which are reduced are left in the result as dimensions From 6224779dc03ee7b3e36274a1265daff8f5690787 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Feb 2018 00:55:23 +0800 Subject: [PATCH 151/948] [TOPI] Add winograd for mali (#898) * add winograd for mali * fix lint * add padding * fix comment --- topi/python/topi/mali/conv2d.py | 225 ++++++++++++++++++++++++++++++-- 1 file changed, 212 insertions(+), 13 deletions(-) diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index ff67e0503f4f..5b4cf5bae6ff 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -2,6 +2,8 @@ """conv2d schedule on ARM Mali GPU""" from __future__ import absolute_import as _abs + +import numpy as np import tvm from .. import generic @@ -63,7 +65,23 @@ def transpose(s, tensor, readers): s[tmp].compute_inline() return s.cache_write(tmp, "global"), tmp -@conv2d.register("mali") +def const_array(data, name): + """ convert an const array to tvm tensor""" + row, col = data.shape + dtype = str(data.dtype) + + def select_array(i, j): + now = tvm.const(0.0, dtype) + for ii in range(row): + for jj in range(col): + now = tvm.select(tvm.all(i % row == ii, j % col == jj), + tvm.const(data[ii][jj], dtype), + now) + return now + return tvm.compute(data.shape, select_array, name=name) + + +@conv2d.register(["mali"]) def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): """Conv2D operator for ARM Mali GPU backend. @@ -94,10 +112,20 @@ def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32 assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." out_dtype = data.dtype - if util.get_const_int(kernel.shape[2]) == 1: + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + kernel_shape = util.get_const_tuple(kernel.shape) + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + if (kernel_shape[2:4] == (3, 3) and (HPAD, WPAD) == (1, 1) and kernel_shape[0] >= 64 and + (HSTR, WSTR) == (1, 1)): + return _decl_winograd(data, kernel, stride, padding, layout, out_dtype) + elif kernel_shape[2:4] == (1, 1): return _decl_im2col(data, kernel, stride, padding, layout, out_dtype) else: - return _decl_direct(data, kernel, stride, padding, layout, out_dtype) + return _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype) @generic.schedule_conv2d_nchw.register(["mali"]) def schedule_conv2d_nchw(outs): @@ -129,14 +157,17 @@ def traverse(op): if 'im2col_conv_output' in op.tag: _schedule_im2col_conv2d(s, op) - if 'direct_conv_output' in op.tag: - _schedule_direct_conv2d(s, op) + if 'spatialpack_conv_output' in op.tag: + _schedule_spatialpack_conv2d(s, op) + + if 'winograd_conv_output' in op.tag: + _schedule_winograd(s, op) traverse(outs[0].op) return s -def _decl_direct(data, kernel, stride, padding, layout, out_dtype): - """declare the direct method (spatial packing) for conv2d""" +def _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype): + """declare the spatialpack method (spatial packing) for conv2d""" _, CI, IH, IW = [util.get_const_int(x) for x in data.shape] CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape] HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) @@ -207,12 +238,12 @@ def _decl_direct(data, kernel, stride, padding, layout, out_dtype): output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC], - name='output_unpack', tag='direct_conv_output') + name='output_unpack', tag='spatialpack_conv_output') return output -def _schedule_direct_conv2d(s, op): - """schedule the direct method (spatial packing) for conv2d""" +def _schedule_spatialpack_conv2d(s, op): + """schedule the spatialpack method (spatial packing) for conv2d""" # get ops and tensors output = op.output(0) output_height = util.get_const_int(output.shape[2]) @@ -294,8 +325,6 @@ def _schedule_direct_conv2d(s, op): _, co, oh, ow = s[output].op.axis tile_and_bind3d(s, output, co, oh, ow, num_thread, 1, last) - #print(tvm.lower(s, [data, kernel, output], simple_mode=True)) - def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): """declare the Im2Col method for conv2d""" _, CI, IH, IW = [x.value for x in data.shape] @@ -476,4 +505,174 @@ def _schedule_im2col_conv2d(s, op): s[output].vectorize(vw) fuse_and_bind(s, output, [n, co, h, w]) - #print(tvm.lower(s, [data, kernel], simple_mode=True)) +def _decl_winograd(data, kernel, stride, padding, layout, out_dtype): + """declare winograd fast convolution F(2x2, 3x3) for conv2d""" + N, CI, H, W = [util.get_const_int(x) for x in data.shape] + CO, CI, KH, KW = [util.get_const_int(x) for x in kernel.shape] + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3 + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + + B_data = np.array([ + [1, 0, 0, 0], + [0, 1, -1, 1], + [-1, 1, 1, 0], + [0, 0, 0, -1] + ], out_dtype) + + G_data = np.array([ + [1, 0, 0], + [1.0/2, 1.0/2, 1.0/2], + [1.0/2, -1.0/2, 1.0/2], + [0, 0, 1], + ], out_dtype) + + A_data = np.array([ + [1, 0], + [1, 1], + [1, -1], + [0, -1], + ], out_dtype) + + m = 2 + r = 3 + alpha = m + r - 1 + K = CO + C = CI + + nH, nW = (H + m-1) // m, (W + m-1) // m + P = N * nH * nW + + bna, bnb = 4, 4 + if data.dtype == 'float16': + bnb *= 2 + P_round = (P + bnb - 1) // bnb * bnb + assert K % bna == 0 and P_round % bnb == 0 + + # pack input tile + input_tile = tvm.compute((C, P_round // bnb, alpha, alpha, bnb), + lambda c, b, eps, nu, bb: + tvm.select(b * bnb + bb < P,\ + data_pad[(b*bnb+bb) // (nH*nW)][c][(b*bnb+bb) // nW % nH * m + eps]\ + [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), + name='d') + + # transform kernel + G = const_array(G_data, 'G') + r_kh = tvm.reduce_axis((0, KH), 'r_kh') + r_kw = tvm.reduce_axis((0, KW), 'r_kw') + U = tvm.compute((alpha, alpha, K // bna, C, bna), lambda eps, nu, k, c, kk: + tvm.sum(kernel[k * bna + kk][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], + axis=[r_kh, r_kw]), name='U') + + # transform image + B = const_array(B_data, 'B') + r_eps = tvm.reduce_axis((0, alpha), 'r_eps') + r_nu = tvm.reduce_axis((0, alpha), 'r_nu') + V = tvm.compute((alpha, alpha, P_round // bnb, C, bnb), lambda eps, nu, b, c, bb: + tvm.sum(input_tile[c][b][r_eps][r_nu][bb] * B[r_eps][eps] * B[r_nu][nu], + axis=[r_eps, r_nu]), name='V') + + # batch gemm + c = tvm.reduce_axis((0, C), name='c') + M = tvm.compute((alpha, alpha, K, P_round), lambda eps, nu, k, b: + tvm.sum(U[eps][nu][k // bna][c][k % bna] * + V[eps][nu][b // bnb][c][b % bnb], axis=c), name='M') + + # inverse transform + A = const_array(A_data, 'A') + r_eps = tvm.reduce_axis((0, alpha), 'r_eps') + r_nu = tvm.reduce_axis((0, alpha), 'r_nu') + Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: + tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], + axis=[r_eps, r_nu]), name='Y') + + # unpack output + output = tvm.compute((N, K, H, W), lambda n, k, h, w: + Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m] + # thw following term is used to make the padding effective, + # otherwise the padding will be eliminated by bound inference + + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][K-1][P_round-1], + name='output', tag='winograd_conv_output') + + return output + +def _schedule_winograd(s, op): + """schedule winograd fast convolution F(2x2, 3x3) for conv2d""" + + # get ops and tensors + output = op.output(0) + + Y = op.input_tensors[0] + M, A = s[Y].op.input_tensors + U, V = s[M].op.input_tensors + kernel, G = s[U].op.input_tensors + d, B = s[V].op.input_tensors + data_pad = s[d].op.input_tensors[0] + data = s[data_pad].op.input_tensors[0] + + # padding + s[data_pad].compute_inline() + + # pack input tiles + c, b, eps, nu, bb = s[d].op.axis + s[d].reorder(eps, nu, bb) + aha = s[d].fuse(eps, nu) + s[d].unroll(bb) + tile_and_bind3d(s, d, c, b, aha, 4, 1, 1) + + # transform kernel + s[G].compute_inline() + eps, nu, k, c, kk, = s[U].op.axis + r_kh, r_kw = s[U].op.reduce_axis + s[U].reorder(k, c, kk, eps, nu, r_kh, r_kw) + _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]] + s[U].vectorize(kk) + tile_and_bind(s, U, k, c, 1, 256) + + # transform image + s[B].compute_inline() + eps, nu, b, c, bb = s[V].op.axis + r_eps, r_nu = s[V].op.reduce_axis + s[V].reorder(b, c, bb, eps, nu, r_nu, r_eps) + _ = [s[V].unroll(x) for x in [eps, nu, r_eps, r_nu]] + s[V].vectorize(bb) + tile_and_bind(s, V, b, c, 2, 1) + + # batch gemm + bna, bnb = 4, 4 + if data.dtype == 'float16': + bnb *= 2 + + eps, nu, k, b = s[M].op.axis + c = s[M].op.reduce_axis[0] + yo, xo, yi, xi = s[M].tile(k, b, bna, bnb) + s[M].reorder(c, yi, xi) + c, c_unroll = s[M].split(c, 2) + s[M].unroll(c_unroll) + s[M].unroll(yi) + s[M].vectorize(xi) + z = s[M].fuse(eps, nu) + tile_and_bind3d(s, M, z, yo, xo, 1, 8, 1) + + # inverse transform + s[A].compute_inline() + k, b, vh, vw = s[Y].op.axis + r_eps, r_nu = s[Y].op.reduce_axis + _ = [s[Y].unroll(x) for x in [vh, vw, r_eps, r_nu]] + tile_and_bind(s, Y, k, b, 4, 1) + + # schedule output + if output.op in s.outputs: # no bias + output = output + else: # has bias + s[output].compute_inline() + output = s.outputs[0] + + _, k, h, w = s[output].op.axis + tile_and_bind3d(s, output, k, h, w, 1, 2, 2) From 1ae02f1e03510b742383fa70eeb72fdca8eaf397 Mon Sep 17 00:00:00 2001 From: Siva Date: Tue, 13 Feb 2018 22:32:21 +0530 Subject: [PATCH 152/948] [Graph Debug] bug fix (#897) Need to break after executing intended operation (not before). --- src/runtime/graph/graph_runtime.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index bf07a8c38927..0c642577da59 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -138,8 +138,8 @@ class GraphRuntime : public ModuleNode { uint32_t eid = index; for (size_t i = 0; i < op_execs_.size(); ++i) { - if (static_cast(i) == index) break; if (op_execs_[i]) op_execs_[i](); + if (static_cast(i) == index) break; } TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr)); From 4818cb2b1a829673e89bf4937215aa3f06018e54 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Thu, 15 Feb 2018 12:44:52 -0500 Subject: [PATCH 153/948] Change CodeGenCPU::GetPackedFuncHandle to generate global variable with InternalLinkage. (#901) Emscripten seems to not have done initialization properly. --- src/codegen/llvm/codegen_cpu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc index 876d1e12d8a2..54aa50427413 100644 --- a/src/codegen/llvm/codegen_cpu.cc +++ b/src/codegen/llvm/codegen_cpu.cc @@ -483,7 +483,7 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) { // create the function handle hptr = new llvm::GlobalVariable( *module_, t_tvm_func_handle_, false, - llvm::GlobalValue::LinkOnceAnyLinkage, 0, ".tvm_func." + fname); + llvm::GlobalValue::InternalLinkage, nullptr, ".tvm_func." + fname); hptr->setAlignment(align); hptr->setInitializer(llvm::Constant::getNullValue(t_tvm_func_handle_)); func_handle_map_[fname] = hptr; From d419aaf4039b098fc786a9377b8239ce1e2ef478 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Thu, 15 Feb 2018 10:03:11 -0800 Subject: [PATCH 154/948] Add dense base scheduler (#887) * Add basic dense scheduler * Revert to put back cpp dense registration * Fix lint --- topi/python/topi/x86/nn.py | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index 49aa382589d1..4c886b48d9c2 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -1,7 +1,9 @@ +# pylint: disable=invalid-name,too-many-locals """x86 nn operators""" from __future__ import absolute_import as _abs import tvm from .. import generic +from .. import tag def _default_schedule(outs, auto_inline): """Default schedule for x86.""" @@ -54,3 +56,61 @@ def schedule_pool(outs): The computation schedule for the op. """ return _default_schedule(outs, False) + + +@generic.schedule_dense.register(["cpu"]) +def schedule_dense(outs): + """Schedule for dense + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of pool + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + if 'dense' in op.tag: + C = op.output(0) + x, y = C.op.axis + + # Write cache for blocks + CC = s.cache_write(C, 'global') + + # Tile + bnx = 1 + bny = 4 + _, yo, _, yi = s[C].tile(x, y, bnx, bny) + s[CC].compute_at(s[C], yo) + xc, yc = s[CC].op.axis + k, = s[CC].op.reduce_axis + ko, ki = s[CC].split(k, factor=4) + s[CC].reorder(ko, xc, ki, yc) + s[CC].unroll(ki) + s[CC].vectorize(yc) + + # Vectorization + s[C].vectorize(yi) + + # Parallelization + s[C].parallel(yo) + + traverse(outs[0].op) + return s From 4b8e1c21e3ca1949f30585c23a968f026d2b9f1f Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Sun, 18 Feb 2018 11:45:44 -0500 Subject: [PATCH 155/948] Set total memory of emcc module to 1GB (#906) --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 7ed46fd14b05..9e15d60e8c9b 100644 --- a/Makefile +++ b/Makefile @@ -32,6 +32,7 @@ FRAMEWORKS = OBJCFLAGS = -fno-objc-arc EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\ -Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\ + -s TOTAL_MEMORY=1073741824\ -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\ -s USE_GLFW=3 -s USE_WEBGL2=1 -lglfw\ $(INCLUDE_FLAGS) From b24f3104d983929cf299c093b6b0782ea0e98dca Mon Sep 17 00:00:00 2001 From: masahi Date: Mon, 19 Feb 2018 03:21:47 +0900 Subject: [PATCH 156/948] [TOPI] update c++ pool and softmax (#905) * update c++ pool and softmax * clean up reduce axis --- topi/include/topi/nn/pooling.h | 130 ++++++++++++++++++++- topi/include/topi/nn/softmax.h | 71 ++++++++--- topi/src/topi.cc | 4 +- topi/tests/python_cpp/test_topi_pooling.py | 2 +- topi/tests/python_cpp/test_topi_softmax.py | 2 +- 5 files changed, 182 insertions(+), 27 deletions(-) diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h index 4333f2749573..5d68510c3842 100644 --- a/topi/include/topi/nn/pooling.h +++ b/topi/include/topi/nn/pooling.h @@ -36,12 +36,13 @@ enum PoolType : int { * * \return The output tensor in NCHW order */ -inline Tensor pool(const Tensor& x, - const Array& kernel_size, - const Array& stride_size, - const Array& padding_size, - PoolType pool_type, - bool ceil_mode) { + +inline Tensor pool_nchw(const Tensor& x, + const Array& kernel_size, + const Array& stride_size, + const Array& padding_size, + PoolType pool_type, + bool ceil_mode) { CHECK_EQ(x->shape.size(), 4) << "Pooling input must be 4-D"; CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements"; CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements"; @@ -112,6 +113,123 @@ inline Tensor pool(const Tensor& x, } } +/*! +* \brief Perform pooling on data in NHWC order +* +* \param x The input tensor in NHWC order +* \param kernel_size Vector of two ints: {kernel_height, kernel_width} +* \param stride_size Vector of two ints: {stride_height, stride_width} +* \param padding_size Vector of two ints: {padding_height, padding_width} +* \param pool_type The type of pooling operator +* \param ceil_mode Whether to use ceil when calculating the output size +* +* \return The output tensor in NCHW order +*/ + +inline Tensor pool_nhwc(const Tensor& x, + const Array& kernel_size, + const Array& stride_size, + const Array& padding_size, + PoolType pool_type, + bool ceil_mode) { + CHECK_EQ(x->shape.size(), 4) << "Pooling input must be 4-D"; + CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements"; + CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements"; + CHECK_EQ(padding_size.size(), 2) << "Pooling padding_size must have 2 elements"; + + auto kernel_height = kernel_size[0]; + auto kernel_width = kernel_size[1]; + auto stride_height = stride_size[0]; + auto stride_width = stride_size[1]; + auto padding_height = padding_size[0]; + auto padding_width = padding_size[1]; + + auto batch = x->shape[0]; + auto height = x->shape[1]; + auto width = x->shape[2]; + auto channel = x->shape[3]; + + auto pad_tuple = detail::GetPadTuple(padding_height, padding_width); + auto pad_top = pad_tuple[0]; + auto pad_left = pad_tuple[1]; + auto pad_down = pad_tuple[2]; + auto pad_right = pad_tuple[3]; + + if (ceil_mode) { + // Additional padding to ensure we do ceil instead of floor when + // dividing by stride. + pad_down += stride_height - 1; + pad_right += stride_width - 1; + } + + Array pad_before{ 0, pad_top, pad_left, 0}; + Array pad_after{ 0, pad_down, pad_right, 0}; + + auto out_height = tvm::ir::Simplify( + (height - kernel_height + pad_top + pad_down) / stride_height + 1); + auto out_width = tvm::ir::Simplify( + (width - kernel_width + pad_left + pad_right) / stride_width + 1); + + auto dheight = tvm::reduce_axis(Range(0, kernel_height)); + auto dwidth = tvm::reduce_axis(Range(0, kernel_width)); + + if (pool_type == kMaxPool) { + auto temp = pad(x, pad_before, pad_after, x->dtype.min(), "pad_temp"); + return tvm::compute( + { batch, out_height, out_width, channel }, + [&](Var n, Var h, Var w, Var c) { + return tvm::max(temp(n, h * stride_height + dheight, w * stride_width + dwidth, c), + { dheight, dwidth }); + }, "tensor", "pool_max"); + } else if (pool_type == kAvgPool) { + auto temp = pad(x, pad_before, pad_after, 0, "pad_temp"); + + auto tsum = tvm::compute( + { batch, out_height, out_width, channel }, + [&](Var n, Var h, Var w, Var c) { + return tvm::sum(temp(n, h * stride_height + dheight, w * stride_width + dwidth, c), + { dheight, dwidth }); + }, "tensor", "pool_avg"); + + return tvm::compute( + { batch, out_height, out_width, channel }, + [&](Var n, Var h, Var w, Var c) { + return tsum(n, h, w, c) / (kernel_height * kernel_width); + }, "tensor", kElementWise); + } else { + LOG(ERROR) << "Unrecognized pool_type: " << pool_type; + return x; + } +} + +/*! +* \brief Perform pooling on data +* +* \param x The input tensor in NCHW or NHWC order +* \param kernel_size Vector of two ints: {kernel_height, kernel_width} +* \param stride_size Vector of two ints: {stride_height, stride_width} +* \param padding_size Vector of two ints: {padding_height, padding_width} +* \param pool_type The type of pooling operator +* \param ceil_mode Whether to use ceil when calculating the output size +* \param layout The input layout +* +* \return The output tensor in NCHW order +*/ + +inline Tensor pool(const Tensor& x, + const Array& kernel_size, + const Array& stride_size, + const Array& padding_size, + PoolType pool_type, + bool ceil_mode, + const std::string& layout = "NCHW") { + CHECK(layout == "NCHW" || layout == "NHWC") << "Unsupported layout."; + if (layout == "NCHW") + return pool_nchw(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode); + else + return pool_nhwc(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode); +} + /*! * \brief Perform global pooling on data in NCHW order * diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h index 273bac4ff76d..d2348c9f230a 100644 --- a/topi/include/topi/nn/softmax.h +++ b/topi/include/topi/nn/softmax.h @@ -9,6 +9,7 @@ #include #include +#include "topi/reduction.h" #include "topi/tags.h" #include "tvm/tvm.h" @@ -19,34 +20,70 @@ using namespace tvm; /*! * \brief Softmax activation * -* \param x The input tensor. 2-D where softmax is performed along the second dimension +* \param x The input tensor. Can be any dimension +* \param axis The channel axis along which softmax is performed * \param name The name of the operation * \param tag The tag to mark the operation * * \return A Tensor whose op member is the softmax operation */ -inline Tensor softmax(const Tensor& x, +inline Tensor softmax(const Tensor &x, + int axis = -1, std::string name = "tensor", std::string tag = "softmax_output") { - CHECK_EQ(x->shape.size(), 2) << "Softmax requires 2-D input"; + auto input_shape = x->shape; + auto ndim = input_shape.size(); + if (axis < 0) { + axis = ndim + axis; + } + CHECK_LT(axis, ndim) << "axis parameter should be less than input dim"; - Expr m = x->shape[0]; - Expr n = x->shape[1]; + auto k1 = tvm::reduce_axis(Range(0, input_shape[axis]), "k1"); + auto k2 = tvm::reduce_axis(Range(0, input_shape[axis]), "k2"); + auto reduced_shape = MakeReduceTargetShape({axis}, x, false); - auto k = tvm::reduce_axis(Range(0, n), "k"); - auto max_elem = tvm::compute( - { m }, [&](Var i) { - return tvm::max(x(i, k), Array{ k }); }); - k = tvm::reduce_axis(Range(0, n), "k"); + auto insert_reduce_index = [axis, ndim](const Array &indices, + const IterVar &reduce_index) { + Array eval_range; + int arg_counter = 0; + for (size_t i = 0; i < ndim; ++i) { + if (i == axis) + eval_range.push_back(reduce_index); + else + eval_range.push_back(indices[arg_counter++]); + } + return eval_range; + }; - auto expsum = tvm::compute( - { m }, [&](Var i) { - return tvm::sum(tvm::exp(x(i, k) - max_elem(i)), { k }); }); + auto _compute_max = [&](const Array &indices) { + auto eval_range = insert_reduce_index(indices, k1); + return topi::MaxOp(x(eval_range), {k1}); + }; - return tvm::compute( - x->shape, [&](Var i, Var j) { - return tvm::exp(x(i, j) - max_elem(i)) / expsum(i); - }); + auto _compute_expsum = [&](const Tensor &max_elem, + const Array &indices) { + auto eval_range = insert_reduce_index(indices, k2); + return tvm::sum(tvm::exp(x(eval_range) - max_elem(indices)), {k2}); + }; + + auto _normalize = [&](const Tensor &max_elem, const Tensor &expsum, + const Array &indices) { + Array non_reduce_indices; + for (size_t i = 0; i < ndim; ++i) { + if (i != axis) + non_reduce_indices.push_back(indices[i]); + } + return tvm::exp(x(indices) - max_elem(non_reduce_indices)) / + expsum(non_reduce_indices); + }; + + auto max_elem = tvm::compute(reduced_shape, _compute_max); + auto expsum = tvm::compute(reduced_shape, [&](const Array &indices) { + return _compute_expsum(max_elem, indices); + }); + return tvm::compute(input_shape, [&](const Array &indices) { + return _normalize(max_elem, expsum, indices); + }); } /*! diff --git a/topi/src/topi.cc b/topi/src/topi.cc index 970e982276f6..d6b67c74bacc 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -320,7 +320,7 @@ TVM_REGISTER_GLOBAL("topi.nn.pool") .set_body([](TVMArgs args, TVMRetValue *rv) { *rv = nn::pool(args[0], args[1], args[2], args[3], static_cast(static_cast(args[4])), - args[5]); + args[5], args[6]); }); TVM_REGISTER_GLOBAL("topi.nn.global_pool") @@ -332,7 +332,7 @@ TVM_REGISTER_GLOBAL("topi.nn.global_pool") /* Ops from nn/softmax.h */ TVM_REGISTER_GLOBAL("topi.nn.softmax") .set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = nn::softmax(args[0]); + *rv = nn::softmax(args[0], args[1]); }); TVM_REGISTER_GLOBAL("topi.nn.log_softmax") diff --git a/topi/tests/python_cpp/test_topi_pooling.py b/topi/tests/python_cpp/test_topi_pooling.py index 6132fcd36469..e45b53dc0dec 100644 --- a/topi/tests/python_cpp/test_topi_pooling.py +++ b/topi/tests/python_cpp/test_topi_pooling.py @@ -16,7 +16,7 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode): ph, pw = padding A = tvm.placeholder((n, ic, ih, iw), name='A') B = topi.cpp.nn.pool(A, [kh, kw], [sh, sw], padding, - pool_code[pool_type], ceil_mode) + pool_code[pool_type], ceil_mode, "NCHW") B = topi.cpp.nn.relu(B) dtype = A.dtype diff --git a/topi/tests/python_cpp/test_topi_softmax.py b/topi/tests/python_cpp/test_topi_softmax.py index 2c2f62c18f20..2a6baaafe2f1 100644 --- a/topi/tests/python_cpp/test_topi_softmax.py +++ b/topi/tests/python_cpp/test_topi_softmax.py @@ -8,7 +8,7 @@ def verify_softmax(m, n): A = tvm.placeholder((m, n), name='A') - B = topi.cpp.nn.softmax(A) + B = topi.cpp.nn.softmax(A, 1) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) From 2d132be71660b2bbb33d112575af8a8bc3d7df58 Mon Sep 17 00:00:00 2001 From: Meghan Cowan Date: Sun, 18 Feb 2018 17:02:40 -0800 Subject: [PATCH 157/948] Add UIntImm to select rewrite (#909) --- src/pass/rewrite_unsafe_select.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pass/rewrite_unsafe_select.cc b/src/pass/rewrite_unsafe_select.cc index ecafe8eba06c..3224f47907a3 100644 --- a/src/pass/rewrite_unsafe_select.cc +++ b/src/pass/rewrite_unsafe_select.cc @@ -77,6 +77,7 @@ class UnsafeExprDetector : public ExprFunctor { return false; } bool VisitExpr_(const Variable* op) final { return false; } + bool VisitExpr_(const UIntImm* op) final { return false; } bool VisitExpr_(const IntImm* op) final { return false; } bool VisitExpr_(const FloatImm* op) final { return false; } bool VisitExpr_(const StringImm* op) final { return false; } From a7dc0def2be8e23494cd47a2776b57afe8bf0abb Mon Sep 17 00:00:00 2001 From: masahi Date: Tue, 20 Feb 2018 05:46:36 +0900 Subject: [PATCH 158/948] add missing inline (#910) --- topi/include/topi/reduction.h | 42 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h index 3fda17130fce..a4a3e4e999ac 100644 --- a/topi/include/topi/reduction.h +++ b/topi/include/topi/reduction.h @@ -35,7 +35,7 @@ using FCommReduce = std::function< * If any input element is negative, it will be treated as an offset from the * last dimension (same as python indexing rules). */ -std::vector GetRealAxis(int ndim, const std::vector& axis) { +inline std::vector GetRealAxis(int ndim, const std::vector& axis) { std::vector real_axis; if (axis.size() == 0) { for (int i = 0; i < ndim; ++i) { @@ -59,7 +59,7 @@ std::vector GetRealAxis(int ndim, const std::vector& axis) { } /*! \brief Enumerate the axes for a reduce op */ -Array MakeReduceAxes(const std::vector& real_axis, const Tensor& data) { +inline Array MakeReduceAxes(const std::vector& real_axis, const Tensor& data) { Array reduce_axes; for (auto i : real_axis) { std::string name = "k" + std::to_string(i); @@ -70,9 +70,9 @@ Array MakeReduceAxes(const std::vector& real_axis, const Tensor& d } /*! \brief Calculate the target shape for a reduce op */ -Array MakeReduceTargetShape(const std::vector& real_axis, - const Tensor& data, - bool keepdims) { +inline Array MakeReduceTargetShape(const std::vector& real_axis, + const Tensor& data, + bool keepdims) { auto ndim = data->shape.size(); Array target_shape; if (keepdims) { @@ -107,10 +107,10 @@ Array MakeReduceTargetShape(const std::vector& real_axis, * * \return The result tensor. */ -Tensor CommReduce(const Tensor& data, - const Array& axis, - FReduce func, - bool keepdims = false) { +inline Tensor CommReduce(const Tensor& data, + const Array& axis, + FReduce func, + bool keepdims = false) { auto ndim = data->shape.size(); CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor"; auto axis_val = detail::GetConstIntValues(axis, "axis"); @@ -159,10 +159,10 @@ Tensor CommReduce(const Tensor& data, * * \return The result tensor. */ -Tensor CommReduceIdx(const Tensor& data, - const Array& axis, - FCommReduce func, - bool keepdims = false) { +inline Tensor CommReduceIdx(const Tensor& data, + const Array& axis, + FCommReduce func, + bool keepdims = false) { auto ndim = data->shape.size(); CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor"; auto axis_val = detail::GetConstIntValues(axis, "axis"); @@ -227,9 +227,9 @@ using FIdentity = std::function(std::vector types)>; * * \return A reducer function which creates a reduce expression over an axis. */ -FCommReduce MakeCommReducer(FCombine fcombine, - FIdentity fidentity, - std::string name = "reduce") { +inline FCommReduce MakeCommReducer(FCombine fcombine, + FIdentity fidentity, + std::string name = "reduce") { return [fcombine, fidentity, &name] (Array exprs, const Array& axis, Expr* condition) { Array lhs, rhs; @@ -277,7 +277,7 @@ inline Expr MaxOp(Expr source, Array axis) { * * \return A Tensor whose op member is the sum operation */ -Tensor sum(const Tensor& data, Array axis, bool keepdims = false) { +inline Tensor sum(const Tensor& data, Array axis, bool keepdims = false) { return CommReduce(data, axis, tvm::sum, keepdims); } @@ -294,7 +294,7 @@ Tensor sum(const Tensor& data, Array axis, bool keepdims = false) { * * \return A Tensor whose op member is the min operation */ -Tensor min(const Tensor& data, Array axis, bool keepdims = false) { +inline Tensor min(const Tensor& data, Array axis, bool keepdims = false) { return CommReduce(data, axis, MinOp, keepdims); } @@ -311,7 +311,7 @@ Tensor min(const Tensor& data, Array axis, bool keepdims = false) { * * \return A Tensor whose op member is the max operation */ -Tensor max(const Tensor& data, Array axis, bool keepdims = false) { // NOLINT(*) +inline Tensor max(const Tensor& data, Array axis, bool keepdims = false) { // NOLINT(*) return CommReduce(data, axis, MaxOp, keepdims); } @@ -328,7 +328,7 @@ Tensor max(const Tensor& data, Array axis, bool keepdims = false) { // NO * * \return A Tensor whose op member is the argmin operation */ -Tensor argmin(const Tensor& data, Array axis, bool keepdims = false) { +inline Tensor argmin(const Tensor& data, Array axis, bool keepdims = false) { auto fcombine = [](Array lhs, Array rhs) { Array result; result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0])); // idx @@ -358,7 +358,7 @@ Tensor argmin(const Tensor& data, Array axis, bool keepdims = false) { * * \return A Tensor whose op member is the argmax operation */ -Tensor argmax(const Tensor& data, Array axis, bool keepdims = false) { +inline Tensor argmax(const Tensor& data, Array axis, bool keepdims = false) { auto fcombine = [](Array lhs, Array rhs) { Array result; result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0])); // idx From 7e1aaa3c2a5b9cf7528e4e640af60e541b8b86c7 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 20 Feb 2018 16:02:50 -0800 Subject: [PATCH 159/948] [RUNTIME] Support nop (#913) --- src/runtime/graph/graph_runtime.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 0c642577da59..34d6b3af155a 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -564,6 +564,9 @@ std::function GraphRuntime::CreateTVMOp( t->shape = &(arg_ptr->shape_data[i]); } } + if (param.func_name == "__nop") { + return [](){}; + } // get compiled function from module. tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false); CHECK(pf != nullptr) << "no such function in module: " << param.func_name; From e4c2af9abdcb3c7aabafba8084414d7739c17c4c Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 20 Feb 2018 16:56:59 -0800 Subject: [PATCH 160/948] [RUNTIME] More reliable runtime only detection (#914) * [RUNTIME] More reliable runtime only detection * fix lint --- python/tvm/_ffi/base.py | 4 ++++ python/tvm/build_module.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py index b17043c2911d..49348f3110ad 100644 --- a/python/tvm/_ffi/base.py +++ b/python/tvm/_ffi/base.py @@ -41,6 +41,10 @@ def _load_lib(): __version__ = libinfo.__version__ # library instance of nnvm _LIB, _LIB_NAME = _load_lib() + +# Whether we are runtime only +_RUNTIME_ONLY = "runtime" in _LIB_NAME + # The FFI mode of TVM _FFI_MODE = os.environ.get("TVM_FFI", "auto") diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 9c442a07425d..86d150c08e5f 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -6,9 +6,9 @@ from __future__ import absolute_import as _abs import warnings import types -import os from ._ffi.node import NodeBase, register_node +from ._ffi.base import _RUNTIME_ONLY from . import api from . import tensor from . import schedule @@ -226,7 +226,7 @@ def build_config(**kwargs): setattr(config, k, kwargs[k]) return config -if not os.environ.get("TVM_USE_RUNTIME_LIB", False): +if not _RUNTIME_ONLY: # BuildConfig is not available in tvm_runtime BuildConfig.current = build_config() From 0951915ace479af2e28c1cada37bfd56e680acae Mon Sep 17 00:00:00 2001 From: libing4752 Date: Fri, 23 Feb 2018 01:28:23 +0800 Subject: [PATCH 161/948] [SCHEDULE] Add factor_axis to rfactor (#895) --- include/tvm/schedule.h | 4 ++- python/tvm/schedule.py | 6 ++-- src/api/api_lang.cc | 2 +- src/schedule/schedule_dataflow_rewrite.cc | 30 +++++++++++++++----- tests/python/integration/test_reduce.py | 31 +++++++++++++++++++++ tests/python/unittest/test_lang_schedule.py | 10 +++++++ 6 files changed, 72 insertions(+), 11 deletions(-) diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h index 51e27a9e94bf..568af8252f4a 100644 --- a/include/tvm/schedule.h +++ b/include/tvm/schedule.h @@ -313,10 +313,12 @@ class Schedule : public NodeRef { * * \param tensor The tensor to be factored. * \param axis The reduction axis in tensor's schedule to be factored. + * \param factor_axis The position where the new axis is placed. * \return The created factored tensors. */ EXPORT Array rfactor(const Tensor& tensor, - const IterVar& axis); + const IterVar& axis, + int factor_axis = 0); /*! * \brief Normalize the schedule. * This is needed before bound inference. diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py index dda5f67d1b89..b04945292adf 100644 --- a/python/tvm/schedule.py +++ b/python/tvm/schedule.py @@ -279,7 +279,7 @@ def cache_write(self, tensor, scope): """ return _api_internal._ScheduleCacheWrite(self, tensor, scope) - def rfactor(self, tensor, axis): + def rfactor(self, tensor, axis, factor_axis=0): """ Factor a reduction axis in tensor's schedule to be an explicit axis. This will create a new stage that generated the new tensor with axis @@ -292,13 +292,15 @@ def rfactor(self, tensor, axis): The tensor to be factored. axis : IterVar The reduction axis in the schedule to be factored. + factor_axis : int + The position where the new axis is placed. Returns ------- tfactor : Tensor or Array of Tensor The created factored tensor. """ - factored = _api_internal._ScheduleRFactor(self, tensor, axis) + factored = _api_internal._ScheduleRFactor(self, tensor, axis, factor_axis) return factored[0] if len(factored) == 1 else factored diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc index de388cf0b51f..d1994340702d 100644 --- a/src/api/api_lang.cc +++ b/src/api/api_lang.cc @@ -432,7 +432,7 @@ TVM_REGISTER_API("_ScheduleCacheWrite") TVM_REGISTER_API("_ScheduleRFactor") .set_body([](TVMArgs args, TVMRetValue* ret) { *ret = args[0].operator Schedule() - .rfactor(args[1], args[2]); + .rfactor(args[1], args[2], args[3]); }); TVM_REGISTER_API("_CommReducerCombine") diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc index 59d425287be0..562eff417dd2 100644 --- a/src/schedule/schedule_dataflow_rewrite.cc +++ b/src/schedule/schedule_dataflow_rewrite.cc @@ -395,7 +395,8 @@ Schedule Schedule::normalize() { // Handle reduction factor. Array Schedule::rfactor(const Tensor& tensor, - const IterVar& axis) { + const IterVar& axis, + int factor_axis) { (*this)->InvalidateCache(); using ir::Reduce; CHECK_EQ(axis->iter_type, kCommReduce) @@ -448,6 +449,9 @@ Array Schedule::rfactor(const Tensor& tensor, reduce_stage, dom_map, value_map, true, skip_bound_check); // Get the factored op node. + const int factor_axis_pos = \ + factor_axis >= 0 ? factor_axis : static_cast(compute_op->axis.size() + 1) + factor_axis; + CHECK_LE(factor_axis_pos, compute_op->axis.size()); auto n = std::make_shared(); n->name = compute_op->name + ".rf"; { @@ -458,10 +462,16 @@ Array Schedule::rfactor(const Tensor& tensor, << "Can only factor reduction domain starting from 0"; iv_node->var = axis->var; iv_node->iter_type = kDataPar; - n->axis.push_back(IterVar(iv_node)); - for (IterVar iv : compute_op->axis) { - n->axis.push_back(iv); + const int size = compute_op->axis.size(); + for (int idx = 0; idx < size; ++idx) { + if (factor_axis_pos == idx) { + n->axis.push_back(IterVar(iv_node)); + } + n->axis.push_back(compute_op->axis[idx]); + } + if (factor_axis_pos == size) { + n->axis.push_back(IterVar(iv_node)); } } // predicate generation, copy not touched axis. @@ -548,9 +558,15 @@ Array Schedule::rfactor(const Tensor& tensor, Array repl_tensors = compute(old_tensors[0]->shape, [&](const Array& i) { Array indices; - indices.push_back(repl_red_axis->var); - for (Var v : i) { - indices.push_back(v); + const int idx_size = static_cast(i.size()); + for (int idx = 0; idx < idx_size; ++idx) { + if (factor_axis_pos == idx) { + indices.push_back(repl_red_axis->var); + } + indices.push_back(i[idx]); + } + if (factor_axis_pos == idx_size) { + indices.push_back(repl_red_axis->var); } Array factor_exprs; for (int idx = 0; idx < size; ++idx) { diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index 228786a7de61..c8fb98746bf6 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -83,6 +83,36 @@ def check_target(target="llvm"): check_target() +def test_rfactor_factor_axis(): + n = tvm.convert(1027) + A = tvm.placeholder((n,), name='A') + k = tvm.reduce_axis((0, n)) + B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B') + # schedule + s = tvm.create_schedule(B.op) + kf, ki = s[B].split(k, nparts=4) + BF = s.rfactor(B, kf, 1) + s[BF].parallel(BF.op.axis[0]) + # one line to build the function. + def check_target(target="llvm"): + if not tvm.module.enabled(target): + return + ctx = tvm.cpu(0) + fapi = tvm.lower(s, args=[A, B]) + fsum = tvm.build(fapi, + target=target, + name="mysum") + # launch the kernel. + n = 1027 + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) + fsum(a, b) + res = np.sum(a.asnumpy(), axis=0) + np.testing.assert_allclose( + b.asnumpy(), res, rtol=1e-4) + + check_target() + def test_rfactor_threads(): nn = 1027 @@ -294,6 +324,7 @@ def check_target(device): if __name__ == "__main__": test_rfactor_elemwise_threads() test_rfactor_threads() + test_rfactor_factor_axis() test_rfactor() test_reduce_prims() test_argmax() diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py index 6c29f1067632..b29ebec180df 100644 --- a/tests/python/unittest/test_lang_schedule.py +++ b/tests/python/unittest/test_lang_schedule.py @@ -137,6 +137,16 @@ def test_rfactor(): assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4) + # schedule with factor_axis + s = tvm.create_schedule(B.op) + ko, ki = s[B].split(k1, factor=4) + xo, xi = s[B].split(B.op.axis[0], factor=8) + BF = s.rfactor(B, ki, 1) + assert(n == BF.shape[0]) + assert(BF.shape[1].value == 4) + assert(BF.op.body[0].axis[0] == k2) + assert(BF.op.body[0].axis[1].var == ko.var) + assert(s[B].op.body[0].axis[0].dom.extent.value == 4) def test_tensor_intrin(): n = 16 From 4cd2352171c145fe5eeb9b54cfa9e8d262f2133b Mon Sep 17 00:00:00 2001 From: xqdan Date: Fri, 23 Feb 2018 01:30:52 +0800 Subject: [PATCH 162/948] [PASS] Support buffer reuse for different types (#891) [PASS] Support buffer reuse for different types --- src/pass/storage_rewrite.cc | 28 ++++--- .../unittest/test_pass_storage_rewrite.py | 80 ++++++++++++++++++- 2 files changed, 94 insertions(+), 14 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 452b1a7b6abf..ac3ca2561e01 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -502,7 +502,6 @@ class StoragePlanRewriter : public IRMutator { } // Remap the index Expr RemapIndex(Type dtype, Expr index, StorageEntry* e) { - CHECK_EQ(dtype.element_of(), e->elem_type); if (e->bits_offset == 0) return index; uint64_t elem_bits = dtype.bits() * dtype.lanes(); CHECK_EQ(e->bits_offset % elem_bits, 0U); @@ -564,17 +563,22 @@ class StoragePlanRewriter : public IRMutator { Expr combo_size; for (const Allocate* op : e->allocs) { Expr sz = arith::ComputeReduce(op->extents, make_const(Int(32), 1)); - if (alloc_type.lanes() != op->type.lanes()) { - sz = (sz * make_const(sz.type(), op->type.lanes()) + - make_const(sz.type(), alloc_type.lanes() - 1)) / - make_const(sz.type(), alloc_type.lanes()); - } + // transform to bits + auto sz_nbits = sz * (op->type.bits() * op->type.lanes()); if (combo_size.defined()) { - combo_size = max(combo_size, sz); + combo_size = max(combo_size, sz_nbits); } else { - combo_size = sz; + combo_size = sz_nbits; } } + // transform to alloc bytes + auto type_bits = alloc_type.bits() * alloc_type.lanes(); + bool divided = can_prove(combo_size % type_bits == 0); + combo_size = combo_size / type_bits; + // round up for can not divided + if (!divided) { + combo_size += make_const(Int(32), 1); + } combo_size = ir::Simplify(combo_size); e->new_alloc = Allocate::make( e->alloc_var, alloc_type, {combo_size}, const_true(), @@ -784,8 +788,9 @@ class StoragePlanRewriter : public IRMutator { // skip plan for local variable, // compiler can do a better job with register allocation. const uint64_t match_range = 16; + uint64_t op_elem_bits = op->type.bits() * op->type.lanes(); uint64_t const_nbits = static_cast( - op->constant_allocation_size() * op->type.bits() * op->type.lanes()); + op->constant_allocation_size() * op_elem_bits); // disable reuse of small arrays, they will be lowered to registers in LLVM // This rules only apply if we are using non special memory if (scope.tag.length() == 0) { @@ -801,15 +806,18 @@ class StoragePlanRewriter : public IRMutator { auto begin = const_free_map_.lower_bound(const_nbits / match_range); auto mid = const_free_map_.lower_bound(const_nbits); auto end = const_free_map_.upper_bound(const_nbits * match_range); + // start looking at the buffer that is bigger than the required size first for (auto it = mid; it != end; ++it) { StorageEntry *e = it->second; if (e->attach_scope_ != attach_scope) continue; if (e->scope != scope) continue; - if (e->elem_type != op->type.element_of()) continue; + // when not divided, no reuse, eg, float4 vs float3 + if (e->bits_offset % op_elem_bits != 0) continue; e->const_nbits = std::max(const_nbits, e->const_nbits); const_free_map_.erase(it); return e; } + // then start looking at smaller buffers. for (auto it = mid; it != begin;) { --it; StorageEntry *e = it->second; diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index c9999f312300..9613a61b9b39 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -54,10 +54,27 @@ def stmt_generater(dtype_list, length): ib = tvm.ir_builder.create() base_dtype = dtype_list[0] global_a = tvm.placeholder((length,), name = "global_a", dtype = base_dtype) - for index, dtype in enumerate(dtype_list): - with ib.for_range(0, length, name="j") as j: - A = ib.allocate(dtype, length, name="A_" + str(index), scope="local.L0A") - A[j] = tvm.const(1, dtype = dtype) + assert len(dtype_list) == 4 + with ib.for_range(0, length, name="j") as j: + dtype = dtype_list[0] + A = ib.allocate(dtype, length, name="A", scope="local.L0A") + A[j] = tvm.const(1, dtype = dtype) + with ib.for_range(0, length, name="j") as j: + dtype = dtype_list[1] + B = ib.allocate(dtype, length, name="B", scope="local.L0A") + B[j] = tvm.const(1, dtype = dtype) + with ib.for_range(0, length, name="j") as j: + dtype = dtype_list[2] + C = ib.allocate(dtype, length, name="C", scope="local.L0A") + C[j] = tvm.const(1, dtype = dtype) + with ib.for_range(0, length, name="j") as j: + dtype = dtype_list[3] + D = ib.allocate(dtype, length, name="D", scope="local.L0A") + D[j] = tvm.const(1, dtype = dtype) + with ib.for_range(0, length, name="j") as j: + dtype = "int8" + E = ib.allocate(dtype, length, name="E", scope="local.L0A") + E[j] = A[j].astype(dtype) + B[j].astype(dtype) + C[j].astype(dtype) + D[j].astype(dtype) return ib.get() def dtype_bit_len(dtype): @@ -342,6 +359,58 @@ def verify(n): assert n.extents[0].value == 70 tvm.ir_pass.PostOrderVisit(stmt, verify) +def test_alloc_seq_type(): + ib = tvm.ir_builder.create() + n = tvm.var("n") + with ib.for_range(0, n, name="i") as i: + with ib.for_range(0, 10, name="j") as j: + A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A1 = ib.allocate("float32", 200, name="A1", scope="local.L0A") + A[j] = 1.2 + A1[j] = 1.3 + B = ib.allocate("int16", 200, name="B", scope="local.L0A") + B[j] = tvm.const(1, "int16") + C = ib.allocate("int16", 200, name="C", scope="local.L0A") + C[j] = tvm.const(1, "int16") + D = ib.allocate("int16", 200, name="D", scope="local.L0A") + D[j] = B[j] + C[j] + A2 = ib.allocate("float32", 200, name="A2", scope="local.L0A") + A2[j] = A[j] + + body = ib.get() + body = tvm.ir_pass.StorageRewrite(body) + num_alloc = [0] + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + num_alloc[0] += 1 + assert n.extents[0].value == 500 + tvm.ir_pass.PostOrderVisit(body, verify) + assert num_alloc[0] == 1 + +def test_alloc_seq_type2(): + ib = tvm.ir_builder.create() + n = tvm.var("n") + with ib.for_range(0, n, name="i") as i: + with ib.for_range(0, 10, name="j") as j: + A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A[j] = 1.2 + with ib.for_range(0, 20, name="j") as j: + B = ib.allocate("int16", 400, name="B", scope="local.L0A") + B[j] = tvm.const(1, "int16") + with ib.for_range(0, 10, name="j") as j: + C = ib.allocate("float32", 200, name="C", scope="local.L0A") + C[j] = 1.2 + + body = ib.get() + body = tvm.ir_pass.StorageRewrite(body) + num_alloc = [0] + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + num_alloc[0] += 1 + assert n.extents[0].value == 200 + tvm.ir_pass.PostOrderVisit(body, verify) + assert num_alloc[0] == 1 + if __name__ == "__main__": test_alloc_seq() test_alloc_different_dtypes() @@ -352,3 +421,6 @@ def verify(n): test_storage_share_gpu() test_inplace_rule2() test_inplace_rule3() + test_alloc_seq_type() + test_alloc_seq_type2() + From 12d15704d7f5d30cff7540f1fd16be64c6baca68 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 22 Feb 2018 18:26:32 -0800 Subject: [PATCH 163/948] [RUNTIME] Refactor extension type handling, now it is header only (#924) * [RUNTIME] Refactor extension type handling, now it is header only --- apps/extension/src/tvm_ext.cc | 20 +++- apps/extension/tests/test_ext.py | 6 ++ include/tvm/runtime/c_runtime_api.h | 18 ++++ include/tvm/runtime/module.h | 12 ++- include/tvm/runtime/packed_func.h | 122 ++++++++++++----------- python/tvm/_ffi/function.py | 25 +++++ python/tvm/api.py | 2 +- python/tvm/build_module.py | 32 +++--- src/runtime/c_runtime_api.cc | 8 ++ src/runtime/module.cc | 13 --- src/runtime/registry.cc | 29 ------ tests/cpp/packed_func_test.cc | 3 - tests/scripts/task_python_integration.sh | 1 + topi/src/topi.cc | 2 - 14 files changed, 166 insertions(+), 127 deletions(-) diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc index 6d7f4bdf7533..8b086863f7ed 100644 --- a/apps/extension/src/tvm_ext.cc +++ b/apps/extension/src/tvm_ext.cc @@ -22,13 +22,10 @@ struct extension_class_info { } // namespace tvm } // namespace runtime - -namespace tvm_ext { - using namespace tvm; using namespace tvm::runtime; -TVM_REGISTER_EXT_TYPE(IntVector); +namespace tvm_ext { TVM_REGISTER_GLOBAL("tvm_ext.ivec_create") .set_body([](TVMArgs args, TVMRetValue *rv) { @@ -66,3 +63,18 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev") *rv = (*tvm::runtime::Registry::Get("device_api.cpu"))(); }); } // namespace tvm_ext + +// This callback approach allows extension allows tvm to extract +// This way can be helpful when we want to use a header only +// minimum version of TVM Runtime. +extern "C" int TVMExtDeclare(TVMFunctionHandle pregister) { + const PackedFunc& fregister = + *static_cast(pregister); + auto mul = [](TVMArgs args, TVMRetValue *rv) { + int x = args[0]; + int y = args[1]; + *rv = x * y; + }; + fregister("mul", PackedFunc(mul)); + return 0; +} diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py index 0bbfff14eeef..628602f0baea 100644 --- a/apps/extension/tests/test_ext.py +++ b/apps/extension/tests/test_ext.py @@ -44,8 +44,14 @@ def ivec_cb(v2): tvm.convert(ivec_cb)(ivec) +def test_extract_ext(): + fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare) + assert fdict["mul"](3, 4) == 12 + + if __name__ == "__main__": test_ext_dev() test_ext_vec() test_bind_add() test_sym_add() + test_extract_ext() diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index edade00c7ed3..e4a06b39d04e 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -24,6 +24,13 @@ #define TVM_EXTERN_C #endif +// Macros to do weak linking +#ifdef _MSC_VER +#define TVM_WEAK __declspec(selectany) +#else +#define TVM_WEAK __attribute__((weak)) +#endif + #ifdef __EMSCRIPTEN__ #include #define TVM_DLL EMSCRIPTEN_KEEPALIVE @@ -313,6 +320,17 @@ typedef int (*TVMPackedCFunc)( */ typedef void (*TVMPackedCFuncFinalizer)(void* resource_handle); +/*! + * \brief Signature for extension function declarer. + * + * TVM call this function to get the extension functions + * The declarer will call register_func to register function and their name. + * + * \param resource_func_handle The register function + * \return 0 if success, -1 if failure happens + */ +typedef int (*TVMExtensionFuncDeclarer)(TVMFunctionHandle register_func_handle); + /*! * \brief Wrap a TVMPackedCFunc to become a FunctionHandle. * diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index 3d0991034c41..f8e5069f56c5 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -38,8 +38,14 @@ class Module { * \param query_imports Whether also query dependency modules. * \return The result function. * This function will return PackedFunc(nullptr) if function do not exist. + * \note Implemented in packed_func.cc */ - TVM_DLL PackedFunc GetFunction(const std::string& name, bool query_imports = false); + inline PackedFunc GetFunction(const std::string& name, bool query_imports = false); + /*! \return internal container */ + inline ModuleNode* operator->(); + /*! \return internal container */ + inline const ModuleNode* operator->() const; + // The following functions requires link with runtime. /*! * \brief Import another module into this module. * \param other The module to be imported. @@ -57,10 +63,6 @@ class Module { */ TVM_DLL static Module LoadFromFile(const std::string& file_name, const std::string& format = ""); - /*! \return internal container */ - inline ModuleNode* operator->(); - /*! \return internal container */ - inline const ModuleNode* operator->() const; private: std::shared_ptr node_; diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index b01e662b97ab..ca2e020ba703 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -183,31 +183,17 @@ struct extension_class_info { }; /*! - * \brief Runtime function table about extension type. + * \brief Capsule structure holding extension types + * Capsule is self-contained and include + * all the information to clone and destroy the type. */ -class ExtTypeVTable { - public: +struct TVMExtTypeCapsule { + /*! \brief The pointer to the object */ + void* ptr; /*! \brief function to be called to delete a handle */ void (*destroy)(void* handle); /*! \brief function to be called when clone a handle */ void* (*clone)(void* handle); - /*! - * \brief Register type - * \tparam T The type to be register. - * \return The registered vtable. - */ - template - static inline ExtTypeVTable* Register_(); - /*! - * \brief Get a vtable based on type code. - * \param type_code The type code - * \return The registered vtable. - */ - TVM_DLL static ExtTypeVTable* Get(int type_code); - - private: - // Internal registration function. - TVM_DLL static ExtTypeVTable* RegisterInternal(int type_code, const ExtTypeVTable& vt); }; /*! @@ -255,8 +241,9 @@ class TVMPODValue_ { } template const TExtension& AsExtension() const { - CHECK_LT(type_code_, kExtEnd); - return static_cast(value_.v_handle)[0]; + CHECK_EQ(type_code_, extension_class_info::code); + return static_cast( + static_cast(value_.v_handle)->ptr)[0]; } int type_code() const { return type_code_; @@ -488,14 +475,6 @@ class TVMRetValue : public TVMPODValue_ { this->Assign(other); return *this; } - template::code != 0>::type> - TVMRetValue& operator=(const T& other) { - this->SwitchToClass( - extension_class_info::code, other); - return *this; - } /*! * \brief Move the value back to front-end via C API. * This marks the current container as null. @@ -521,6 +500,11 @@ class TVMRetValue : public TVMPODValue_ { type_code_ != kStr) << "TVMRetValue.value can only be used for POD data"; return value_; } + // assign extension + template::code != 0>::type> + inline TVMRetValue& operator=(const T& other); // NodeRef related extenstions: in tvm/packed_func_ext.h templateClear(); - type_code_ = other.type_code(); - value_.v_handle = - (*(ExtTypeVTable::Get(other.type_code())->clone))( - other.value().v_handle); + TVMExtTypeCapsule cap = *other.template ptr(); + cap.ptr = cap.clone(cap.ptr); + SwitchToClass(other.type_code(), cap); } break; } @@ -600,7 +582,9 @@ class TVMRetValue : public TVMPODValue_ { case kNodeHandle: delete ptr >(); break; } if (type_code_ > kExtBegin) { - (*(ExtTypeVTable::Get(type_code_)->destroy))(value_.v_handle); + TVMExtTypeCapsule *cap = ptr(); + cap->destroy(cap->ptr); + delete cap; } type_code_ = kNull; } @@ -716,8 +700,10 @@ inline void for_each(const F& f, Args&&... args) { // NOLINT(*) /* \brief argument settter to PackedFunc */ class TVMArgsSetter { public: - TVMArgsSetter(TVMValue* values, int* type_codes) - : values_(values), type_codes_(type_codes) {} + TVMArgsSetter(TVMValue* values, + int* type_codes, + TVMExtTypeCapsule* exts) + : values_(values), type_codes_(type_codes), exts_(exts) {} // setters for POD types template inline TVMRetValue PackedFunc::operator()(Args&& ...args) const { const int kNumArgs = sizeof...(Args); + // Compiler will remove an static array when it is not touched. const int kArraySize = kNumArgs > 0 ? kNumArgs : 1; TVMValue values[kArraySize]; int type_codes[kArraySize]; - detail::for_each(TVMArgsSetter(values, type_codes), + // If the function call does not contain extension type, + // exts will get optimized away by compiler. + TVMExtTypeCapsule exts[kArraySize]; + detail::for_each(TVMArgsSetter(values, type_codes, exts), std::forward(args)...); TVMRetValue rv; body_(TVMArgs(values, type_codes, kNumArgs), &rv); @@ -853,14 +845,6 @@ inline TVMRetValue::operator T() const { ::Apply(this); } -template -inline void TVMArgsSetter::operator()(size_t i, const T& value) const { - static_assert(extension_class_info::code != 0, - "Need to have extesion code"); - type_codes_[i] = extension_class_info::code; - values_[i].v_handle = const_cast(&value); -} - // extension type handling template struct ExtTypeInfo { @@ -872,16 +856,42 @@ struct ExtTypeInfo { } }; -template -inline ExtTypeVTable* ExtTypeVTable::Register_() { - const int code = extension_class_info::code; - static_assert(code != 0, - "require extension_class_info traits to be declared with non-zero code"); - ExtTypeVTable vt; - vt.clone = ExtTypeInfo::clone; - vt.destroy = ExtTypeInfo::destroy; - return ExtTypeVTable::RegisterInternal(code, vt); +template +inline TVMRetValue& TVMRetValue::operator=(const T& other) { + TVMExtTypeCapsule cap; + cap.clone = ExtTypeInfo::clone; + cap.destroy = ExtTypeInfo::destroy; + cap.ptr = new T(other); + SwitchToClass( + extension_class_info::code, cap); + return *this; +} + +template +inline void TVMArgsSetter::operator()(size_t i, const T& value) const { + static_assert(extension_class_info::code != 0, + "Need to have extesion code"); + type_codes_[i] = extension_class_info::code; + exts_[i].clone = ExtTypeInfo::clone; + exts_[i].destroy = ExtTypeInfo::destroy; + exts_[i].ptr = const_cast(&value); + values_[i].v_handle = &exts_[i]; } + +// Implement Module::GetFunction +// Put implementation in this file so we have seen the PackedFunc +inline PackedFunc Module::GetFunction(const std::string& name, bool query_imports) { + PackedFunc pf = node_->GetFunction(name, node_); + if (pf != nullptr) return pf; + if (query_imports) { + for (const Module& m : node_->imports_) { + pf = m.node_->GetFunction(name, m.node_); + if (pf != nullptr) return pf; + } + } + return pf; +} + } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_PACKED_FUNC_H_ diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py index 2edb355fb721..526d972f6d28 100644 --- a/python/tvm/_ffi/function.py +++ b/python/tvm/_ffi/function.py @@ -234,6 +234,31 @@ def list_global_func_names(): return fnames +def extract_ext_funcs(finit): + """ + Extract the extension PackedFuncs from a C module. + + Parameters + ---------- + finit : ctypes function + a ctypes that takes signature of TVMExtensionDeclarer + + Returns + ------- + fdict : dict of str to Function + The extracted functions + """ + fdict = {} + def _list(name, func): + fdict[name] = func + myf = convert_to_tvm_func(_list) + ret = finit(myf.handle) + _ = myf + if ret != 0: + raise RuntimeError("cannot initialize with %s" % finit) + return fdict + + def _get_api(f): flocal = f flocal.is_global = True diff --git a/python/tvm/api.py b/python/tvm/api.py index 7c90b0ec9481..66c154bc9f00 100644 --- a/python/tvm/api.py +++ b/python/tvm/api.py @@ -8,7 +8,7 @@ from ._ffi.node import register_node, NodeBase from ._ffi.node import convert_to_node as _convert_to_node from ._ffi.function import Function -from ._ffi.function import _init_api, register_func, get_global_func +from ._ffi.function import _init_api, register_func, get_global_func, extract_ext_funcs from ._ffi.function import convert_to_tvm_func as _convert_tvm_func from ._ffi.runtime_ctypes import TVMType from . import _api_internal diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 86d150c08e5f..d868e2e0df12 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -23,16 +23,16 @@ from . import make class DumpIR(object): - """Dump IR for each pass. - With it, you can dump ir just like gcc/llvm. - - How to use: - ----------- - .. code-block:: python + """ + Dump IR for each pass. + With it, you can dump ir just like gcc/llvm. - with tvm.build_config(dump_pass_ir=True) - run() + How to use: + ----------- + .. code-block:: python + with tvm.build_config(dump_pass_ir=True) + run() """ scope_level = 0 def __init__(self): @@ -40,9 +40,9 @@ def __init__(self): self._recover_list = [] def decorate(self, func): - ''' decorate the pass function''' + """ decorate the pass function""" def dump(*args, **kwargs): - '''dump function''' + """dump function""" retv = func(*args, **kwargs) if not isinstance(retv, (_stmt.Stmt, container.LoweredFunc, container.Array)): return retv @@ -59,7 +59,7 @@ def dump(*args, **kwargs): return dump def decorate_irpass(self): - '''decorate ir_pass and ScheduleOps''' + """decorate ir_pass and ScheduleOps""" self._old_sgpass = schedule.ScheduleOps schedule.ScheduleOps = self.decorate(schedule.ScheduleOps) vset = vars(ir_pass) @@ -71,7 +71,7 @@ def recover(): vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v def decorate_custompass(self): - ''' decorate add_lower_pass pass in BuildConfig''' + """ decorate add_lower_pass pass in BuildConfig""" cfg = BuildConfig.current self._old_custom_pass = cfg.add_lower_pass custom_pass = cfg.add_lower_pass if cfg.add_lower_pass else [] @@ -79,7 +79,7 @@ def decorate_custompass(self): BuildConfig.current.add_lower_pass = pass_list def enter(self): - '''only decorate outermost nest''' + """only decorate outermost nest""" if DumpIR.scope_level > 0: return self.decorate_irpass() @@ -88,7 +88,7 @@ def enter(self): DumpIR.scope_level += 1 def exit(self): - '''recover outermost nest''' + """recover outermost nest""" if DumpIR.scope_level > 1: return # recover decorated functions @@ -163,6 +163,7 @@ def __setattr__(self, name, value): "'%s' object cannot set attribute '%s'" % (str(type(self)), name)) return super(BuildConfig, self).__setattr__(name, value) + def build_config(**kwargs): """Configure the build behavior by setting config variables. @@ -226,6 +227,7 @@ def build_config(**kwargs): setattr(config, k, kwargs[k]) return config + if not _RUNTIME_ONLY: # BuildConfig is not available in tvm_runtime BuildConfig.current = build_config() @@ -352,8 +354,10 @@ def lower(sch, stmt = f(stmt) if simple_mode: return stmt + return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) + def build(sch, args=None, target=None, diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 5e3b3e8034ce..9a7005f2e6c7 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -347,6 +347,14 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, API_END(); } +int TVMExtTypeFree(void* handle, int type_code) { + API_BEGIN(); + TVMExtTypeCapsule* cap = static_cast(handle); + cap->destroy(cap->ptr); + delete cap; + API_END(); +} + int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, diff --git a/src/runtime/module.cc b/src/runtime/module.cc index c0796a61a20d..d5ece65606b9 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -13,19 +13,6 @@ namespace tvm { namespace runtime { -PackedFunc Module::GetFunction( - const std::string& name, bool query_imports) { - PackedFunc pf = node_->GetFunction(name, node_); - if (pf != nullptr) return pf; - if (query_imports) { - for (const Module& m : node_->imports_) { - pf = m.node_->GetFunction(name, m.node_); - if (pf != nullptr) return pf; - } - } - return pf; -} - void Module::Import(Module other) { // specially handle rpc if (!std::strcmp((*this)->type_key(), "rpc")) { diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc index d7587b6ce1a5..563731a606cf 100644 --- a/src/runtime/registry.cc +++ b/src/runtime/registry.cc @@ -22,15 +22,10 @@ struct Registry::Manager { // and the resource can become invalid because of indeterminstic order of destruction. // The resources will only be recycled during program exit. std::unordered_map fmap; - // vtable for extension type - std::array ext_vtable; // mutex std::mutex mutex; Manager() { - for (auto& x : ext_vtable) { - x.destroy = nullptr; - } } static Manager* Global() { @@ -88,24 +83,6 @@ std::vector Registry::ListNames() { return keys; } -ExtTypeVTable* ExtTypeVTable::Get(int type_code) { - CHECK(type_code > kExtBegin && type_code < kExtEnd); - Registry::Manager* m = Registry::Manager::Global(); - ExtTypeVTable* vt = &(m->ext_vtable[type_code]); - CHECK(vt->destroy != nullptr) - << "Extension type not registered"; - return vt; -} - -ExtTypeVTable* ExtTypeVTable::RegisterInternal( - int type_code, const ExtTypeVTable& vt) { - CHECK(type_code > kExtBegin && type_code < kExtEnd); - Registry::Manager* m = Registry::Manager::Global(); - std::lock_guard(m->mutex); - ExtTypeVTable* pvt = &(m->ext_vtable[type_code]); - pvt[0] = vt; - return pvt; -} } // namespace runtime } // namespace tvm @@ -120,12 +97,6 @@ struct TVMFuncThreadLocalEntry { /*! \brief Thread local store that can be used to hold return values. */ typedef dmlc::ThreadLocalStore TVMFuncThreadLocalStore; -int TVMExtTypeFree(void* handle, int type_code) { - API_BEGIN(); - tvm::runtime::ExtTypeVTable::Get(type_code)->destroy(handle); - API_END(); -} - int TVMFuncRegisterGlobal( const char* name, TVMFunctionHandle f, int override) { API_BEGIN(); diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc index 00e428f258a9..8771a04e5940 100644 --- a/tests/cpp/packed_func_test.cc +++ b/tests/cpp/packed_func_test.cc @@ -126,9 +126,6 @@ struct extension_class_info { } // runtime } // tvm -// do registration, this need to be in cc file -TVM_REGISTER_EXT_TYPE(test::IntVector); - TEST(PackedFunc, ExtensionType) { using namespace tvm; using namespace tvm::runtime; diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index d10c9a6b127f..7cdade714aa5 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -6,6 +6,7 @@ rm -rf python/tvm/*.pyc python/tvm/*/*.pyc # Test TVM make cython || exit -1 +make cython3 || exit -1 # Test extern package package cd apps/extension diff --git a/topi/src/topi.cc b/topi/src/topi.cc index d6b67c74bacc..1d73a8fc68dd 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -54,8 +54,6 @@ namespace topi { using namespace tvm; using namespace tvm::runtime; -TVM_REGISTER_EXT_TYPE(tvm::Target); - /*! \brief Canonicalize an argument that may be Array or int to Array */ Array ArrayOrInt(TVMArgValue arg) { if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) { From 3895d3ce258b9c68eb737e2745faca52e53e8ee8 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 22 Feb 2018 20:18:08 -0800 Subject: [PATCH 164/948] Revert "[RUNTIME] Refactor extension type handling, now it is header only (#924)" (#925) This reverts commit 12d15704d7f5d30cff7540f1fd16be64c6baca68. --- apps/extension/src/tvm_ext.cc | 20 +--- apps/extension/tests/test_ext.py | 6 -- include/tvm/runtime/c_runtime_api.h | 18 ---- include/tvm/runtime/module.h | 12 +-- include/tvm/runtime/packed_func.h | 122 +++++++++++------------ python/tvm/_ffi/function.py | 25 ----- python/tvm/api.py | 2 +- python/tvm/build_module.py | 32 +++--- src/runtime/c_runtime_api.cc | 8 -- src/runtime/module.cc | 13 +++ src/runtime/registry.cc | 29 ++++++ tests/cpp/packed_func_test.cc | 3 + tests/scripts/task_python_integration.sh | 1 - topi/src/topi.cc | 2 + 14 files changed, 127 insertions(+), 166 deletions(-) diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc index 8b086863f7ed..6d7f4bdf7533 100644 --- a/apps/extension/src/tvm_ext.cc +++ b/apps/extension/src/tvm_ext.cc @@ -22,10 +22,13 @@ struct extension_class_info { } // namespace tvm } // namespace runtime + +namespace tvm_ext { + using namespace tvm; using namespace tvm::runtime; -namespace tvm_ext { +TVM_REGISTER_EXT_TYPE(IntVector); TVM_REGISTER_GLOBAL("tvm_ext.ivec_create") .set_body([](TVMArgs args, TVMRetValue *rv) { @@ -63,18 +66,3 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev") *rv = (*tvm::runtime::Registry::Get("device_api.cpu"))(); }); } // namespace tvm_ext - -// This callback approach allows extension allows tvm to extract -// This way can be helpful when we want to use a header only -// minimum version of TVM Runtime. -extern "C" int TVMExtDeclare(TVMFunctionHandle pregister) { - const PackedFunc& fregister = - *static_cast(pregister); - auto mul = [](TVMArgs args, TVMRetValue *rv) { - int x = args[0]; - int y = args[1]; - *rv = x * y; - }; - fregister("mul", PackedFunc(mul)); - return 0; -} diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py index 628602f0baea..0bbfff14eeef 100644 --- a/apps/extension/tests/test_ext.py +++ b/apps/extension/tests/test_ext.py @@ -44,14 +44,8 @@ def ivec_cb(v2): tvm.convert(ivec_cb)(ivec) -def test_extract_ext(): - fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare) - assert fdict["mul"](3, 4) == 12 - - if __name__ == "__main__": test_ext_dev() test_ext_vec() test_bind_add() test_sym_add() - test_extract_ext() diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index e4a06b39d04e..edade00c7ed3 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -24,13 +24,6 @@ #define TVM_EXTERN_C #endif -// Macros to do weak linking -#ifdef _MSC_VER -#define TVM_WEAK __declspec(selectany) -#else -#define TVM_WEAK __attribute__((weak)) -#endif - #ifdef __EMSCRIPTEN__ #include #define TVM_DLL EMSCRIPTEN_KEEPALIVE @@ -320,17 +313,6 @@ typedef int (*TVMPackedCFunc)( */ typedef void (*TVMPackedCFuncFinalizer)(void* resource_handle); -/*! - * \brief Signature for extension function declarer. - * - * TVM call this function to get the extension functions - * The declarer will call register_func to register function and their name. - * - * \param resource_func_handle The register function - * \return 0 if success, -1 if failure happens - */ -typedef int (*TVMExtensionFuncDeclarer)(TVMFunctionHandle register_func_handle); - /*! * \brief Wrap a TVMPackedCFunc to become a FunctionHandle. * diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index f8e5069f56c5..3d0991034c41 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -38,14 +38,8 @@ class Module { * \param query_imports Whether also query dependency modules. * \return The result function. * This function will return PackedFunc(nullptr) if function do not exist. - * \note Implemented in packed_func.cc */ - inline PackedFunc GetFunction(const std::string& name, bool query_imports = false); - /*! \return internal container */ - inline ModuleNode* operator->(); - /*! \return internal container */ - inline const ModuleNode* operator->() const; - // The following functions requires link with runtime. + TVM_DLL PackedFunc GetFunction(const std::string& name, bool query_imports = false); /*! * \brief Import another module into this module. * \param other The module to be imported. @@ -63,6 +57,10 @@ class Module { */ TVM_DLL static Module LoadFromFile(const std::string& file_name, const std::string& format = ""); + /*! \return internal container */ + inline ModuleNode* operator->(); + /*! \return internal container */ + inline const ModuleNode* operator->() const; private: std::shared_ptr node_; diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index ca2e020ba703..b01e662b97ab 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -183,17 +183,31 @@ struct extension_class_info { }; /*! - * \brief Capsule structure holding extension types - * Capsule is self-contained and include - * all the information to clone and destroy the type. + * \brief Runtime function table about extension type. */ -struct TVMExtTypeCapsule { - /*! \brief The pointer to the object */ - void* ptr; +class ExtTypeVTable { + public: /*! \brief function to be called to delete a handle */ void (*destroy)(void* handle); /*! \brief function to be called when clone a handle */ void* (*clone)(void* handle); + /*! + * \brief Register type + * \tparam T The type to be register. + * \return The registered vtable. + */ + template + static inline ExtTypeVTable* Register_(); + /*! + * \brief Get a vtable based on type code. + * \param type_code The type code + * \return The registered vtable. + */ + TVM_DLL static ExtTypeVTable* Get(int type_code); + + private: + // Internal registration function. + TVM_DLL static ExtTypeVTable* RegisterInternal(int type_code, const ExtTypeVTable& vt); }; /*! @@ -241,9 +255,8 @@ class TVMPODValue_ { } template const TExtension& AsExtension() const { - CHECK_EQ(type_code_, extension_class_info::code); - return static_cast( - static_cast(value_.v_handle)->ptr)[0]; + CHECK_LT(type_code_, kExtEnd); + return static_cast(value_.v_handle)[0]; } int type_code() const { return type_code_; @@ -475,6 +488,14 @@ class TVMRetValue : public TVMPODValue_ { this->Assign(other); return *this; } + template::code != 0>::type> + TVMRetValue& operator=(const T& other) { + this->SwitchToClass( + extension_class_info::code, other); + return *this; + } /*! * \brief Move the value back to front-end via C API. * This marks the current container as null. @@ -500,11 +521,6 @@ class TVMRetValue : public TVMPODValue_ { type_code_ != kStr) << "TVMRetValue.value can only be used for POD data"; return value_; } - // assign extension - template::code != 0>::type> - inline TVMRetValue& operator=(const T& other); // NodeRef related extenstions: in tvm/packed_func_ext.h template(); - cap.ptr = cap.clone(cap.ptr); - SwitchToClass(other.type_code(), cap); + this->Clear(); + type_code_ = other.type_code(); + value_.v_handle = + (*(ExtTypeVTable::Get(other.type_code())->clone))( + other.value().v_handle); } break; } @@ -582,9 +600,7 @@ class TVMRetValue : public TVMPODValue_ { case kNodeHandle: delete ptr >(); break; } if (type_code_ > kExtBegin) { - TVMExtTypeCapsule *cap = ptr(); - cap->destroy(cap->ptr); - delete cap; + (*(ExtTypeVTable::Get(type_code_)->destroy))(value_.v_handle); } type_code_ = kNull; } @@ -700,10 +716,8 @@ inline void for_each(const F& f, Args&&... args) { // NOLINT(*) /* \brief argument settter to PackedFunc */ class TVMArgsSetter { public: - TVMArgsSetter(TVMValue* values, - int* type_codes, - TVMExtTypeCapsule* exts) - : values_(values), type_codes_(type_codes), exts_(exts) {} + TVMArgsSetter(TVMValue* values, int* type_codes) + : values_(values), type_codes_(type_codes) {} // setters for POD types template inline TVMRetValue PackedFunc::operator()(Args&& ...args) const { const int kNumArgs = sizeof...(Args); - // Compiler will remove an static array when it is not touched. const int kArraySize = kNumArgs > 0 ? kNumArgs : 1; TVMValue values[kArraySize]; int type_codes[kArraySize]; - // If the function call does not contain extension type, - // exts will get optimized away by compiler. - TVMExtTypeCapsule exts[kArraySize]; - detail::for_each(TVMArgsSetter(values, type_codes, exts), + detail::for_each(TVMArgsSetter(values, type_codes), std::forward(args)...); TVMRetValue rv; body_(TVMArgs(values, type_codes, kNumArgs), &rv); @@ -845,6 +853,14 @@ inline TVMRetValue::operator T() const { ::Apply(this); } +template +inline void TVMArgsSetter::operator()(size_t i, const T& value) const { + static_assert(extension_class_info::code != 0, + "Need to have extesion code"); + type_codes_[i] = extension_class_info::code; + values_[i].v_handle = const_cast(&value); +} + // extension type handling template struct ExtTypeInfo { @@ -856,42 +872,16 @@ struct ExtTypeInfo { } }; -template -inline TVMRetValue& TVMRetValue::operator=(const T& other) { - TVMExtTypeCapsule cap; - cap.clone = ExtTypeInfo::clone; - cap.destroy = ExtTypeInfo::destroy; - cap.ptr = new T(other); - SwitchToClass( - extension_class_info::code, cap); - return *this; -} - -template -inline void TVMArgsSetter::operator()(size_t i, const T& value) const { - static_assert(extension_class_info::code != 0, - "Need to have extesion code"); - type_codes_[i] = extension_class_info::code; - exts_[i].clone = ExtTypeInfo::clone; - exts_[i].destroy = ExtTypeInfo::destroy; - exts_[i].ptr = const_cast(&value); - values_[i].v_handle = &exts_[i]; -} - -// Implement Module::GetFunction -// Put implementation in this file so we have seen the PackedFunc -inline PackedFunc Module::GetFunction(const std::string& name, bool query_imports) { - PackedFunc pf = node_->GetFunction(name, node_); - if (pf != nullptr) return pf; - if (query_imports) { - for (const Module& m : node_->imports_) { - pf = m.node_->GetFunction(name, m.node_); - if (pf != nullptr) return pf; - } - } - return pf; +template +inline ExtTypeVTable* ExtTypeVTable::Register_() { + const int code = extension_class_info::code; + static_assert(code != 0, + "require extension_class_info traits to be declared with non-zero code"); + ExtTypeVTable vt; + vt.clone = ExtTypeInfo::clone; + vt.destroy = ExtTypeInfo::destroy; + return ExtTypeVTable::RegisterInternal(code, vt); } - } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_PACKED_FUNC_H_ diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py index 526d972f6d28..2edb355fb721 100644 --- a/python/tvm/_ffi/function.py +++ b/python/tvm/_ffi/function.py @@ -234,31 +234,6 @@ def list_global_func_names(): return fnames -def extract_ext_funcs(finit): - """ - Extract the extension PackedFuncs from a C module. - - Parameters - ---------- - finit : ctypes function - a ctypes that takes signature of TVMExtensionDeclarer - - Returns - ------- - fdict : dict of str to Function - The extracted functions - """ - fdict = {} - def _list(name, func): - fdict[name] = func - myf = convert_to_tvm_func(_list) - ret = finit(myf.handle) - _ = myf - if ret != 0: - raise RuntimeError("cannot initialize with %s" % finit) - return fdict - - def _get_api(f): flocal = f flocal.is_global = True diff --git a/python/tvm/api.py b/python/tvm/api.py index 66c154bc9f00..7c90b0ec9481 100644 --- a/python/tvm/api.py +++ b/python/tvm/api.py @@ -8,7 +8,7 @@ from ._ffi.node import register_node, NodeBase from ._ffi.node import convert_to_node as _convert_to_node from ._ffi.function import Function -from ._ffi.function import _init_api, register_func, get_global_func, extract_ext_funcs +from ._ffi.function import _init_api, register_func, get_global_func from ._ffi.function import convert_to_tvm_func as _convert_tvm_func from ._ffi.runtime_ctypes import TVMType from . import _api_internal diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index d868e2e0df12..86d150c08e5f 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -23,16 +23,16 @@ from . import make class DumpIR(object): - """ - Dump IR for each pass. - With it, you can dump ir just like gcc/llvm. + """Dump IR for each pass. + With it, you can dump ir just like gcc/llvm. + + How to use: + ----------- + .. code-block:: python - How to use: - ----------- - .. code-block:: python + with tvm.build_config(dump_pass_ir=True) + run() - with tvm.build_config(dump_pass_ir=True) - run() """ scope_level = 0 def __init__(self): @@ -40,9 +40,9 @@ def __init__(self): self._recover_list = [] def decorate(self, func): - """ decorate the pass function""" + ''' decorate the pass function''' def dump(*args, **kwargs): - """dump function""" + '''dump function''' retv = func(*args, **kwargs) if not isinstance(retv, (_stmt.Stmt, container.LoweredFunc, container.Array)): return retv @@ -59,7 +59,7 @@ def dump(*args, **kwargs): return dump def decorate_irpass(self): - """decorate ir_pass and ScheduleOps""" + '''decorate ir_pass and ScheduleOps''' self._old_sgpass = schedule.ScheduleOps schedule.ScheduleOps = self.decorate(schedule.ScheduleOps) vset = vars(ir_pass) @@ -71,7 +71,7 @@ def recover(): vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v def decorate_custompass(self): - """ decorate add_lower_pass pass in BuildConfig""" + ''' decorate add_lower_pass pass in BuildConfig''' cfg = BuildConfig.current self._old_custom_pass = cfg.add_lower_pass custom_pass = cfg.add_lower_pass if cfg.add_lower_pass else [] @@ -79,7 +79,7 @@ def decorate_custompass(self): BuildConfig.current.add_lower_pass = pass_list def enter(self): - """only decorate outermost nest""" + '''only decorate outermost nest''' if DumpIR.scope_level > 0: return self.decorate_irpass() @@ -88,7 +88,7 @@ def enter(self): DumpIR.scope_level += 1 def exit(self): - """recover outermost nest""" + '''recover outermost nest''' if DumpIR.scope_level > 1: return # recover decorated functions @@ -163,7 +163,6 @@ def __setattr__(self, name, value): "'%s' object cannot set attribute '%s'" % (str(type(self)), name)) return super(BuildConfig, self).__setattr__(name, value) - def build_config(**kwargs): """Configure the build behavior by setting config variables. @@ -227,7 +226,6 @@ def build_config(**kwargs): setattr(config, k, kwargs[k]) return config - if not _RUNTIME_ONLY: # BuildConfig is not available in tvm_runtime BuildConfig.current = build_config() @@ -354,10 +352,8 @@ def lower(sch, stmt = f(stmt) if simple_mode: return stmt - return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) - def build(sch, args=None, target=None, diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 9a7005f2e6c7..5e3b3e8034ce 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -347,14 +347,6 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, API_END(); } -int TVMExtTypeFree(void* handle, int type_code) { - API_BEGIN(); - TVMExtTypeCapsule* cap = static_cast(handle); - cap->destroy(cap->ptr); - delete cap; - API_END(); -} - int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, diff --git a/src/runtime/module.cc b/src/runtime/module.cc index d5ece65606b9..c0796a61a20d 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -13,6 +13,19 @@ namespace tvm { namespace runtime { +PackedFunc Module::GetFunction( + const std::string& name, bool query_imports) { + PackedFunc pf = node_->GetFunction(name, node_); + if (pf != nullptr) return pf; + if (query_imports) { + for (const Module& m : node_->imports_) { + pf = m.node_->GetFunction(name, m.node_); + if (pf != nullptr) return pf; + } + } + return pf; +} + void Module::Import(Module other) { // specially handle rpc if (!std::strcmp((*this)->type_key(), "rpc")) { diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc index 563731a606cf..d7587b6ce1a5 100644 --- a/src/runtime/registry.cc +++ b/src/runtime/registry.cc @@ -22,10 +22,15 @@ struct Registry::Manager { // and the resource can become invalid because of indeterminstic order of destruction. // The resources will only be recycled during program exit. std::unordered_map fmap; + // vtable for extension type + std::array ext_vtable; // mutex std::mutex mutex; Manager() { + for (auto& x : ext_vtable) { + x.destroy = nullptr; + } } static Manager* Global() { @@ -83,6 +88,24 @@ std::vector Registry::ListNames() { return keys; } +ExtTypeVTable* ExtTypeVTable::Get(int type_code) { + CHECK(type_code > kExtBegin && type_code < kExtEnd); + Registry::Manager* m = Registry::Manager::Global(); + ExtTypeVTable* vt = &(m->ext_vtable[type_code]); + CHECK(vt->destroy != nullptr) + << "Extension type not registered"; + return vt; +} + +ExtTypeVTable* ExtTypeVTable::RegisterInternal( + int type_code, const ExtTypeVTable& vt) { + CHECK(type_code > kExtBegin && type_code < kExtEnd); + Registry::Manager* m = Registry::Manager::Global(); + std::lock_guard(m->mutex); + ExtTypeVTable* pvt = &(m->ext_vtable[type_code]); + pvt[0] = vt; + return pvt; +} } // namespace runtime } // namespace tvm @@ -97,6 +120,12 @@ struct TVMFuncThreadLocalEntry { /*! \brief Thread local store that can be used to hold return values. */ typedef dmlc::ThreadLocalStore TVMFuncThreadLocalStore; +int TVMExtTypeFree(void* handle, int type_code) { + API_BEGIN(); + tvm::runtime::ExtTypeVTable::Get(type_code)->destroy(handle); + API_END(); +} + int TVMFuncRegisterGlobal( const char* name, TVMFunctionHandle f, int override) { API_BEGIN(); diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc index 8771a04e5940..00e428f258a9 100644 --- a/tests/cpp/packed_func_test.cc +++ b/tests/cpp/packed_func_test.cc @@ -126,6 +126,9 @@ struct extension_class_info { } // runtime } // tvm +// do registration, this need to be in cc file +TVM_REGISTER_EXT_TYPE(test::IntVector); + TEST(PackedFunc, ExtensionType) { using namespace tvm; using namespace tvm::runtime; diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index 7cdade714aa5..d10c9a6b127f 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -6,7 +6,6 @@ rm -rf python/tvm/*.pyc python/tvm/*/*.pyc # Test TVM make cython || exit -1 -make cython3 || exit -1 # Test extern package package cd apps/extension diff --git a/topi/src/topi.cc b/topi/src/topi.cc index 1d73a8fc68dd..d6b67c74bacc 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -54,6 +54,8 @@ namespace topi { using namespace tvm; using namespace tvm::runtime; +TVM_REGISTER_EXT_TYPE(tvm::Target); + /*! \brief Canonicalize an argument that may be Array or int to Array */ Array ArrayOrInt(TVMArgValue arg) { if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) { From 1a51fe12639a6b628484827e804ba00b8880bab6 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 23 Feb 2018 10:24:17 -0800 Subject: [PATCH 165/948] [EXT] Allow easy extraction of extern module (#926) --- apps/extension/src/tvm_ext.cc | 20 ++++++++++++++--- apps/extension/tests/test_ext.py | 6 +++++ include/tvm/runtime/c_runtime_api.h | 18 +++++++++++++++ include/tvm/runtime/module.h | 12 +++++----- include/tvm/runtime/packed_func.h | 27 +++++++++++++++++++++++ python/tvm/_ffi/function.py | 25 +++++++++++++++++++++ python/tvm/api.py | 2 +- python/tvm/build_module.py | 28 ++++++++++++------------ src/runtime/module.cc | 13 ----------- tests/scripts/task_python_integration.sh | 1 + 10 files changed, 116 insertions(+), 36 deletions(-) diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc index 6d7f4bdf7533..bb8b4b694187 100644 --- a/apps/extension/src/tvm_ext.cc +++ b/apps/extension/src/tvm_ext.cc @@ -22,12 +22,11 @@ struct extension_class_info { } // namespace tvm } // namespace runtime - -namespace tvm_ext { - using namespace tvm; using namespace tvm::runtime; +namespace tvm_ext { + TVM_REGISTER_EXT_TYPE(IntVector); TVM_REGISTER_GLOBAL("tvm_ext.ivec_create") @@ -66,3 +65,18 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev") *rv = (*tvm::runtime::Registry::Get("device_api.cpu"))(); }); } // namespace tvm_ext + +// This callback approach allows extension allows tvm to extract +// This way can be helpful when we want to use a header only +// minimum version of TVM Runtime. +extern "C" int TVMExtDeclare(TVMFunctionHandle pregister) { + const PackedFunc& fregister = + *static_cast(pregister); + auto mul = [](TVMArgs args, TVMRetValue *rv) { + int x = args[0]; + int y = args[1]; + *rv = x * y; + }; + fregister("mul", PackedFunc(mul)); + return 0; +} diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py index 0bbfff14eeef..628602f0baea 100644 --- a/apps/extension/tests/test_ext.py +++ b/apps/extension/tests/test_ext.py @@ -44,8 +44,14 @@ def ivec_cb(v2): tvm.convert(ivec_cb)(ivec) +def test_extract_ext(): + fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare) + assert fdict["mul"](3, 4) == 12 + + if __name__ == "__main__": test_ext_dev() test_ext_vec() test_bind_add() test_sym_add() + test_extract_ext() diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index edade00c7ed3..e4a06b39d04e 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -24,6 +24,13 @@ #define TVM_EXTERN_C #endif +// Macros to do weak linking +#ifdef _MSC_VER +#define TVM_WEAK __declspec(selectany) +#else +#define TVM_WEAK __attribute__((weak)) +#endif + #ifdef __EMSCRIPTEN__ #include #define TVM_DLL EMSCRIPTEN_KEEPALIVE @@ -313,6 +320,17 @@ typedef int (*TVMPackedCFunc)( */ typedef void (*TVMPackedCFuncFinalizer)(void* resource_handle); +/*! + * \brief Signature for extension function declarer. + * + * TVM call this function to get the extension functions + * The declarer will call register_func to register function and their name. + * + * \param resource_func_handle The register function + * \return 0 if success, -1 if failure happens + */ +typedef int (*TVMExtensionFuncDeclarer)(TVMFunctionHandle register_func_handle); + /*! * \brief Wrap a TVMPackedCFunc to become a FunctionHandle. * diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index 3d0991034c41..f8e5069f56c5 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -38,8 +38,14 @@ class Module { * \param query_imports Whether also query dependency modules. * \return The result function. * This function will return PackedFunc(nullptr) if function do not exist. + * \note Implemented in packed_func.cc */ - TVM_DLL PackedFunc GetFunction(const std::string& name, bool query_imports = false); + inline PackedFunc GetFunction(const std::string& name, bool query_imports = false); + /*! \return internal container */ + inline ModuleNode* operator->(); + /*! \return internal container */ + inline const ModuleNode* operator->() const; + // The following functions requires link with runtime. /*! * \brief Import another module into this module. * \param other The module to be imported. @@ -57,10 +63,6 @@ class Module { */ TVM_DLL static Module LoadFromFile(const std::string& file_name, const std::string& format = ""); - /*! \return internal container */ - inline ModuleNode* operator->(); - /*! \return internal container */ - inline const ModuleNode* operator->() const; private: std::shared_ptr node_; diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index b01e662b97ab..e6cc3cddecd1 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -24,6 +24,11 @@ struct Type; struct Expr; } +// Whether use TVM runtime in header only mode. +#ifndef TVM_RUNTIME_HEADER_ONLY +#define TVM_RUNTIME_HEADER_ONLY 0 +#endif + namespace tvm { // Forward declare NodeRef and Node for extensions. // This header works fine without depend on NodeRef @@ -564,11 +569,15 @@ class TVMRetValue : public TVMPODValue_ { SwitchToPOD(other.type_code()); value_ = other.value_; } else { +#if TVM_RUNTIME_HEADER_ONLY + LOG(FATAL) << "Header only mode do not support ext type"; +#else this->Clear(); type_code_ = other.type_code(); value_.v_handle = (*(ExtTypeVTable::Get(other.type_code())->clone))( other.value().v_handle); +#endif } break; } @@ -600,7 +609,11 @@ class TVMRetValue : public TVMPODValue_ { case kNodeHandle: delete ptr >(); break; } if (type_code_ > kExtBegin) { +#if TVM_RUNTIME_HEADER_ONLY + LOG(FATAL) << "Header only mode do not support ext type"; +#else (*(ExtTypeVTable::Get(type_code_)->destroy))(value_.v_handle); +#endif } type_code_ = kNull; } @@ -882,6 +895,20 @@ inline ExtTypeVTable* ExtTypeVTable::Register_() { vt.destroy = ExtTypeInfo::destroy; return ExtTypeVTable::RegisterInternal(code, vt); } + +// Implement Module::GetFunction +// Put implementation in this file so we have seen the PackedFunc +inline PackedFunc Module::GetFunction(const std::string& name, bool query_imports) { + PackedFunc pf = node_->GetFunction(name, node_); + if (pf != nullptr) return pf; + if (query_imports) { + for (const Module& m : node_->imports_) { + pf = m.node_->GetFunction(name, m.node_); + if (pf != nullptr) return pf; + } + } + return pf; +} } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_PACKED_FUNC_H_ diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py index 2edb355fb721..526d972f6d28 100644 --- a/python/tvm/_ffi/function.py +++ b/python/tvm/_ffi/function.py @@ -234,6 +234,31 @@ def list_global_func_names(): return fnames +def extract_ext_funcs(finit): + """ + Extract the extension PackedFuncs from a C module. + + Parameters + ---------- + finit : ctypes function + a ctypes that takes signature of TVMExtensionDeclarer + + Returns + ------- + fdict : dict of str to Function + The extracted functions + """ + fdict = {} + def _list(name, func): + fdict[name] = func + myf = convert_to_tvm_func(_list) + ret = finit(myf.handle) + _ = myf + if ret != 0: + raise RuntimeError("cannot initialize with %s" % finit) + return fdict + + def _get_api(f): flocal = f flocal.is_global = True diff --git a/python/tvm/api.py b/python/tvm/api.py index 7c90b0ec9481..66c154bc9f00 100644 --- a/python/tvm/api.py +++ b/python/tvm/api.py @@ -8,7 +8,7 @@ from ._ffi.node import register_node, NodeBase from ._ffi.node import convert_to_node as _convert_to_node from ._ffi.function import Function -from ._ffi.function import _init_api, register_func, get_global_func +from ._ffi.function import _init_api, register_func, get_global_func, extract_ext_funcs from ._ffi.function import convert_to_tvm_func as _convert_tvm_func from ._ffi.runtime_ctypes import TVMType from . import _api_internal diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 86d150c08e5f..7b6fa7715a7b 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -23,16 +23,16 @@ from . import make class DumpIR(object): - """Dump IR for each pass. - With it, you can dump ir just like gcc/llvm. - - How to use: - ----------- - .. code-block:: python + """ + Dump IR for each pass. + With it, you can dump ir just like gcc/llvm. - with tvm.build_config(dump_pass_ir=True) - run() + How to use: + ----------- + .. code-block:: python + with tvm.build_config(dump_pass_ir=True) + run() """ scope_level = 0 def __init__(self): @@ -40,9 +40,9 @@ def __init__(self): self._recover_list = [] def decorate(self, func): - ''' decorate the pass function''' + """ decorate the pass function""" def dump(*args, **kwargs): - '''dump function''' + """dump function""" retv = func(*args, **kwargs) if not isinstance(retv, (_stmt.Stmt, container.LoweredFunc, container.Array)): return retv @@ -59,7 +59,7 @@ def dump(*args, **kwargs): return dump def decorate_irpass(self): - '''decorate ir_pass and ScheduleOps''' + """decorate ir_pass and ScheduleOps""" self._old_sgpass = schedule.ScheduleOps schedule.ScheduleOps = self.decorate(schedule.ScheduleOps) vset = vars(ir_pass) @@ -71,7 +71,7 @@ def recover(): vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v def decorate_custompass(self): - ''' decorate add_lower_pass pass in BuildConfig''' + """ decorate add_lower_pass pass in BuildConfig""" cfg = BuildConfig.current self._old_custom_pass = cfg.add_lower_pass custom_pass = cfg.add_lower_pass if cfg.add_lower_pass else [] @@ -79,7 +79,7 @@ def decorate_custompass(self): BuildConfig.current.add_lower_pass = pass_list def enter(self): - '''only decorate outermost nest''' + """only decorate outermost nest""" if DumpIR.scope_level > 0: return self.decorate_irpass() @@ -88,7 +88,7 @@ def enter(self): DumpIR.scope_level += 1 def exit(self): - '''recover outermost nest''' + """recover outermost nest""" if DumpIR.scope_level > 1: return # recover decorated functions diff --git a/src/runtime/module.cc b/src/runtime/module.cc index c0796a61a20d..d5ece65606b9 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -13,19 +13,6 @@ namespace tvm { namespace runtime { -PackedFunc Module::GetFunction( - const std::string& name, bool query_imports) { - PackedFunc pf = node_->GetFunction(name, node_); - if (pf != nullptr) return pf; - if (query_imports) { - for (const Module& m : node_->imports_) { - pf = m.node_->GetFunction(name, m.node_); - if (pf != nullptr) return pf; - } - } - return pf; -} - void Module::Import(Module other) { // specially handle rpc if (!std::strcmp((*this)->type_key(), "rpc")) { diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index d10c9a6b127f..7cdade714aa5 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -6,6 +6,7 @@ rm -rf python/tvm/*.pyc python/tvm/*/*.pyc # Test TVM make cython || exit -1 +make cython3 || exit -1 # Test extern package package cd apps/extension From d9e4ccce51dd9ade60b5d638548e42d48331e9d4 Mon Sep 17 00:00:00 2001 From: Jammy Zhou Date: Sun, 25 Feb 2018 00:34:03 +0000 Subject: [PATCH 166/948] Fix a typo for function registration (#927) --- python/tvm/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/target.py b/python/tvm/target.py index 5966326c71eb..ab02619f6deb 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -212,7 +212,7 @@ def _do_reg(myf): dispatch_dict[k] = myf return myf if func: - return _do_reg(myf) + return _do_reg(func) return _do_reg def dispatch_func(func, *args, **kwargs): From 30eaf463e34d7c301357c31a010945d11df16537 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 25 Feb 2018 11:45:45 -0800 Subject: [PATCH 167/948] MXNet NDArray bridge. (#930) * MXNet NDArray bridge. Support convert a tvm Function as MXNet's async NDArray function. * fix lint * update comment --- python/tvm/contrib/mxnet.py | 59 +++++++++++++++++++++++ src/api/api_base.cc | 5 ++ tests/python/contrib/test_mxnet_bridge.py | 48 ++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 python/tvm/contrib/mxnet.py create mode 100644 tests/python/contrib/test_mxnet_bridge.py diff --git a/python/tvm/contrib/mxnet.py b/python/tvm/contrib/mxnet.py new file mode 100644 index 000000000000..3a6c92f1b880 --- /dev/null +++ b/python/tvm/contrib/mxnet.py @@ -0,0 +1,59 @@ +"""MXNet bridge wrap Function MXNet's async function.""" +from __future__ import absolute_import as _abs + +from .. import api, _api_internal, ndarray +from ..module import Module + +# pylint: disable=invalid-name +_wrap_async = None + + +def to_mxnet_func(func, const_loc=None): + """Wrap a TVM function as MXNet function + + MXNet function runs asynchrously via its engine. + + Parameters + ---------- + func : Function + A TVM function that can take positional arguments + + const_loc : list of int + List of integers indicating the argument position + of read only NDArray argument. + The NDArray argument location that are not annotated + will be viewed as mutable arrays in MXNet's engine. + + Returns + ------- + async_func : Function + A function that can take MXNet NDArray as argument + in places that used to expect TVM NDArray. + Run asynchrously in MXNet's async engine. + """ + # only import mxnet when wrap get called. + # pylint: disable=import-self + import mxnet + if isinstance(func, Module): + func = func.entry_func + + def _get_bridge_func(): + """Get MXNet bridge function""" + if not mxnet.base._LIB.MXTVMBridge: + raise RuntimeError( + "MXTVMBridge not exist in mxnet package," + " please update to latest version") + + fdict = api.extract_ext_funcs(mxnet.base._LIB.MXTVMBridge) + ret = fdict["WrapAsyncCall"] + ret.is_global = True + return ret + global _wrap_async + + if _wrap_async is None: + # Register extension type in first time + _wrap_async = _get_bridge_func() + ndarray.register_extension(mxnet.nd.NDArray) + + const_loc = const_loc if const_loc else [] + return _wrap_async(func, _api_internal._TVMSetStream, len(const_loc), *const_loc) diff --git a/src/api/api_base.cc b/src/api/api_base.cc index df8469903533..cc76f6a8f50b 100644 --- a/src/api/api_base.cc +++ b/src/api/api_base.cc @@ -36,4 +36,9 @@ TVM_REGISTER_API("_load_json") TVM_REGISTER_API("_nop") .set_body([](TVMArgs args, TVMRetValue *ret) { }); + +TVM_REGISTER_API("_TVMSetStream") +.set_body([](TVMArgs args, TVMRetValue *ret) { + TVMSetStream(args[0], args[1], args[2]); + }); } // namespace tvm diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py new file mode 100644 index 000000000000..2228f7305c6b --- /dev/null +++ b/tests/python/contrib/test_mxnet_bridge.py @@ -0,0 +1,48 @@ +def mxnet_check(): + """This is a simple test function for MXNet bridge + + It is not included as nosetests, because of its dependency on mxnet + + User can directly run this script to verify correctness. + """ + import mxnet as mx + import topi + import tvm + import numpy as np + from tvm.contrib.mxnet import to_mxnet_func + + # build a TVM function through topi + n = 20 + shape = (20,) + scale = tvm.var("scale", dtype="float32") + x = tvm.placeholder(shape) + y = tvm.placeholder(shape) + z = topi.broadcast_add(x, y) + zz = tvm.compute(shape, lambda *i: z(*i) * scale) + + target = tvm.target.cuda() + + # build the function + with target: + s = topi.generic.schedule_injective(zz) + f = tvm.build(s, [x, y, zz, scale]) + + # get a mxnet version + mxf = to_mxnet_func(f, const_loc=[0, 1]) + + ctx = mx.gpu(0) + xx = mx.nd.uniform(shape=shape, ctx=ctx) + yy = mx.nd.uniform(shape=shape, ctx=ctx) + zz = mx.nd.empty(shape=shape, ctx=ctx) + + # invoke myf: this runs in mxnet engine + mxf(xx, yy, zz, 10.0) + mxf(xx, yy, zz, 10.0) + + + np.testing.assert_allclose( + zz.asnumpy(), (xx.asnumpy() + yy.asnumpy()) * 10) + + +if __name__ == "__main__": + mxnet_check() From 31013393255d77ab4e479e0403560d075bc2a934 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Mon, 26 Feb 2018 18:42:11 -0800 Subject: [PATCH 168/948] Append null terminator when converting JS string to c string. (#931) --- web/tvm_runtime.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index c23c1f9da796..df9cba94aa3a 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -291,10 +291,11 @@ var tvm_runtime = tvm_runtime || {}; } function StringToUint8Array(str) { - var arr = new Uint8Array(str.length); + var arr = new Uint8Array(str.length + 1); for(var i = 0; i < str.length; ++i) { arr[i] = str.charCodeAt(i); } + arr[str.length] = 0; return arr; } //----------------------------------------- @@ -599,7 +600,7 @@ var tvm_runtime = tvm_runtime || {}; Module.setValue(this.value + index * SIZEOF_TVMVALUE, value, "*"); }, setString : function(index, value) { - var sdata = new CBuffer(value.length); + var sdata = new CBuffer(value.length + 1); Module.HEAPU8.set(StringToUint8Array(value), sdata.data); this.temp.push(sdata); Module.setValue(this.tcode + index * SIZEOF_INT, kStr, "i32"); From 83cc348f7a2ba3e922b5fa9d16e393b666ef5d23 Mon Sep 17 00:00:00 2001 From: nhynes Date: Tue, 27 Feb 2018 11:36:57 -0800 Subject: [PATCH 169/948] TVM SGX (#919) * Update DMLC core most recent version * Modify runtime to minimize io when building for SGX * Add SGX example app * Prefer streaming versions of packed_func function --- apps/sgx/.gitignore | 2 + apps/sgx/Makefile | 110 +++++++++++++++++++++++++ apps/sgx/README.md | 34 ++++++++ apps/sgx/app.cc | 129 ++++++++++++++++++++++++++++++ apps/sgx/enclave.cc | 62 ++++++++++++++ apps/sgx/enclave_config.xml | 11 +++ apps/sgx/enclave_private.pem | 39 +++++++++ apps/sgx/prepare_test_libs.py | 24 ++++++ apps/sgx/run_example.sh | 8 ++ apps/sgx/test_addone.edl | 7 ++ apps/sgx/tvm_runtime_pack.cc | 17 ++++ dmlc-core | 2 +- include/tvm/runtime/packed_func.h | 24 +++++- src/runtime/c_runtime_api.cc | 12 +-- src/runtime/cpu_device_api.cc | 3 + src/runtime/module.cc | 6 ++ src/runtime/module_util.cc | 6 ++ 17 files changed, 485 insertions(+), 11 deletions(-) create mode 100644 apps/sgx/.gitignore create mode 100644 apps/sgx/Makefile create mode 100644 apps/sgx/README.md create mode 100644 apps/sgx/app.cc create mode 100644 apps/sgx/enclave.cc create mode 100644 apps/sgx/enclave_config.xml create mode 100644 apps/sgx/enclave_private.pem create mode 100644 apps/sgx/prepare_test_libs.py create mode 100755 apps/sgx/run_example.sh create mode 100644 apps/sgx/test_addone.edl create mode 100644 apps/sgx/tvm_runtime_pack.cc diff --git a/apps/sgx/.gitignore b/apps/sgx/.gitignore new file mode 100644 index 000000000000..25f8f1058cab --- /dev/null +++ b/apps/sgx/.gitignore @@ -0,0 +1,2 @@ +lib/ +bin/ diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile new file mode 100644 index 000000000000..6a1eeb5b8b1e --- /dev/null +++ b/apps/sgx/Makefile @@ -0,0 +1,110 @@ +# Makefile for example to deploy TVM modules in SGX. + +TVM_ROOT := $(shell cd ../..; pwd) +NNVM_PATH := nnvm +DMLC_CORE := ${TVM_ROOT}/dmlc-core + +SGX_SDK ?= /opt/sgxsdk +SGX_MODE ?= SIM +SGX_ARCH ?= x64 +SGX_DEBUG ?= 1 + +sgx_edger8r := $(SGX_SDK)/bin/x64/sgx_edger8r +sgx_enclave_signer := $(SGX_SDK)/bin/x64/sgx_sign + +ifneq ($(SGX_MODE), HW) + sgx_sim := _sim +endif +urts_library_name := sgx_urts$(sgx_sim) +trts_library_name := sgx_trts$(sgx_sim) +tservice_library_name := sgx_tservice$(sgx_sim) +uservice_library_name := sgx_uae_service$(sgx_sim) + +pkg_cflags := -std=c++11 -O2 -fPIC\ + -I${TVM_ROOT}/include\ + -I${DMLC_CORE}/include\ + -I${TVM_ROOT}/dlpack/include\ + -I.\ + -DDMLC_LOG_STACK_TRACE=0\ + +pkg_ldflags := -L${TVM_ROOT}/lib + +enclave_include_paths := -I$(SGX_SDK)/include\ + -I$(SGX_SDK)/include/tlibc\ + -I$(SGX_SDK)/include/libcxx\ + -I$(SGX_SDK)/include/stdc++\ + +enclave_cflags := -static -nostdinc\ + -fvisibility=hidden -fpie -fstack-protector-strong\ + -ffunction-sections -fdata-sections\ + -DDMLC_CXX11_THREAD_LOCAL=0\ + $(enclave_include_paths)\ + +enclave_cxxflags := -nostdinc++ $(enclave_cflags) + +enclave_ldflags :=\ + -Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_SDK)/lib64\ + -Wl,--whole-archive -l$(trts_library_name) -Wl,--no-whole-archive\ + -Wl,--start-group\ + -lsgx_tstdc -lsgx_tstdcxx -lsgx_tcxx -lsgx_tcrypto -lsgx_tkey_exchange -l$(tservice_library_name)\ + -Wl,--end-group\ + -Wl,-Bstatic -Wl,-Bsymbolic -Wl,--no-undefined\ + -Wl,-pie,-eenclave_entry -Wl,--export-dynamic\ + -Wl,--defsym,__ImageBase=0 -Wl,--gc-sections + +app_cflags := -I$(SGX_SDK)/include -Ilib + +app_ldflags := -L$(SGX_SDK)/lib64\ + -l$(urts_library_name) -l$(uservice_library_name) -lpthread\ + +.PHONY: clean all + +all: lib/test_addone.signed.so bin/test_addone + +# Build rule for all-in-one TVM package library +lib/tvm_runtime_pack.o: tvm_runtime_pack.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) -g + +# The code library built by TVM +lib/test_addone_sys.o: prepare_test_libs.py + python prepare_test_libs.py + +# EDL files +lib/test_addone_u.c: $(sgx_edger8r) test_addone.edl + $(sgx_edger8r) --untrusted test_addone.edl --untrusted-dir lib --search-path $(SGX_SDK)/include + +lib/test_addone_u.o: lib/test_addone_u.c + $(CC) $(enclave_cflags) -c $< -o $@ + +lib/test_addone_t.c: test_addone.edl + $(sgx_edger8r) --trusted $< --trusted-dir lib --search-path $(SGX_SDK)/include + +lib/test_addone_t.o: lib/test_addone_t.c + $(CC) $(enclave_cflags) -c $< -o $@ + +# The enclave library +lib/test_addone.so: enclave.cc lib/tvm_runtime_pack.o lib/test_addone_t.o lib/test_addone_sys.o + $(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) $(enclave_ldflags) -g + +# The signed enclave +lib/test_addone.signed.so: lib/test_addone.so enclave_config.xml + $(sgx_enclave_signer) sign -key enclave_private.pem -enclave $< -out $@ -config enclave_config.xml + +# An app that runs the enclave +bin/test_addone: app.cc lib/test_addone_u.o + @mkdir -p $(@D) + $(CXX) $^ -o $@ $(app_cflags) $(app_ldflags) + +# Debugging runtime pack built without SGX (c.f. howto_deploy/tvm_runtime_pack.cc) +lib/tvm_runtime_pack_nosgx.o: tvm_runtime_pack.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(pkg_cflags) $(pkg_ldflags) -g + +# Debugging binary that runs TVM without SGX +bin/addone_nosgx: enclave.cc lib/tvm_runtime_pack_nosgx.o lib/test_addone_sys.o + @mkdir -p $(@D) + $(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) -g + +clean: + rm -rf lib bin diff --git a/apps/sgx/README.md b/apps/sgx/README.md new file mode 100644 index 000000000000..c041c2b6c94b --- /dev/null +++ b/apps/sgx/README.md @@ -0,0 +1,34 @@ +# TVM in Intel SGX Example + +This application demonstrates the use of a simple TVM model in the [Intel SGX](https://software.intel.com/en-us/blogs/2013/09/26/protecting-application-secrets-with-intel-sgx) trusted computing environment. + +## Prerequisites + +1. A GNU/Linux environment +2. TVM compiled with LLVM and the `tvm` Python module +3. The [Linux SGX SDK](https://github.com/intel/linux-sgx) [link to pre-built libraries](https://01.org/intel-software-guard-extensions/downloads) + +## Running the example + +`SGX_SDK=/path/to/sgxsdk bash run_example.sh` + +If everything goes well, you should see a lot of build messages and below them +the text `It works!`. + +## High-level overview + +First of all, it helps to think of an SGX enclave as a library that can be called +to perform trusted computation. +In this library, one can use other libraries like TVM. + +Building this example performs the following steps: + +1. Creates a simple TVM module that computes `x + 1` and save it as a system library. +2. Builds a minimal TVM runtime pack that can load the module. +3. Links the TVM module into an SGX enclave along with some code that runs the module. +4. Compiles and runs an executable that loads the enclave and calls a function + which invokes the TVM module. + +For more information on building, please refer to the `Makefile`. +For more information on the TVM module, please refer to `../howto_deploy`. +For more in formation on SGX enclaves, please refer to the [SGX Enclave Demo](https://github.com/intel/linux-sgx/tree/master/SampleCode/SampleEnclave/) diff --git a/apps/sgx/app.cc b/apps/sgx/app.cc new file mode 100644 index 000000000000..1516e8b4e925 --- /dev/null +++ b/apps/sgx/app.cc @@ -0,0 +1,129 @@ +#include + +#include "sgx_urts.h" +#include "sgx_eid.h" +#include "test_addone_u.h" + +#define TOKEN_FILENAME "bin/test_addone.token" +#define ENCLAVE_FILENAME "lib/test_addone.signed.so" + +sgx_enclave_id_t global_eid = 0; // global EID shared by multiple threads + +typedef struct _sgx_errlist_t { + sgx_status_t err; + const char *msg; +} sgx_errlist_t; + +/* Error code returned by sgx_create_enclave */ +static sgx_errlist_t sgx_errlist[] = { + { SGX_ERROR_DEVICE_BUSY, "SGX device was busy." }, + { SGX_ERROR_ENCLAVE_FILE_ACCESS, "Can't open enclave file." }, + { SGX_ERROR_ENCLAVE_LOST, "Power transition occurred." }, + { SGX_ERROR_INVALID_ATTRIBUTE, "Enclave was not authorized." }, + { SGX_ERROR_INVALID_ENCLAVE, "Invalid enclave image." }, + { SGX_ERROR_INVALID_ENCLAVE_ID, "Invalid enclave identification." }, + { SGX_ERROR_INVALID_METADATA, "Invalid enclave metadata." }, + { SGX_ERROR_INVALID_PARAMETER, "Invalid parameter." }, + { SGX_ERROR_INVALID_SIGNATURE, "Invalid enclave signature." }, + { SGX_ERROR_INVALID_VERSION, "Enclave version was invalid." }, + { SGX_ERROR_MEMORY_MAP_CONFLICT, "Memory map conflicted." }, + { SGX_ERROR_NO_DEVICE, "Invalid SGX device." }, + { SGX_ERROR_OUT_OF_EPC, "Out of EPC memory." }, + { SGX_ERROR_OUT_OF_MEMORY, "Out of memory." }, + { SGX_ERROR_UNEXPECTED, "Unexpected error occurred." }, +}; + +/* Check error conditions for loading enclave */ +void print_error_message(sgx_status_t status) +{ + size_t idx = 0; + size_t ttl = sizeof sgx_errlist/sizeof sgx_errlist[0]; + + for (idx = 0; idx < ttl; idx++) { + if(status == sgx_errlist[idx].err) { + printf("Error: %s\n", sgx_errlist[idx].msg); + break; + } + } + + if (idx == ttl) + printf("Error code is 0x%X. Please refer to the \"Intel SGX SDK Developer Reference\" for more details.\n", status); +} + +/* Initialize the enclave: + * Step 1: try to retrieve the launch token saved by last transaction + * Step 2: call sgx_create_enclave to initialize an enclave instance + * Step 3: save the launch token if it is updated + */ +int initialize_enclave(void) +{ + sgx_launch_token_t token = {0}; + sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; + int updated = 0; + + /* Step 1: try to retrieve the launch token saved by last transaction + * if there is no token, then create a new one. + */ + FILE *fp = fopen(TOKEN_FILENAME, "rb"); + if (fp == NULL && (fp = fopen(TOKEN_FILENAME, "wb")) == NULL) { + printf("Warning: Failed to create/open the launch token file \"%s\".\n", TOKEN_FILENAME); + return -1; + } + + /* read the token from saved file */ + size_t read_num = fread(token, 1, sizeof(sgx_launch_token_t), fp); + if (read_num != 0 && read_num != sizeof(sgx_launch_token_t)) { + /* if token is invalid, clear the buffer */ + memset(&token, 0x0, sizeof(sgx_launch_token_t)); + printf("Warning: Invalid launch token read from \"%s\".\n", TOKEN_FILENAME); + } + + /* Step 2: call sgx_create_enclave to initialize an enclave instance */ + /* Debug Support: set 2nd parameter to 1 */ + sgx_status = sgx_create_enclave(ENCLAVE_FILENAME, SGX_DEBUG_FLAG, &token, &updated, &global_eid, NULL); + if (sgx_status != SGX_SUCCESS) { + print_error_message(sgx_status); + if (fp != NULL) fclose(fp); + return -1; + } + + /* Step 3: save the launch token if it is updated */ + if (updated == 0 || fp == NULL) { + /* if the token is not updated, or file handler is invalid, do not perform saving */ + if (fp != NULL) fclose(fp); + return 0; + } + + /* reopen the file with write capablity */ + fp = freopen(TOKEN_FILENAME, "wb", fp); + if (fp == NULL) return 0; + size_t write_num = fwrite(token, 1, sizeof(sgx_launch_token_t), fp); + if (write_num != sizeof(sgx_launch_token_t)) + printf("Warning: Failed to save launch token to \"%s\".\n", TOKEN_FILENAME); + fclose(fp); + return 0; +} + +int SGX_CDECL main(int argc, char *argv[]) { + if(initialize_enclave() < 0){ + printf("Failed to initialize enclave.\n"); + return -1; + } + + /* Run TVM within the enclave */ + int addone_status; + sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; + sgx_status = enclave_main(global_eid, &addone_status); + if (sgx_status != SGX_SUCCESS) { + print_error_message(sgx_status); + } + + sgx_destroy_enclave(global_eid); + + if (addone_status == 1) { + printf("It works!"); + return 0; + } + printf("It doesn't work."); + return -1; +} diff --git a/apps/sgx/enclave.cc b/apps/sgx/enclave.cc new file mode 100644 index 000000000000..7588455543e9 --- /dev/null +++ b/apps/sgx/enclave.cc @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#ifndef _LIBCPP_SGX_CONFIG +#include +#endif + +/* This function mirrors the one in howto_deploy except without the iostream */ +int Verify(tvm::runtime::Module mod, std::string fname) { + // Get the function from the module. + tvm::runtime::PackedFunc f = mod.GetFunction(fname); + + // Allocate the DLPack data structures. + DLTensor* x; + DLTensor* y; + int ndim = 1; + int dtype_code = kDLFloat; + int dtype_bits = 32; + int dtype_lanes = 1; + int device_type = kDLCPU; + int device_id = 0; + int64_t shape[1] = {10}; + TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, + device_type, device_id, &x); + TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, + device_type, device_id, &y); + for (int i = 0; i < shape[0]; ++i) { + static_cast(x->data)[i] = i; + } + + // Invoke the function + f(x, y); + + // check the output + bool all_eq = true; + for (int i = 0; i < shape[0]; ++i) { + all_eq = all_eq && static_cast(y->data)[i] == i + 1.0f; + } + + return all_eq; +} + + +extern "C" { +int enclave_main() { + tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))(); + return Verify(mod_syslib, "addonesys"); +} +} + +#ifndef _LIBCPP_SGX_CONFIG +int main(void) { + tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))(); + if (Verify(mod_syslib, "addonesys")) { + std::cout << "It works!" << std::endl; + return 0; + } + std::cerr << "It doesn't work." << std::endl; + return -1; +} +#endif diff --git a/apps/sgx/enclave_config.xml b/apps/sgx/enclave_config.xml new file mode 100644 index 000000000000..d24da1882981 --- /dev/null +++ b/apps/sgx/enclave_config.xml @@ -0,0 +1,11 @@ + + 0 + 0 + 0x2000 + 0x1000 + 1 + 1 + 0 + 0 + 0xFFFFFFFF + diff --git a/apps/sgx/enclave_private.pem b/apps/sgx/enclave_private.pem new file mode 100644 index 000000000000..529d07be3574 --- /dev/null +++ b/apps/sgx/enclave_private.pem @@ -0,0 +1,39 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIG4gIBAAKCAYEAroOogvsj/fZDZY8XFdkl6dJmky0lRvnWMmpeH41Bla6U1qLZ +AmZuyIF+mQC/cgojIsrBMzBxb1kKqzATF4+XwPwgKz7fmiddmHyYz2WDJfAjIveJ +ZjdMjM4+EytGlkkJ52T8V8ds0/L2qKexJ+NBLxkeQLfV8n1mIk7zX7jguwbCG1Pr +nEMdJ3Sew20vnje+RsngAzdPChoJpVsWi/K7cettX/tbnre1DL02GXc5qJoQYk7b +3zkmhz31TgFrd9VVtmUGyFXAysuSAb3EN+5VnHGr0xKkeg8utErea2FNtNIgua8H +ONfm9Eiyaav1SVKzPHlyqLtcdxH3I8Wg7yqMsaprZ1n5A1v/levxnL8+It02KseD +5HqV4rf/cImSlCt3lpRg8U5E1pyFQ2IVEC/XTDMiI3c+AR+w2jSRB3Bwn9zJtFlW +KHG3m1xGI4ck+Lci1JvWWLXQagQSPtZTsubxTQNx1gsgZhgv1JHVZMdbVlAbbRMC +1nSuJNl7KPAS/VfzAgEDAoIBgHRXxaynbVP5gkO0ug6Qw/E27wzIw4SmjsxG6Wpe +K7kfDeRskKxESdsA/xCrKkwGwhcx1iIgS5+Qscd1Yg+1D9X9asd/P7waPmWoZd+Z +AhlKwhdPsO7PiF3e1AzHhGQwsUTt/Y/aSI1MpHBvy2/s1h9mFCslOUxTmWw0oj/Q +ldIEgWeNR72CE2+jFIJIyml6ftnb6qzPiga8Bm48ubKh0kvySOqnkmnPzgh+JBD6 +JnBmtZbfPT97bwTT+N6rnPqOOApvfHPf15kWI8yDbprG1l4OCUaIUH1AszxLd826 +5IPM+8gINLRDP1MA6azECPjTyHXhtnSIBZCyWSVkc05vYmNXYUNiXWMajcxW9M02 +wKzFELO8NCEAkaTPxwo4SCyIjUxiK1LbQ9h8PSy4c1+gGP4LAMR8xqP4QKg6zdu9 +osUGG/xRe/uufgTBFkcjqBHtK5L5VI0jeNIUAgW/6iNbYXjBMJ0GfauLs+g1VsOm +WfdgXzsb9DYdMa0OXXHypmV4GwKBwQDUwQj8RKJ6c8cT4vcWCoJvJF00+RFL+P3i +Gx2DLERxRrDa8AVGfqaCjsR+3vLgG8V/py+z+dxZYSqeB80Qeo6PDITcRKoeAYh9 +xlT3LJOS+k1cJcEmlbbO2IjLkTmzSwa80fWexKu8/Xv6vv15gpqYl1ngYoqJM3pd +vzmTIOi7MKSZ0WmEQavrZj8zK4endE3v0eAEeQ55j1GImbypSf7Idh7wOXtjZ7WD +Dg6yWDrri+AP/L3gClMj8wsAxMV4ZR8CgcEA0fzDHkFa6raVOxWnObmRoDhAtE0a +cjUj976NM5yyfdf2MrKy4/RhdTiPZ6b08/lBC/+xRfV3xKVGzacm6QjqjZrUpgHC +0LKiZaMtccCJjLtPwQd0jGQEnKfMFaPsnhOc5y8qVkCzVOSthY5qhz0XNotHHFmJ +gffVgB0iqrMTvSL7IA2yqqpOqNRlhaYhNl8TiFP3gIeMtVa9rZy31JPgT2uJ+kfo +gV7sdTPEjPWZd7OshGxWpT6QfVDj/T9T7L6tAoHBAI3WBf2DFvxNL2KXT2QHAZ9t +k3imC4f7U+wSE6zILaDZyzygA4RUbwG0gv8/TJVn2P/Eynf76DuWHGlaiLWnCbSz +Az2DHBQBBaku409zDQym3j1ugMRjzzSQWzJg0SIyBH3hTmnYcn3+Uqcp/lEBvGW6 +O+rsXFt3pukqJmIV8HzLGGaLm62BHUeZf3dyWm+i3p/hQAL7Xvu04QW70xuGqdr5 +afV7p5eaeQIJXyGQJ0eylV/90+qxjMKiB1XYg6WYvwKBwQCL/ddpgOdHJGN8uRom +e7Zq0Csi3hGheMKlKbN3vcxT5U7MdyHtTZZOJbTvxKNNUNYH/8uD+PqDGNneb29G +BfGzvI3EASyLIcGZF3OhKwZd0jUrWk2y7Vhob91jwp2+t73vdMbkKyI4mHOuXvGv +fg95si9oO7EBT+Oqvhccd2J+F1IVXncccYnF4u5ZGWt5lLewN/pVr7MjjykeaHqN +t+rfnQam2psA6fL4zS2zTmZPzR2tnY8Y1GBTi0Ko1OKd1HMCgcAb5cB/7/AQlhP9 +yQa04PLH9ygQkKKptZp7dy5WcWRx0K/hAHRoi2aw1wZqfm7VBNu2SLcs90kCCCxp +6C5sfJi6b8NpNbIPC+sc9wsFr7pGo9SFzQ78UlcWYK2Gu2FxlMjonhka5hvo4zvg +WxlpXKEkaFt3gLd92m/dMqBrHfafH7VwOJY2zT3WIpjwuk0ZzmRg5p0pG/svVQEH +NZmwRwlopysbR69B/n1nefJ84UO50fLh5s5Zr3gBRwbWNZyzhXk= +-----END RSA PRIVATE KEY----- diff --git a/apps/sgx/prepare_test_libs.py b/apps/sgx/prepare_test_libs.py new file mode 100644 index 000000000000..1fa9d74ef1c9 --- /dev/null +++ b/apps/sgx/prepare_test_libs.py @@ -0,0 +1,24 @@ +"""Script to prepare test_addone_sys.o""" + +from os import path as osp + +import tvm + +CWD = osp.dirname(osp.abspath(osp.expanduser(__file__))) + +def prepare_test_libs(base_path): + n = tvm.var('n') + A = tvm.placeholder((n,), name='A') + B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') + s = tvm.create_schedule(B.op) + + # Compile library in system library mode + fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib', name='addonesys') + syslib_path = osp.join(base_path, 'test_addone_sys.o') + fadd_syslib.save(syslib_path) + +def main(): + prepare_test_libs(osp.join(CWD, 'lib')) + +if __name__ == '__main__': + main() diff --git a/apps/sgx/run_example.sh b/apps/sgx/run_example.sh new file mode 100755 index 000000000000..3658e0bafb73 --- /dev/null +++ b/apps/sgx/run_example.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +sgx_sdk=${SGX_SDK:=/opt/sgxsdk} + +mkdir -p bin lib +make +echo "=========================" +LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} bin/test_addone diff --git a/apps/sgx/test_addone.edl b/apps/sgx/test_addone.edl new file mode 100644 index 000000000000..58341a727a6b --- /dev/null +++ b/apps/sgx/test_addone.edl @@ -0,0 +1,7 @@ +enclave { + from "sgx_tstdc.edl" import sgx_thread_wait_untrusted_event_ocall, sgx_thread_set_untrusted_event_ocall, sgx_thread_setwait_untrusted_events_ocall, sgx_thread_set_multiple_untrusted_events_ocall; + + trusted { + public int enclave_main(); + }; +}; diff --git a/apps/sgx/tvm_runtime_pack.cc b/apps/sgx/tvm_runtime_pack.cc new file mode 100644 index 000000000000..52aa372f7789 --- /dev/null +++ b/apps/sgx/tvm_runtime_pack.cc @@ -0,0 +1,17 @@ +/*! + * \brief This is an all in one TVM runtime file for use in an SGX enclave. + * + * The files included here will be statically linked into the enclave. + * Please refer to the Makefile (rule lib/tvm_runtime_pack.o) for how to build. + * + */ +#include "../../src/runtime/c_runtime_api.cc" +#include "../../src/runtime/cpu_device_api.cc" +#include "../../src/runtime/workspace_pool.cc" +#include "../../src/runtime/module_util.cc" +#include "../../src/runtime/module.cc" +#include "../../src/runtime/registry.cc" +#include "../../src/runtime/system_lib_module.cc" +#ifndef _LIBCPP_SGX_CONFIG +#include "../../src/runtime/file_util.cc" +#endif diff --git a/dmlc-core b/dmlc-core index c0871823b518..7e84e8b036a3 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit c0871823b518093a0d04d6cba0a3291bc7b31401 +Subproject commit 7e84e8b036a3ff5c0104a3da1f4c7eebf94396ec diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index e6cc3cddecd1..8101ecdc175c 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -640,6 +640,7 @@ inline const char* TypeCode2Str(int type_code) { } } +#ifndef _LIBCPP_SGX_NO_IOSTREAMS inline std::ostream& operator<<(std::ostream& os, TVMType t) { // NOLINT(*) os << TypeCode2Str(t.code); if (t.code == kHandle) return os; @@ -649,11 +650,23 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) { // NOLINT(*) } return os; } +#endif inline std::string TVMType2String(TVMType t) { +#ifndef _LIBCPP_SGX_NO_IOSTREAMS std::ostringstream os; os << t; return os.str(); +#else + std::string repr = ""; + repr += TypeCode2Str(t.code); + if (t.code == kHandle) return repr; + repr += std::to_string(static_cast(t.bits)); + if (t.lanes != 1) { + repr += "x" + std::to_string(static_cast(t.lanes)); + } + return repr; +#endif } inline TVMType String2TVMType(std::string s) { @@ -674,10 +687,13 @@ inline TVMType String2TVMType(std::string s) { scan = s.c_str(); LOG(FATAL) << "unknown type " << s; } - unsigned bits = t.bits, lanes = t.lanes; - sscanf(scan, "%ux%u", &bits, &lanes); - t.bits = static_cast(bits); - t.lanes = static_cast(lanes); + char* xdelim; // emulate sscanf("%ux%u", bits, lanes) + unsigned bits = strtoul(scan, &xdelim, 10); + if (bits != 0) t.bits = static_cast(bits); + if (*xdelim == 'x') { + unsigned lanes = strtoul(xdelim + 1, nullptr, 10); + t.lanes = static_cast(lanes); + } return t; } diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 5e3b3e8034ce..37902b448021 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -324,9 +324,9 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) args.num_args, rv, resource_handle); if (ret != 0) { - std::ostringstream os; - os << "TVMCall CFunc Error:\n" << TVMGetLastError(); - throw dmlc::Error(os.str()); + std::string err = "TVMCall CFunc Error:\n"; + err += TVMGetLastError(); + throw dmlc::Error(err); } }); } else { @@ -338,9 +338,9 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) args.num_args, rv, rpack.get()); if (ret != 0) { - std::ostringstream os; - os << "TVMCall CFunc Error:\n" << TVMGetLastError(); - throw dmlc::Error(os.str()); + std::string err = "TVMCall CFunc Error:\n"; + err += TVMGetLastError(); + throw dmlc::Error(err); } }); } diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 7486f20a6ae1..8199988f3419 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -28,6 +28,9 @@ class CPUDeviceAPI final : public DeviceAPI { #if _MSC_VER ptr = _aligned_malloc(nbytes, alignment); if (ptr == nullptr) throw std::bad_alloc(); +#elif defined(_LIBCPP_SGX_CONFIG) + ptr = memalign(alignment, nbytes); + if (ptr == nullptr) throw std::bad_alloc(); #else int ret = posix_memalign(&ptr, alignment, nbytes); if (ret != 0) throw std::bad_alloc(); diff --git a/src/runtime/module.cc b/src/runtime/module.cc index d5ece65606b9..b3cdd9c95ba6 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -8,7 +8,9 @@ #include #include #include +#ifndef _LIBCPP_SGX_CONFIG #include "./file_util.h" +#endif namespace tvm { namespace runtime { @@ -44,6 +46,7 @@ void Module::Import(Module other) { Module Module::LoadFromFile(const std::string& file_name, const std::string& format) { +#ifndef _LIBCPP_SGX_CONFIG std::string fmt = GetFileFormat(file_name, format); CHECK(fmt.length() != 0) << "Cannot deduce format of file " << file_name; @@ -57,6 +60,9 @@ Module Module::LoadFromFile(const std::string& file_name, << load_f_name << ") is not presented."; Module m = (*f)(file_name, format); return m; +#else + LOG(FATAL) << "SGX does not support LoadFromFile"; +#endif } void ModuleNode::SaveToFile(const std::string& file_name, diff --git a/src/runtime/module_util.cc b/src/runtime/module_util.cc index 825e0e459b08..95da78d23f09 100644 --- a/src/runtime/module_util.cc +++ b/src/runtime/module_util.cc @@ -3,7 +3,9 @@ * \file module_util.cc * \brief Utilities for module. */ +#ifndef _LIBCPP_SGX_CONFIG #include +#endif #include #include #include "./module_util.h" @@ -12,6 +14,7 @@ namespace tvm { namespace runtime { void ImportModuleBlob(const char* mblob, std::vector* mlist) { +#ifndef _LIBCPP_SGX_CONFIG CHECK(mblob != nullptr); uint64_t nbytes = 0; for (size_t i = 0; i < sizeof(nbytes); ++i) { @@ -34,6 +37,9 @@ void ImportModuleBlob(const char* mblob, std::vector* mlist) { Module m = (*f)(static_cast(stream)); mlist->push_back(m); } +#else + LOG(FATAL) << "SGX does not support ImportModuleBlob"; +#endif } PackedFunc WrapPackedFunc(BackendPackedCFunc faddr, From 3921b938c2a14017c2624f149983e86a7f9a4e94 Mon Sep 17 00:00:00 2001 From: Yige Hu Date: Tue, 27 Feb 2018 17:14:58 -0600 Subject: [PATCH 170/948] Fixed a g++ explicit constructor compatibility error for unordered_set. (#935) * Fixed a g++ explicit constructor compatibility error for unordered_set. * Change std::unordered_set>() to std::unordered_set(). --- include/tvm/build_module.h | 3 ++- src/codegen/build_module.cc | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index d4186e8f8167..3fb55ae169ce 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -40,7 +40,8 @@ struct Target { int thread_warp_size, const std::unordered_set& keys, const std::vector& options, - const std::unordered_set& libs = {}) : + const std::unordered_set& libs = + std::unordered_set()) : target_name(target_name), device_type(device_type), max_num_threads(max_num_threads), diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index cca09a966e21..e7f0cd41bb7b 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -85,25 +85,29 @@ namespace target { Target llvm() { std::unordered_set keys({ "llvm", "cpu" }); std::vector options; - return Target("llvm", kDLCPU, 512, 1, keys, options, {}); + return Target("llvm", kDLCPU, 512, 1, keys, options, + std::unordered_set()); } Target cuda() { std::unordered_set keys({ "cuda", "gpu" }); std::vector options; - return Target("cuda", kDLGPU, 512, 32, keys, options, {}); + return Target("cuda", kDLGPU, 512, 32, keys, options, + std::unordered_set()); } Target rocm() { std::unordered_set keys({ "rocm", "gpu" }); std::vector options; - return Target("rocm", kDLROCM, 256, 1, keys, options, {}); + return Target("rocm", kDLROCM, 256, 1, keys, options, + std::unordered_set()); } Target metal() { std::unordered_set keys({ "gpu" }); std::vector options; - return Target("metal", kDLMetal, 256, 1, keys, options, {}); + return Target("metal", kDLMetal, 256, 1, keys, options, + std::unordered_set()); } Target rasp() { @@ -114,7 +118,8 @@ Target rasp() { "-mcpu=cortex-a53", "-mattr=+neon" }); - return Target("llvm", kDLCPU, 512, 1, keys, options, {}); + return Target("llvm", kDLCPU, 512, 1, keys, options, + std::unordered_set()); } Target mali() { @@ -129,7 +134,8 @@ Target mali() { Target stackvm() { std::unordered_set keys({ "stackvm", "cpu" }); std::vector options; - return Target("stackvm", kDLCPU, 512, 1, keys, options, {}); + return Target("stackvm", kDLCPU, 512, 1, keys, options, + std::unordered_set()); } } // namespace target From 2ca47e2078b427f273ec26719ccf432a6d90d6f4 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Tue, 27 Feb 2018 17:08:44 -0800 Subject: [PATCH 171/948] Name all the lock guards. (#938) --- src/codegen/llvm/llvm_common.cc | 2 +- src/runtime/metal/metal_device_api.mm | 2 +- src/runtime/opencl/opencl_device_api.cc | 2 +- src/runtime/registry.cc | 10 +++++----- src/runtime/vulkan/vulkan_device_api.cc | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index 03ac6f0ff227..01f2c8869dc1 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -24,7 +24,7 @@ struct LLVMEnv { void InitializeLLVM() { LLVMEnv* e = LLVMEnv::Global(); if (!e->all_initialized) { - std::lock_guard(e->mu); + std::lock_guard lock(e->mu); if (!e->all_initialized) { e->all_initialized = true; llvm::InitializeAllTargetInfos(); diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 6d376d3144ac..d87a9eac4f72 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -96,7 +96,7 @@ int GetWarpSize(id dev) { void MetalWorkspace::Init() { if (initialized_) return; - std::lock_guard(this->mutex); + std::lock_guard lock(this->mutex); if (initialized_) return; initialized_ = true; if (devices.size() != 0) return; diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index a07fe15f805f..4c3d206b9354 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -180,7 +180,7 @@ bool MatchPlatformInfo( void OpenCLWorkspace::Init() { if (initialized_) return; - std::lock_guard(this->mu); + std::lock_guard lock(this->mu); if (initialized_) return; initialized_ = true; if (context != nullptr) return; diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc index d7587b6ce1a5..3f72828390ee 100644 --- a/src/runtime/registry.cc +++ b/src/runtime/registry.cc @@ -46,7 +46,7 @@ Registry& Registry::set_body(PackedFunc f) { // NOLINT(*) Registry& Registry::Register(const std::string& name, bool override) { // NOLINT(*) Manager* m = Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); auto it = m->fmap.find(name); if (it == m->fmap.end()) { Registry* r = new Registry(); @@ -62,7 +62,7 @@ Registry& Registry::Register(const std::string& name, bool override) { // NOLIN bool Registry::Remove(const std::string& name) { Manager* m = Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); auto it = m->fmap.find(name); if (it == m->fmap.end()) return false; m->fmap.erase(it); @@ -71,7 +71,7 @@ bool Registry::Remove(const std::string& name) { const PackedFunc* Registry::Get(const std::string& name) { Manager* m = Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); auto it = m->fmap.find(name); if (it == m->fmap.end()) return nullptr; return &(it->second->func_); @@ -79,7 +79,7 @@ const PackedFunc* Registry::Get(const std::string& name) { std::vector Registry::ListNames() { Manager* m = Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); std::vector keys; keys.reserve(m->fmap.size()); for (const auto &kv : m->fmap) { @@ -101,7 +101,7 @@ ExtTypeVTable* ExtTypeVTable::RegisterInternal( int type_code, const ExtTypeVTable& vt) { CHECK(type_code > kExtBegin && type_code < kExtEnd); Registry::Manager* m = Registry::Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); ExtTypeVTable* pvt = &(m->ext_vtable[type_code]); pvt[0] = vt; return pvt; diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc index 99b308d338f9..ef97e99431c2 100644 --- a/src/runtime/vulkan/vulkan_device_api.cc +++ b/src/runtime/vulkan/vulkan_device_api.cc @@ -650,7 +650,7 @@ std::vector GetContext(VkInstance instance) { void VulkanWorkspace::Init() { if (initialized_) return; - std::lock_guard(this->mu); + std::lock_guard lock(this->mu); if (initialized_) return; initialized_ = true; instance_ = CreateInstance(); From 9503f663b67bf0b6b7fc58db7ed05b27a859a5b2 Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Tue, 27 Feb 2018 20:20:16 -0800 Subject: [PATCH 172/948] Add test case: Create a static WebGL library and run it in the browser. (#932) * Add test case: Create a static WebGL library and run it in the browser. * Add documentation for loadModuleFromFile * Modify emscripten.createjs --- python/tvm/contrib/emscripten.py | 2 + python/tvm/module.py | 8 +++- src/runtime/system_lib_module.cc | 22 +++++++-- tests/webgl/test_static_webgl_library.html | 55 ++++++++++++++++++++++ tests/webgl/test_static_webgl_library.py | 49 +++++++++++++++++++ web/tvm_runtime.js | 37 +++++++++++++++ 6 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 tests/webgl/test_static_webgl_library.html create mode 100644 tests/webgl/test_static_webgl_library.py diff --git a/python/tvm/contrib/emscripten.py b/python/tvm/contrib/emscripten.py index d770ce1161f1..d263e472cff0 100644 --- a/python/tvm/contrib/emscripten.py +++ b/python/tvm/contrib/emscripten.py @@ -60,3 +60,5 @@ def create_js(output, msg = "Compilation error:\n" msg += out raise RuntimeError(msg) + +create_js.object_format = "bc" diff --git a/python/tvm/module.py b/python/tvm/module.py index d8b018b824f0..6459733fa15e 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -84,6 +84,8 @@ def export_library(self, fcompile : function(target, file_list, kwargs), optional Compilation function to use create dynamic library. + If fcompile has attribute object_format, will compile host library + to that format. Otherwise, will use default format "o". kwargs : dict, optiona; Additional arguments passed to fcompile @@ -95,7 +97,11 @@ def export_library(self, if self.type_key != "llvm": raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key) temp = _util.tempdir() - path_obj = temp.relpath("lib.o") + if fcompile is not None and hasattr(fcompile, "object_format"): + object_format = fcompile.object_format + else: + object_format = "o" + path_obj = temp.relpath("lib." + object_format) self.save(path_obj) files = [path_obj] is_system_lib = self.get_function("__tvm_is_system_module")() diff --git a/src/runtime/system_lib_module.cc b/src/runtime/system_lib_module.cc index db06f57e8422..1f9c8ac8e152 100644 --- a/src/runtime/system_lib_module.cc +++ b/src/runtime/system_lib_module.cc @@ -13,8 +13,8 @@ namespace runtime { class SystemLibModuleNode : public ModuleNode { public: - SystemLibModuleNode() { - } + SystemLibModuleNode() = default; + const char* type_key() const final { return "system_lib"; } @@ -23,6 +23,13 @@ class SystemLibModuleNode : public ModuleNode { const std::string& name, const std::shared_ptr& sptr_to_self) final { std::lock_guard lock(mutex_); + + if (module_blob_ != nullptr) { + // If we previously recorded submodules, load them now. + ImportModuleBlob(reinterpret_cast(module_blob_), &imports_); + module_blob_ = nullptr; + } + auto it = tbl_.find(name); if (it != tbl_.end()) { return WrapPackedFunc( @@ -38,7 +45,14 @@ class SystemLibModuleNode : public ModuleNode { void** ctx_addr = reinterpret_cast(ptr); *ctx_addr = this; } else if (name == symbol::tvm_dev_mblob) { - ImportModuleBlob(reinterpret_cast(ptr), &imports_); + // Record pointer to content of submodules to be loaded. + // We defer loading submodules to the first call to GetFunction(). + // The reason is that RegisterSymbol() gets called when initializing the + // syslib (i.e. library loading time), and the registeries aren't ready + // yet. Therefore, we might not have the functionality to load submodules + // now. + CHECK(module_blob_ == nullptr) << "Resetting mobule blob?"; + module_blob_ = ptr; } else { auto it = tbl_.find(name); if (it != tbl_.end()) { @@ -65,6 +79,8 @@ class SystemLibModuleNode : public ModuleNode { std::mutex mutex_; // Internal symbol table std::unordered_map tbl_; + // Module blob to be imported + void* module_blob_{nullptr}; }; TVM_REGISTER_GLOBAL("module._GetSystemLib") diff --git a/tests/webgl/test_static_webgl_library.html b/tests/webgl/test_static_webgl_library.html new file mode 100644 index 000000000000..39bcb5fff8c7 --- /dev/null +++ b/tests/webgl/test_static_webgl_library.html @@ -0,0 +1,55 @@ + + + + + TVM RPC Test Page + + + +

TVM Test Page

+
+ + + + + + + + \ No newline at end of file diff --git a/tests/webgl/test_static_webgl_library.py b/tests/webgl/test_static_webgl_library.py new file mode 100644 index 000000000000..262416c42506 --- /dev/null +++ b/tests/webgl/test_static_webgl_library.py @@ -0,0 +1,49 @@ +"""Create a static WebGL library and run it in the browser.""" + +from __future__ import absolute_import, print_function + +import os, shutil, SimpleHTTPServer, SocketServer +import tvm +from tvm.contrib import emscripten, util +import numpy as np + +def try_static_webgl_library(): + curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) + + # Change to lib/ which contains "libtvm_runtime.bc". + os.chdir(os.path.join(curr_path, "../../lib")) + + # Create OpenGL module. + n = tvm.var("n") + A = tvm.placeholder((n,), name='A', dtype="float") + B = tvm.compute((n,), lambda *i: A[i], name="B") + + s = tvm.create_schedule(B.op) + s[B].opengl() + + target_host = "llvm -target=asmjs-unknown-emscripten -system-lib" + f = tvm.build(s, [A, B], name="identity", target="opengl", + target_host=target_host) + + # Create a JS library that contains both the module and the tvm runtime. + path_dso = "identity_static.js" + f.export_library(path_dso, emscripten.create_js, options=[ + "-s", "USE_GLFW=3", + "-s", "USE_WEBGL2=1", + "-lglfw", + ]) + + # Create "tvm_runtime.js" and "identity_static.html" in lib/ + shutil.copyfile(os.path.join(curr_path, "../../web/tvm_runtime.js"), + "tvm_runtime.js") + shutil.copyfile(os.path.join(curr_path, "test_static_webgl_library.html"), + "identity_static.html") + + port = 8080 + handler = SimpleHTTPServer.SimpleHTTPRequestHandler + httpd = SocketServer.TCPServer(("", port), handler) + print("Please open http://localhost:" + str(port) + "/identity_static.html") + httpd.serve_forever() + +if __name__ == "__main__": + try_static_webgl_library() diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index df9cba94aa3a..347532f5bdbf 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -229,6 +229,14 @@ var tvm_runtime = tvm_runtime || {}; "number" // size_t nbytes ]); + var TVMModLoadFromFile = Module.cwrap + ("TVMModLoadFromFile", + "number", + ["string", // const char* file_name + "string", // const char* format + "number" // TVMModuleHandle* out + ]) + //----------------------------------------- // Static utility functions // ---------------------------------------- @@ -940,6 +948,35 @@ var tvm_runtime = tvm_runtime || {}; } return new RPCServer(counter); }; + + /** + * Load a TVM module from a library file. + * The file must be present in the Emscripten virtual file system. + * For example, you can pass "--preload-file file" or "--preload-file dir/" + * to "emcc" when compiling the TVM library, in order to populate files into + * the file system. + * For more detail, see: + * https://kripken.github.io/emscripten-site/docs/porting/files/packaging_files + * @param {string} file_name Path of the file to be loaded. The path refers + * to the Emscripten virtual file system. + * @param {string} format The format of the file. + * @return {tvm.TVMModule} The loaded module. + */ + this.loadModuleFromFile = function (file_name, format) { + // alloc + var out = new RefTVMValue(); + TVM_CALL(TVMModLoadFromFile(file_name, format, out.data)); + var out_handle = out.asHandle(); + // release + out.release(); + if (out_handle != 0) { + return new TVMModule(out_handle); + } else { + return null; + } + }; + var loadModuleFromFile = this.loadModuleFromFile; + //----------------------------------------- // Class defintions // ---------------------------------------- From 3cb6152ca1d9f2364a0ee49d52298dc057ff9bcf Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 1 Mar 2018 00:54:58 +0800 Subject: [PATCH 173/948] add exclusive mode for rpc server (#941) --- python/tvm/contrib/rpc.py | 21 ++++++++++++++++----- python/tvm/exec/rpc_server.py | 5 ++++- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/tvm/contrib/rpc.py b/python/tvm/contrib/rpc.py index 3448c4c554d1..831481509c76 100644 --- a/python/tvm/contrib/rpc.py +++ b/python/tvm/contrib/rpc.py @@ -74,10 +74,16 @@ def _recvall(sock, nbytes): return b"".join(res) -def _listen_loop(sock): +def _listen_loop(sock, exclusive): """Lisenting loop""" + last_proc = None while True: conn, addr = sock.accept() + + if last_proc and last_proc.is_alive() and exclusive: + logging.info("Kill last call") + last_proc.terminate() + logging.info("RPCServer: connection from %s", addr) magic = struct.unpack("@i", _recvall(conn, 4))[0] if magic != RPC_MAGIC: @@ -90,9 +96,11 @@ def _listen_loop(sock): else: conn.sendall(struct.pack("@i", RPC_MAGIC)) logging.info("Connection from %s", addr) + process = multiprocessing.Process(target=_serve_loop, args=(conn, addr)) process.deamon = True process.start() + last_proc = process # close from our side. conn.close() @@ -158,6 +166,11 @@ class Server(object): This is recommended to switch on if we want to do local RPC demonstration for GPU devices to avoid fork safety issues. + exclusive : bool, optional + If this is enabled, the server will kill old connection + when new connection comes. This can make sure the current call + monopolize the hardware resource. + key : str, optional The key used to identify the server in Proxy connection. """ @@ -167,6 +180,7 @@ def __init__(self, port_end=9199, is_proxy=False, use_popen=False, + exclusive=False, key=""): self.host = host self.port = port @@ -201,7 +215,7 @@ def __init__(self, sock.listen(1) self.sock = sock self.proc = multiprocessing.Process( - target=_listen_loop, args=(self.sock,)) + target=_listen_loop, args=(self.sock, exclusive)) self.proc.deamon = True self.proc.start() else: @@ -210,8 +224,6 @@ def __init__(self, self.proc.deamon = True self.proc.start() - - def terminate(self): """Terminate the server process""" if self.proc: @@ -222,7 +234,6 @@ def __del__(self): self.terminate() - class RPCSession(object): """RPC Client session module diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py index 432860f58d1e..deb830bdc583 100644 --- a/python/tvm/exec/rpc_server.py +++ b/python/tvm/exec/rpc_server.py @@ -21,6 +21,9 @@ def main(): help="Whether to load executor runtime") parser.add_argument('--load-library', type=str, default="", help="Additional library to load") + parser.add_argument('--exclusive', action='store_true', + help="If this is enabled, the server will kill old connection" + "when new connection comes") args = parser.parse_args() logging.basicConfig(level=logging.INFO) @@ -35,7 +38,7 @@ def main(): libs.append(ctypes.CDLL(file_name, ctypes.RTLD_GLOBAL)) logging.info("Load additional library %s", file_name) - server = rpc.Server(args.host, args.port, args.port_end) + server = rpc.Server(args.host, args.port, args.port_end, exclusive=args.exclusive) server.libs += libs server.proc.join() From aff6692af662c3f7e8c60bdea53ced6408ce8022 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 28 Feb 2018 08:55:07 -0800 Subject: [PATCH 174/948] Fix compiler warnings (#939) --- src/codegen/spirv/ir_builder.cc | 3 ++- src/codegen/spirv/ir_builder.h | 2 +- topi/include/topi/nn/softmax.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc index 500c88ae832d..2bd1d0d5da7c 100644 --- a/src/codegen/spirv/ir_builder.cc +++ b/src/codegen/spirv/ir_builder.cc @@ -159,7 +159,8 @@ Value IRBuilder::FloatImm(const SType& dtype, double value) { return GetConst_(dtype, reinterpret_cast(&value)); } else if (dtype.type.bits() == 32) { float fvalue = static_cast(value); - uint64_t data = reinterpret_cast(&fvalue)[0]; + uint32_t* ptr = reinterpret_cast(&fvalue); + uint64_t data = ptr[0]; return GetConst_(dtype, &data); } else { CHECK_EQ(dtype.type.bits(), 16); diff --git a/src/codegen/spirv/ir_builder.h b/src/codegen/spirv/ir_builder.h index bdb320154062..e652a0068278 100644 --- a/src/codegen/spirv/ir_builder.h +++ b/src/codegen/spirv/ir_builder.h @@ -344,7 +344,7 @@ class IRBuilder { * \tparams Args The positional arguments */ template - Value DeclareGlobal(spv::Op op, Args&& ...args) { + void DeclareGlobal(spv::Op op, Args&& ...args) { ib_.Begin(op).AddSeq(std::forward(args)...).Commit(&decorate_); } /*! diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h index d2348c9f230a..f542f0eb0256 100644 --- a/topi/include/topi/nn/softmax.h +++ b/topi/include/topi/nn/softmax.h @@ -47,7 +47,7 @@ inline Tensor softmax(const Tensor &x, Array eval_range; int arg_counter = 0; for (size_t i = 0; i < ndim; ++i) { - if (i == axis) + if (static_cast(i) == axis) eval_range.push_back(reduce_index); else eval_range.push_back(indices[arg_counter++]); @@ -70,7 +70,7 @@ inline Tensor softmax(const Tensor &x, const Array &indices) { Array non_reduce_indices; for (size_t i = 0; i < ndim; ++i) { - if (i != axis) + if (static_cast(i) != axis) non_reduce_indices.push_back(indices[i]); } return tvm::exp(x(indices) - max_elem(non_reduce_indices)) / From 90c827a7b9d2910f7581fb6555871bedb3d5c490 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Thu, 1 Mar 2018 09:12:50 +0530 Subject: [PATCH 175/948] Spelling mistake corrected (#945) --- python/tvm/container.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/container.py b/python/tvm/container.py index b0ec44c695cf..d1d4546fd86a 100644 --- a/python/tvm/container.py +++ b/python/tvm/container.py @@ -20,7 +20,7 @@ def __getitem__(self, i): return [self[idx] for idx in range(start, stop, step)] if i >= len(self): - raise IndexError("array index out ot range") + raise IndexError("array index out of range") return _api_internal._ArrayGetItem(self, i) def __len__(self): From 279a778b36b05dc4b64ade0b0863905b328e5be7 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 1 Mar 2018 10:59:58 -0800 Subject: [PATCH 176/948] Better error message handling for contrib (#946) * Better error message handling for contrib * fix lint * fix testcase * fix test --- python/tvm/contrib/cc.py | 9 +++++---- python/tvm/contrib/emscripten.py | 4 +++- python/tvm/contrib/ndk.py | 4 +++- python/tvm/contrib/nvcc.py | 2 +- python/tvm/contrib/rocm.py | 3 ++- python/tvm/contrib/spirv.py | 4 ++-- python/tvm/contrib/tar.py | 5 +++-- python/tvm/contrib/xcode.py | 6 ++++-- topi/python/topi/__init__.py | 5 ++++- topi/tests/python/test_topi_dilate.py | 1 + topi/tests/python/test_topi_softmax.py | 1 + topi/tests/python/test_topi_upsampling.py | 1 + topi/tests/python_cpp/test_topi_dilate.py | 1 + topi/tests/python_cpp/test_topi_softmax.py | 1 + 14 files changed, 32 insertions(+), 15 deletions(-) diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py index 5764121c59ed..0ffa6c420243 100644 --- a/python/tvm/contrib/cc.py +++ b/python/tvm/contrib/cc.py @@ -3,8 +3,9 @@ from __future__ import absolute_import as _abs import sys import subprocess - import os + +from .._ffi.base import py_str from .util import tempdir @@ -53,7 +54,7 @@ def _linux_shared(output, objects, options, cc="g++"): (out, _) = proc.communicate() if proc.returncode != 0: msg = "Compilation error:\n" - msg += str(out) + msg += py_str(out) raise RuntimeError(msg) @@ -88,7 +89,7 @@ def _windows_shared(output, objects, options): "please run this in Vistual Studio Command Prompt.") if proc.returncode != 0: msg = "Compilation error:\n" - msg += str(out) + msg += py_str(out) raise RuntimeError(msg) link_cmd = ["link"] link_cmd += ["-dll", "-FORCE:MULTIPLE"] @@ -114,6 +115,6 @@ def _windows_shared(output, objects, options): "please run this in Vistual Studio Command Prompt.") if proc.returncode != 0: msg = "Compilation error:\n" - msg += str(out) + msg += py_str(out) raise RuntimeError(msg) diff --git a/python/tvm/contrib/emscripten.py b/python/tvm/contrib/emscripten.py index d263e472cff0..a722a0d673d9 100644 --- a/python/tvm/contrib/emscripten.py +++ b/python/tvm/contrib/emscripten.py @@ -1,7 +1,9 @@ """Util to invoke emscripten compilers in the system.""" # pylint: disable=invalid-name from __future__ import absolute_import as _abs + import subprocess +from .._ffi.base import py_str from .._ffi.libinfo import find_lib_path def create_js(output, @@ -58,7 +60,7 @@ def create_js(output, if proc.returncode != 0: msg = "Compilation error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) create_js.object_format = "bc" diff --git a/python/tvm/contrib/ndk.py b/python/tvm/contrib/ndk.py index 3751a4574b47..a79aae96163d 100644 --- a/python/tvm/contrib/ndk.py +++ b/python/tvm/contrib/ndk.py @@ -1,8 +1,10 @@ """Util to invoke NDK compiler toolchain.""" # pylint: disable=invalid-name from __future__ import absolute_import as _abs + import subprocess import os +from .._ffi.base import py_str def create_shared(output, objects, @@ -43,5 +45,5 @@ def create_shared(output, if proc.returncode != 0: msg = "Compilation error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py index ac8dbf65b2bc..e3c2b7895da7 100644 --- a/python/tvm/contrib/nvcc.py +++ b/python/tvm/contrib/nvcc.py @@ -72,7 +72,7 @@ def compile_cuda(code, if proc.returncode != 0: msg = "Compilation error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) return bytearray(open(file_target, "rb").read()) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index 0f72a4694a82..10cfaed83e68 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -2,6 +2,7 @@ import subprocess from os.path import join from . import util +from .._ffi.base import py_str from ..api import register_func, convert def rocm_link(in_file, out_file): @@ -24,7 +25,7 @@ def rocm_link(in_file, out_file): if proc.returncode != 0: msg = "Linking error using ld.lld:\n" - msg += str(out) + msg += py_str(out) raise RuntimeError(msg) diff --git a/python/tvm/contrib/spirv.py b/python/tvm/contrib/spirv.py index eed6240bc76c..586da1a7487f 100644 --- a/python/tvm/contrib/spirv.py +++ b/python/tvm/contrib/spirv.py @@ -2,7 +2,7 @@ import subprocess import os from . import util - +from .._ffi.base import py_str def optimize(spv_bin): """Optimize SPIRV using spirv-opt via CLI @@ -37,7 +37,7 @@ def optimize(spv_bin): if proc.returncode != 0: msg = "Opitmizationerror using spirv-opt:\n" - msg += str(out) + msg += py_str(out) raise RuntimeError(msg) return bytearray(open(tmp_out, "rb").read()) diff --git a/python/tvm/contrib/tar.py b/python/tvm/contrib/tar.py index ca3bf3478840..7e075d7a5697 100644 --- a/python/tvm/contrib/tar.py +++ b/python/tvm/contrib/tar.py @@ -6,6 +6,7 @@ import shutil import subprocess from . import util +from .._ffi.base import py_str def tar(output, files): """Create tarball containing all files in root. @@ -38,7 +39,7 @@ def tar(output, files): if proc.returncode != 0: msg = "Tar error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) @@ -64,5 +65,5 @@ def untar(tar_file, directory): if proc.returncode != 0: msg = "Tar error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py index e05df7181544..3456bdf1964a 100644 --- a/python/tvm/contrib/xcode.py +++ b/python/tvm/contrib/xcode.py @@ -1,9 +1,11 @@ # pylint: disable=invalid-name """Utility to invoke Xcode compiler toolchain""" from __future__ import absolute_import as _abs + import os import sys import subprocess +from .._ffi.base import py_str from . import util def xcrun(cmd): @@ -49,7 +51,7 @@ def codesign(lib): (out, _) = proc.communicate() if proc.returncode != 0: msg = "Codesign error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) @@ -92,7 +94,7 @@ def create_dylib(output, objects, arch, sdk="macosx"): if proc.returncode != 0: msg = "Compilation error:\n" - msg += out + msg += py_str(out) raise RuntimeError(msg) diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index f7a922c5c379..ae02211a9d6c 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -22,7 +22,10 @@ from . import rasp from . import mali from . import opengl -from . import testing from . import util from . import rocm from . import cpp +# not import testing by default +# because testing can have extra deps that are not necessary +# we can import them from test cases explicitly +# from . import testing diff --git a/topi/tests/python/test_topi_dilate.py b/topi/tests/python/test_topi_dilate.py index 778c0ba5e9c4..9cc44719745a 100644 --- a/topi/tests/python/test_topi_dilate.py +++ b/topi/tests/python/test_topi_dilate.py @@ -1,5 +1,6 @@ import tvm import topi +import topi.testing import numpy as np diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py index c8a0f9549803..c768c395a74f 100644 --- a/topi/tests/python/test_topi_softmax.py +++ b/topi/tests/python/test_topi_softmax.py @@ -3,6 +3,7 @@ import numpy as np import tvm import topi +import topi.testing import logging from topi.util import get_const_tuple diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py index 20c349c81832..7421dd4151e6 100644 --- a/topi/tests/python/test_topi_upsampling.py +++ b/topi/tests/python/test_topi_upsampling.py @@ -2,6 +2,7 @@ import numpy as np import tvm import topi +import topi.testing import math def verify_upsampling(batch, in_channel, in_height, in_width, scale): diff --git a/topi/tests/python_cpp/test_topi_dilate.py b/topi/tests/python_cpp/test_topi_dilate.py index 177ddbc3cfd7..f1924239cc77 100644 --- a/topi/tests/python_cpp/test_topi_dilate.py +++ b/topi/tests/python_cpp/test_topi_dilate.py @@ -1,5 +1,6 @@ import tvm import topi +import topi.testing import numpy as np def test_dilate(): diff --git a/topi/tests/python_cpp/test_topi_softmax.py b/topi/tests/python_cpp/test_topi_softmax.py index 2a6baaafe2f1..4d4ac387bccf 100644 --- a/topi/tests/python_cpp/test_topi_softmax.py +++ b/topi/tests/python_cpp/test_topi_softmax.py @@ -4,6 +4,7 @@ import tvm import topi import logging +import topi.testing from topi.util import get_const_tuple def verify_softmax(m, n): From e3e288fb0cb0ca76746535e069d9d4408f9158a6 Mon Sep 17 00:00:00 2001 From: Yida Wang Date: Thu, 1 Mar 2018 12:14:44 -0800 Subject: [PATCH 177/948] remove the pragma primitives for better performance when the threads are binded (#949) --- topi/python/topi/x86/conv2d_avx_1x1.py | 9 --------- topi/python/topi/x86/conv2d_avx_common.py | 9 --------- 2 files changed, 18 deletions(-) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index cc264d04ac24..afd0be2e2ded 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -77,9 +77,6 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(ic_chunk, ih) s[A1].parallel(parallel_axis) - s[A1].pragma(batch, "parallel_launch_point") - s[A1].pragma(parallel_axis, "parallel_stride_pattern") - s[A1].pragma(batch, "parallel_barrier_when_finish") # schedule kernel pack oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis @@ -88,9 +85,6 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou s[W].vectorize(oc_block) parallel_axis = s[W].fuse(oc_chunk, oh) s[W].parallel(parallel_axis) - s[W].pragma(parallel_axis, "parallel_launch_point") - s[W].pragma(parallel_axis, "parallel_stride_pattern") - s[W].pragma(parallel_axis, "parallel_barrier_when_finish") C, O0, O = conv_out, output, last CC = s.cache_write(C, 'global') @@ -128,8 +122,5 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou s[O].vectorize(oc_block) s[O].parallel(parallel_axis) - s[O].pragma(batch, "parallel_launch_point") - s[O].pragma(parallel_axis, "parallel_stride_pattern") - s[O].pragma(batch, "parallel_barrier_when_finish") return s diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 4f5be019f45a..f4c0e453e643 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -90,9 +90,6 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(ic_chunk, ih) s[A1].parallel(parallel_axis) - s[A1].pragma(batch, "parallel_launch_point") - s[A1].pragma(parallel_axis, "parallel_stride_pattern") - s[A1].pragma(batch, "parallel_barrier_when_finish") # schedule kernel pack oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis @@ -101,9 +98,6 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou s[W].vectorize(oc_block) parallel_axis = s[W].fuse(oc_chunk, oh) s[W].parallel(parallel_axis) - s[W].pragma(parallel_axis, "parallel_launch_point") - s[W].pragma(parallel_axis, "parallel_stride_pattern") - s[W].pragma(parallel_axis, "parallel_barrier_when_finish") # schedule conv C, O0, O = conv_out, output, last @@ -144,8 +138,5 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou s[O].vectorize(oc_block) s[O].parallel(parallel_axis) - s[O].pragma(batch, "parallel_launch_point") - s[O].pragma(parallel_axis, "parallel_stride_pattern") - s[O].pragma(batch, "parallel_barrier_when_finish") return s From 68aac7f89a945385c2c47400aa92bb41c3713f5b Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya <34906252+AtuNuka@users.noreply.github.com> Date: Fri, 2 Mar 2018 14:07:35 +0900 Subject: [PATCH 178/948] Fix a typo for Target class (#951) --- python/tvm/target.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/target.py b/python/tvm/target.py index ab02619f6deb..7fa8998bfbe1 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -99,7 +99,7 @@ def __init__(self, self.libs += libs.split(",") elif item.startswith("-device="): self.device_name = item.split("=")[1] - # Target query searchs device name first + # Target query searches device name first if self.device_name: self.keys = (self.device_name,) else: @@ -122,7 +122,7 @@ def __init__(self, elif target_name in ("opengl",): self.keys += ("opengl",) elif target_name in ("stackvm", "ext_dev"): - # Do not now class for stacvm or ext_dev + # Do not now class for stackvm or ext_dev pass else: raise ValueError("Unknown target name %s" % target_name) From adba4011adad5a531bf7d06cbba5e36c04a03a4b Mon Sep 17 00:00:00 2001 From: Zhixun Tan Date: Fri, 2 Mar 2018 08:39:15 -0800 Subject: [PATCH 179/948] Add the equivalence of graph_runtime.py in tvm_runtime.js (#950) --- web/tvm_runtime.js | 103 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index 347532f5bdbf..bc12f4f39c14 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -656,6 +656,8 @@ var tvm_runtime = tvm_runtime || {}; v = convertFunc(v); this.temp.push(v); this.setHandle(i, v._tvm_function.handle, kFuncHandle); + } else if (v instanceof TVMModule) { + this.setHandle(i, v.handle, kModuleHandle); } else { throwError("Unsupported argument type " + tp); } @@ -977,6 +979,107 @@ var tvm_runtime = tvm_runtime || {}; }; var loadModuleFromFile = this.loadModuleFromFile; + /** + * Wrapper runtime module. + * Wraps around set_input, load_params, run, and get_output. + * + * @class + * @memberof tvm + */ + function GraphModule(tvm_graph_module, ctx) { + CHECK(tvm_graph_module instanceof TVMModule, + "tvm_graph_module must be TVMModule"); + CHECK(ctx instanceof TVMContext, "ctx must be TVMContext"); + + this.tvm_graph_module = tvm_graph_module; + this.ctx = ctx; + this._set_input = tvm_graph_module.getFunction("set_input"); + this._load_params = tvm_graph_module.getFunction("load_params"); + this._run = tvm_graph_module.getFunction("run"); + this._get_output = tvm_graph_module.getFunction("get_output"); + }; + + GraphModule.prototype = { + /** + * Set input to graph module. + * + * @param {string} key The name of the input. + * @param {NDArray} value The input value. + */ + "set_input" : function(key, value) { + CHECK(typeof key == "string", "key must be string"); + CHECK(value instanceof NDArray, "value must be NDArray"); + this._set_input(key, value); + }, + + /** + * Load parameters from serialized byte array of parameter dict. + * + * @param {Uint8Array} params The serialized parameter dict. + */ + "load_params" : function(params) { + CHECK(params instanceof Uint8Array, "params must be Uint8Array"); + this._load_params(params); + }, + + /** + * Load parameters from serialized base64 string of parameter dict. + * + * @param {string} base64_params The serialized parameter dict. + */ + "load_base64_params" : function(base64_params) { + CHECK(typeof base64_params == "string", "base64_params must be string"); + var decoded_string = atob(base64_params); + var decoded_u8 = new Uint8Array(decoded_string.length); + for (var i = 0; i < decoded_string.length; i++) { + decoded_u8[i] = decoded_string[i].charCodeAt(0); + } + this.load_params(decoded_u8); + }, + + /** + * Run forward execution of the graph. + */ + "run" : function() { + this._run(); + }, + + /** + * Get index-th output to out. + * + * @param {NDArray} out The output array container. + * @return {NDArray} The output array container. + */ + "get_output" : function(index, out) { + CHECK(typeof index == "number", "index must be number"); + CHECK(out instanceof NDArray, "out must be NDArray"); + this._get_output(new TVMConstant(index, "int32"), out); + return out; + } + }; + + /** + * Create a runtime executor module given a graph and a module. + * @param {string} graph_json_str The Json string of the graph. + * @param {TVMModule} libmod The TVM module. + * @param {TVMContext} ctx The context to deploy the module. + * @return {GraphModule} Runtime graph module for executing the graph. + */ + this.createGraphRuntime = function(graph_json_str, libmod, ctx) { + CHECK(typeof graph_json_str == "string", "graph_json_str must be string"); + CHECK(libmod instanceof TVMModule, "libmod must be TVMModule"); + CHECK(ctx instanceof TVMContext, "ctx must be TVMContext"); + + var fcreate = getGlobalFunc("tvm.graph_runtime.create"); + CHECK(fcreate != null, "Cannot find tvm.graph_runtime.create"); + + var tvm_graph_module = fcreate(graph_json_str, libmod, + new TVMConstant(ctx.device_type, "int32"), + new TVMConstant(ctx.device_id, "int32")); + + return new GraphModule(tvm_graph_module, ctx); + }; + //----------------------------------------- // Class defintions // ---------------------------------------- From 9c850a145c26dbe7753cfee9478369f1c153597c Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Fri, 2 Mar 2018 12:59:22 -0700 Subject: [PATCH 180/948] [RUNTIME] Stream API (#953) --- CONTRIBUTORS.md | 1 + include/tvm/c_dsl_api.h | 2 +- include/tvm/runtime/c_backend_api.h | 2 +- include/tvm/runtime/c_runtime_api.h | 42 +++++++++++++++++++---- include/tvm/runtime/device_api.h | 33 +++++++++++++++++- src/runtime/c_runtime_api.cc | 45 +++++++++++++++++++++++++ src/runtime/cuda/cuda_device_api.cc | 26 +++++++++++++- src/runtime/opencl/opencl_device_api.cc | 4 +-- 8 files changed, 142 insertions(+), 13 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 13b5ee1c5dd3..7a7f52b5ee82 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -37,3 +37,4 @@ List of Contributors - [Masahiro Masuda](https://github.com/masahi) - [Haolong Zhang](https://github.com/haolongzhangm) - [Cody Hao Yu](https://github.com/comaniac) +- [Chris Nuernberger](https://github.com/cnuernber) diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h index f81018a7610e..0ba34ed5ccee 100644 --- a/include/tvm/c_dsl_api.h +++ b/include/tvm/c_dsl_api.h @@ -17,7 +17,7 @@ #include "./runtime/c_runtime_api.h" #ifdef __cplusplus -TVM_EXTERN_C { +extern "C" { #endif /*! \brief handle to node */ diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h index 079ab1efb040..01e8a9486655 100644 --- a/include/tvm/runtime/c_backend_api.h +++ b/include/tvm/runtime/c_backend_api.h @@ -13,7 +13,7 @@ #include "./c_runtime_api.h" #ifdef __cplusplus -TVM_EXTERN_C { +extern "C" { #endif // Backend related functions. diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index e4a06b39d04e..f6d22534e8ab 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -18,12 +18,6 @@ #ifndef TVM_RUNTIME_C_RUNTIME_API_H_ #define TVM_RUNTIME_C_RUNTIME_API_H_ -#ifdef __cplusplus -#define TVM_EXTERN_C extern "C" -#else -#define TVM_EXTERN_C -#endif - // Macros to do weak linking #ifdef _MSC_VER #define TVM_WEAK __declspec(selectany) @@ -52,7 +46,7 @@ #include #ifdef __cplusplus -TVM_EXTERN_C { +extern "C" { #endif #include #include @@ -443,6 +437,26 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from, TVMArrayHandle to, TVMStreamHandle stream); +/*! + * \brief Create a new runtime stream. + * + * \param device_type The device type of context + * \param device_id The device id of context + * \param out The new stream handle + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out); + +/*! + * \brief Free a created stream handle. + * + * \param device_type The device type of context + * \param device_id The device id of context + * \param stream The stream to be freed + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream); + /*! * \brief Set the runtime stream of current thread to be stream. * The subsequent calls to the same device_type @@ -466,6 +480,20 @@ TVM_DLL int TVMSetStream(int device_type, int device_id, TVMStreamHandle handle) */ TVM_DLL int TVMSynchronize(int device_type, int device_id, TVMStreamHandle stream); +/*! + * \brief Synchronize two streams of execution. + * + * \param device_type The device type of context + * \param device_id The device id of context + * \param src The source stream to synchronize. + * \param dst The destination stream to synchronize. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMStreamStreamSynchronize(int device_type, + int device_id, + TVMStreamHandle src, + TVMStreamHandle dst); + #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 45009f1d3af3..ff3fe8062920 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -19,7 +19,7 @@ enum DeviceAttrKind : int { kExist = 0, kMaxThreadsPerBlock = 1, kWarpSize = 2, - kComputeVersion = 3 + kComputeVersion = 3, }; /*! \brief Number of bytes each allocation must align to */ @@ -90,6 +90,21 @@ class DeviceAPI { TVMContext ctx_from, TVMContext ctx_to, TVMStreamHandle stream) = 0; + /*! + * \brief Create a new stream of execution. + * + * \param ctx The context of allocation. + */ + TVM_DLL virtual TVMStreamHandle CreateStream(TVMContext ctx); + + /*! + * \brief Free a stream of execution + * + * \param ctx The context of the stream + * \param stream The pointer to be freed. + */ + TVM_DLL virtual void FreeStream(TVMContext ctx, TVMStreamHandle stream); + /*! * \brief Synchronize the stream * \param ctx The context to perform operation. @@ -102,6 +117,21 @@ class DeviceAPI { * \param stream The stream to be set. */ virtual void SetStream(TVMContext ctx, TVMStreamHandle stream) {} + /*! + * \brief Synchronize 2 streams of execution. + * + * An event is created in event_src stream that the second then + * stream waits on. Neither event_src or event_dst need to be of + * the same device ID as the context, but they must be of the same + * device type. + * + * \param ctx The context of the streams. + * \param event_src The source stream to synchronize. + * \param event_dst The destination stream to synchronize. + */ + TVM_DLL virtual void SyncStreamFromTo(TVMContext ctx, + TVMStreamHandle event_src, + TVMStreamHandle event_dst); /*! * \brief Allocate temporal workspace for backend execution. * @@ -128,6 +158,7 @@ class DeviceAPI { * \param ptr The pointer to be freed. */ TVM_DLL virtual void FreeWorkspace(TVMContext ctx, void* ptr); + /*! * \brief Get device API base don context. * \param ctx The context diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 37902b448021..7d65a04bc040 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -106,6 +106,21 @@ void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); } +TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) { + LOG(FATAL) << "Device does not support stream api."; + return 0; +} + +void DeviceAPI::FreeStream(TVMContext ctx, TVMStreamHandle stream) { + LOG(FATAL) << "Device does not support stream api."; +} + +void DeviceAPI::SyncStreamFromTo(TVMContext ctx, + TVMStreamHandle event_src, + TVMStreamHandle event_dst) { + LOG(FATAL) << "Device does not support stream api."; +} + inline TVMArray* TVMArrayCreate_() { TVMArray* arr = new TVMArray(); arr->shape = nullptr; @@ -448,6 +463,24 @@ int TVMArrayCopyToBytes(TVMArrayHandle handle, API_END(); } +int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out) { + API_BEGIN(); + TVMContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx); + API_END(); +} + +int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream) { + API_BEGIN(); + TVMContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream); + API_END(); +} + int TVMSetStream(int device_type, int device_id, TVMStreamHandle stream) { API_BEGIN(); TVMContext ctx; @@ -466,6 +499,18 @@ int TVMSynchronize(int device_type, int device_id, TVMStreamHandle stream) { API_END(); } +int TVMStreamStreamSynchronize(int device_type, + int device_id, + TVMStreamHandle src, + TVMStreamHandle dst) { + API_BEGIN(); + TVMContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst); + API_END(); +} + int TVMCbArgToReturn(TVMValue* value, int code) { API_BEGIN(); tvm::runtime::TVMRetValue rv; diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 7885aa7705ed..9b0fee5023f1 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -30,7 +30,7 @@ class CUDADeviceAPI final : public DeviceAPI { &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) == cudaSuccess); break; - case kMaxThreadsPerBlock: { + case kMaxThreadsPerBlock: { CUDA_CALL(cudaDeviceGetAttribute( &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id)); break; @@ -102,6 +102,30 @@ class CUDADeviceAPI final : public DeviceAPI { } } + TVMStreamHandle CreateStream(TVMContext ctx) { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + cudaStream_t retval; + CUDA_CALL(cudaStreamCreate(&retval)); + return static_cast(retval); + } + + void FreeStream(TVMContext ctx, TVMStreamHandle stream) { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + cudaStream_t cu_stream = static_cast(stream); + CUDA_CALL(cudaStreamDestroy(cu_stream)); + } + + void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + cudaStream_t src_stream = static_cast(event_src); + cudaStream_t dst_stream = static_cast(event_dst); + cudaEvent_t evt; + CUDA_CALL(cudaEventCreate(&evt)); + CUDA_CALL(cudaEventRecord(evt, src_stream)); + CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0)); + CUDA_CALL(cudaEventDestroy(evt)); + } + void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 4c3d206b9354..6341c9f4b83d 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -45,8 +45,8 @@ void OpenCLWorkspace::GetAttr( *rv = 1; break; } - case kComputeVersion: return; - case kExist: break; + case kComputeVersion: return; + case kExist: break; } } From 8c5bc873838ba89e3e1c11205ed70a20be781f2a Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 2 Mar 2018 13:22:20 -0800 Subject: [PATCH 181/948] [CODEGEN] Fix alignment generation (#955) --- src/codegen/llvm/codegen_llvm.cc | 13 +++++++++---- tests/python/unittest/test_codegen_llvm.py | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index e95407a5c057..934398d9ce09 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -339,6 +339,7 @@ void CodeGenLLVM::GetAlignment(Type t, } arith::ModularEntry me = arith::EvalModular(index, align_map_); + int align_bits = t.bits(); while (align_bits < max_align_bits && me.base % 2 == 0 && @@ -814,13 +815,13 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Let* op) { llvm::Value* CodeGenLLVM::VisitExpr_(const Load* op) { Type t = op->type; - int alignment, native_bits; bool is_volatile = volatile_buf_.count(op->buffer_var.get()); - GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits); llvm::Value* buffer = MakeValue(op->buffer_var); llvm::Value* index = MakeValue(op->index); if (t.lanes() == 1) { + int alignment, native_bits; + GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits); llvm::Value* ptr = CreateBufferPtr(t, buffer, index); llvm::LoadInst* load = builder_->CreateAlignedLoad(ptr, alignment, is_volatile); AddAliasInfo(load, op->buffer_var.get(), op->index, t); @@ -831,6 +832,8 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Load* op) { buffer->getType())->getAddressSpace(); if (const Ramp* ramp = op->index.as()) { if (is_one(ramp->stride)) { + int alignment, native_bits; + GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits); CHECK_EQ(ramp->lanes, t.lanes()); llvm::Value* ptr = CreateBufferPtr( t.element_of(), buffer, MakeValue(ramp->base)); @@ -885,14 +888,14 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Broadcast* op) { void CodeGenLLVM::VisitStmt_(const Store* op) { CHECK(is_one(op->predicate)); Type t = op->value.type(); - int alignment, native_bits; bool is_volatile = volatile_buf_.count(op->buffer_var.get()); - GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits); llvm::Value* buffer = MakeValue(op->buffer_var); llvm::Value* index = MakeValue(op->index); llvm::Value* value = MakeValue(op->value); if (t.lanes() == 1) { + int alignment, native_bits; + GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits); llvm::Value* ptr = CreateBufferPtr(t, buffer, index); llvm::StoreInst* store = builder_->CreateAlignedStore(value, ptr, alignment, is_volatile); AddAliasInfo(store, op->buffer_var.get(), op->index, op->value.type()); @@ -903,6 +906,8 @@ void CodeGenLLVM::VisitStmt_(const Store* op) { buffer->getType())->getAddressSpace(); if (const Ramp* ramp = op->index.as()) { if (is_one(ramp->stride)) { + int alignment, native_bits; + GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits); CHECK_EQ(ramp->lanes, t.lanes()); llvm::Value* ptr = CreateBufferPtr( t.element_of(), buffer, MakeValue(ramp->base)); diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py index 24996c842249..f05fad10d273 100644 --- a/tests/python/unittest/test_codegen_llvm.py +++ b/tests/python/unittest/test_codegen_llvm.py @@ -297,7 +297,22 @@ def check_llvm(n): check_llvm(64) +def test_alignment(): + n = tvm.convert(1024) + A = tvm.placeholder((n,), name='A') + B = tvm.compute(A.shape, lambda i: A[i] * 3, name='B') + s = tvm.create_schedule(B.op) + bx, tx = s[B].split(B.op.axis[0], factor=8) + s[B].vectorize(tx) + f = tvm.build(s, [A, B], "llvm") + + for l in f.get_source().split("\n"): + if "align" in l and "4 x float" in l: + assert "align 32" in l + + if __name__ == "__main__": + test_alignment() test_rank_zero() test_llvm_bool() test_llvm_persist_parallel() From e2bbd9f46c6796e08fe60479450178e632182aea Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 2 Mar 2018 15:19:13 -0800 Subject: [PATCH 182/948] explicit import testing (#956) * explicit import testing * Enable init api for extension modules --- python/tvm/_ffi/function.py | 20 +++++++++++++------ topi/tests/python/test_topi_conv2d.py | 1 + topi/tests/python/test_topi_conv2d_hwcn.py | 1 + topi/tests/python/test_topi_conv2d_nhwc.py | 1 + .../python/test_topi_conv2d_transpose_nchw.py | 1 + topi/tests/python/test_topi_dense.py | 1 + .../python/test_topi_depthwise_conv2d.py | 1 + .../test_topi_depthwise_conv2d_back_input.py | 1 + .../test_topi_depthwise_conv2d_back_weight.py | 1 + topi/tests/python/test_topi_math.py | 1 + 10 files changed, 23 insertions(+), 6 deletions(-) diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py index 526d972f6d28..90a0c18dec5d 100644 --- a/python/tvm/_ffi/function.py +++ b/python/tvm/_ffi/function.py @@ -280,15 +280,23 @@ def my_api_func(*args): return flocal(*args) return my_api_func -def _init_api(namespace): +def _init_api(namespace, target_module_name=None): """Initialize api for a given module name - mod : str - The name of the module. + namespace : str + The namespace of the source registry + + target_module_name : str + The target module name if different from namespace """ - assert namespace.startswith("tvm.") - prefix = namespace[4:] - _init_api_prefix(namespace, prefix) + target_module_name = ( + target_module_name if target_module_name else namespace) + if namespace.startswith("tvm."): + prefix = namespace[4:] + _init_api_prefix(target_module_name, namespace[4:]) + else: + _init_api_prefix(target_module_name, namespace) + def _init_api_prefix(module_name, prefix): module = sys.modules[module_name] diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py index 3e6978752b29..e7ea956eea78 100644 --- a/topi/tests/python/test_topi_conv2d.py +++ b/topi/tests/python/test_topi_conv2d.py @@ -3,6 +3,7 @@ import numpy as np import tvm import topi +import topi.testing from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py index 7761a90a0eed..84962a0b4789 100644 --- a/topi/tests/python/test_topi_conv2d_hwcn.py +++ b/topi/tests/python/test_topi_conv2d_hwcn.py @@ -3,6 +3,7 @@ import numpy as np import tvm import topi +import topi.testing from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py index 7fc5b841908f..40aa3e550a86 100644 --- a/topi/tests/python/test_topi_conv2d_nhwc.py +++ b/topi/tests/python/test_topi_conv2d_nhwc.py @@ -3,6 +3,7 @@ import numpy as np import tvm import topi +import topi.testing from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py index 712f64ca2c87..dcdddca9107b 100644 --- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py +++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py @@ -2,6 +2,7 @@ import numpy as np import tvm import topi +import topi.testing from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py index cbae2055a3cd..779e1af1f35e 100644 --- a/topi/tests/python/test_topi_dense.py +++ b/topi/tests/python/test_topi_dense.py @@ -2,6 +2,7 @@ import numpy as np import tvm import topi +import topi.testing from topi.util import get_const_tuple from tvm.contrib.pickle_memoize import memoize diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py index 8ecfdd063a94..5a12d7abb208 100644 --- a/topi/tests/python/test_topi_depthwise_conv2d.py +++ b/topi/tests/python/test_topi_depthwise_conv2d.py @@ -1,5 +1,6 @@ import tvm import topi +import topi.testing import numpy as np from scipy import signal from topi.util import get_const_tuple diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py index 4280e92bdcb9..663364c5345b 100644 --- a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py +++ b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py @@ -5,6 +5,7 @@ from scipy import signal from topi.util import get_const_tuple from topi.nn.util import get_pad_tuple +import topi.testing from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py index 5aa65d2c4039..25d08e1b1882 100644 --- a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py +++ b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py @@ -1,5 +1,6 @@ import tvm import topi +import topi.testing import numpy as np from tvm.contrib.pickle_memoize import memoize from scipy import signal diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py index 2c31d0fea62c..dc3c015d4f25 100644 --- a/topi/tests/python/test_topi_math.py +++ b/topi/tests/python/test_topi_math.py @@ -1,6 +1,7 @@ import numpy as np import tvm import topi +import topi.testing from topi import util From e38500f43c66021520a87df301315c26e075465d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 2 Mar 2018 15:39:52 -0800 Subject: [PATCH 183/948] Update function.py --- python/tvm/_ffi/function.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py index 90a0c18dec5d..e0f85be6f1a9 100644 --- a/python/tvm/_ffi/function.py +++ b/python/tvm/_ffi/function.py @@ -292,7 +292,6 @@ def _init_api(namespace, target_module_name=None): target_module_name = ( target_module_name if target_module_name else namespace) if namespace.startswith("tvm."): - prefix = namespace[4:] _init_api_prefix(target_module_name, namespace[4:]) else: _init_api_prefix(target_module_name, namespace) From 17fd2ad90f23b8c1e8c8a1150a035c1274d7822c Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Sun, 4 Mar 2018 20:45:06 -0700 Subject: [PATCH 184/948] Small refactor for clarity in arraycopyfromto (#960) --- src/runtime/c_runtime_api.cc | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 7d65a04bc040..c7fd84362f47 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -413,19 +413,22 @@ int TVMArrayCopyFromTo(TVMArrayHandle from, size_t from_size = GetDataSize(from); size_t to_size = GetDataSize(to); CHECK_EQ(from_size, to_size) - << "TVMArrayCopyFromTo: The size must exactly match"; - TVMContext ctx = from->ctx; - if (ctx.device_type == kDLCPU) { - ctx = to->ctx; - } else { - CHECK(to->ctx.device_type == kDLCPU || - to->ctx.device_type == from->ctx.device_type) - << "Can not copy across different ctx types directly"; - } + << "TVMArrayCopyFromTo: The size must exactly match"; + + CHECK(from->ctx.device_type == to->ctx.device_type + || from->ctx.device_type == kDLCPU + || to->ctx.device_type == kDLCPU) + << "Can not copy across different ctx types directly"; + + // Use the context that is *not* a cpu context to get the correct device + // api manager. + TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx; + DeviceAPIManager::Get(ctx)->CopyDataFromTo( - from->data, static_cast(from->byte_offset), - to->data, static_cast(to->byte_offset), - from_size, from->ctx, to->ctx, stream); + from->data, static_cast(from->byte_offset), + to->data, static_cast(to->byte_offset), + from_size, from->ctx, to->ctx, stream); + API_END(); } From 5fc0e69df7856ff07fd378117b10628b50eff832 Mon Sep 17 00:00:00 2001 From: nhynes Date: Mon, 5 Mar 2018 10:58:22 -0800 Subject: [PATCH 185/948] SGXify graph runtime (#937) --- dmlc-core | 2 +- src/runtime/graph/graph_runtime.cc | 20 ++++++++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/dmlc-core b/dmlc-core index 7e84e8b036a3..d3f7fbb53e5b 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 7e84e8b036a3ff5c0104a3da1f4c7eebf94396ec +Subproject commit d3f7fbb53e5b037c0f5bf6bd21871ccc720690cc diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 34d6b3af155a..89bb3a052b0f 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -64,7 +64,11 @@ class GraphRuntime : public ModuleNode { void Init(const std::string& graph_json, tvm::runtime::Module module, TVMContext ctx) { +#ifndef _LIBCPP_SGX_NO_IOSTREAMS std::istringstream is(graph_json); +#else + std::string is = graph_json; +#endif dmlc::JSONReader reader(&is); this->Load(&reader); module_ = module; @@ -198,27 +202,19 @@ class GraphRuntime : public ModuleNode { std::string key, value; reader->BeginObject(); while (reader->NextObjectItem(&key)) { + reader->Read(&value); if (key == "func_name") { - reader->Read(&value); param->func_name = value; bitmask |= 1; } else if (key == "num_inputs") { - reader->Read(&value); - std::istringstream is(value); - is >> param->num_inputs; + param->num_inputs = strtoul(value.c_str(), nullptr, 10); bitmask |= 2; } else if (key == "num_outputs") { - reader->Read(&value); - std::istringstream is(value); - is >> param->num_outputs; + param->num_outputs = strtoul(value.c_str(), nullptr, 10); bitmask |= 4; } else if (key == "flatten_data") { - reader->Read(&value); - std::istringstream is(value); - is >> param->flatten_data; + param->flatten_data = strtoul(value.c_str(), nullptr, 10); bitmask |= 8; - } else { - reader->Read(&value); } } CHECK_EQ(bitmask, 1|2|4|8) << "invalid format"; From 9b62760f3f5a37035e460ba15a67dab035cde308 Mon Sep 17 00:00:00 2001 From: eqy Date: Mon, 5 Mar 2018 13:00:22 -0800 Subject: [PATCH 186/948] prevent starting of RPC server w/o RPC support (#962) * prevent starting of RPC server w/o RPC support * fix indent --- python/tvm/contrib/rpc.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/tvm/contrib/rpc.py b/python/tvm/contrib/rpc.py index 831481509c76..56f9fb1b4058 100644 --- a/python/tvm/contrib/rpc.py +++ b/python/tvm/contrib/rpc.py @@ -182,6 +182,11 @@ def __init__(self, use_popen=False, exclusive=False, key=""): + try: + if _ServerLoop is None: + raise RuntimeError("Please compile with USE_RPC=1") + except NameError: + raise RuntimeError("Please compile with USE_RPC=1") self.host = host self.port = port self.libs = [] From 678b3b3f71eedb68bb611c2c540539612dc914eb Mon Sep 17 00:00:00 2001 From: nhynes Date: Tue, 6 Mar 2018 08:57:18 -0800 Subject: [PATCH 187/948] Add SGX runtime (#963) --- apps/sgx/tvm_runtime_pack.cc | 8 +------- sgx/sgx_runtime.cc | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 sgx/sgx_runtime.cc diff --git a/apps/sgx/tvm_runtime_pack.cc b/apps/sgx/tvm_runtime_pack.cc index 52aa372f7789..709386b78931 100644 --- a/apps/sgx/tvm_runtime_pack.cc +++ b/apps/sgx/tvm_runtime_pack.cc @@ -5,13 +5,7 @@ * Please refer to the Makefile (rule lib/tvm_runtime_pack.o) for how to build. * */ -#include "../../src/runtime/c_runtime_api.cc" -#include "../../src/runtime/cpu_device_api.cc" -#include "../../src/runtime/workspace_pool.cc" -#include "../../src/runtime/module_util.cc" -#include "../../src/runtime/module.cc" -#include "../../src/runtime/registry.cc" -#include "../../src/runtime/system_lib_module.cc" +#include "../../sgx/sgx_runtime.cc" #ifndef _LIBCPP_SGX_CONFIG #include "../../src/runtime/file_util.cc" #endif diff --git a/sgx/sgx_runtime.cc b/sgx/sgx_runtime.cc new file mode 100644 index 000000000000..0bc6a3189e7a --- /dev/null +++ b/sgx/sgx_runtime.cc @@ -0,0 +1,25 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file sgx_runtime.cc + */ +#include "../../src/runtime/c_runtime_api.cc" +#include "../../src/runtime/cpu_device_api.cc" +#include "../../src/runtime/workspace_pool.cc" +#include "../../src/runtime/module_util.cc" +#include "../../src/runtime/module.cc" +#include "../../src/runtime/registry.cc" +#include "../../src/runtime/system_lib_module.cc" + +// dummy parallel runtime +int TVMBackendParallelLaunch( + FTVMParallelLambda flambda, + void* cdata, + int num_task) { + TVMAPISetLastError("Parallel is not (yet) supported in SGX runtime"); + return -1; +} + +int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { + return 0; +} + From 0c8c97ea8aa5b6cf308d2bb0f8b1cad751462eac Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 6 Mar 2018 17:51:13 -0800 Subject: [PATCH 188/948] MPS conv (#822) --- make/contrib/mps.mk | 10 +- python/tvm/contrib/mps.py | 44 ++++++- python/tvm/contrib/rpc_proxy.py | 2 + src/contrib/mps/conv.mm | 154 +++++++++++++++++++++++++ src/contrib/mps/gemm.mm | 158 +++++++++++++------------- src/contrib/mps/mps_utils.cc | 58 ---------- src/contrib/mps/mps_utils.h | 15 ++- src/contrib/mps/mps_utils.mm | 80 +++++++++++++ src/runtime/metal/metal_device_api.mm | 12 +- tests/python/contrib/test_mps.py | 90 +++++++++++---- 10 files changed, 448 insertions(+), 175 deletions(-) create mode 100644 src/contrib/mps/conv.mm delete mode 100644 src/contrib/mps/mps_utils.cc create mode 100644 src/contrib/mps/mps_utils.mm diff --git a/make/contrib/mps.mk b/make/contrib/mps.mk index 501e62b2a671..0fe8a7f12889 100644 --- a/make/contrib/mps.mk +++ b/make/contrib/mps.mk @@ -1,4 +1,4 @@ -MPS_CONTRIB_SRC = $(wildcard src/contrib/mps/*.mm, src/contrib/mps/*.cc) +MPS_CONTRIB_SRC = $(wildcard src/contrib/mps/*.mm) MPS_CONTRIB_OBJ = $(patsubst src/%.mm, build/%.o, $(MPS_CONTRIB_SRC)) ifeq ($(USE_MPS), 1) @@ -6,9 +6,15 @@ FRAMEWORKS += -framework MetalPerformanceShaders CFLAGS += ADD_LDFLAGS += RUNTIME_DEP += $(MPS_CONTRIB_OBJ) +CONTRIB_OBJ += $(MPS_CONTRIB_OBJ) endif -build/contrib/mps/%.o: src/contrib/mps/%.mm src/contrib/mps/%.cc +build/contrib/mps/%.o: src/contrib/mps/%.mm + @mkdir -p $(@D) + $(CXX) $(OBJCFLAGS) $(CFLAGS) -MM -MT build/contrib/mps/$*.o $< >build/contrib/mps/$*.d + $(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@ + +build/contrib/mps/%.o: src/contrib/mps/%.cc @mkdir -p $(@D) $(CXX) $(OBJCFLAGS) $(CFLAGS) -MM -MT build/contrib/mps/$*.o $< >build/contrib/mps/$*.d $(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@ diff --git a/python/tvm/contrib/mps.py b/python/tvm/contrib/mps.py index d214d4b93631..43b3b9fb48db 100644 --- a/python/tvm/contrib/mps.py +++ b/python/tvm/contrib/mps.py @@ -1,9 +1,9 @@ """External function interface to MPS libraroes.""" from __future__ import absolute_import as _abs - from .. import api as _api from .. import intrin as _intrin +# pylint: disable=C0103,W0612 def matmul(lhs, rhs, transa=False, transb=False): """Create an extern op that compute matrix mult of A and rhs with CrhsLAS @@ -26,10 +26,46 @@ def matmul(lhs, rhs, transa=False, transb=False): C : Tensor The result tensor. """ - m = lhs.shape[0] - n = rhs.shape[1] + m = lhs.shape[0] if transa is False else lhs.shape[1] + n = rhs.shape[1] if transb is False else rhs.shape[0] + if transa: + m = b + if transb: + n = c return _api.extern( - (n, m), [lhs, rhs], + (m, n), [lhs, rhs], lambda ins, outs: _intrin.call_packed( "tvm.contrib.mps.matmul", ins[0], ins[1], outs[0], transa, transb), name="C") + +def conv2d(data, weight, pad='SAME', stride=1): + """ + Create an extern op that compute data * weight and return result in output + + Parameters: + ---------- + data: Tensor + The input data, format NHWC + weight: Tensor + The conv weight, format output_feature * kH * kW * input_feature + pad: str + Padding method, 'SAME' or 'VALID' + stride: int + convolution stride + + Returns + ------- + output: Tensor + The result tensor + """ + n, hi, wi, ci = data.shape + co, kh, kw, ciw = weight.shape + padding = 0 if pad == 'SAME' else 1 + ho = hi // stride + wo = wi // stride + + return _api.extern( + (n, ho, wo, co), [data, weight], + lambda ins, outs: _intrin.call_packed( + "tvm.contrib.mps.conv2d", ins[0], ins[1], outs[0], padding, stride), + name="C") diff --git a/python/tvm/contrib/rpc_proxy.py b/python/tvm/contrib/rpc_proxy.py index fe289935e7cb..9634c258b39f 100644 --- a/python/tvm/contrib/rpc_proxy.py +++ b/python/tvm/contrib/rpc_proxy.py @@ -70,6 +70,7 @@ def on_start(self): ProxyServerHandler.current.handler_ready(self) def on_data(self, message): + """on data""" assert isinstance(message, bytes) if self.forward_proxy: self.forward_proxy.send_data(message) @@ -98,6 +99,7 @@ def close_pair(self): self.close() def on_close_event(self): + """on close event""" assert not self._done logging.info("RPCProxy:on_close %s ...", self.name()) self._done = True diff --git a/src/contrib/mps/conv.mm b/src/contrib/mps/conv.mm new file mode 100644 index 000000000000..fa279bd5cc95 --- /dev/null +++ b/src/contrib/mps/conv.mm @@ -0,0 +1,154 @@ +#include "mps_utils.h" + +namespace tvm { +namespace contrib { + +using namespace runtime; + +TVM_REGISTER_GLOBAL("tvm.contrib.mps.buffer2img") +.set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor *buf = args[0]; + DLTensor *img = args[1]; + // copy to temp + id mtlbuf = (__bridge id)(buf->data); + MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal(); + runtime::metal::MetalThreadEntry *rt = + runtime::metal::MetalThreadEntry::ThreadLocal(); + id dev = entry_ptr->metal_api->GetDevice(buf->ctx); + id temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]); + entry_ptr->metal_api->CopyDataFromTo( + (__bridge void *)mtlbuf, 0, (__bridge void *)temp, 0, [mtlbuf length], + buf->ctx, buf->ctx, nullptr + ); + + MPSImageDescriptor *desc = [MPSImageDescriptor + imageDescriptorWithChannelFormat:MPSImageFeatureChannelFormatFloat32 + width:buf->shape[2] + height:buf->shape[1] + featureChannels:buf->shape[3]]; + + MPSImage *mpsimg = entry_ptr->AllocMPSImage(dev, desc); + + [mpsimg writeBytes:[temp contents] + dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels + imageIndex:0]; + + img->data = (__bridge void *)mpsimg; + + [mpsimg readBytes:[temp contents] + dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels + imageIndex:0]; + + }); + +TVM_REGISTER_GLOBAL("tvm.contrib.mps.img2buffer") +.set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor *img = args[0]; + DLTensor *buf = args[1]; + id mtlbuf = (__bridge id)(buf->data); + MPSImage *mpsimg = (__bridge MPSImage *)(img->data); + MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal(); + runtime::metal::MetalThreadEntry *rt = + runtime::metal::MetalThreadEntry::ThreadLocal(); + id temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]); + + [mpsimg readBytes:[temp contents] + dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels + imageIndex:0]; + + entry_ptr->metal_api->CopyDataFromTo( + (__bridge void *)temp, 0, (__bridge void *)mtlbuf, 0, [mtlbuf length], + buf->ctx, buf->ctx, nullptr); + + }); + +TVM_REGISTER_GLOBAL("tvm.contrib.mps.conv2d") +.set_body([](TVMArgs args, TVMRetValue *ret) { + // MPS-NHWC + DLTensor *data = args[0]; + DLTensor *weight = args[1]; + DLTensor *output = args[2]; + int pad = args[3]; + int stride = args[4]; + + CHECK_EQ(data->ndim, 4); + CHECK_EQ(weight->ndim, 4); + CHECK_EQ(output->ndim, 4); + CHECK(output->strides == nullptr); + CHECK(weight->strides == nullptr); + CHECK(data->strides == nullptr); + + CHECK_EQ(data->shape[0], 1); + CHECK_EQ(output->shape[0], 1); + + int oCh = weight->shape[0]; + int kH = weight->shape[1]; + int kW = weight->shape[2]; + int iCh = weight->shape[3]; + + auto f_buf2img = runtime::Registry::Get("tvm.contrib.mps.buffer2img"); + auto f_img2buf = runtime::Registry::Get("tvm.contrib.mps.img2buffer"); + // Get Metal device API + MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal(); + runtime::metal::MetalThreadEntry *rt = + runtime::metal::MetalThreadEntry::ThreadLocal(); + id dev = entry_ptr->metal_api->GetDevice(data->ctx); + id queue = + entry_ptr->metal_api->GetCommandQueue(data->ctx); + id cb = [queue commandBuffer]; + // data to MPSImage + DLTensor tmp_in; + (*f_buf2img)(data, &tmp_in); + MPSImage *tempA = (__bridge MPSImage *)tmp_in.data; + // weight to temp memory + id bufB = (__bridge id)(weight->data); + id tempB = rt->GetTempBuffer(weight->ctx, [bufB length]); + entry_ptr->metal_api->CopyDataFromTo( + (__bridge void *)bufB, 0, (__bridge void *)tempB, 0, [bufB length], + weight->ctx, weight->ctx, nullptr); + float *ptr_w = (float *)[tempB contents]; + // output to MPSImage + DLTensor tmp_out; + (*f_buf2img)(output, &tmp_out); + MPSImage *tempC = (__bridge MPSImage *)tmp_out.data; + // conv desc + + MPSCNNConvolutionDescriptor *conv_desc = [MPSCNNConvolutionDescriptor + cnnConvolutionDescriptorWithKernelWidth:kW + kernelHeight:kH + inputFeatureChannels:iCh + outputFeatureChannels:oCh]; + [conv_desc setStrideInPixelsX:stride]; + [conv_desc setStrideInPixelsY:stride]; + + MPSCNNConvolution *conv = + [[MPSCNNConvolution alloc] initWithDevice:dev + convolutionDescriptor:conv_desc + kernelWeights:ptr_w + biasTerms:nil + flags:MPSCNNConvolutionFlagsNone]; + if (pad == 0) { + conv.padding = [MPSNNDefaultPadding + paddingWithMethod:MPSNNPaddingMethodAddRemainderToTopLeft | + MPSNNPaddingMethodAlignCentered | + MPSNNPaddingMethodSizeSame]; + } else if (pad == 1) { + conv.padding = [MPSNNDefaultPadding + paddingWithMethod:MPSNNPaddingMethodAddRemainderToTopLeft | + MPSNNPaddingMethodAlignCentered | + MPSNNPaddingMethodSizeValidOnly]; + } + [conv encodeToCommandBuffer:cb sourceImage:tempA destinationImage:tempC]; + + [cb commit]; + id encoder = [cb blitCommandEncoder]; + [encoder synchronizeResource:tempC.texture]; + [encoder endEncoding]; + [cb waitUntilCompleted]; + + (*f_img2buf)(&tmp_out, output); + + }); + +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/mps/gemm.mm b/src/contrib/mps/gemm.mm index f877cb8b0ea1..1d92ad2851d0 100644 --- a/src/contrib/mps/gemm.mm +++ b/src/contrib/mps/gemm.mm @@ -1,9 +1,5 @@ -#include "../../runtime/metal/metal_common.h" -#include -#include -#include -#include -#include + +#include "mps_utils.h" namespace tvm { namespace contrib { @@ -11,83 +7,81 @@ using namespace runtime; TVM_REGISTER_GLOBAL("tvm.contrib.mps.matmul") - .set_body([](TVMArgs args, TVMRetValue *ret) { - DLTensor *A = args[0]; - DLTensor *B = args[1]; - DLTensor *C = args[2]; - bool transa = args[3]; - bool transb = args[4]; - // call gemm for simple compact code. - CHECK_EQ(A->ndim, 2); - CHECK_EQ(B->ndim, 2); - CHECK_EQ(C->ndim, 2); - CHECK(C->strides == nullptr); - CHECK(B->strides == nullptr); - CHECK(A->strides == nullptr); - CHECK(TypeMatch(A->dtype, kDLFloat, 32)); - CHECK(TypeMatch(B->dtype, kDLFloat, 32)); - CHECK(TypeMatch(C->dtype, kDLFloat, 32)); - // Get Metal device API - MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal(); - CHECK_EQ(A->ctx, B->ctx); - CHECK_EQ(A->ctx, C->ctx); - id dev = entry_ptr->metal_api->GetDevice(A->ctx); - id queue = entry_ptr->metal_api->GetCommandQueue(A->ctx); - id cb = [queue commandBuffer]; - NSUInteger M = A->shape[0 + transa?1:0]; - NSUInteger N = B->shape[1 - transb?1:0]; - NSUInteger K = B->shape[0 + transb?1:0]; - CHECK_EQ(A->shape[1-transa?1:0], K); - // mps a - MPSDataType dtype = MPSType::DLTypeToMPSType(A->dtype); - MPSMatrixDescriptor *descA = [MPSMatrixDescriptor - matrixDescriptorWithDimensions:M - columns:K - rowBytes:M * sizeof(dtype) - dataType:dtype]; - id bufA = (__bridge id)(A->data); - MPSMatrix *matrixA = - [[MPSMatrix alloc] initWithBuffer:bufA descriptor:descA]; - // mps b - MPSMatrixDescriptor *descB = [MPSMatrixDescriptor - matrixDescriptorWithDimensions:K - columns:N - rowBytes:K * sizeof(dtype) - dataType:dtype]; - id bufB = (__bridge id)(B->data); - MPSMatrix *matrixB = - [[MPSMatrix alloc] initWithBuffer:bufB descriptor:descB]; - // mps c - MPSMatrixDescriptor *descC = [MPSMatrixDescriptor - matrixDescriptorWithDimensions:M - columns:N - rowBytes:M * sizeof(dtype) - dataType:dtype]; - id bufC = (__bridge id)(C->data); - MPSMatrix *matrixC = - [[MPSMatrix alloc] initWithBuffer:bufC descriptor:descC]; - // kernel +.set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor *A = args[0]; + DLTensor *B = args[1]; + DLTensor *C = args[2]; + bool transa = args[3]; + bool transb = args[4]; + // call gemm for simple compact code. + CHECK_EQ(A->ndim, 2); + CHECK_EQ(B->ndim, 2); + CHECK_EQ(C->ndim, 2); + CHECK(C->strides == nullptr); + CHECK(B->strides == nullptr); + CHECK(A->strides == nullptr); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); + // Get Metal device API + MetalThreadEntry *entry_ptr = MetalThreadEntry::ThreadLocal(); + // CHECK_EQ(A->ctx, B->ctx); + // CHECK_EQ(A->ctx, C->ctx); + id dev = entry_ptr->metal_api->GetDevice(A->ctx); + id queue = entry_ptr->metal_api->GetCommandQueue(A->ctx); + id cb = [queue commandBuffer]; + NSUInteger M = A->shape[0 + (transa ? 1 : 0)]; + NSUInteger N = B->shape[1 - (transb ? 1 : 0)]; + NSUInteger K = B->shape[0 + (transb ? 1 : 0)]; + + CHECK_EQ(A->shape[1 - (transa ? 1 : 0)], K); + // mps a + MPSDataType dtype = MPSType::DLTypeToMPSType(A->dtype); + MPSMatrixDescriptor *descA = [MPSMatrixDescriptor + matrixDescriptorWithDimensions:M + columns:K + rowBytes:K * sizeof(MPSDataTypeFloat32) + dataType:MPSDataTypeFloat32]; + id bufA = (__bridge id)(A->data); + MPSMatrix *matrixA = + [[MPSMatrix alloc] initWithBuffer:bufA descriptor:descA]; + // mps b + MPSMatrixDescriptor *descB = + [MPSMatrixDescriptor matrixDescriptorWithDimensions:K + columns:N + rowBytes:N * sizeof(dtype) + dataType:dtype]; + id bufB = (__bridge id)(B->data); + MPSMatrix *matrixB = + [[MPSMatrix alloc] initWithBuffer:bufB descriptor:descB]; + // mps c + MPSMatrixDescriptor *descC = + [MPSMatrixDescriptor matrixDescriptorWithDimensions:M + columns:N + rowBytes:N * sizeof(dtype) + dataType:dtype]; + id bufC = (__bridge id)(C->data); + MPSMatrix *matrixC = + [[MPSMatrix alloc] initWithBuffer:bufC descriptor:descC]; + // kernel - MPSMatrixMultiplication *mul_obj = [[MPSMatrixMultiplication alloc] init]; - MPSMatrixMultiplication *sgemm = [mul_obj initWithDevice:dev - transposeLeft:transa - transposeRight:transb - resultRows:M - resultColumns:N - interiorColumns:K - alpha:1.0f - beta:0.0f]; - CHECK(sgemm != nil); - [sgemm encodeToCommandBuffer:cb - leftMatrix:matrixA - rightMatrix:matrixB - resultMatrix:matrixC]; - [cb commit]; - [mul_obj dealloc]; - [matrixA dealloc]; - [matrixB dealloc]; - [matrixC dealloc]; - }); + MPSMatrixMultiplication *mul_obj = [[MPSMatrixMultiplication alloc] init]; + MPSMatrixMultiplication *sgemm = [mul_obj initWithDevice:dev + transposeLeft:transa + transposeRight:transb + resultRows:M + resultColumns:N + interiorColumns:K + alpha:1.0f + beta:0.0f]; + CHECK(sgemm != nil); + [sgemm encodeToCommandBuffer:cb + leftMatrix:matrixA + rightMatrix:matrixB + resultMatrix:matrixC]; + [cb commit]; + + }); } // namespace contrib } // namespace tvm diff --git a/src/contrib/mps/mps_utils.cc b/src/contrib/mps/mps_utils.cc deleted file mode 100644 index 2e3ca6218bb4..000000000000 --- a/src/contrib/mps/mps_utils.cc +++ /dev/null @@ -1,58 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file Use external mps utils function - */ -#include "mps_utils.h" -#include -#include -#include - - -namespace tvm { -namespace contrib { - -// MPS Data Type -MPSDataType MPSType::DLTypeToMPSType(const DLDataType &dtype) { - switch (dtype.code) { - case kDLInt: - if (dtype.bits == 8 && dtype.lanes == 1) return MPSDataTypeInt8; - else if (dtype.bits == 16 && dtype.lanes == 1) return MPSDataTypeInt16; - else - LOG(FATAL) << "Unsupported type"; - break; - case kDLUInt: - if (dtype.bits == 8 && dtype.lanes == 1) return MPSDataTypeUInt8; - else if (dtype.bits == 16 && dtype.lanes == 1) return MPSDataTypeUInt16; - else if (dtype.bits == 32 && dtype.lanes == 1) return MPSDataTypeUInt32; - LOG(FATAL) << "Unsupported type"; - break; - case kDLFloat: - if (dtype.bits == 16 && dtype.lanes == 1) return MPSDataTypeFloat16; - else if (dtype.bits == 32 && dtype.lanes == 1) return MPSDataTypeFloat32; - else - LOG(FATAL) << "Unsupported type"; - break; - default: - LOG(FATAL) << "Unsupported type"; - } -} - -// MetalThreadEntry - -MetalThreadEntry::MetalThreadEntry() { - auto func = runtime::Registry::Get("device_api.metal"); - void *ret = (*func)(); - metal_api = static_cast(ret); -} - -MetalThreadEntry::~MetalThreadEntry() { -} - -typedef dmlc::ThreadLocalStore MetalThreadStore; - -MetalThreadEntry* MetalThreadEntry::ThreadLocal() { - return MetalThreadStore::Get(); -} - -} // namespace contrib -} // namespace tvm diff --git a/src/contrib/mps/mps_utils.h b/src/contrib/mps/mps_utils.h index 91336ce44edd..f07156a252a3 100644 --- a/src/contrib/mps/mps_utils.h +++ b/src/contrib/mps/mps_utils.h @@ -6,11 +6,15 @@ #ifndef TVM_CONTRIB_MPS_MPS_UTILS_H_ #define TVM_CONTRIB_MPS_MPS_UTILS_H_ +#import #include +#include #include +#include +#include +#include #include "../../runtime/metal/metal_common.h" - namespace tvm { namespace contrib { @@ -19,12 +23,15 @@ struct MPSType { static MPSDataType DLTypeToMPSType(const DLDataType &dtype); }; // struct MPSType - struct MetalThreadEntry { MetalThreadEntry(); ~MetalThreadEntry(); - runtime::MetalWorkspace *metal_api{nullptr}; - static MetalThreadEntry* ThreadLocal(); + MPSImage *AllocMPSImage(id dev, MPSImageDescriptor *desc); + MPSTemporaryImage *AllocTempImage(id cb, + MPSImageDescriptor *desc); + runtime::metal::MetalWorkspace *metal_api{nullptr}; + static MetalThreadEntry *ThreadLocal(); + std::vector img_table; }; // MetalThreadEntry } // namespace contrib diff --git a/src/contrib/mps/mps_utils.mm b/src/contrib/mps/mps_utils.mm new file mode 100644 index 000000000000..bed8278a1d50 --- /dev/null +++ b/src/contrib/mps/mps_utils.mm @@ -0,0 +1,80 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external mps utils function + */ +#include "mps_utils.h" + +namespace tvm { +namespace contrib { + +// MPS Data Type +MPSDataType MPSType::DLTypeToMPSType(const DLDataType &dtype) { + switch (dtype.code) { + case kDLInt: + if (dtype.bits == 8 && dtype.lanes == 1) + return MPSDataTypeInt8; + else if (dtype.bits == 16 && dtype.lanes == 1) + return MPSDataTypeInt16; + else + LOG(FATAL) << "Unsupported type"; + break; + case kDLUInt: + if (dtype.bits == 8 && dtype.lanes == 1) + return MPSDataTypeUInt8; + else if (dtype.bits == 16 && dtype.lanes == 1) + return MPSDataTypeUInt16; + else if (dtype.bits == 32 && dtype.lanes == 1) + return MPSDataTypeUInt32; + LOG(FATAL) << "Unsupported type"; + break; + case kDLFloat: + if (dtype.bits == 16 && dtype.lanes == 1) + return MPSDataTypeFloat16; + else if (dtype.bits == 32 && dtype.lanes == 1) + return MPSDataTypeFloat32; + else + LOG(FATAL) << "Unsupported type"; + break; + default: + LOG(FATAL) << "Unsupported type"; + } + return MPSDataTypeFloat32; +} + +// MetalThreadEntry + +MPSImage *MetalThreadEntry::AllocMPSImage(id dev, + MPSImageDescriptor *desc) { + MPSImage *mpsimg = [[MPSImage alloc] initWithDevice:dev imageDescriptor:desc]; + img_table.push_back(mpsimg); + return mpsimg; +} + +MPSTemporaryImage *MetalThreadEntry::AllocTempImage(id cb, + MPSImageDescriptor *desc) { + MPSTemporaryImage *mpsimg = + [MPSTemporaryImage temporaryImageWithCommandBuffer:cb + imageDescriptor:desc]; + return mpsimg; +} + +MetalThreadEntry::MetalThreadEntry() { + auto func = runtime::Registry::Get("device_api.metal"); + void *ret = (*func)(); + metal_api = static_cast(ret); +} + +MetalThreadEntry::~MetalThreadEntry() { + for (int i = 0; i < img_table.size(); ++i) { + [img_table[i] dealloc]; + } +} + +typedef dmlc::ThreadLocalStore MetalThreadStore; + +MetalThreadEntry *MetalThreadEntry::ThreadLocal() { + return MetalThreadStore::Get(); +} + +} // namespace contrib +} // namespace tvm diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index d87a9eac4f72..1768d6334b5c 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -126,10 +126,18 @@ int GetWarpSize(id dev) { TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) { this->Init(); id dev = GetDevice(ctx); - // allocate buffer in GPU only mode. + // GPU memory only + MTLResourceOptions storage_mode = MTLResourceStorageModePrivate; + /* + #if TARGET_OS_IPHONE + storage_mode = MTLResourceStorageModeShared; + #else + storage_mode = MTLResourceStorageModeManaged; + #endif + */ id buf = [ dev newBufferWithLength:nbytes - options:MTLResourceStorageModePrivate]; + options:storage_mode]; CHECK(buf != nil); return (__bridge void*)([buf retain]); } diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py index 68dcb135e908..25437605525b 100644 --- a/tests/python/contrib/test_mps.py +++ b/tests/python/contrib/test_mps.py @@ -2,39 +2,83 @@ import numpy as np from tvm.contrib import mps -def test_matmul_add(): +def test_matmul(): + if not tvm.module.enabled("metal"): + print("skip because %s is not enabled..." % "metal") + return n = 1024 l = 128 - m = 235 - bias = tvm.var('bias', dtype=tvm.float32) + m = 256 A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') - C1 = mps.matmul(A, B) - C2 = mps.matmul(B, A, True, True) - D1 = tvm.compute(C1.shape, lambda i, j: C1[i,j] + bias, name="D1") - D2 = tvm.compute(C2.shape, lambda i, j: C2[i,j] + bias, name="D2") - s1 = tvm.create_schedule(D1.op) - s2 = tvm.create_schedule(D2.op) - - def verify(A, B, D, s, bias, target="llvm"): - if not tvm.module.enabled(target): - print("skip because %s is not enabled..." % target) - return + C = mps.matmul(A, B) + D = tvm.compute( + C.shape, + lambda *i: C(*i) + 1. + ) + s = tvm.create_schedule(D.op) + yo, xo = D.op.axis + block_y = tvm.thread_axis("blockIdx.y") + block_x = tvm.thread_axis("blockIdx.x") + thread_y = tvm.thread_axis("threadIdx.y") + thread_x = tvm.thread_axis("threadIdx.x") + by, ty = s[D].split(yo, factor=16) + bx, tx = s[D].split(xo, factor=16) + s[D].bind(by, block_y) + s[D].bind(bx, block_x) + s[D].bind(ty, thread_y) + s[D].bind(tx, thread_x) + + + + def verify(A, B, D, s, target="metal"): if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not avalable") return - ctx = tvm.cpu(0) - f = tvm.build(s, [A, B, D, bias], target) + ctx = tvm.metal(0) + f = tvm.build(s, [A, B, D], "metal") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) - bb = 10.0 - f(a, b, d, bb) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + f(a, b, c) np.testing.assert_allclose( - d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) - verify(A, B, D1, s1, bias) - verify(A, B, D2, s2, bias) + c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5) + verify(A, B, D, s) + +def test_conv2d(): + if not tvm.module.enabled("metal"): + print("skip because %s is not enabled..." % "metal") + return + n = 1 + h = 14 + w = 14 + ci = 2 + co = 4 + kh = 3 + kw = 3 + stride = 2 + A = tvm.placeholder((n, h, w, ci), name="x") + B = tvm.placeholder((co, kh, kw, ci), name="w") + C = mps.conv2d(A, B, 'SAME', 2) + s1 = tvm.create_schedule(C.op) + + def verify(A, B, C, target="llvm"): + if not tvm.get_global_func("tvm.contrib.mps.conv2d", True): + print("skip because extern function is not avalable") + return + ctx = tvm.metal(0) + f = tvm.build(s1, [A, B, C], "metal") + a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), ctx) + f(a, b, c) + # print(c.asnumpy()) + # print(c.shape) + + verify(A, B, C, s1) if __name__ == "__main__": - test_matmul_add() + #test_matmul() + test_conv2d() + From ac771c19caeffe8e7f69cceb5300a0fc3c3ff304 Mon Sep 17 00:00:00 2001 From: libing4752 Date: Thu, 8 Mar 2018 12:04:21 +0800 Subject: [PATCH 189/948] enhance access_ptr that args can support Expr (#970) --- include/tvm/buffer.h | 2 +- python/tvm/schedule.py | 29 +++++++++++++++++++++-- src/lang/buffer.cc | 2 +- tests/python/unittest/test_lang_buffer.py | 9 +++++++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h index d737341e1c0e..8b04bf550cb2 100644 --- a/include/tvm/buffer.h +++ b/include/tvm/buffer.h @@ -55,7 +55,7 @@ class Buffer : public NodeRef { * \param offset The offset of ptr. */ TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle(), - int content_lanes = 1, int offset = 0) const; + int content_lanes = 1, Expr offset = make_const(Int(32), 0)) const; /*! * \brief Create an Expr that does a vector load at begin index. * \param begin The beginning index diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py index b04945292adf..236570c2417c 100644 --- a/python/tvm/schedule.py +++ b/python/tvm/schedule.py @@ -2,12 +2,34 @@ from __future__ import absolute_import as _abs from ._ffi.base import string_types from ._ffi.node import NodeBase, register_node -from ._ffi.function import _init_api +from ._ffi.node import convert_to_node as _convert_to_node +from ._ffi.function import _init_api, Function +from ._ffi.function import convert_to_tvm_func as _convert_tvm_func from . import _api_internal from . import tensor as _tensor from . import expr as _expr from . import container as _container +def convert(value): + """Convert value to TVM node or function. + + Parameters + ---------- + value : python value + + Returns + ------- + tvm_val : Node or Function + Converted value in TVM + """ + if isinstance(value, (Function, NodeBase)): + return value + + if callable(value): + return _convert_tvm_func(value) + + return _convert_to_node(value) + @register_node class Buffer(NodeBase): """Symbolic data buffer in TVM. @@ -45,7 +67,7 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0): The number of lanes for the data type. This value is greater than one for vector types. - offset: int, optional + offset: Expr, optional The offset of pointer. We can use it to offset by the number of elements from the address of ptr. @@ -60,6 +82,8 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0): buffer.access_ptr(Buffer.READ | Buffer.WRITE) # Get access ptr for read/write with str flag buffer.access_ptr("rw") + # Get access ptr for read with offset + buffer.access_ptr("r", offset = 100) """ if isinstance(access_mask, string_types): mask = 0 @@ -71,6 +95,7 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0): else: raise ValueError("Unknown access_mask %s" % access_mask) access_mask = mask + offset = convert(offset) return _api_internal._BufferAccessPtr(self, access_mask, ptr_type, content_lanes, offset) diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc index 07e455e25384..39566df45ae6 100644 --- a/src/lang/buffer.cc +++ b/src/lang/buffer.cc @@ -335,7 +335,7 @@ Buffer Buffer::MakeSlice(Array begins, Array extents) const { 0); } -Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, int offset) const { +Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, Expr offset) const { const BufferNode* self = operator->(); Expr e_dtype; Expr extent; diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py index fe0f1f0b759c..a5a8f5d065a6 100644 --- a/tests/python/unittest/test_lang_buffer.py +++ b/tests/python/unittest/test_lang_buffer.py @@ -31,6 +31,15 @@ def test_buffer_access_ptr_offset(): offset = tvm.ir_pass.Simplify(aptr.args[2]) assert tvm.ir_pass.Equal(offset, 100) assert aptr.args[4].value == Buffer.READ | Buffer.WRITE + v = tvm.var('int32') + aptr = Ab.access_ptr("rw", offset=100 + 100 + v) + offset = tvm.ir_pass.Simplify(aptr.args[2]) + assert tvm.ir_pass.Equal(offset, 200 + v) + assert aptr.args[4].value == Buffer.READ | Buffer.WRITE + aptr = Ab.access_ptr("rw", offset=tvm.call_extern('int32', "test_call", 100 + 100 + v)) + offset = tvm.ir_pass.Simplify(aptr.args[2]) + assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v)) + assert aptr.args[4].value == Buffer.READ | Buffer.WRITE def test_buffer_index_merge_mult_mod(): m = tvm.var('m') From e1fd9206e0048ff35b0e0a7121cb11be265a58b5 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Thu, 8 Mar 2018 23:27:21 +0530 Subject: [PATCH 190/948] Yolo2 operators (#911) --- topi/python/topi/__init__.py | 1 + topi/python/topi/cuda/__init__.py | 1 + topi/python/topi/cuda/vision.py | 65 +++++++++++++++++++ topi/python/topi/generic/__init__.py | 1 + topi/python/topi/generic/vision.py | 67 ++++++++++++++++++++ topi/python/topi/rocm/__init__.py | 1 + topi/python/topi/rocm/vision.py | 22 +++++++ topi/python/topi/testing/__init__.py | 3 + topi/python/topi/testing/region_python.py | 69 +++++++++++++++++++++ topi/python/topi/testing/reorg_python.py | 42 +++++++++++++ topi/python/topi/testing/shortcut_python.py | 47 ++++++++++++++ topi/python/topi/vision/__init__.py | 7 +++ topi/python/topi/vision/reorg.py | 39 ++++++++++++ topi/python/topi/vision/shortcut.py | 45 ++++++++++++++ topi/python/topi/vision/yolo2/__init__.py | 5 ++ topi/python/topi/vision/yolo2/region.py | 63 +++++++++++++++++++ topi/tests/python/test_topi_region.py | 44 +++++++++++++ topi/tests/python/test_topi_reorg.py | 47 ++++++++++++++ topi/tests/python/test_topi_shortcut.py | 48 ++++++++++++++ 19 files changed, 617 insertions(+) create mode 100644 topi/python/topi/cuda/vision.py create mode 100644 topi/python/topi/generic/vision.py create mode 100644 topi/python/topi/rocm/vision.py create mode 100644 topi/python/topi/testing/region_python.py create mode 100644 topi/python/topi/testing/reorg_python.py create mode 100644 topi/python/topi/testing/shortcut_python.py create mode 100644 topi/python/topi/vision/__init__.py create mode 100644 topi/python/topi/vision/reorg.py create mode 100644 topi/python/topi/vision/shortcut.py create mode 100644 topi/python/topi/vision/yolo2/__init__.py create mode 100644 topi/python/topi/vision/yolo2/region.py create mode 100644 topi/tests/python/test_topi_region.py create mode 100644 topi/tests/python/test_topi_reorg.py create mode 100644 topi/tests/python/test_topi_shortcut.py diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index ae02211a9d6c..c9a9b7bc01b6 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -25,6 +25,7 @@ from . import util from . import rocm from . import cpp +from . import vision # not import testing by default # because testing can have extra deps that are not necessary # we can import them from test cases explicitly diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index f829a9895fd2..174a37b1d451 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -15,3 +15,4 @@ from .pooling import schedule_pool, schedule_global_pool from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw from .extern import schedule_extern +from .vision import schedule_region diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py new file mode 100644 index 000000000000..bfd453f807c2 --- /dev/null +++ b/topi/python/topi/cuda/vision.py @@ -0,0 +1,65 @@ +# pylint: disable=invalid-name, unused-variable +"""Schedule for vision operators""" +from __future__ import absolute_import as _abs +import tvm +from .. import tag +from .. import generic + +@generic.schedule_region.register(["cuda", "gpu"]) +def schedule_region(outs): + """Schedule for region operator. + Parameters + ---------- + outs: Array of Tensor + The computation graph description of region + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for region. + """ + + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + output = outs[0].op.output(0) + #thread = 64 for higher size tensors, give resource_unavailable error for higher values + num_thread = 64 + def _schedule_softmax(softmax_op): + softmax = softmax_op.input_tensors[0] + max_elem = softmax_op.input_tensors[1] + expsum = softmax_op.input_tensors[2] + block_x = tvm.thread_axis("blockIdx.x") + thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") + s[max_elem].bind(max_elem.op.axis[0], block_x) + k = expsum.op.reduce_axis[0] + ko, ki = s[expsum].split(k, factor=num_thread) + ef = s.rfactor(expsum, ki) + s[expsum].bind(s[expsum].op.axis[0], block_x) + s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x) + s[ef].compute_at(s[expsum], s[expsum].op.reduce_axis[0]) + s[expsum].set_store_predicate(thread_x.var.equal(0)) + tx, xi = s[softmax_op].split(softmax_op.axis[1], nparts=num_thread) + s[softmax_op].bind(softmax_op.axis[0], block_x) + s[softmax_op].bind(tx, thread_x) + return max_elem.op.input_tensors[0] + + def _traverse(op): + if tag.is_injective(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + _traverse(tensor.op) + elif op.tag == 'softmax_output': + tensor = _schedule_softmax(op) + if tensor.op.input_tensors: + _traverse(tensor.op) + else: + raise RuntimeError("Unsupported operator: %s" % op.tag) + _traverse(outs[0].op) + k = output.op.axis[0] + bx, tx = s[output].split(k, factor=num_thread) + s[output].bind(bx, tvm.thread_axis("blockIdx.x")) + s[output].bind(tx, tvm.thread_axis("threadIdx.x")) + return s diff --git a/topi/python/topi/generic/__init__.py b/topi/python/topi/generic/__init__.py index 8fc9143c3f86..8450e2d4c4e2 100644 --- a/topi/python/topi/generic/__init__.py +++ b/topi/python/topi/generic/__init__.py @@ -18,3 +18,4 @@ from .nn import * from .injective import * from .extern import * +from .vision import * diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py new file mode 100644 index 000000000000..139491b6213e --- /dev/null +++ b/topi/python/topi/generic/vision.py @@ -0,0 +1,67 @@ +"""Generic vision operators""" +from __future__ import absolute_import as _abs +import tvm + +def _default_schedule(outs, auto_inline): + """Default schedule for llvm.""" + target = tvm.target.current_target(allow_none=False) + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + if target.target_name != "llvm": + raise RuntimeError("schedule not registered for '%s'" % target) + s = tvm.create_schedule([x.op for x in outs]) + if auto_inline: + x = outs[0] + tvm.schedule.AutoInlineInjective(s) + s[x].fuse(s[x].op.axis) + return s + +@tvm.target.generic_func +def schedule_shortcut(outs): + """Schedule for shortcut + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of shortcut + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + +@tvm.target.generic_func +def schedule_reorg(outs): + """Schedule for reorg + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of reorg + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + +@tvm.target.generic_func +def schedule_region(outs): + """Schedule for region + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of region + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py index 3fddd53a3b36..a5b4ee30dc37 100644 --- a/topi/python/topi/rocm/__init__.py +++ b/topi/python/topi/rocm/__init__.py @@ -4,3 +4,4 @@ from .conv2d import * from .dense import * +from .vision import * diff --git a/topi/python/topi/rocm/vision.py b/topi/python/topi/rocm/vision.py new file mode 100644 index 000000000000..465bfd1e9cce --- /dev/null +++ b/topi/python/topi/rocm/vision.py @@ -0,0 +1,22 @@ +# pylint: disable=invalid-name, unused-variable +"""Schedule for vision operator""" +from __future__ import absolute_import as _abs +import topi +from .. import generic + +@generic.schedule_region.register(["rocm"]) +def schedule_region(outs): + """Schedule for region operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of region + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for region. + """ + return topi.cuda.schedule_region(outs) diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index 2a20a1c4f622..66a865724dc3 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -12,3 +12,6 @@ from .dilate_python import dilate_python from .softmax_python import softmax_python, log_softmax_python from .upsampling_python import upsampling_python +from .reorg_python import reorg_python +from .region_python import region_python +from .shortcut_python import shortcut_python diff --git a/topi/python/topi/testing/region_python.py b/topi/python/topi/testing/region_python.py new file mode 100644 index 000000000000..3bab53892607 --- /dev/null +++ b/topi/python/topi/testing/region_python.py @@ -0,0 +1,69 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Region in python""" +import numpy as np + +def entry_index(batch, w, h, outputs, classes, coords, location, entry): + n = int(location/(w*h)) + loc = location%(w*h) + return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc + +def region_python(a_np, N, classes, coords, background, softmax): + """Region operator + Parameters + ---------- + a_np : numpy.ndarray + 4-D with shape [batch, in_channel, in_height, in_width] + + N : int + Darknet layer parameter n + + classes : int + Darknet layer parameter classes + + coords : int + Darknet layer parameter coords + + background : int + Darknet layer parameter background + + softmax : int + Darknet layer parameter softmax + + Returns + ------- + b_np : np.ndarray + 4-D with shape [batch, out_channel, out_height, out_width] + """ + + batch, in_channel, in_height, in_width = a_np.shape + a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width) + outputs = batch*in_channel*in_height*in_width + b_np = np.zeros(batch*in_channel*in_height*in_width) + for i in range(batch*in_channel*in_height*in_width): + b_np[i] = a_np_temp[i] + for b in range(batch): + for n in range(N): + index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, 0) + b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height])) + index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, coords) + if not background: + b_np[index: index+in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+in_width*in_height])) + + b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width)) + def local_softmax(data_in): + data_c, data_h, data_w = data_in.shape + largest = np.max(data_in, axis=1) + data_out = np.zeros((data_c, data_h, data_w)) + for i in range(data_h): + for j in range(data_w): + data_out[:, i, j] = np.exp(data_in[:, i, j] - largest[i, j]) + return data_out/data_out.sum(axis=0) + + if softmax: + index = coords + int(not background) + for b in range(batch): + for i in range(N): + b_np_index = int(i*(in_channel/N) + index) + b_np[b, b_np_index: b_np_index + classes+background, :, :] = local_softmax(b_np[b, b_np_index:b_np_index + classes+background, :, :]) + + return b_np diff --git a/topi/python/topi/testing/reorg_python.py b/topi/python/topi/testing/reorg_python.py new file mode 100644 index 000000000000..185e5566e5bc --- /dev/null +++ b/topi/python/topi/testing/reorg_python.py @@ -0,0 +1,42 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Reorg in python""" +import numpy as np + +def reorg_python(a_np, stride): + """Reorg operator + + Parameters + ---------- + a_np : numpy.ndarray + 4-D with shape [batch, in_channel, in_height, in_width] + + stride : int + Stride size + + Returns + ------- + b_np : np.ndarray + 4-D with shape [batch, out_channel, out_height, out_width] + """ + + batch, in_channel, in_height, in_width = a_np.shape + a_np = np.reshape(a_np, batch*in_channel*in_height*in_width) + out_c = int(in_channel/(stride*stride)) + out_channel = in_channel*stride*stride + out_height = int(in_height/stride) + out_width = int(in_width/stride) + b_np = np.zeros(batch*out_channel*out_height*out_width) + cnt = 0 + for b in range(batch): + for k in range(in_channel): + for j in range(in_height): + for i in range(in_width): + c2 = k % out_c + offset = int(k / out_c) + w2 = int(i*stride + offset % stride) + h2 = int(j*stride + offset / stride) + out_index = int(w2 + in_width*stride*(h2 + in_height*stride*(c2 + out_c*b))) + b_np[cnt] = a_np[int(out_index)] + cnt = cnt+1 + b_np = np.reshape(b_np, (batch, out_channel, out_height, out_width)) + return b_np diff --git a/topi/python/topi/testing/shortcut_python.py b/topi/python/topi/testing/shortcut_python.py new file mode 100644 index 000000000000..575c28b61c2c --- /dev/null +++ b/topi/python/topi/testing/shortcut_python.py @@ -0,0 +1,47 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Shortcut in python""" +import numpy as np + +def shortcut_python(a_np1, a_np2): + """Reorg operator + + Parameters + ---------- + a_np1 : numpy.ndarray + 4-D with shape [batch1, in_channel1, in_height1, in_width1] + + a_np2 : numpy.ndarray + 4-D with shape [batch2, in_channel2, in_height2, in_width2] + + Returns + ------- + b_np : np.ndarray + 4-D with shape [batch1, out_channel1, out_height1, out_width1] + """ + + batch1, in_channel1, in_height1, in_width1 = a_np1.shape + batch2, in_channel2, in_height2, in_width2 = a_np2.shape + a_np1_temp = np.reshape(a_np1, batch1*in_channel1*in_height1*in_width1) + a_np2_temp = np.reshape(a_np2, batch2*in_channel2*in_height2*in_width2) + b_np = np.zeros(batch1*in_channel1*in_height1*in_width1) + stride = int(in_width1/in_width2) + sample = int(in_width2/in_width1) + if stride < 1: + stride = 1 + if sample < 1: + sample = 1 + minw = min(in_width1, in_width2) + minh = min(in_height1, in_height2) + minc = min(in_channel1, in_channel2) + + for i in range((batch1*in_channel1*in_height1*in_width1)): + b_np[i] = a_np1_temp[i] + for b in range(batch1): + for k in range(minc): + for j in range(minh): + for i in range(minw): + out_index = i*sample + in_width2*(j*sample + in_height2*(k + in_channel2*b)) + add_index = i*stride + in_width1*(j*stride + in_height1*(k + in_channel1*b)) + b_np[out_index] = a_np1_temp[out_index] + a_np2_temp[add_index] + b_np = np.reshape(b_np, (batch1, in_channel1, in_height1, in_width1)) + return b_np diff --git a/topi/python/topi/vision/__init__.py b/topi/python/topi/vision/__init__.py new file mode 100644 index 000000000000..0f164fdafc2b --- /dev/null +++ b/topi/python/topi/vision/__init__.py @@ -0,0 +1,7 @@ +# pylint: disable=wildcard-import +"""VISION network operators""" +from __future__ import absolute_import as _abs + +from . import yolo2 +from .shortcut import * +from .reorg import * diff --git a/topi/python/topi/vision/reorg.py b/topi/python/topi/vision/reorg.py new file mode 100644 index 000000000000..761a1718fd23 --- /dev/null +++ b/topi/python/topi/vision/reorg.py @@ -0,0 +1,39 @@ +""" +REORG Operator +==================== +Reorg operator, used in darknet. +""" +from __future__ import absolute_import as _abs +import tvm +from .. import util +from .. import transform + +@tvm.target.generic_func +def reorg(data, stride): + """Reorg forward operators. + + Parameters + ---------- + Input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + stride : int + Stride value for reorganization + + Returns + ------- + Output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + batch, c_in, h_in, w_in = util.get_const_tuple(data.shape) + out_c = int(c_in / (stride * stride)) + out = tvm.compute((batch, c_in, h_in, w_in), lambda b, k, j, i: + data[b * stride * stride, + (k % out_c) * stride * stride, + (j*stride + (k / out_c) / stride) * stride, + (i*stride + (k / out_c) % stride)], + tag="reorg") + out_c = int(c_in * stride * stride) + out_h = int(h_in / stride) + out_w = int(w_in / stride) + return transform.reshape(out, (batch, out_c, out_h, out_w)) diff --git a/topi/python/topi/vision/shortcut.py b/topi/python/topi/vision/shortcut.py new file mode 100644 index 000000000000..529360190a4e --- /dev/null +++ b/topi/python/topi/vision/shortcut.py @@ -0,0 +1,45 @@ +"""Shortcut operators (short-cut connections).""" +from __future__ import absolute_import as _abs +import tvm +from .. import util +from .. import transform + +@tvm.target.generic_func +def shortcut(inp1, inp2): + """Shortcut forward operators. + + Parameters + ---------- + First Input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + Second Input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + Returns + ------- + Output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + + _, inp1_c, inp1_h, inp1_w = util.get_const_tuple(inp1.shape) + batch, inp2_c, inp2_h, inp2_w = util.get_const_tuple(inp2.shape) + + stride = int(max(inp2_w / inp1_w, 1)) + sample = int(max(inp1_w / inp2_w, 1)) + minc = min(inp2_c, inp1_c) + minh = min(inp2_h, inp1_h) + minw = min(inp2_w, inp1_w) + + out = tvm.compute((batch, minc, minh, minw), lambda b, c, h, w: + inp1[b, c, h * sample, w * sample] + + inp2[b, c, h * stride, w * stride], + tag="shortcut") + + split_indices = int(inp1_c / minc) + if split_indices > 1: + split_res = transform.split(inp1, split_indices, 1) + split_res[0] = out + out = transform.concatenate(split_res, 1) + + return out diff --git a/topi/python/topi/vision/yolo2/__init__.py b/topi/python/topi/vision/yolo2/__init__.py new file mode 100644 index 000000000000..c0e9899a41aa --- /dev/null +++ b/topi/python/topi/vision/yolo2/__init__.py @@ -0,0 +1,5 @@ +# pylint: disable=wildcard-import +"""VISION network operators""" +from __future__ import absolute_import as _abs + +from .region import * diff --git a/topi/python/topi/vision/yolo2/region.py b/topi/python/topi/vision/yolo2/region.py new file mode 100644 index 000000000000..8f31d88fa941 --- /dev/null +++ b/topi/python/topi/vision/yolo2/region.py @@ -0,0 +1,63 @@ +# pylint: disable=invalid-name, unused-variable +""" +REGION Operator +==================== +Region operator, used in darknet. +""" +from __future__ import absolute_import as _abs +import tvm +from ... import transform +from ... import util +from ... import math +from ... import nn + +@tvm.target.generic_func +def region(data, num, classes, coords, background, softmax=True): + """Region forward operators. + Parameters + ---------- + data : tvm.Tensor + 4-D with shape [batch, c_in, h_in, w_in] + + num : int + Darknet layer parameter n + + classes : int + Darknet layer parameter classes + + coords : int + Darknet layer parameter coords + + background : int + Darknet layer parameter background + + softmax : boolean + Darknet layer parameter softmax + + Returns + ------- + out : tvm.Tensor + 4-D with shape [batch, c_in, h_in, w_in] + """ + + batch, c_in, h_in, w_in = util.get_const_tuple(data.shape) + split_indices = classes+coords+1 + data_block = transform.reshape(data, (batch, num, split_indices, h_in, w_in)) + split_res = transform.split(data_block, split_indices, 2) + split_res[0] = math.sigmoid(split_res[0]) + split_res[1] = math.sigmoid(split_res[1]) + if not background: + split_res[coords] = math.sigmoid(split_res[coords]) + + if softmax: + offset = coords + int(not background) + data_block_1 = [] + data_block_1.append(transform.concatenate(split_res[0:offset], 2)) + temp_out = transform.concatenate(split_res[offset:split_indices], 2) + temp_out = nn.softmax(temp_out, axis=2) + data_block_1.append(temp_out) + split_res = data_block_1 + + out = transform.concatenate(split_res, 2) + out = transform.reshape(out, data.shape) + return out diff --git a/topi/tests/python/test_topi_region.py b/topi/tests/python/test_topi_region.py new file mode 100644 index 000000000000..b2ef02530ba6 --- /dev/null +++ b/topi/tests/python/test_topi_region.py @@ -0,0 +1,44 @@ +"""Example code to do region.""" +import numpy as np +import topi +from topi.util import get_const_tuple +import tvm +def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax): + '''Verify region operator by comparing outputs from tvm and numpy implementation''' + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') + B = topi.vision.yolo2.region(A, n, classes, coords, background, l_softmax) + + a_shape = get_const_tuple(A.shape) + dtype = A.dtype + + def get_ref_data_region(): + a_np = np.random.uniform(size=a_shape).astype(dtype) + b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax) + return a_np, b_np + + a_np, b_np = get_ref_data_region() + def check_device(device): + '''Cheching devices is enabled or not''' + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.vision.schedule_region([B]) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, B], device) + func(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['llvm', 'cuda']: + check_device(device) + +def test_region(): + verify_region(1, 19, 425, 5, 80, 4, 0, 1) + +if __name__ == "__main__": + test_region() diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py new file mode 100644 index 000000000000..36a4d71cb5d4 --- /dev/null +++ b/topi/tests/python/test_topi_reorg.py @@ -0,0 +1,47 @@ +"""Example code to do reorg.""" +import numpy as np +import topi +from topi.util import get_const_tuple +import tvm + +def verify_reorg(batch, in_size, in_channel, stride): + '''Verify reorg operator by comparing outputs from tvm and numpy implementation''' + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') + B = topi.vision.reorg(A, stride) + + a_shape = get_const_tuple(A.shape) + dtype = A.dtype + + def get_ref_data_reorg(): + a_np = np.random.uniform(size=a_shape).astype(dtype) + b_np = topi.testing.reorg_python(a_np, stride) + return a_np, b_np + + a_np, b_np = get_ref_data_reorg() + + def check_device(device): + '''Cheching devices is enabled or not''' + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_injective([B]) + + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, B], device) + func(a, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['llvm', 'cuda']: + check_device(device) + +def test_reorg(): + verify_reorg(1, 20, 8, 2) + +if __name__ == "__main__": + test_reorg() diff --git a/topi/tests/python/test_topi_shortcut.py b/topi/tests/python/test_topi_shortcut.py new file mode 100644 index 000000000000..b5840fe8e7b2 --- /dev/null +++ b/topi/tests/python/test_topi_shortcut.py @@ -0,0 +1,48 @@ +"""Example code to do shortcut.""" +import numpy as np +import topi +from topi.util import get_const_tuple +import tvm + +def verify_shortcut(batch, in_size, in_channel): + '''Verify shortcut operator by comparing outputs from tvm and numpy implementation''' + in_height = in_width = in_size + + A1 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A1') + A2 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A2') + B = topi.vision.shortcut(A1, A2) + + a_shape = get_const_tuple(A1.shape) + dtype = A1.dtype + def get_ref_data_shortcut(): + a_np1 = np.random.uniform(size=a_shape).astype(dtype) + a_np2 = np.random.uniform(size=a_shape).astype(dtype) + b_np = topi.testing.shortcut_python(a_np1, a_np2) + return a_np1, a_np2, b_np + + a_np1, a_np2, b_np = get_ref_data_shortcut() + def check_device(device): + '''Cheching devices is enabled or not''' + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_injective([B]) + + a1 = tvm.nd.array(a_np1, ctx) + a2 = tvm.nd.array(a_np2, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A1, A2, B], device) + func(a1, a2, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + for device in ['llvm', 'cuda']: + check_device(device) + +def test_shortcut(): + verify_shortcut(1, 144, 32) + +if __name__ == "__main__": + test_shortcut() From 74e2ab433cfeb741410aa59e0658a1bc46444e7d Mon Sep 17 00:00:00 2001 From: nhynes Date: Thu, 8 Mar 2018 16:00:44 -0800 Subject: [PATCH 191/948] Use single-threaded SGX parallel (#975) --- sgx/sgx_runtime.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sgx/sgx_runtime.cc b/sgx/sgx_runtime.cc index 0bc6a3189e7a..6a0d0dfb224c 100644 --- a/sgx/sgx_runtime.cc +++ b/sgx/sgx_runtime.cc @@ -10,13 +10,13 @@ #include "../../src/runtime/registry.cc" #include "../../src/runtime/system_lib_module.cc" -// dummy parallel runtime +// dummy parallel runtime (for now) int TVMBackendParallelLaunch( FTVMParallelLambda flambda, void* cdata, int num_task) { - TVMAPISetLastError("Parallel is not (yet) supported in SGX runtime"); - return -1; + TVMParallelGroupEnv env = { nullptr /* sync_handle */, 1 /* num_task */ }; + return flambda(0 /* task_id */, &env, cdata); } int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { From 65ad02eb3fb615d2aad578e14086b65e8c133cea Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Thu, 8 Mar 2018 22:14:46 -0300 Subject: [PATCH 192/948] Assert dont crash on null strides (#976) --- src/pass/arg_binder.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc index cdd344670725..8139a5644bb6 100644 --- a/src/pass/arg_binder.cc +++ b/src/pass/arg_binder.cc @@ -194,6 +194,9 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, init_nest_.emplace_back(LetStmt::make( v_strides, TVMArrayGet(Handle(), handle, intrinsic::kArrStrides), nop)); + Expr is_null = Call::make( + Bool(1), intrinsic::tvm_handle_is_null, + {v_strides}, Call::PureIntrinsic); if (buffer->strides.size() == 0) { // Assert the buffer is compact Type stype = buffer->DefaultIndexType(); @@ -215,13 +218,14 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, Stmt check = AssertStmt::make(arith::ComputeReduce(conds, Expr()), stride_err_msg.str(), Evaluate::make(0)); - Expr is_null = Call::make( - Bool(1), intrinsic::tvm_handle_is_null, - {v_strides}, Call::PureIntrinsic); check = IfThenElse::make(Not::make(is_null), check, Stmt()); init_nest_.emplace_back(Block::make(check, Evaluate::make(0))); } } else { + std::ostringstream stride_null_err_msg; + stride_null_err_msg << arg_name << ".strides: expected non-null strides."; + asserts_.emplace_back(AssertStmt::make(Not::make(is_null), stride_null_err_msg.str(), nop)); + for (size_t k = 0; k < buffer->strides.size(); ++k) { std::ostringstream field_name; field_name << v_strides->name_hint << '[' << k << ']'; From 5aa0c74c56c34a7d4659cd2fd63c95c59acc1f72 Mon Sep 17 00:00:00 2001 From: Yida Wang Date: Thu, 8 Mar 2018 20:08:31 -0800 Subject: [PATCH 193/948] [RUNTIME] Better scalability for multi-thread parallelization of CPUs (#971) --- src/runtime/thread_pool.cc | 211 ++++++++++++++++++++++++------------- 1 file changed, 139 insertions(+), 72 deletions(-) diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index 7caedaa15538..bde16d8f56a8 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -17,6 +17,11 @@ #include #include #include +#if defined(__linux__) +#include +#endif + +const constexpr int kL1CacheBytes = 64; namespace tvm { namespace runtime { @@ -127,99 +132,124 @@ class ParallelLauncher { std::vector par_errors_; }; -/*! \brief Working queue for each thread */ -class ParallelTaskQueue { +/*! \brief Lock-free single-producer-single-consumer queue for each thread */ +class SpscTaskQueue { public: /*! \brief The task entry */ struct Task { ParallelLauncher* launcher; int32_t task_id; }; - ParallelTaskQueue() { - ring_.resize(2); + + SpscTaskQueue() : + buffer_(new Task[kRingSize]), + head_(0), + tail_(0) { } - /*! - * \brief Signal to kill the job. - */ - void SignalForKill() { - std::lock_guard lock(mutex_); - exit_now_.store(true); - cv_.notify_all(); + + ~SpscTaskQueue() { + delete[] buffer_; } + /*! - * \brief Push task into the queue. - * \param task The task to be pushed. + * \brief Push a task into the queue and notify the comsumer if it is on wait. + * \param input The task to be dequeued. */ - void Push(Task task) { - std::unique_lock lock(mutex_); - if (num_pending_ < ring_.size()) { - CHECK_NE(ring_.size(), 0U); - ring_[(head_ + num_pending_) % ring_.size()] = task; - ++num_pending_; - } else { - size_t old_size = ring_.size(); - ring_.resize(old_size * 2); - if (head_ + num_pending_ > old_size) { - // copy the ring overflow part into the tail. - size_t ncopy = head_ + num_pending_ - old_size; - memcpy(&ring_[0] + old_size, &ring_[0], ncopy * sizeof(Task)); - } - ring_[(head_ + num_pending_) % ring_.size()] = task; - ++num_pending_; + void Push(const Task& input) { + while (!Enqueue(input)) { + std::this_thread::yield(); } - if (nwait_consumer_ != 0) { - lock.unlock(); + if (pending_.fetch_add(1) == -1) { + std::unique_lock lock(mutex_); cv_.notify_one(); } } + /*! - * \brief Pop task from the queue - * \param task The task to be poped. - * \param timeout The number of cycles to spin before sleep. - * \return Whether pop is successful or we need to exit now. + * \brief Pop a task out of the queue and condition wait if no tasks. + * \param output The pointer to the task to be dequeued. + * \param spin_count The number of iterations to spin before sleep. + * \return Whether pop is successful (true) or we need to exit now (false). */ - bool Pop(Task* task, int timeout = 10) { - std::unique_lock lock(mutex_); - if (num_pending_ != 0) { - *task = ring_[head_]; - head_ = (head_ + 1) % ring_.size(); - --num_pending_; - if (exit_now_.load()) return false; - } else { - lock.unlock(); - // do a bit spin and busy waiting before sleep. - for (int i = 0; i < timeout && num_pending_ == 0; ++i) { + bool Pop(Task* output, uint32_t spin_count = 300000) { + // Busy wait a bit when the queue is empty. + // If a new task comes to the queue quickly, this wait avoid the worker from sleeping. + // The default spin count is set by following the typical omp convention + for (uint32_t i = 0; i < spin_count && pending_.load() == 0; ++i) { std::this_thread::yield(); } - lock.lock(); - ++nwait_consumer_; + if (pending_.fetch_sub(1) == 0) { + std::unique_lock lock(mutex_); cv_.wait(lock, [this] { - return num_pending_ != 0 || exit_now_.load(); + return pending_.load() >= 0 || exit_now_.load(); }); - --nwait_consumer_; - *task = ring_[head_]; - head_ = (head_ + 1) % ring_.size(); - --num_pending_; - if (exit_now_.load()) return false; } + if (exit_now_.load(std::memory_order_relaxed)) { + return false; + } + const uint32_t head = head_.load(std::memory_order_relaxed); + // sanity check if the queue is empty + CHECK(tail_.load(std::memory_order_acquire) != head); + *output = buffer_[head]; + head_.store((head + 1) % kRingSize, std::memory_order_release); return true; } + /*! + * \brief Signal to terminate the worker. + */ + void SignalForKill() { + std::lock_guard lock(mutex_); + exit_now_.store(true); + cv_.notify_all(); + } + private: - // Number of the elments in the queue - uint32_t num_pending_{0}; - // Queue head - uint32_t head_{0}; - // Number of consumers to wait. - uint32_t nwait_consumer_{0}; + /*! + * \brief Lock-free enqueue. + * \param input The task to be enqueued. + * \return Whether the task is enqueued. + */ + bool Enqueue(const Task& input) { + const uint32_t tail = tail_.load(std::memory_order_relaxed); + + if ((tail + 1) % kRingSize != (head_.load(std::memory_order_acquire))) { + buffer_[tail] = input; + tail_.store((tail + 1) % kRingSize, std::memory_order_release); + return true; + } + return false; + } + + // the cache line paddings are used for avoid false sharing between atomic variables + typedef char cache_line_pad_t[kL1CacheBytes]; + cache_line_pad_t pad0_; + // size of the queue, the queue can host size_ - 1 items at most + // define it as a constant for better compiler optimization + static constexpr const int kRingSize = 2; + // pointer to access the item + Task* const buffer_; + + cache_line_pad_t pad1_; + // queue head, where one gets a task from the queue + std::atomic head_; + + cache_line_pad_t pad2_; + // queue tail, when one puts a task to the queue + std::atomic tail_; + + cache_line_pad_t pad3_; + // pending tasks in the queue + std::atomic pending_{0}; + + cache_line_pad_t pad4_; + // signal for exit now + std::atomic exit_now_{false}; + // internal mutex std::mutex mutex_; // cv for consumer std::condition_variable cv_; - // signal for exit now - std::atomic exit_now_{false}; - // The internal ring. - std::vector ring_; }; // The thread pool @@ -244,7 +274,7 @@ class ThreadPool { this->Init(); } ~ThreadPool() { - for (std::unique_ptr& q : queues_) { + for (std::unique_ptr& q : queues_) { q->SignalForKill(); } for (std::thread& t : threads_) { @@ -267,13 +297,14 @@ class ThreadPool { << " workers=" << num_workers_ << " request=" << num_task; } launcher->Init(flambda, cdata, num_task, need_sync != 0); - ParallelTaskQueue::Task tsk; + SpscTaskQueue::Task tsk; tsk.launcher = launcher; for (int i = 0; i < num_task; ++i) { tsk.task_id = i; queues_[i]->Push(tsk); } - return launcher->WaitForJobs(); + int res = launcher->WaitForJobs(); + return res; } static ThreadPool* Global() { @@ -285,8 +316,9 @@ class ThreadPool { // Initialize the pool. void Init() { for (int i = 0; i < num_workers_; ++i) { + // The SpscTaskQueue only host ONE item at a time queues_.emplace_back( - std::unique_ptr(new ParallelTaskQueue())); + std::unique_ptr(new SpscTaskQueue())); } threads_.resize(num_workers_); for (int i = 0; i < num_workers_; ++i) { @@ -294,10 +326,20 @@ class ThreadPool { this->RunWorker(queues_[i].get()); }); } + const char *val = getenv("TVM_BIND_THREADS"); + if (val == nullptr || atoi(val) == 1) { + if (num_workers_ <= std::thread::hardware_concurrency()) { + SetThreadAffinity(); + } else { + LOG(WARNING) + << "The thread affinity cannot be set when the number of workers is larger " + << "than the number of available cores in the system."; + } + } } // Internal worker function. - void RunWorker(ParallelTaskQueue* queue) { - ParallelTaskQueue::Task task; + void RunWorker(SpscTaskQueue* queue) { + SpscTaskQueue::Task task; ParallelLauncher::ThreadLocal()->is_worker = true; while (queue->Pop(&task)) { CHECK(task.launcher != nullptr); @@ -310,9 +352,33 @@ class ThreadPool { } } } + // bind worker threads to disjoint cores + void SetThreadAffinity() { +#if defined(__ANDROID__) + #define CPU_SETSIZE 1024 + #define __NCPUBITS (8 * sizeof (uint64_t)) + typedef struct { + uint64_t __bits[CPU_SETSIZE / __NCPUBITS]; + } cpu_set_t; + + #define CPU_SET(cpu, cpusetp) \ + ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) + #define CPU_ZERO(cpusetp) \ + memset((cpusetp), 0, sizeof(cpu_set_t)) +#endif + for (int i=0; i < num_workers_; ++i) { +#if defined(__linux__) || defined(__ANDROID__) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(i, &cpuset); + pthread_setaffinity_np(threads_[i].native_handle(), + sizeof(cpu_set_t), &cpuset); +#endif + } + } // Number of workers int num_workers_; - std::vector > queues_; + std::vector > queues_; std::vector threads_; }; @@ -323,8 +389,9 @@ int TVMBackendParallelLaunch( FTVMParallelLambda flambda, void* cdata, int num_task) { - return tvm::runtime::ThreadPool::Global()->Launch( + int res = tvm::runtime::ThreadPool::Global()->Launch( flambda, cdata, num_task, 1); + return res; } int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { From d5c42b0c29ac4793c9cd0f09d8a733981b56eb50 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 9 Mar 2018 11:12:57 -0800 Subject: [PATCH 194/948] [IOS] Improve the iOS RPC with exclusive filesys lock (#981) --- .gitignore | 1 + apps/ios_rpc/tests/ios_rpc_test.py | 1 - include/tvm/runtime/c_runtime_api.h | 2 +- include/tvm/runtime/packed_func.h | 7 +++--- python/tvm/contrib/rpc_proxy.py | 2 +- python/tvm/contrib/util.py | 39 ++++++++++++++++++++++++++++- python/tvm/contrib/xcode.py | 36 ++++++++++++++++++++------ src/runtime/graph/graph_runtime.cc | 4 +-- 8 files changed, 74 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index b645f6693b09..4f44aeb8d952 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,7 @@ xcuserdata/ .DS_Store tags cscope* +*.lock # vim temporary files *.swp diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py index 80317a6e350d..a3df1d3a9043 100644 --- a/apps/ios_rpc/tests/ios_rpc_test.py +++ b/apps/ios_rpc/tests/ios_rpc_test.py @@ -59,7 +59,6 @@ def test_rpc_module(): # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, - options=['-quiet'], libs=[path_dso1, path_dso2]) # connect to the proxy diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index f6d22534e8ab..af89860b5784 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -320,7 +320,7 @@ typedef void (*TVMPackedCFuncFinalizer)(void* resource_handle); * TVM call this function to get the extension functions * The declarer will call register_func to register function and their name. * - * \param resource_func_handle The register function + * \param register_func_handle The register function * \return 0 if success, -1 if failure happens */ typedef int (*TVMExtensionFuncDeclarer)(TVMFunctionHandle register_func_handle); diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 8101ecdc175c..c5c86535cee2 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -688,11 +688,10 @@ inline TVMType String2TVMType(std::string s) { LOG(FATAL) << "unknown type " << s; } char* xdelim; // emulate sscanf("%ux%u", bits, lanes) - unsigned bits = strtoul(scan, &xdelim, 10); - if (bits != 0) t.bits = static_cast(bits); + uint8_t bits = static_cast(strtoul(scan, &xdelim, 10)); + if (bits != 0) t.bits = bits; if (*xdelim == 'x') { - unsigned lanes = strtoul(xdelim + 1, nullptr, 10); - t.lanes = static_cast(lanes); + t.lanes = static_cast(strtoul(xdelim + 1, nullptr, 10)); } return t; } diff --git a/python/tvm/contrib/rpc_proxy.py b/python/tvm/contrib/rpc_proxy.py index 9634c258b39f..5d67556ad1ce 100644 --- a/python/tvm/contrib/rpc_proxy.py +++ b/python/tvm/contrib/rpc_proxy.py @@ -392,7 +392,7 @@ def __init__(self, port=9091, port_end=9199, web_port=0, - timeout_client=240, + timeout_client=600, timeout_server=600, index_page=None, resource_files=None): diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 93d5b897c7c1..338567d6f619 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -2,6 +2,7 @@ from __future__ import absolute_import as _abs import os import tempfile +import fcntl import shutil class TempDirectory(object): @@ -38,7 +39,7 @@ def relpath(self, name): return os.path.join(self.temp_dir, name) def listdir(self): - """"List contents in the dir. + """List contents in the dir. Returns ------- @@ -47,6 +48,7 @@ def listdir(self): """ return os.listdir(self.temp_dir) + def tempdir(): """Create temp dir which deletes the contents when exit. @@ -56,3 +58,38 @@ def tempdir(): The temp directory object """ return TempDirectory() + + +class FileLock(object): + """File lock object + + Parameters + ---------- + path : str + The path to the lock + """ + def __init__(self, path): + self.lock_file = open(path, "w") + fcntl.lockf(self.lock_file, fcntl.LOCK_EX) + + + def release(self): + """Release the lock""" + if self.lock_file: + fcntl.lockf(self.lock_file, fcntl.LOCK_UN) + self.lock_file.close() + self.lock_file = None + +def filelock(path): + """Create a file lock which locks on path + + Parameters + ---------- + path : str + The path to the lock + + Returns + ------- + lock : File lock object + """ + return FileLock(path) diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py index 3456bdf1964a..63fbad2a58cf 100644 --- a/python/tvm/contrib/xcode.py +++ b/python/tvm/contrib/xcode.py @@ -146,6 +146,28 @@ def compile_metal(code, path_target=None, sdk="macosx"): return libbin +class XCodeRPCServer(object): + """Wrapper for RPC server + + Parameters + ---------- + cmd : list of str + The command to run + + lock: FileLock + Lock on the path + """ + def __init__(self, cmd, lock): + self.proc = subprocess.Popen(cmd) + self.lock = lock + + def join(self): + """Wait server to finish and release its resource + """ + self.proc.wait() + self.lock.release() + + def popen_test_rpc(host, port, key, @@ -190,6 +212,10 @@ def popen_test_rpc(host, if not os.path.exists(proj_path): raise RuntimeError("Cannot find tvmrpc.xcodeproj in %s," + (" please set env TVM_IOS_RPC_ROOT correctly" % rpc_root)) + + # Lock the path so only one file can run + lock = util.filelock(os.path.join(rpc_root, "ios_rpc.lock")) + with open(os.path.join(rpc_root, "rpc_config.txt"), "w") as fo: fo.write("%s %d %s\n" % (host, port, key)) libs = libs if libs else [] @@ -203,11 +229,5 @@ def popen_test_rpc(host, if options: cmd += options cmd += ["test"] - if "-quiet" in options: - with open(os.devnull, 'w') as devnull: - proc = subprocess.Popen(cmd, - stderr=subprocess.STDOUT, - stdout=devnull) - else: - proc = subprocess.Popen(cmd) - return proc + + return XCodeRPCServer(cmd, lock) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 89bb3a052b0f..b692ab729566 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -57,7 +57,7 @@ class GraphRuntime : public ModuleNode { } /*! * \brief Initialize the graph executor with graph and context. - * \param graph The execution graph. + * \param graph_json The execution graph. * \param module The module containing the compiled functions. * \param ctx The context where the graph should sit on */ @@ -94,7 +94,7 @@ class GraphRuntime : public ModuleNode { /*! * \brief set index-th input to the graph. * \param index The input index. - * \param data The input data. + * \param data_in The input data. */ void SetInput(int index, DLTensor* data_in) { CHECK_LT(static_cast(index), input_nodes_.size()); From e4ead4a33288ce5ec4abbeab2465baa896092271 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 10 Mar 2018 03:39:38 +0800 Subject: [PATCH 195/948] fix keeping trivial loop (#982) --- src/schedule/schedule_ops.cc | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc index 1fbffb61fc7f..b9b02050a556 100644 --- a/src/schedule/schedule_ops.cc +++ b/src/schedule/schedule_ops.cc @@ -57,8 +57,10 @@ class InjectAttach : public IRMutator { public: InjectAttach(const Stage& stage, const Stage& attach_spec, - const std::unordered_map& dom_map) - : stage_(stage), attach_spec_(attach_spec), dom_map_(dom_map) {} + const std::unordered_map& dom_map, + bool del_trivial_loop) + : stage_(stage), attach_spec_(attach_spec), dom_map_(dom_map), + del_trivial_loop_(del_trivial_loop) {} Stmt Mutate(Stmt stmt) final { CHECK(stmt.defined()); @@ -74,7 +76,7 @@ class InjectAttach : public IRMutator { found_attach = true; stmt = AttrStmt::make( op->node, op->attr_key, op->value, - MakePipeline(stage_, dom_map_, op->body, true)); + MakePipeline(stage_, dom_map_, op->body, del_trivial_loop_)); } } return stmt; @@ -89,6 +91,8 @@ class InjectAttach : public IRMutator { const Stage& attach_spec_; // domain map const std::unordered_map& dom_map_; + // whether delete trivial loops with extent of 1 + bool del_trivial_loop_; }; // inject the operator's realization on the stmt. @@ -97,9 +101,10 @@ class InjectScanStep : public IRMutator { InjectScanStep(const Stage& stage, const Operation& scan_op, const std::unordered_map& dom_map, - bool is_init) + bool is_init, + bool del_trivial_loop) : stage_(stage), scan_op_(scan_op), - dom_map_(dom_map), is_init_(is_init) {} + dom_map_(dom_map), is_init_(is_init), del_trivial_loop_(del_trivial_loop) {} Stmt Mutate(Stmt stmt) final { CHECK(stmt.defined()); @@ -113,7 +118,7 @@ class InjectScanStep : public IRMutator { found_attach = true; stmt = AttrStmt::make( op->node, op->attr_key, op->value, - MakePipeline(stage_, dom_map_, op->body, true)); + MakePipeline(stage_, dom_map_, op->body, del_trivial_loop_)); } } return stmt; @@ -130,6 +135,8 @@ class InjectScanStep : public IRMutator { const std::unordered_map& dom_map_; // whether it is init. bool is_init_; + // whether delete trivial loops with extent of 1 + bool del_trivial_loop_; }; // Postprocessing of schedule op @@ -365,14 +372,14 @@ Stmt ScheduleOps( if (scan_init.count(s->op)) { CHECK(body.defined()); - InjectScanStep mu(s, scan_init.at(s->op), dom_map, true); + InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, del_trivial_loop); body = mu.Mutate(body); CHECK(mu.found_attach) << "did not find attachment point for scan.init"; } else if (attach_spec->attach_type == kScanUpdate) { // Handle scan update CHECK(body.defined()); - InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false); + InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, del_trivial_loop); body = mu.Mutate(body); CHECK(mu.found_attach) << "did not find attachment point for scan.update"; @@ -384,7 +391,7 @@ Stmt ScheduleOps( } else { CHECK_EQ(attach_spec->attach_type, kScope); CHECK(body.defined()); - InjectAttach mutator(s, attach_spec, dom_map); + InjectAttach mutator(s, attach_spec, dom_map, del_trivial_loop); body = mutator.Mutate(body); CHECK(mutator.found_attach) << "did not find attachment point for " << s << " in " From ba143fb8a64c82a39d1965f59b92b527bf0a8901 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 9 Mar 2018 11:39:58 -0800 Subject: [PATCH 196/948] prevent aggressive unrolling in vthread (#983) --- src/pass/inject_virtual_thread.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc index c0e7b4b3ff9e..833513756053 100644 --- a/src/pass/inject_virtual_thread.cc +++ b/src/pass/inject_virtual_thread.cc @@ -418,7 +418,8 @@ class VTInjector : public IRMutator { // reset the flags after processing. vt_loop_injected_ = false; visit_touched_var_ = false; - if (max_loop_depth_ == 0) { + // only unroll if number of vthreads are small + if (max_loop_depth_ == 0 && num_threads_ < 16) { // do unrolling if it is inside innermost content. Stmt blk = Substitute(stmt, {{var_, make_zero(var_.type())}}); for (int i = 1; i < num_threads_; ++i) { From 7c29d0ad61bb55de4ee761c9af10699b679f6f15 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Sat, 10 Mar 2018 23:10:26 +0530 Subject: [PATCH 197/948] Bug fix for Android platforms (https://github.com/dmlc/tvm/pull/971) (#986) --- src/runtime/thread_pool.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index bde16d8f56a8..74795f004811 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -355,6 +355,7 @@ class ThreadPool { // bind worker threads to disjoint cores void SetThreadAffinity() { #if defined(__ANDROID__) +#ifndef CPU_SET #define CPU_SETSIZE 1024 #define __NCPUBITS (8 * sizeof (uint64_t)) typedef struct { @@ -365,14 +366,19 @@ class ThreadPool { ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) #define CPU_ZERO(cpusetp) \ memset((cpusetp), 0, sizeof(cpu_set_t)) +#endif #endif for (int i=0; i < num_workers_; ++i) { #if defined(__linux__) || defined(__ANDROID__) cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(i, &cpuset); +#if defined(__ANDROID__) + sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); +#else pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); +#endif #endif } } From f5e6431f0d3a793158c61662a5f8a1ddbb0865ed Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 10 Mar 2018 09:40:52 -0800 Subject: [PATCH 198/948] [FFI] Fix global free destruction (#985) --- python/tvm/_ffi/_ctypes/function.py | 2 +- python/tvm/_ffi/_ctypes/node.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py index ec278bc2045c..189d9964baf5 100644 --- a/python/tvm/_ffi/_ctypes/function.py +++ b/python/tvm/_ffi/_ctypes/function.py @@ -165,7 +165,7 @@ def __init__(self, handle, is_global): self.is_global = is_global def __del__(self): - if not self.is_global: + if not self.is_global and _LIB is not None: check_call(_LIB.TVMFuncFree(self.handle)) def __call__(self, *args): diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py index 08efc3913084..cb32b83291d1 100644 --- a/python/tvm/_ffi/_ctypes/node.py +++ b/python/tvm/_ffi/_ctypes/node.py @@ -44,7 +44,8 @@ def __init__(self, handle): self.handle = handle def __del__(self): - check_call(_LIB.TVMNodeFree(self.handle)) + if _LIB is not None: + check_call(_LIB.TVMNodeFree(self.handle)) def __getattr__(self, name): ret_val = TVMValue() From eb86aa8ec1016883309f104e8bc956af45cacda9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 10 Mar 2018 14:25:45 -0800 Subject: [PATCH 199/948] [CYTHON] Correct backtrace print for python3 (#989) --- python/tvm/_ffi/_cython/base.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi index 0bbcc7064960..b84391648f35 100644 --- a/python/tvm/_ffi/_cython/base.pxi +++ b/python/tvm/_ffi/_cython/base.pxi @@ -37,7 +37,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h": DLDataType dtype int64_t* shape int64_t* strides - uint64_t byte_offset; + uint64_t byte_offset ctypedef struct TVMValue: int64_t v_int64 @@ -128,7 +128,7 @@ cdef inline c_str(pystr): cdef inline CALL(int ret): if ret != 0: - raise TVMError(TVMGetLastError()) + raise TVMError(py_str(TVMGetLastError())) cdef inline object ctypes_handle(void* chandle): From 6287a99b9c65ae3a2453047a69d5db73838bd932 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 10 Mar 2018 15:32:32 -0800 Subject: [PATCH 200/948] [RUNTIME] Update graph runtime to rely on smarter planner, add get_input (#990) --- python/tvm/contrib/graph_runtime.py | 15 +++++++++++++++ src/runtime/graph/graph_runtime.cc | 26 ++++++++++++++++++-------- src/runtime/thread_pool.cc | 2 +- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index ddabac004993..a303895ceb1d 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -72,6 +72,7 @@ def __init__(self, module, ctx): self._set_input = module["set_input"] self._run = module["run"] self._get_output = module["get_output"] + self._get_input = module["get_input"] try: self._debug_get_output = module["debug_get_output"] except AttributeError: @@ -111,6 +112,20 @@ def run(self, **input_dict): self.set_input(**input_dict) self._run() + def get_input(self, index, out): + """Get index-th input to out + + Parameters + ---------- + index : int + The input index + + out : NDArray + The output array container + """ + self._get_input(index, out) + return out + def get_output(self, index, out): """Get index-th output to out diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index b692ab729566..892967b310e2 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -101,6 +101,16 @@ class GraphRuntime : public ModuleNode { uint32_t eid = this->entry_id(input_nodes_[index], 0); TVM_CCALL(TVMArrayCopyFromTo(data_in, &data_entry_[eid], nullptr)); } + /*! + * \brief Copy index-th input to data_out + * \param index The input index. + * \param data_out The output + */ + void GetInput(int index, DLTensor* data_out) { + CHECK_LT(static_cast(index), input_nodes_.size()); + uint32_t eid = this->entry_id(input_nodes_[index], 0); + TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr)); + } /*! * \brief Copy index-th output to data_out. * \param index The output index. @@ -463,14 +473,6 @@ void GraphRuntime::SetupStorage() { vtype.push_back(tvm::runtime::String2TVMType(s_type)); } data_entry_.resize(num_node_entries()); - // Find the maximum space size. - int max_id = 0; - for (size_t i = 0; i < attrs_.shape.size(); ++i) { - max_id = std::max(attrs_.storage_id[i] + 1, max_id); - } - for (uint32_t nid : input_nodes_) { - attrs_.storage_id[this->entry_id(nid, 0)] = max_id++; - } // size of each storage pool entry std::vector pool_entry_bytes; // Find the maximum space size. @@ -592,6 +594,14 @@ PackedFunc GraphRuntime::GetFunction( return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->GetOutput(args[0], args[1]); }); + } else if (name == "get_input") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + if (args[0].type_code() == kStr) { + this->GetInput(this->GetInputIndex(args[0]), args[1]); + } else { + this->GetInput(args[0], args[1]); + } + }); #ifdef TVM_GRAPH_RUNTIME_DEBUG } else if (name == "debug_get_output") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index 74795f004811..4e13fdd14151 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -204,7 +204,7 @@ class SpscTaskQueue { cv_.notify_all(); } - private: + protected: /*! * \brief Lock-free enqueue. * \param input The task to be enqueued. From a333b4a538cc4028f7831893479d6129567db7bb Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya <34906252+AtuNuka@users.noreply.github.com> Date: Tue, 13 Mar 2018 13:59:56 +0900 Subject: [PATCH 201/948] [BUILD] Windows build with cuDNN support (#999) --- CMakeLists.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e072ced6fda1..eb52d1f82723 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,8 @@ tvm_option(USE_RTTI "Build with RTTI" ON) tvm_option(USE_MSVC_MT "Build with MT" OFF) tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF) +tvm_option(USE_CUDNN "Build with cuDNN" OFF) + include_directories("include") include_directories("HalideIR/src") include_directories("dlpack/include") @@ -126,6 +128,24 @@ find_library(CUDA_NVRTC_LIBRARIES nvrtc ${CUDA_TOOLKIT_ROOT_DIR}/lib) list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIB}) endif(MSVC) + + if(USE_CUDNN) + message(STATUS "Build with cuDNN support") + file(GLOB CONTRIB_CUDNN_SRCS src/contrib/cudnn/*.cc) + list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS}) + if(MSVC) + find_library(CUDA_CUDNN_LIB cudnn + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib/win32) + list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIB}) + else(MSVC) + find_library(CUDA_CUDNN_LIB cudnn + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib) + list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIB}) + endif(MSVC) + endif(USE_CUDNN) + add_definitions(-DTVM_CUDA_RUNTIME=1) else(USE_CUDA) add_definitions(-DTVM_CUDA_RUNTIME=0) From 71c2086c777cfa5424871a5cfc9ba38b018d924c Mon Sep 17 00:00:00 2001 From: Ding <37059654+dingobye@users.noreply.github.com> Date: Wed, 14 Mar 2018 02:52:21 +1100 Subject: [PATCH 202/948] [PASS] Add VerifyMemory pass and test cases (#410) (#993) --- include/tvm/ir_pass.h | 14 ++ python/tvm/build_module.py | 6 +- src/api/api_pass.cc | 1 + src/codegen/build_module.cc | 6 +- src/pass/verify_memory.cc | 168 ++++++++++++++++++ .../unittest/test_pass_verify_memory.py | 96 ++++++++++ 6 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 src/pass/verify_memory.cc create mode 100644 tests/python/unittest/test_pass_verify_memory.py diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index d7032ffdb1ae..572385d9a895 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -440,6 +440,20 @@ LoweredFunc PointerValueTypeRewrite(LoweredFunc f); * \return Transformed function. */ LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target); + +/*! + * \brief Verify if memory accesses are legal for a specific target device type. + * + * In the case that tgt is cuda, if not all workload is bound with + * threads, CPU code is generated that tries to access GPU memory, + * which is illegal. This pass performs verification for this case. + * + * \param func The function to be verified. + * \param device_type The target device type. + * \return Success of memory verification. + */ +bool VerifyMemory(LoweredFunc func, int device_type); + } // namespace ir } // namespace tvm diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 7b6fa7715a7b..03a79860e9ee 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -424,10 +424,15 @@ def build(sch, target = _target.current_target() if target is None else target target = _target.create(target) if target else _target.create("llvm") + device_type = ndarray.context(target.target_name, 0).device_type fhost = [] fdevice = [] for func in flist: + if not ir_pass.VerifyMemory(func, device_type): + raise ValueError( + "Direct host side access to device memory is detected in %s. " + "Did you forget to bind?" % func.name) if func.func_type == container.LoweredFunc.MixedFunc: if BuildConfig.current.detect_global_barrier: func = ir_pass.ThreadSync(func, "global") @@ -449,7 +454,6 @@ def build(sch, warnings.warn( "Specified target %s, but cannot find device code, did you do bind?" % target) - device_type = ndarray.context(target.target_name, 0).device_type fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost] fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost] diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index 06c6b621abde..7ec6ef4009e4 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -128,5 +128,6 @@ REGISTER_PASS2(LowerThreadAllreduce); REGISTER_PASS2(LowerIntrin); REGISTER_PASS1(LowerTVMBuiltin); REGISTER_PASS1(CombineContextCall); +REGISTER_PASS2(VerifyMemory); } // namespace ir } // namespace tvm diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index e7f0cd41bb7b..dc0971b2d4c3 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -269,7 +269,11 @@ runtime::Module build(const Array& funcs, Array fhost; Array fdevice; - for (const auto &x : funcs) { + for (const auto& x : funcs) { + CHECK(ir::VerifyMemory(x, target.device_type)) + << "Direct host side access to device memory is detected in " << x->func_name() + << ". Did you forget to bind?"; + if (x->func_type == kMixedFunc) { auto func = x; if (config->detect_global_barrier) { diff --git a/src/pass/verify_memory.cc b/src/pass/verify_memory.cc new file mode 100644 index 000000000000..e928bedf266c --- /dev/null +++ b/src/pass/verify_memory.cc @@ -0,0 +1,168 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file verify_memory.cc + * \brief Pass to check if memory accesses are legal. + */ +#include +#include +#include + +namespace tvm { +namespace ir { +namespace { + +/*! + * \brief Verify if memory accesses are legal. + * + * In the case that tgt is cuda, if workload is not bound with + * threads, CPU code is generated that tries to access GPU memory, + * which is illegal. + * + * This pass performs such verification by checking if all Producer/Consumer + * with memory accesses are bound with threads when device type is GPU. + */ +class MemoryAccessVerifier final : protected IRVisitor { + public: + /// Special member functions + //@{ + explicit MemoryAccessVerifier(LoweredFunc f, int device_type) + : func_(f), dev_type_(device_type) {} + virtual ~MemoryAccessVerifier() = default; + MemoryAccessVerifier(const MemoryAccessVerifier &) = delete; + MemoryAccessVerifier(MemoryAccessVerifier &&) = delete; + MemoryAccessVerifier &operator=(const MemoryAccessVerifier &) = delete; + MemoryAccessVerifier &operator=(MemoryAccessVerifier &&) = delete; + //@} + + /// Interface to perform memory access verification + void Run() { + if (!IsGPUDevice(dev_type_)) return; + IRVisitor::Visit(func_->body); + } + + /// Verification result + bool Failed() const { return failure_; } + + protected: + /// Visitor implementation + //@{ + void Visit(const NodeRef &n) final { + if (Failed()) return; + IRVisitor::Visit(n); + } + + void Visit_(const LetStmt *op) final { + // Book keep definitions + defs_[op->var.get()] = op->value; + return IRVisitor::Visit_(op); + } + + void Visit_(const AttrStmt *op) final { + if (!InThreadEnv() && (op->attr_key == attr::thread_extent || + op->attr_key == attr::pipeline_exec_scope)) { + EnterThreadEnv(); + IRVisitor::Visit_(op); + ExitThreadEnv(); + } else { + IRVisitor::Visit_(op); + } + } + + void Visit_(const ProducerConsumer *op) final { + EnterProducerConsumer(op); + IRVisitor::Visit_(op); + ExitProducerConsumer(); + } + + void Visit_(const Load *op) final { + HandleLoadStoreToVariable(op->buffer_var); + return IRVisitor::Visit_(op); + } + + void Visit_(const Store *op) final { + HandleLoadStoreToVariable(op->buffer_var); + return IRVisitor::Visit_(op); + } + //@} + + /// Check if the value of a Variable comes from function argument. + bool IsFromFunctionArgs(const Variable *var) const { + const Variable *V = var; + while (true) { + CHECK(V) << "Invalid Variable\n"; + + // Variable is from function args. Return true. + if (V == func_->args[0].node_.get()) return true; + + // The value is expected to come from a tvm_struct_get Call. + // Get the first argument of tvm_struct_get, and continue. + const auto &iter = defs_.find(V); + if (iter == defs_.end()) return false; + const Call *C = iter->second.as(); + if (!C || C->name != intrinsic::tvm_struct_get) return false; + V = C->args[0].as(); + } + return false; + } + + /// Handle memory access to a Variable + void HandleLoadStoreToVariable(const VarExpr &var) { + // We skip the access within thread env. + if (InThreadEnv()) return; + + // We only check access within a producer/consumer. + // Because for load/store out side of producer/consumer, + // they don't have to be in thread env to stay legal (e.g. Load of args). + if (!InProducerConsumer()) return; + + // We only handle the variable from function argument. + // If it does not come from args, then it could be allocated internally, + // it may possibly be in host or device address space. + // We do not handle this case, and skip it conservatively. + if (!IsFromFunctionArgs(var.get())) return; + + // The verification fails in this case. + SetFailure(); + } + + /// Status getter/setter + //@{ + bool InThreadEnv() const { return in_thread_env_; } + void EnterThreadEnv() { in_thread_env_ = true; } + void ExitThreadEnv() { in_thread_env_ = false; } + bool InProducerConsumer() const { return pc_ != nullptr; } + const ProducerConsumer *GetCurrentProducerConsumer() const { return pc_; } + void EnterProducerConsumer(const ProducerConsumer *pc) { this->pc_ = pc; } + void ExitProducerConsumer() { pc_ = nullptr; } + void SetFailure() { failure_ = true; } + //@} + + /// Check if a given DLDeviceType/TVMDeviceExtType value denotes GPU device. + static bool IsGPUDevice(int dev_type) { + return kDLGPU == dev_type || kDLOpenCL == dev_type || + kDLVulkan == dev_type || kDLMetal == dev_type || + kDLROCM == dev_type || kOpenGL == dev_type; + } + + private: + /// Status of visitor + //@{ + bool in_thread_env_{false}; + const ProducerConsumer *pc_{nullptr}; + bool failure_{false}; ///< If the verification fails (i.e. has illegal access) + //@} + LoweredFunc func_{nullptr}; ///< Function to be verified. + int dev_type_{kDLCPU}; ///< Device type + std::unordered_map defs_; ///< Variable definitions +}; +} // namespace + +/// Interface of VerifyMemory pass +bool VerifyMemory(LoweredFunc func, int device_type) { + MemoryAccessVerifier v(func, device_type); + v.Run(); + return !v.Failed(); +} + +} // namespace ir +} // namespace tvm diff --git a/tests/python/unittest/test_pass_verify_memory.py b/tests/python/unittest/test_pass_verify_memory.py new file mode 100644 index 000000000000..d1f5d4326621 --- /dev/null +++ b/tests/python/unittest/test_pass_verify_memory.py @@ -0,0 +1,96 @@ +import tvm + +# The following DLDeviceType/TVMDeviceExtType values +# are originally defined in dlpack.h and c_runtime_api.h. +gpu_devices = [2, 4, 7, 8, 10, 11] +other_devices = [1, 3, 9, 12] + + +def lower(sch, args): + binds = {} + arg_list = [] + for x in args: + if isinstance(x, tvm.tensor.Tensor): + buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name) + assert x not in binds + binds[x] = buf + arg_list.append(buf) + else: + raise ValueError("args must be Tensor, Buffer or Var") + sch = sch.normalize() + bounds = tvm.schedule.InferBound(sch) + stmt = tvm.schedule.ScheduleOps(sch, bounds) + stmt = tvm.ir_pass.LoopPartition(stmt, False) + stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64) + func = tvm.ir_pass.MakeAPI(stmt, "myadd", arg_list, 0, True) + return func + + +# All computations are bound. +# So VerifyMemory pass is expected to succeed. +# +def test_verify_memory_all_bind(): + n = tvm.var("n") + A = tvm.placeholder((n,), name='A') + B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B") + + # B is bound to threads. + s = tvm.create_schedule(B.op) + bx, tx = s[B].split(B.op.axis[0], factor=64) + s[B].bind(bx, tvm.thread_axis("blockIdx.x")) + s[B].bind(tx, tvm.thread_axis("threadIdx.x")) + + func = lower(s, [A, B]) + + for dev_type in gpu_devices + other_devices: + assert tvm.ir_pass.VerifyMemory(func, dev_type) + + +# Computations are not bound. +# So VerifyMemory pass fails when device type is GPU. +# +def test_verify_memory_not_bind(): + n = tvm.var("n") + A = tvm.placeholder((n,), name='A') + B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B") + + # B is not bound to threads. + s = tvm.create_schedule(B.op) + + func = lower(s, [A, B]) + + for dev_type in gpu_devices: + assert not tvm.ir_pass.VerifyMemory(func, dev_type) + for dev_type in other_devices: + assert tvm.ir_pass.VerifyMemory(func, dev_type) + + +# Computations are partially bound. +# So VerifyMemory pass fails when device type is GPU. +# +def test_verify_memory_partially_bind(): + n = tvm.var("n") + A = tvm.placeholder((n,), name='A') + B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B") + C = tvm.compute(B.shape, lambda i: B[i] + 2.0, name="C") + D = tvm.compute(C.shape, lambda i: C[i] + 2.0, name="D") + + # C is bound to threads, but B and D are not. + s = tvm.create_schedule([B.op, C.op, D.op]) + bx, tx = s[C].split(C.op.axis[0], factor=64) + s[C].bind(bx, tvm.thread_axis("blockIdx.x")) + s[C].bind(tx, tvm.thread_axis("threadIdx.x")) + + func = lower(s, [A, B, C, D]) + + for dev_type in gpu_devices: + assert not tvm.ir_pass.VerifyMemory(func, dev_type) + for dev_type in other_devices: + assert tvm.ir_pass.VerifyMemory(func, dev_type) + + +if __name__ == "__main__": + test_verify_memory_all_bind() + test_verify_memory_not_bind() + test_verify_memory_partially_bind() + From 1d6df5a187127f514ea48f6a8a74c77ff59c89f5 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Wed, 14 Mar 2018 09:20:31 +0530 Subject: [PATCH 203/948] Updated documentation error (#1001) --- docs/how_to/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how_to/install.md b/docs/how_to/install.md index 54db42281623..6e81c0407550 100644 --- a/docs/how_to/install.md +++ b/docs/how_to/install.md @@ -58,7 +58,7 @@ The configuration of tvm can be modified by ```config.mk``` any local modification will be ignored by git, then modify the according flags. - TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM. - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0. - - Since LLVM takes long time to build from source, you can download pre-built version of LLVM frorm + - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from [LLVM Download Page](http://releases.llvm.org/download.html). - Unzip to a certain location, modify ```config.mk``` to add ```LLVM_CONFIG=/path/to/your/llvm/bin/llvm-config``` - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/) From 19add5f9d8a969152cc4b5934d4074ecabdecbcf Mon Sep 17 00:00:00 2001 From: nhynes Date: Thu, 15 Mar 2018 19:06:46 -0700 Subject: [PATCH 204/948] Pluggable Thread Launching Mechanism (#991) --- apps/howto_deploy/tvm_runtime_pack.cc | 1 + apps/sgx/Makefile | 9 +- apps/sgx/app.cc | 21 +++-- apps/sgx/enclave.cc | 6 +- apps/sgx/enclave_config.xml | 6 +- apps/sgx/prepare_test_libs.py | 2 + apps/sgx/test_addone.edl | 6 +- apps/sgx/tvm_runtime_pack.cc | 6 +- include/tvm/runtime/threading_backend.h | 65 +++++++++++++ sgx/{sgx_runtime.cc => runtime_t.cc} | 20 ++-- sgx/runtime_u.cc | 34 +++++++ sgx/threading_backend.cc | 71 ++++++++++++++ sgx/tvm.edl | 15 +++ src/runtime/thread_pool.cc | 120 ++++++------------------ src/runtime/threading_backend.cc | 113 ++++++++++++++++++++++ 15 files changed, 374 insertions(+), 121 deletions(-) create mode 100644 include/tvm/runtime/threading_backend.h rename sgx/{sgx_runtime.cc => runtime_t.cc} (52%) create mode 100644 sgx/runtime_u.cc create mode 100644 sgx/threading_backend.cc create mode 100644 sgx/tvm.edl create mode 100644 src/runtime/threading_backend.cc diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc index 9a090d863729..445768128413 100644 --- a/apps/howto_deploy/tvm_runtime_pack.cc +++ b/apps/howto_deploy/tvm_runtime_pack.cc @@ -25,6 +25,7 @@ #include "../../src/runtime/module.cc" #include "../../src/runtime/registry.cc" #include "../../src/runtime/file_util.cc" +#include "../../src/runtime/threading_backend.cc" #include "../../src/runtime/thread_pool.cc" // NOTE: all the files after this are optional modules diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile index 6a1eeb5b8b1e..fd1d0cc8f2d5 100644 --- a/apps/sgx/Makefile +++ b/apps/sgx/Makefile @@ -26,6 +26,7 @@ pkg_cflags := -std=c++11 -O2 -fPIC\ -I${TVM_ROOT}/dlpack/include\ -I.\ -DDMLC_LOG_STACK_TRACE=0\ + -fmax-errors=4 pkg_ldflags := -L${TVM_ROOT}/lib @@ -40,7 +41,7 @@ enclave_cflags := -static -nostdinc\ -DDMLC_CXX11_THREAD_LOCAL=0\ $(enclave_include_paths)\ -enclave_cxxflags := -nostdinc++ $(enclave_cflags) +enclave_cxxflags := -nostdinc++ $(enclave_cflags) -DTVM_SGX_MAX_CONCURRENCY=4 enclave_ldflags :=\ -Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_SDK)/lib64\ @@ -62,7 +63,7 @@ app_ldflags := -L$(SGX_SDK)/lib64\ all: lib/test_addone.signed.so bin/test_addone # Build rule for all-in-one TVM package library -lib/tvm_runtime_pack.o: tvm_runtime_pack.cc +lib/tvm_runtime_pack.o: tvm_runtime_pack.cc lib/test_addone_t.o @mkdir -p $(@D) $(CXX) -c $< -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) -g @@ -94,7 +95,7 @@ lib/test_addone.signed.so: lib/test_addone.so enclave_config.xml # An app that runs the enclave bin/test_addone: app.cc lib/test_addone_u.o @mkdir -p $(@D) - $(CXX) $^ -o $@ $(app_cflags) $(app_ldflags) + $(CXX) $^ -o $@ $(app_cflags) $(app_ldflags) $(pkg_cflags) -g # Debugging runtime pack built without SGX (c.f. howto_deploy/tvm_runtime_pack.cc) lib/tvm_runtime_pack_nosgx.o: tvm_runtime_pack.cc @@ -104,7 +105,7 @@ lib/tvm_runtime_pack_nosgx.o: tvm_runtime_pack.cc # Debugging binary that runs TVM without SGX bin/addone_nosgx: enclave.cc lib/tvm_runtime_pack_nosgx.o lib/test_addone_sys.o @mkdir -p $(@D) - $(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) -g + $(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) -g -lpthread clean: rm -rf lib bin diff --git a/apps/sgx/app.cc b/apps/sgx/app.cc index 1516e8b4e925..d008bfb3777d 100644 --- a/apps/sgx/app.cc +++ b/apps/sgx/app.cc @@ -1,13 +1,15 @@ #include +#include #include "sgx_urts.h" #include "sgx_eid.h" #include "test_addone_u.h" +#include "../../sgx/runtime_u.cc" #define TOKEN_FILENAME "bin/test_addone.token" #define ENCLAVE_FILENAME "lib/test_addone.signed.so" -sgx_enclave_id_t global_eid = 0; // global EID shared by multiple threads +sgx_enclave_id_t tvm_sgx_eid; typedef struct _sgx_errlist_t { sgx_status_t err; @@ -80,7 +82,7 @@ int initialize_enclave(void) /* Step 2: call sgx_create_enclave to initialize an enclave instance */ /* Debug Support: set 2nd parameter to 1 */ - sgx_status = sgx_create_enclave(ENCLAVE_FILENAME, SGX_DEBUG_FLAG, &token, &updated, &global_eid, NULL); + sgx_status = sgx_create_enclave(ENCLAVE_FILENAME, SGX_DEBUG_FLAG, &token, &updated, &tvm_sgx_eid, NULL); if (sgx_status != SGX_SUCCESS) { print_error_message(sgx_status); if (fp != NULL) fclose(fp); @@ -105,7 +107,7 @@ int initialize_enclave(void) } int SGX_CDECL main(int argc, char *argv[]) { - if(initialize_enclave() < 0){ + if(initialize_enclave() < 0) { printf("Failed to initialize enclave.\n"); return -1; } @@ -113,12 +115,13 @@ int SGX_CDECL main(int argc, char *argv[]) { /* Run TVM within the enclave */ int addone_status; sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; - sgx_status = enclave_main(global_eid, &addone_status); + sgx_status = tvm_ecall_run_module(tvm_sgx_eid, nullptr, &addone_status); if (sgx_status != SGX_SUCCESS) { print_error_message(sgx_status); } - - sgx_destroy_enclave(global_eid); + tvm_ecall_shutdown(tvm_sgx_eid); + tvm::runtime::sgx::Shutdown(); + sgx_destroy_enclave(tvm_sgx_eid); if (addone_status == 1) { printf("It works!"); @@ -127,3 +130,9 @@ int SGX_CDECL main(int argc, char *argv[]) { printf("It doesn't work."); return -1; } + +extern "C" { +void ocall_println(const char* str) { + std::cout << "Enclave says: " << str << std::endl; +} +} diff --git a/apps/sgx/enclave.cc b/apps/sgx/enclave.cc index 7588455543e9..d43107288ecb 100644 --- a/apps/sgx/enclave.cc +++ b/apps/sgx/enclave.cc @@ -6,6 +6,8 @@ #include #endif +extern void Shutdown(); + /* This function mirrors the one in howto_deploy except without the iostream */ int Verify(tvm::runtime::Module mod, std::string fname) { // Get the function from the module. @@ -43,9 +45,9 @@ int Verify(tvm::runtime::Module mod, std::string fname) { extern "C" { -int enclave_main() { +void tvm_ecall_run_module(const void* tvm_args, void* tvm_return_value) { tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))(); - return Verify(mod_syslib, "addonesys"); + *(int*)tvm_return_value = Verify(mod_syslib, "addonesys"); } } diff --git a/apps/sgx/enclave_config.xml b/apps/sgx/enclave_config.xml index d24da1882981..f7fc129d6e63 100644 --- a/apps/sgx/enclave_config.xml +++ b/apps/sgx/enclave_config.xml @@ -1,9 +1,9 @@ 0 0 - 0x2000 - 0x1000 - 1 + 0x100000 + 0x100000 + 5 1 0 0 diff --git a/apps/sgx/prepare_test_libs.py b/apps/sgx/prepare_test_libs.py index 1fa9d74ef1c9..715880e61c31 100644 --- a/apps/sgx/prepare_test_libs.py +++ b/apps/sgx/prepare_test_libs.py @@ -11,6 +11,8 @@ def prepare_test_libs(base_path): A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') s = tvm.create_schedule(B.op) + s[B].parallel(s[B].op.axis[0]) + print(tvm.lower(s, [A, B], simple_mode=True)) # Compile library in system library mode fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib', name='addonesys') diff --git a/apps/sgx/test_addone.edl b/apps/sgx/test_addone.edl index 58341a727a6b..0127a581803a 100644 --- a/apps/sgx/test_addone.edl +++ b/apps/sgx/test_addone.edl @@ -1,7 +1,7 @@ enclave { - from "sgx_tstdc.edl" import sgx_thread_wait_untrusted_event_ocall, sgx_thread_set_untrusted_event_ocall, sgx_thread_setwait_untrusted_events_ocall, sgx_thread_set_multiple_untrusted_events_ocall; + from "../../sgx/tvm.edl" import *; - trusted { - public int enclave_main(); + untrusted { + void ocall_println([in, string] const char *str); }; }; diff --git a/apps/sgx/tvm_runtime_pack.cc b/apps/sgx/tvm_runtime_pack.cc index 709386b78931..0d88af03a65d 100644 --- a/apps/sgx/tvm_runtime_pack.cc +++ b/apps/sgx/tvm_runtime_pack.cc @@ -5,7 +5,11 @@ * Please refer to the Makefile (rule lib/tvm_runtime_pack.o) for how to build. * */ -#include "../../sgx/sgx_runtime.cc" +#ifdef _LIBCPP_SGX_CONFIG +#include "lib/test_addone_t.h" +#endif +#include "../../sgx/runtime_t.cc" + #ifndef _LIBCPP_SGX_CONFIG #include "../../src/runtime/file_util.cc" #endif diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h new file mode 100644 index 000000000000..6c8c4f5eb084 --- /dev/null +++ b/include/tvm/runtime/threading_backend.h @@ -0,0 +1,65 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file threading_backend.h + * \brief Utilities for manipulating thread pool threads. + */ +#ifndef TVM_RUNTIME_THREADING_BACKEND_H_ +#define TVM_RUNTIME_THREADING_BACKEND_H_ + +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace threading { + +/*! + * \brief A platform-agnostic abstraction for managing a collection of + * thread pool threads. + */ +class ThreadGroup { + public: + class Impl; + + /*! + * \brief Creates a collection of threads which run a provided function. + * + * \param num_workers The total number of worker threads in this group. + Includes main thread if `exclude_worker0 = true` + * \param worker_callback A callback which is run in its own thread. + Receives the worker_id as an argument. + * \param exclude_worker0 Whether to use the main thread as a worker. + * If `true`, worker0 will not be launched in a new thread and + * `worker_callback` will only be called for values >= 1. This + * allows use of the main thread as a worker. + */ + ThreadGroup(int num_workers, + std::function worker_callback, + bool exclude_worker0 = false); + ~ThreadGroup(); + + /*! + * \brief Blocks until all non-main threads in the pool finish. + */ + void Join(); + + private: + Impl* impl_; +}; + +/*! + * \brief Platform-agnostic no-op. + */ +void Yield(); + +/*! + * \return the maximum number of effective workers for this system. + */ +int MaxConcurrency(); + +} // namespace threading +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_THREADING_BACKEND_H_ diff --git a/sgx/sgx_runtime.cc b/sgx/runtime_t.cc similarity index 52% rename from sgx/sgx_runtime.cc rename to sgx/runtime_t.cc index 6a0d0dfb224c..5f280ffce8e4 100644 --- a/sgx/sgx_runtime.cc +++ b/sgx/runtime_t.cc @@ -9,17 +9,15 @@ #include "../../src/runtime/module.cc" #include "../../src/runtime/registry.cc" #include "../../src/runtime/system_lib_module.cc" +#ifndef _LIBCPP_SGX_CONFIG +#include "../../src/runtime/threading_backend.cc" +#else +#include "threading_backend.cc" +#endif +#include "../../src/runtime/thread_pool.cc" -// dummy parallel runtime (for now) -int TVMBackendParallelLaunch( - FTVMParallelLambda flambda, - void* cdata, - int num_task) { - TVMParallelGroupEnv env = { nullptr /* sync_handle */, 1 /* num_task */ }; - return flambda(0 /* task_id */, &env, cdata); +extern "C" { +void tvm_ecall_shutdown() { + tvm::runtime::ThreadPool::Global()->Shutdown(); } - -int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { - return 0; } - diff --git a/sgx/runtime_u.cc b/sgx/runtime_u.cc new file mode 100644 index 000000000000..0acccf614b40 --- /dev/null +++ b/sgx/runtime_u.cc @@ -0,0 +1,34 @@ +#include +#include "../../src/runtime/threading_backend.cc" +#include + +extern sgx_enclave_id_t tvm_sgx_eid; +extern "C" { +sgx_status_t tvm_ecall_run_worker(sgx_enclave_id_t eid, const void* cb); +} + +namespace tvm { +namespace runtime { +namespace sgx { + +static std::unique_ptr sgx_thread_group; + +extern "C" { +void tvm_ocall_thread_pool_launch(int num_tasks, void* cb) { + std::function runner = [cb](int _worker_id) { + sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; + sgx_status = tvm_ecall_run_worker(tvm_sgx_eid, cb); + CHECK(sgx_status == SGX_SUCCESS) << "SGX Error: " << sgx_status; + }; + sgx_thread_group.reset(new tvm::runtime::threading::ThreadGroup( + num_tasks, runner, false /* include_main_thread */)); +} +} + +void Shutdown() { + sgx_thread_group->Join(); +} + +} // namespace sgx +} // namespace runtime +} // namespace tvm diff --git a/sgx/threading_backend.cc b/sgx/threading_backend.cc new file mode 100644 index 000000000000..7f820ab51581 --- /dev/null +++ b/sgx/threading_backend.cc @@ -0,0 +1,71 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file sgx/threading_backend.cc + * \brief SGX threading backend + */ +#include +#include +#include +#include +#include + +extern "C" { +sgx_status_t SGX_CDECL tvm_ocall_thread_pool_launch(int num_workers, void* cb); +} + +#ifndef TVM_SGX_MAX_CONCURRENCY +#define TVM_SGX_MAX_CONCURRENCY 1 +#endif + +namespace tvm { +namespace runtime { +namespace threading { + +class ThreadGroup::Impl { + public: + Impl(int num_workers, std::function worker_callback, + bool exclude_worker0) + : num_workers_(num_workers), + worker_callback_(worker_callback), + next_task_id_(exclude_worker0) { + CHECK(num_workers <= TVM_SGX_MAX_CONCURRENCY) + << "Tried spawning more threads than allowed by TVM_SGX_MAX_CONCURRENCY."; + sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; + sgx_status = tvm_ocall_thread_pool_launch(num_workers, this); + CHECK(sgx_status == SGX_SUCCESS) << "SGX Error: " << sgx_status; + } + + void RunTask() { + int task_id = next_task_id_++; + CHECK(task_id < num_workers_) + << "More workers entered enclave than allowed by TVM_SGX_MAX_CONCURRENCY"; + worker_callback_(task_id); + } + + private: + int num_workers_; + std::function worker_callback_; + std::atomic next_task_id_; +}; + +ThreadGroup::ThreadGroup(int num_workers, + std::function worker_callback, + bool exclude_worker0) + : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {} +void ThreadGroup::Join() {} +ThreadGroup::~ThreadGroup() { delete impl_; } + +void Yield() {} + +int MaxConcurrency() { return TVM_SGX_MAX_CONCURRENCY; } + +extern "C" { +void tvm_ecall_run_worker(const void* impl) { + if (!sgx_is_within_enclave(impl, sizeof(ThreadGroup::Impl))) return; + ((ThreadGroup::Impl*)impl)->RunTask(); +} +} + +} // namespace threading +} // namespace runtime +} // namespace tvm diff --git a/sgx/tvm.edl b/sgx/tvm.edl new file mode 100644 index 000000000000..e88ac0ac7f9c --- /dev/null +++ b/sgx/tvm.edl @@ -0,0 +1,15 @@ +enclave { + from "sgx_tstdc.edl" import *; + + trusted { + public void tvm_ecall_run_module([user_check] const void* tvm_args, + [user_check] void* tvm_ret_value); + public void tvm_ecall_run_worker([user_check] const void* cb); + public void tvm_ecall_shutdown(); + }; + + untrusted { + void tvm_ocall_thread_pool_launch(int num_workers, [user_check] void* cb); + }; +}; + diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index 4e13fdd14151..ac42a0c032c5 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -17,9 +18,6 @@ #include #include #include -#if defined(__linux__) -#include -#endif const constexpr int kL1CacheBytes = 64; @@ -73,14 +71,14 @@ class ParallelLauncher { return num_pending_ == 0; }); if (!has_error_) return 0; - std::ostringstream os; + std::string err(""); for (size_t i = 0; i < par_errors_.size(); ++i) { if (par_errors_[i].length() != 0) { - os << "Task " << i << " error: " << par_errors_[i] << '\n'; + err += "Task " + std::to_string(i) + " error: " + par_errors_[i] + '\n'; par_errors_[i].clear(); } } - TVMAPISetLastError(os.str().c_str()); + TVMAPISetLastError(err.c_str()); return -1; } // Signal that one job has finished. @@ -157,7 +155,7 @@ class SpscTaskQueue { */ void Push(const Task& input) { while (!Enqueue(input)) { - std::this_thread::yield(); + tvm::runtime::threading::Yield(); } if (pending_.fetch_add(1) == -1) { std::unique_lock lock(mutex_); @@ -176,8 +174,8 @@ class SpscTaskQueue { // If a new task comes to the queue quickly, this wait avoid the worker from sleeping. // The default spin count is set by following the typical omp convention for (uint32_t i = 0; i < spin_count && pending_.load() == 0; ++i) { - std::this_thread::yield(); - } + tvm::runtime::threading::Yield(); + } if (pending_.fetch_sub(1) == 0) { std::unique_lock lock(mutex_); cv_.wait(lock, [this] { @@ -211,6 +209,8 @@ class SpscTaskQueue { * \return Whether the task is enqueued. */ bool Enqueue(const Task& input) { + if (exit_now_.load(std::memory_order_relaxed)) return false; + const uint32_t tail = tail_.load(std::memory_order_relaxed); if ((tail + 1) % kRingSize != (head_.load(std::memory_order_acquire))) { @@ -255,32 +255,17 @@ class SpscTaskQueue { // The thread pool class ThreadPool { public: - ThreadPool() { - const char *val = getenv("TVM_NUM_THREADS"); - if (val == nullptr) { - val = getenv("OMP_NUM_THREADS"); - } - if (val != nullptr) { - num_workers_ = atoi(val); - } else { -#if defined(_M_X64) || defined(__x86_64__) - // Half to not count hyper threading. - num_workers_ = std::thread::hardware_concurrency() / 2; -#else - num_workers_ = std::thread::hardware_concurrency(); -#endif - } - num_workers_ = std::max(num_workers_, 1); - this->Init(); - } - ~ThreadPool() { - for (std::unique_ptr& q : queues_) { - q->SignalForKill(); - } - for (std::thread& t : threads_) { - t.join(); + ThreadPool(): num_workers_(tvm::runtime::threading::MaxConcurrency()) { + for (int i = 0; i < num_workers_; ++i) { + // The SpscTaskQueue only host ONE item at a time + queues_.emplace_back(std::unique_ptr(new SpscTaskQueue())); } + threads_ = std::unique_ptr( + new tvm::runtime::threading::ThreadGroup( + num_workers_, [this](int worker_id) { this->RunWorker(worker_id); }, + false /* include_main_thread */)); } + ~ThreadPool() { Shutdown(); } int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, @@ -307,38 +292,22 @@ class ThreadPool { return res; } + void Shutdown() { + for (std::unique_ptr& q : queues_) { + q->SignalForKill(); + } + threads_.reset(); + } + static ThreadPool* Global() { static ThreadPool inst; return &inst; } private: - // Initialize the pool. - void Init() { - for (int i = 0; i < num_workers_; ++i) { - // The SpscTaskQueue only host ONE item at a time - queues_.emplace_back( - std::unique_ptr(new SpscTaskQueue())); - } - threads_.resize(num_workers_); - for (int i = 0; i < num_workers_; ++i) { - threads_[i] = std::thread([this, i] { - this->RunWorker(queues_[i].get()); - }); - } - const char *val = getenv("TVM_BIND_THREADS"); - if (val == nullptr || atoi(val) == 1) { - if (num_workers_ <= std::thread::hardware_concurrency()) { - SetThreadAffinity(); - } else { - LOG(WARNING) - << "The thread affinity cannot be set when the number of workers is larger " - << "than the number of available cores in the system."; - } - } - } // Internal worker function. - void RunWorker(SpscTaskQueue* queue) { + void RunWorker(int worker_id) { + SpscTaskQueue* queue = queues_[worker_id].get(); SpscTaskQueue::Task task; ParallelLauncher::ThreadLocal()->is_worker = true; while (queue->Pop(&task)) { @@ -352,40 +321,9 @@ class ThreadPool { } } } - // bind worker threads to disjoint cores - void SetThreadAffinity() { -#if defined(__ANDROID__) -#ifndef CPU_SET - #define CPU_SETSIZE 1024 - #define __NCPUBITS (8 * sizeof (uint64_t)) - typedef struct { - uint64_t __bits[CPU_SETSIZE / __NCPUBITS]; - } cpu_set_t; - - #define CPU_SET(cpu, cpusetp) \ - ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) - #define CPU_ZERO(cpusetp) \ - memset((cpusetp), 0, sizeof(cpu_set_t)) -#endif -#endif - for (int i=0; i < num_workers_; ++i) { -#if defined(__linux__) || defined(__ANDROID__) - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(i, &cpuset); -#if defined(__ANDROID__) - sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); -#else - pthread_setaffinity_np(threads_[i].native_handle(), - sizeof(cpu_set_t), &cpuset); -#endif -#endif - } - } - // Number of workers int num_workers_; std::vector > queues_; - std::vector threads_; + std::unique_ptr threads_; }; } // namespace runtime @@ -411,7 +349,7 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { if (i != task_id) { while (sync_counter[i * kSyncStride].load( std::memory_order_relaxed) <= old_counter) { - std::this_thread::yield(); + tvm::runtime::threading::Yield(); } } } diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc new file mode 100644 index 000000000000..19ba9bf2d776 --- /dev/null +++ b/src/runtime/threading_backend.cc @@ -0,0 +1,113 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file threading_backend.cc + * \brief Native threading backend + */ +#include +#include +#include +#if defined(__linux__) +#include +#endif + +namespace tvm { +namespace runtime { +namespace threading { + +class ThreadGroup::Impl { + public: + Impl(int num_workers, + std::function worker_callback, + bool exclude_worker0) + : num_workers_(num_workers) { + CHECK_GE(num_workers, 1) + << "Requested a non-positive number of worker threads."; + for (int i = exclude_worker0; i < num_workers_; ++i) { + threads_.emplace_back([worker_callback, i] { worker_callback(i); }); + } + const char *val = getenv("TVM_BIND_THREADS"); + if (val == nullptr || atoi(val) == 1) { + if (num_workers_ <= std::thread::hardware_concurrency()) { + SetAffinity(); + } else { + LOG(WARNING) + << "The thread affinity cannot be set when the number of workers" + << "is larger than the number of available cores in the system."; + } + } + } + ~Impl() { Join(); } + + void Join() { + for (auto& t : threads_) { + if (t.joinable()) t.join(); + } + } + + private: + // bind worker threads to disjoint cores + void SetAffinity() { +#if defined(__ANDROID__) +#ifndef CPU_SET +#define CPU_SETSIZE 1024 +#define __NCPUBITS (8 * sizeof (uint64_t)) + typedef struct { + uint64_t __bits[CPU_SETSIZE / __NCPUBITS]; + } cpu_set_t; + +#define CPU_SET(cpu, cpusetp) \ + ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) +#define CPU_ZERO(cpusetp) \ + memset((cpusetp), 0, sizeof(cpu_set_t)) +#endif +#endif + for (unsigned i=0; i < threads_.size(); ++i) { +#if defined(__linux__) || defined(__ANDROID__) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(i, &cpuset); +#if defined(__ANDROID__) + sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); +#else + pthread_setaffinity_np(threads_[i].native_handle(), + sizeof(cpu_set_t), &cpuset); +#endif +#endif + } + } + + int num_workers_; + std::vector threads_; +}; + +ThreadGroup::ThreadGroup(int num_workers, + std::function worker_callback, + bool exclude_worker0) + : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {} +ThreadGroup::~ThreadGroup() { delete impl_; } +void ThreadGroup::Join() { impl_->Join(); } + +void Yield() { + std::this_thread::yield(); +} + +int MaxConcurrency() { + int max_concurrency = 1; + const char *val = getenv("TVM_NUM_THREADS"); + if (val == nullptr) { + val = getenv("OMP_NUM_THREADS"); + } + if (val != nullptr) { + max_concurrency = atoi(val); + } else { + max_concurrency = std::thread::hardware_concurrency(); +#if defined(_M_X64) || defined(__x86_64__) + max_concurrency /= 2; // ignore hyper-threading +#endif + } + return std::max(max_concurrency, 1); +} + +} // namespace threading +} // namespace runtime +} // namespace tvm From df66a8581794d3ead389b929028160df7fea60af Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 15 Mar 2018 19:16:45 -0700 Subject: [PATCH 205/948] [CONTRIB] windows compatiblity (#1009) --- python/tvm/contrib/util.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 338567d6f619..e2b4011c4c0a 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -2,8 +2,12 @@ from __future__ import absolute_import as _abs import os import tempfile -import fcntl import shutil +try: + import fcntl +except ImportError: + fcntl = None + class TempDirectory(object): """Helper object to manage temp directory during testing. @@ -70,13 +74,15 @@ class FileLock(object): """ def __init__(self, path): self.lock_file = open(path, "w") - fcntl.lockf(self.lock_file, fcntl.LOCK_EX) + if fcntl: + fcntl.lockf(self.lock_file, fcntl.LOCK_EX) def release(self): """Release the lock""" if self.lock_file: - fcntl.lockf(self.lock_file, fcntl.LOCK_UN) + if fcntl: + fcntl.lockf(self.lock_file, fcntl.LOCK_UN) self.lock_file.close() self.lock_file = None From eafd0c00a3c940219e4c7ae927ab64e432fea32b Mon Sep 17 00:00:00 2001 From: Ding <37059654+dingobye@users.noreply.github.com> Date: Fri, 16 Mar 2018 13:30:20 +1100 Subject: [PATCH 206/948] [LANGUAGE] Verify Compute with respect to Reduce operations (#1006) --- src/op/compute_op.cc | 94 ++++++++++++++++--- .../unittest/test_lang_verify_compute.py | 64 +++++++++++++ 2 files changed, 143 insertions(+), 15 deletions(-) create mode 100644 tests/python/unittest/test_lang_verify_compute.py diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc index 8b8bfbfe602e..f3f8335c195c 100644 --- a/src/op/compute_op.cc +++ b/src/op/compute_op.cc @@ -24,6 +24,9 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) TVM_REGISTER_NODE_TYPE(ComputeOpNode); +/// Verify if ComputeOp is valid with respect to Reduce operations. +static void VerifyComputeOp(const ComputeOpNode *op); + inline bool ReduceEqual(const ir::Reduce* a, const ir::Reduce* b) { return (a->combiner.same_as(b->combiner)) && (a->source.same_as(b->source)) && @@ -116,15 +119,9 @@ Operation ComputeOpNode::make(std::string name, n->body = body; if (n->body[0]->is_type()) { const ir::Reduce* reduce = n->body[0].as(); - for (size_t i = 1; i < n->body.size(); ++i) { - const ir::Reduce* reduce_ = n->body[i].as(); - CHECK(reduce_); - CHECK(ReduceEqual(reduce_, reduce)) - << "The Reduce inputs of ComputeOp should " - << "have the same attribute except value_index"; - } n->reduce_axis = reduce->axis; } + VerifyComputeOp(n.get()); return Operation(n); } @@ -151,18 +148,11 @@ Operation ComputeOpNode::ReplaceInputs( const Operation& self, const std::unordered_map& rmap) const { CHECK_EQ(self.operator->(), this); + VerifyComputeOp(this); Array arr; if (this->body[0]->is_type()) { // Specially handle reduce so the replaced op // still share all the components - const ir::Reduce* reduce = this->body[0].as(); - for (size_t i = 1; i < this->body.size(); ++i) { - const ir::Reduce* reduce_ = this->body[i].as(); - CHECK(reduce_); - CHECK(ReduceEqual(reduce_, reduce)) - << "The Reduce inputs of ComputeOp should " - << "have the same attribute except value_index"; - }\ Expr new_reduce = op::ReplaceTensor(this->body[0], rmap); if (!new_reduce.same_as(this->body[0])) { const ir::Reduce* r = new_reduce.as(); @@ -466,4 +456,78 @@ ComputeLoopNest ComputeLoopNest::make( // copy elison here. return ret; } + +namespace { +/*! + * \brief Verify if ComputeOp is valid with respect to Reduce operations. + * + * The following two properties are verified: + * (1) All Reduce operations must exist at top level. + * (2) For a list of operations, if one is Reduce, then the others + * must be Reduce as well; and their inputs should have the + * same attribute except value_index. + */ +class ComputeVerifier final : protected ir::IRVisitor { + public: + /// Special member functions + //@{ + explicit ComputeVerifier(const ComputeOpNode* compute) + : compute_(compute), reduce_(compute->body[0].as()) {} + virtual ~ComputeVerifier() = default; + ComputeVerifier(const ComputeVerifier&) = delete; + ComputeVerifier(ComputeVerifier&&) = delete; + ComputeVerifier& operator=(const ComputeVerifier&) = delete; + ComputeVerifier& operator=(ComputeVerifier&&) = delete; + //@} + + /// Interface to perform compute verification + void Run() { + for (const Expr e : compute_->body) { + // Check for consistency of top level reductions + const ir::Reduce* reduce = e.as(); + CHECK((reduce && reduce_) || (!reduce && !reduce_)) + << "All ComputeOp should be consistent " + << "with being Reduce operation or not."; + + if (reduce && reduce_) { + CHECK(ReduceEqual(reduce, reduce_)) + << "The Reduce inputs of ComputeOp should " + << "have the same attribute except value_index"; + } + + level_ = 0; + ir::IRVisitor::Visit(e); + } + } + + protected: + /// Visitor implementation + //@{ + void Visit(const NodeRef& n) final { + ++level_; + ir::IRVisitor::Visit(n); + --level_; + } + + void Visit_(const ir::Reduce* op) final { + // Check for non top level reductions + CHECK(0 == level_) + << "Reductions are only allowed at the top level of compute. " + << "Please create another tensor for further composition."; + } + //@} + + private: + const ComputeOpNode* compute_{nullptr}; ///< ComputeOpNode to verify + const ir::Reduce* reduce_{nullptr}; ///< Top level Reduce operation + int level_{0}; ///< Level of op being processed +}; +} // namespace + +/// Verify if ComputeOp is valid with respect to Reduce operations. +static void VerifyComputeOp(const ComputeOpNode* op) { + ComputeVerifier v(op); + v.Run(); +} + } // namespace tvm diff --git a/tests/python/unittest/test_lang_verify_compute.py b/tests/python/unittest/test_lang_verify_compute.py new file mode 100644 index 000000000000..1b9ecf453267 --- /dev/null +++ b/tests/python/unittest/test_lang_verify_compute.py @@ -0,0 +1,64 @@ +import tvm + +def test_verify_compute(): + n = tvm.var("n") + m = tvm.var("m") + A = tvm.placeholder((n, m), name='A') + k = tvm.reduce_axis((0, m), "k") + k_ = tvm.reduce_axis((0, m-1), "k_") + f1 = lambda i: tvm.sum(A[i, k], axis=k) + f2 = lambda i: A[i,0] + 1 + f3 = lambda i: tvm.sum(A[i, k], axis=k) + 1 + f4 = lambda i: A[i,0] * (tvm.sum(A[i, k], axis=k) + 1) + f5 = lambda i: (tvm.sum(A[i, k], axis=k), A[i,0] + 1) + f6 = lambda i: (tvm.sum(A[i, k], axis=k), tvm.sum(A[i, k_], axis=k_)) + + # + # Valid compute + try: + B = tvm.compute((n,), f1, name="B") + except tvm._ffi.base.TVMError as ex: + assert False + + # + # Valid compute + try: + B = tvm.compute((n,), f2, name="B") + except tvm._ffi.base.TVMError as ex: + assert False + + # + # Invalid compute with non top level reduction + try: + B = tvm.compute((n,), f3, name="B") + assert False + except tvm._ffi.base.TVMError as ex: + pass + + # + # Invalid compute with non top level reduction + try: + B = tvm.compute((n,), f4, name="B") + assert False + except tvm._ffi.base.TVMError as ex: + pass + + # + # Invalid compute with reduction and non-reduction batch ops + try: + B0, B1 = tvm.compute((n,), f5, name="B") + assert False + except tvm._ffi.base.TVMError as ex: + pass + + # + # Invalid compute with unequal batch reduction ops + try: + B0, B1 = tvm.compute((n,), f6, name="B") + assert False + except tvm._ffi.base.TVMError as ex: + pass + + +if __name__ == "__main__": + test_verify_compute() \ No newline at end of file From 3f5d090948613ee8cc55a14b670d28d044728d97 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 15 Mar 2018 19:50:55 -0700 Subject: [PATCH 207/948] Update contributor guide (#1010) --- CONTRIBUTORS.md | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7a7f52b5ee82..e261aebd7245 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,14 +1,32 @@ Contributors of TVM =================== -TVM adopts Apache style committer model. The package is developed and used by the community. +TVM adopts Apache style committer model. +The package is developed and used by the community. + +We actively seek committers that come from community contributors who: +- Made substantial contributions to the project. + - All forms of contributions are valued (see detail in next section). +- Willing to spend time on maintaining and lead the project. + +Contributions +------------- +We value all forms of contributions, here is a non-comprehensive +list of contributions that are welcomed + +- Documentation and usage examples +- Community participation, answering questions and issues. +- Code readability and developer guide + - We welcome contributions that add code comments + to improve readability + - We also welcome contributions to docs to explain the + design choices of the internal. +- Test cases to make the codebase more robust +- Tutorials, blog posts, talks that promote the project. -We actively seek committers that comes from contributors who: -- Made substantial contribution to the project. -- Willing to spent time on maintaining and lead the project. How to Contribute ----------------- -See [Contributor guide](docs/how_to/contribute.md) on how to contribute +See [Contributor guide](docs/how_to/contribute.md) on how to contribute. Committers ---------- @@ -28,6 +46,7 @@ and are qualified to lead development and review changes of the owned module. - [Yuwei Hu](https://github.com/Huyuwei) TOPI - [Yizhi Liu](https://github.com/yzhliu) JVM package + List of Contributors -------------------- - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors) From 8f883d88335a8340d9d4ab5f7f0c3691f5a5875c Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 15 Mar 2018 21:22:29 -0700 Subject: [PATCH 208/948] [PASS] Fix reuse small buffer in storage rewrite (#1012) --- src/pass/storage_rewrite.cc | 1 + .../unittest/test_pass_storage_rewrite.py | 48 +++++++++++++++---- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index ac3ca2561e01..0a8782366193 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -824,6 +824,7 @@ class StoragePlanRewriter : public IRMutator { if (e->attach_scope_ != attach_scope) continue; if (e->scope != scope) continue; if (e->elem_type != op->type.element_of()) continue; + e->const_nbits = std::max(const_nbits, e->const_nbits); const_free_map_.erase(it); return e; } diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index 9613a61b9b39..994db4ce5f15 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -76,7 +76,7 @@ def stmt_generater(dtype_list, length): E = ib.allocate(dtype, length, name="E", scope="local.L0A") E[j] = A[j].astype(dtype) + B[j].astype(dtype) + C[j].astype(dtype) + D[j].astype(dtype) return ib.get() - + def dtype_bit_len(dtype): index = 0 for i in dtype: @@ -109,7 +109,7 @@ def verify(n): dtype_list = ["float64", "int32", "uint16", "int8"] dtype_test(dtype_list, length) - + dtype_list = ["int8", "int32", "uint16", "uint8"] dtype_test(dtype_list, length) @@ -313,7 +313,7 @@ def mem_info_inp_buffer(): B2L = s.cache_read(B2, scope_tb, [B7, B9]) B4L = s.cache_read(B4, scope_tb, [B7, B12]) B3L = s.cache_read(B3, scope_tb, [B9, B13]) - B0L = s.cache_read(B0, scope_tb, [B10, B12]) + B0L = s.cache_read(B0, scope_tb, [B10, B12]) B8L = s.cache_write(B8, scope_tb) B11L = s.cache_write(B11, scope_tb) @@ -324,7 +324,7 @@ def mem_info_inp_buffer(): B10L = s.cache_write(B10, scope_tb) B12L = s.cache_write(B12, scope_tb) B13L = s.cache_write(B13, scope_tb) - + s[B12].compute_inline() s[B13].compute_inline() s[B8].compute_inline() @@ -334,12 +334,12 @@ def mem_info_inp_buffer(): s[B7].compute_inline() s[B9].compute_inline() s[B10].compute_inline() - - s = s.normalize() + + s = s.normalize() bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) - + B0a = tvm.decl_buffer(B0.shape, B0.dtype, name='B0') B1a = tvm.decl_buffer(B1.shape, B1.dtype, name='B1') B2a = tvm.decl_buffer(B2.shape, B2.dtype, name='B2') @@ -411,6 +411,38 @@ def verify(n): tvm.ir_pass.PostOrderVisit(body, verify) assert num_alloc[0] == 1 + +def test_reuse_small_buffer(): + ib = tvm.ir_builder.create() + n = tvm.var("n") + with ib.for_range(0, n, name="i") as i: + with ib.for_range(0, 10, name="j") as j: + A = ib.allocate("int16", 200, name="A", scope="local.L0A") + A[j] = tvm.const(1, "int16") + B = ib.allocate("int16", 200, name="B", scope="local.L0A") + B[j] = tvm.const(1, "int16") + B1 = ib.allocate("int16", 200, name="B1", scope="local.L0A") + B1[j] = A[j] + B[j] + C = ib.allocate("int16", 400, name="C", scope="local.L0A") + C[j] = tvm.const(1, "int16") + D = ib.allocate("int16", 400, name="D", scope="local.L0A") + D[j] = tvm.const(1, "int16") + E = ib.allocate("int16", 400, name="E", scope="local.L0A") + E[j] = C[j] + + body = ib.get() + body = tvm.ir_pass.StorageRewrite(body) + + num_alloc = [0] + + def verify(n): + if isinstance(n, tvm.stmt.Allocate): + num_alloc[0] += 1 + assert n.extents[0].value == 800 + tvm.ir_pass.PostOrderVisit(body, verify) + assert num_alloc[0] == 1 + + if __name__ == "__main__": test_alloc_seq() test_alloc_different_dtypes() @@ -423,4 +455,4 @@ def verify(n): test_inplace_rule3() test_alloc_seq_type() test_alloc_seq_type2() - + test_reuse_small_buffer() From 82c563424be3cda45ee26edb47da87b87bca2e39 Mon Sep 17 00:00:00 2001 From: nhynes Date: Fri, 16 Mar 2018 10:36:14 -0700 Subject: [PATCH 209/948] Simplify enclave lifecycle management (#1013) --- apps/sgx/app.cc | 3 +-- apps/sgx/enclave.cc | 2 -- sgx/runtime_t.cc | 6 ------ sgx/runtime_u.cc | 8 +++++--- sgx/threading_backend.cc | 9 +++++++-- sgx/tvm.edl | 4 ++-- src/runtime/thread_pool.cc | 14 ++++++-------- 7 files changed, 21 insertions(+), 25 deletions(-) diff --git a/apps/sgx/app.cc b/apps/sgx/app.cc index d008bfb3777d..c130b1d2f3e9 100644 --- a/apps/sgx/app.cc +++ b/apps/sgx/app.cc @@ -119,8 +119,7 @@ int SGX_CDECL main(int argc, char *argv[]) { if (sgx_status != SGX_SUCCESS) { print_error_message(sgx_status); } - tvm_ecall_shutdown(tvm_sgx_eid); - tvm::runtime::sgx::Shutdown(); + sgx_destroy_enclave(tvm_sgx_eid); if (addone_status == 1) { diff --git a/apps/sgx/enclave.cc b/apps/sgx/enclave.cc index d43107288ecb..342e5984e0f9 100644 --- a/apps/sgx/enclave.cc +++ b/apps/sgx/enclave.cc @@ -6,8 +6,6 @@ #include #endif -extern void Shutdown(); - /* This function mirrors the one in howto_deploy except without the iostream */ int Verify(tvm::runtime::Module mod, std::string fname) { // Get the function from the module. diff --git a/sgx/runtime_t.cc b/sgx/runtime_t.cc index 5f280ffce8e4..8ab326f862c0 100644 --- a/sgx/runtime_t.cc +++ b/sgx/runtime_t.cc @@ -15,9 +15,3 @@ #include "threading_backend.cc" #endif #include "../../src/runtime/thread_pool.cc" - -extern "C" { -void tvm_ecall_shutdown() { - tvm::runtime::ThreadPool::Global()->Shutdown(); -} -} diff --git a/sgx/runtime_u.cc b/sgx/runtime_u.cc index 0acccf614b40..08ea2acf643c 100644 --- a/sgx/runtime_u.cc +++ b/sgx/runtime_u.cc @@ -14,7 +14,8 @@ namespace sgx { static std::unique_ptr sgx_thread_group; extern "C" { -void tvm_ocall_thread_pool_launch(int num_tasks, void* cb) { + +void tvm_ocall_thread_group_launch(int num_tasks, void* cb) { std::function runner = [cb](int _worker_id) { sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; sgx_status = tvm_ecall_run_worker(tvm_sgx_eid, cb); @@ -23,12 +24,13 @@ void tvm_ocall_thread_pool_launch(int num_tasks, void* cb) { sgx_thread_group.reset(new tvm::runtime::threading::ThreadGroup( num_tasks, runner, false /* include_main_thread */)); } -} -void Shutdown() { +void tvm_ocall_thread_group_join() { sgx_thread_group->Join(); } +} + } // namespace sgx } // namespace runtime } // namespace tvm diff --git a/sgx/threading_backend.cc b/sgx/threading_backend.cc index 7f820ab51581..c26cf08b95e2 100644 --- a/sgx/threading_backend.cc +++ b/sgx/threading_backend.cc @@ -10,7 +10,8 @@ #include extern "C" { -sgx_status_t SGX_CDECL tvm_ocall_thread_pool_launch(int num_workers, void* cb); +sgx_status_t SGX_CDECL tvm_ocall_thread_group_launch(int num_workers, void* cb); +sgx_status_t SGX_CDECL tvm_ocall_thread_group_join(); } #ifndef TVM_SGX_MAX_CONCURRENCY @@ -31,10 +32,14 @@ class ThreadGroup::Impl { CHECK(num_workers <= TVM_SGX_MAX_CONCURRENCY) << "Tried spawning more threads than allowed by TVM_SGX_MAX_CONCURRENCY."; sgx_status_t sgx_status = SGX_ERROR_UNEXPECTED; - sgx_status = tvm_ocall_thread_pool_launch(num_workers, this); + sgx_status = tvm_ocall_thread_group_launch(num_workers, this); CHECK(sgx_status == SGX_SUCCESS) << "SGX Error: " << sgx_status; } + ~Impl() { + tvm_ocall_thread_group_join(); + } + void RunTask() { int task_id = next_task_id_++; CHECK(task_id < num_workers_) diff --git a/sgx/tvm.edl b/sgx/tvm.edl index e88ac0ac7f9c..f580fecfec89 100644 --- a/sgx/tvm.edl +++ b/sgx/tvm.edl @@ -5,11 +5,11 @@ enclave { public void tvm_ecall_run_module([user_check] const void* tvm_args, [user_check] void* tvm_ret_value); public void tvm_ecall_run_worker([user_check] const void* cb); - public void tvm_ecall_shutdown(); }; untrusted { - void tvm_ocall_thread_pool_launch(int num_workers, [user_check] void* cb); + void tvm_ocall_thread_group_launch(int num_workers, [user_check] void* cb); + void tvm_ocall_thread_group_join(); }; }; diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index ac42a0c032c5..d70f03e08b64 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -265,7 +265,12 @@ class ThreadPool { num_workers_, [this](int worker_id) { this->RunWorker(worker_id); }, false /* include_main_thread */)); } - ~ThreadPool() { Shutdown(); } + ~ThreadPool() { + for (std::unique_ptr& q : queues_) { + q->SignalForKill(); + } + threads_.reset(); + } int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, @@ -292,13 +297,6 @@ class ThreadPool { return res; } - void Shutdown() { - for (std::unique_ptr& q : queues_) { - q->SignalForKill(); - } - threads_.reset(); - } - static ThreadPool* Global() { static ThreadPool inst; return &inst; From 86afdd4c0e83cbe0f020c913b40bf5c436b55b97 Mon Sep 17 00:00:00 2001 From: Salem Derisavi <33945117+derisavi-huawei@users.noreply.github.com> Date: Fri, 16 Mar 2018 18:09:38 -0700 Subject: [PATCH 210/948] file include/tvm/logging.h from AutoTensorize work (#1015) --- include/tvm/logging.h | 99 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 include/tvm/logging.h diff --git a/include/tvm/logging.h b/include/tvm/logging.h new file mode 100644 index 000000000000..fdd063082a88 --- /dev/null +++ b/include/tvm/logging.h @@ -0,0 +1,99 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file logging.h + * \brief logging utilities on top of dmlc-core + */ +#ifndef TVM_LOGGING_H_ +#define TVM_LOGGING_H_ + +// a technique that enables overriding macro names on the number of parameters. This is used +// to define other macros below +#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME + +/*! + * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X + * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.) + * COND_X (but not COND_X_N) are supposed to be used outside this file. + * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert', + * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X. + * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X + * quits the program on assertion failure. If it's false, then it moves on and somehow reports + * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false' + * in a function, or 'continue' or 'break' in a loop) + * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not + * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what + * to do when when quit_on_assertion is false and the assertion fails. + * + * Rationale: These macros were designed to implement functions that have two behaviours + * in a concise way. Those behaviours are quitting on assertion failures, or trying to + * move on from assertion failures. Note that these macros hide lots of control flow in them, + * and therefore, makes the logic of the whole code slightly harder to understand. However, + * in pieces of code that use these macros frequently, it will significantly shorten the + * amount of code needed to be read, and we won't need to clutter the main logic of the + * function by repetitive control flow structure. The first problem + * mentioned will be improved over time as the developer gets used to the macro. + * + * Here is an example of how to use it + * \code + * bool f(..., bool quit_on_assertion) { + * int a = 0, b = 0; + * ... + * a = ... + * b = ... + * // if quit_on_assertion is true, if a==b, continue, otherwise quit. + * // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default behaviour) + * COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when quiting" + * ... + * for (int i = 0; i < N; i++) { + * a = ... + * b = ... + * // if quit_on_assertion is true, if a==b, continue, otherwise quit. + * // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default + * // behaviour, therefore, has to be explicitly specified) + * COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when quiting" + * } + * } + * \endcode + */ +#define COND_CHECK_GE(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__) +#define COND_CHECK_EQ(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__) +#define COND_CHECK(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__) +#define COND_LOG(...) \ + GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__) + +// Not supposed to be used by users directly. +#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \ + if (!quit_on_assert) { \ + if (!((x) op (y))) \ + what; \ + } \ + else /* NOLINT(*) */ \ + CHECK_##op(x, y) + +#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==) +#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=) + +#define COND_CHECK_3(quit_on_assert, x, what) \ + if (!quit_on_assert) { \ + if (!(x)) \ + what; \ + } \ + else /* NOLINT(*) */ \ + CHECK(x) + +#define COND_LOG_3(quit_on_assert, x, what) \ + if (!quit_on_assert) { \ + what; \ + } \ + else /* NOLINT(*) */ \ + LOG(x) + +#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false) +#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false) +#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false) +#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false) + +#endif // TVM_LOGGING_H_ From 5c06ffa825ee867f94543523c78f661c0416a129 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Sat, 17 Mar 2018 11:15:17 +0530 Subject: [PATCH 211/948] Documentation issues fixed (#1016) --- tutorials/optimize/opt_gemm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index f1060bf46f45..44ee53a73399 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -13,9 +13,9 @@ In this tutorial, we will demonstrate how to use TVM to optimize square matrix multiplication and achieve 200 times faster than baseline by simply adding 18 extra lines of code. -There are two important optmizations on intense computation applications executed on CPU: +There are two important optimizations on intense computation applications executed on CPU: 1. Increase the cache hit rate of memory access. Both complex numerical computation and hot-spot - memory access can be acclerated from high cache hit rate. This requires us to transform the + memory access can be accelerated from high cache hit rate. This requires us to transform the origin memory access pattern to the pattern fits the cache policy. 2. SIMD (Single instruction multi-data), or we call it vector processing unit. Every time, a small batch of data, rather than a single grid, will be processed. This requires us to @@ -26,7 +26,7 @@ `repo `_. Some of them have been applied by TVM abstraction automatically, but some of them cannot be simply applied due to TVM constraints. -All the experiment results mentioned below, are executed on 2015's 15' MacBook equiped with +All the experiment results mentioned below, are executed on 2015's 15' MacBook equipped with Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs. """ From 8595d19261a9434fffbaeca906d3d4c58f9e70d2 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 17 Mar 2018 12:22:10 -0700 Subject: [PATCH 212/948] [RUNTIME] More reliable thread enumeration (#1017) --- src/codegen/llvm/codegen_amdgpu.cc | 4 +-- src/codegen/llvm/codegen_nvptx.cc | 4 +-- src/codegen/spirv/codegen_spirv.cc | 4 +-- src/pass/storage_access.cc | 3 +- src/pass/storage_access.h | 1 + src/pass/storage_flatten.cc | 4 ++- src/pass/storage_rewrite.cc | 6 ++-- src/pass/storage_sync.cc | 17 ++++----- src/runtime/thread_storage_scope.h | 55 ++++++++++++++++++++++++++---- src/schedule/bound.cc | 12 ++++--- 10 files changed, 80 insertions(+), 30 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 9b8995bf5516..fa42cefa07a7 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -44,7 +44,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { if (info.alignment > 16) { info.alignment = 16; } - if (info.scope.rank == 2) { + if (info.scope.rank == runtime::StorageRank::kLocal) { // const int local_address_space = 5; // TODO(tqchen): for higher version of LLVM, local address space can be set. llvm::AllocaInst* alloca = builder_->CreateAlloca( @@ -54,7 +54,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { } buf = alloca; } else { - CHECK_EQ(info.scope.rank, 1) + CHECK(info.scope.rank == runtime::StorageRank::kShared) << "Can only allocate shared or local memory inside kernel"; // Shared memory: address space == 3 const unsigned shared_address_space = 3; diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc index c0002873d5fc..d354e3b9eaf0 100644 --- a/src/codegen/llvm/codegen_nvptx.cc +++ b/src/codegen/llvm/codegen_nvptx.cc @@ -47,7 +47,7 @@ class CodeGenNVPTX : public CodeGenLLVM { if (info.alignment > 16) { info.alignment = 16; } - if (info.scope.rank == 2) { + if (info.scope.rank == runtime::StorageRank::kLocal) { // const int local_address_space = 5; // TODO(tqchen): for higher version of LLVM, local address space can be set. llvm::AllocaInst* alloca = builder_->CreateAlloca( @@ -57,7 +57,7 @@ class CodeGenNVPTX : public CodeGenLLVM { } buf = alloca; } else { - CHECK_EQ(info.scope.rank, 1) + CHECK(info.scope.rank == runtime::StorageRank::kShared) << "Can only allocate shared or local memory inside kernel"; // Shared memory: address space == 3 const unsigned shared_address_space = 3; diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc index 4d7a9b21ba5b..d844a7b11390 100644 --- a/src/codegen/spirv/codegen_spirv.cc +++ b/src/codegen/spirv/codegen_spirv.cc @@ -561,13 +561,13 @@ void CodeGenSPIRV::VisitStmt_(const Allocate* op) { spirv::Value buf; StorageInfo& info = storage_info_[op->buffer_var.get()]; spirv::SType etype = builder_->GetSType(op->type); - if (info.scope.rank == 2) { + if (info.scope.rank == runtime::StorageRank::kLocal) { buf = builder_->Allocate( etype, static_cast(constant_size), spv::StorageClassFunction); } else { // shared memory - CHECK_EQ(info.scope.rank, 1) + CHECK(info.scope.rank == runtime::StorageRank::kShared) << "Can only allocate shared or local memory inside kernel"; // Shared memory buf = builder_->Allocate( diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc index 9211f3f71de0..09be1a53da42 100644 --- a/src/pass/storage_access.cc +++ b/src/pass/storage_access.cc @@ -210,7 +210,8 @@ void StorageAccessVisitor::Visit_(const Call* op) { StorageScope StorageAccessVisitor::GetScope(const Variable* buf) const { auto it = storage_scope_.find(buf); - StorageScope s; s.rank = 0; + StorageScope s; + s.rank = StorageRank::kGlobal; if (it == storage_scope_.end()) return s; return it->second; } diff --git a/src/pass/storage_access.h b/src/pass/storage_access.h index 7268bb668342..4f313f8e7c24 100644 --- a/src/pass/storage_access.h +++ b/src/pass/storage_access.h @@ -17,6 +17,7 @@ namespace tvm { namespace ir { using runtime::StorageScope; +using runtime::StorageRank; /*! * \brief Base class of storage access analysis */ diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc index 94332ff5cb7e..f5cb98495ff9 100644 --- a/src/pass/storage_flatten.cc +++ b/src/pass/storage_flatten.cc @@ -23,6 +23,7 @@ namespace tvm { namespace ir { using HalideIR::Internal::Region; +using runtime::StorageRank; using runtime::StorageScope; using runtime::ThreadScope; using intrinsic::tvm_address_of; @@ -141,7 +142,8 @@ class StorageFlattener : public IRMutator { const std::string& strkey = it->second; if (strkey.length() == 0) { if (curr_thread_scope_.size() != 0) { - skey.rank = curr_thread_scope_.back().rank + 1; + skey.rank = runtime::DefaultStorageRank( + curr_thread_scope_.back().rank); } } else { skey = StorageScope::make(strkey); diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 0a8782366193..998df034e5a1 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -19,6 +19,7 @@ namespace tvm { namespace ir { +using runtime::StorageRank; using runtime::StorageScope; // Find a linear pattern of storage acess @@ -794,7 +795,7 @@ class StoragePlanRewriter : public IRMutator { // disable reuse of small arrays, they will be lowered to registers in LLVM // This rules only apply if we are using non special memory if (scope.tag.length() == 0) { - if (scope.rank > 1 || op->type.is_handle()) { + if (scope.rank >= StorageRank::kWarp || op->type.is_handle()) { return NewAlloc(op, attach_scope, scope, const_nbits); } if (const_nbits > 0 && const_nbits <= 32) { @@ -853,7 +854,8 @@ class StoragePlanRewriter : public IRMutator { // This rules only apply if we are using non special memory if (e->scope.tag.length() == 0) { // Disable sharing of local memory. - if (e->scope.rank > 1 || e->allocs[0]->type.is_handle()) return; + if (e->scope.rank >= StorageRank::kWarp || + e->allocs[0]->type.is_handle()) return; // disable reuse of small arrays if (e->const_nbits > 0 && e->const_nbits <= 32) return; } diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc index af3dc1f128e5..6e2d1020a6b5 100644 --- a/src/pass/storage_sync.cc +++ b/src/pass/storage_sync.cc @@ -189,7 +189,7 @@ class ThreadSyncInserter : public IRMutator { if (syncs_.size() == 0) return stmt; if (syncs_.count(stmt.get())) { Stmt barrier; - if (sync_scope_.rank == 0) { + if (sync_scope_.rank == StorageRank::kGlobal) { barrier = MakeGlobalBarrier(); } else { barrier = Evaluate::make( @@ -206,15 +206,15 @@ class ThreadSyncInserter : public IRMutator { return stmt; } Expr Mutate_(const Load* op, const Expr& e) final { - if (sync_scope_.rank == 0 && - GetScope(op->buffer_var.get()).rank == 0) { + if (sync_scope_.rank == StorageRank::kGlobal && + GetScope(op->buffer_var.get()).rank == StorageRank::kGlobal) { ++rw_stats_[op->buffer_var].read_count; } return IRMutator::Mutate_(op, e); } Stmt Mutate_(const Store* op, const Stmt& s) final { - if (sync_scope_.rank == 0 && - GetScope(op->buffer_var.get()).rank == 0) { + if (sync_scope_.rank == StorageRank::kGlobal && + GetScope(op->buffer_var.get()).rank == StorageRank::kGlobal) { ++rw_stats_[op->buffer_var].write_count; } return IRMutator::Mutate_(op, s); @@ -228,7 +228,7 @@ class ThreadSyncInserter : public IRMutator { thread_extents_.pop_back(); std::swap(temp, in_thread_env_); // first thread scope. - if (!in_thread_env_ && sync_scope_.rank == 0) { + if (!in_thread_env_ && sync_scope_.rank == StorageRank::kGlobal) { ret = InitGlobalBarrier(ret.as()); num_blocks_ = Expr(); is_lead_ = Expr(); @@ -253,7 +253,8 @@ class ThreadSyncInserter : public IRMutator { // Get current storage scope. StorageScope GetScope(const Variable* buf) const { auto it = storage_scope_.find(buf); - StorageScope s; s.rank = 0; + StorageScope s; + s.rank = StorageRank::kGlobal; if (it == storage_scope_.end()) return s; return it->second; } @@ -279,7 +280,7 @@ class ThreadSyncInserter : public IRMutator { return Block::make(prep, body); } Stmt MakeGlobalBarrier() { - CHECK_EQ(sync_scope_.rank, 0); + CHECK(sync_scope_.rank == StorageRank::kGlobal); if (!num_blocks_.defined()) { CHECK(!is_lead_.defined()); num_work_dim_ = thread_extents_.size(); diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h index 48b5e8f1ef16..647bbb82ea34 100644 --- a/src/runtime/thread_storage_scope.h +++ b/src/runtime/thread_storage_scope.h @@ -13,10 +13,47 @@ namespace tvm { namespace runtime { +/*! + * \brief Memory hierachy rank in the storage system + * \note The global rank and shared rank have one to one + * correspondence to the thread rank. + */ +enum class StorageRank { + /*! \brief global memory */ + kGlobal = 0, + /*! \brief shared memory among thread group */ + kShared = 1, + /*! + * \brief reserved for warp memory. + * This is only used by programming model. + * There is no such memory usually in GPU. + * Instead, we can simulate it by registers and shuffle. + */ + kWarp = 2, + /*! \brief thread local memory */ + kLocal = 3 +}; + +/*! + * \param thread_scope_rank The thread scope rank + * \return default storage rank given the thread scope + */ +inline StorageRank DefaultStorageRank(int thread_scope_rank) { + switch (thread_scope_rank) { + case -1: return StorageRank::kGlobal; + case 0: return StorageRank::kShared; + case 1: return StorageRank::kLocal; + default: { + LOG(FATAL) << "unknown rank"; + return StorageRank::kGlobal; + } + } +} + /*! \brief class to represent storage scope */ struct StorageScope { /*! \brief The rank of the storage */ - int rank{0}; + StorageRank rank{StorageRank::kGlobal}; /*! \brief tag for special purpose memory. */ std::string tag; // comparator @@ -29,9 +66,10 @@ struct StorageScope { inline std::string to_string() const { std::string ret; switch (rank) { - case 0: return "global" + tag; - case 1: return "shared" + tag; - case 2: return "local" + tag; + case StorageRank::kGlobal: return "global" + tag; + case StorageRank::kShared: return "shared" + tag; + case StorageRank::kWarp: return "warp" + tag; + case StorageRank::kLocal: return "local" + tag; default: LOG(FATAL) << "unknown storage scope"; return ""; } } @@ -43,13 +81,16 @@ struct StorageScope { static StorageScope make(const std::string& s) { StorageScope r; if (s.compare(0, 6, "global") == 0) { - r.rank = 0; + r.rank = StorageRank::kGlobal; r.tag = s.substr(6, std::string::npos); } else if (s.compare(0, 6, "shared") == 0) { - r.rank = 1; + r.rank = StorageRank::kShared; r.tag = s.substr(6, std::string::npos); + } else if (s.compare(0, 4, "warp") == 0) { + r.rank = StorageRank::kWarp; + r.tag = s.substr(4, std::string::npos); } else if (s.compare(0, 5, "local") == 0) { - r.rank = 2; + r.rank = StorageRank::kLocal; r.tag = s.substr(5, std::string::npos); } else { LOG(FATAL) << "unknown storage scope " << s; diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc index 1a06970e52e4..908b579ec9a4 100644 --- a/src/schedule/bound.cc +++ b/src/schedule/bound.cc @@ -16,8 +16,9 @@ namespace tvm { namespace schedule { -using runtime::ThreadScope; +using runtime::StorageRank; using runtime::StorageScope; +using runtime::ThreadScope; /*! \brief The graph context used during bound inference. */ struct GraphContext { @@ -41,7 +42,7 @@ bool NeedRelax(const IterVar& iv, if (tag.length() == 0 || tag == "pipeline") { return !found_attach; } - return scope.rank <= ThreadScope::make(tag).rank; + return static_cast(scope.rank) <= ThreadScope::make(tag).rank; } // infer storage scope, if not given @@ -50,16 +51,17 @@ StorageScope InferStorageScope( if (stage->scope.length() != 0) { return StorageScope::make(stage->scope); } - int max_rank = 0; + int max_rank = -1; for (IterVar iv : ctx.attach_path.at(stage->op)) { auto it = ctx.bind_map.find(iv); const std::string& tag = ( it != ctx.bind_map.end() ? it->second->thread_tag : iv->thread_tag); if (tag != "pipeline" && tag.length() != 0) { - max_rank = std::max(max_rank, ThreadScope::make(tag).rank + 1); + max_rank = std::max(max_rank, ThreadScope::make(tag).rank); } } - StorageScope s; s.rank = max_rank; + StorageScope s; + s.rank = runtime::DefaultStorageRank(max_rank); return s; } From 4afc2f9bcb30e3f744ad6a7653bf295a42b50dcc Mon Sep 17 00:00:00 2001 From: alex-weaver Date: Mon, 19 Mar 2018 07:17:25 +0000 Subject: [PATCH 213/948] Implement C++ registry to back Python target.generic_func (#892) --- include/tvm/build_module.h | 273 ++++++++++++-- python/tvm/target.py | 284 +++++++++----- src/codegen/build_module.cc | 441 ++++++++++++++++++---- src/runtime/threading_backend.cc | 1 + tests/cpp/build_module_test.cc | 2 +- tests/python/unittest/test_lang_target.py | 15 +- topi/include/topi/cuda/dense.h | 19 +- topi/include/topi/cuda/extern.h | 2 +- topi/include/topi/cuda/injective.h | 2 +- topi/include/topi/cuda/pooling.h | 2 +- topi/include/topi/cuda/reduction.h | 4 +- topi/include/topi/nn/dense.h | 13 +- topi/include/topi/rocm/dense.h | 19 +- topi/python/topi/__init__.py | 5 +- topi/python/topi/generic/injective.py | 2 +- topi/python/topi/generic/nn.py | 14 +- topi/python/topi/nn/dense.py | 2 +- topi/src/topi.cc | 133 +++++-- 18 files changed, 954 insertions(+), 279 deletions(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index 3fb55ae169ce..0cbc97d71e46 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -8,81 +8,146 @@ #include #include -#include "./tvm/runtime/packed_func.h" -#include "./tvm/schedule_pass.h" -#include "./tvm/lowered_func.h" +#include "./runtime/packed_func.h" +#include "./schedule_pass.h" +#include "./lowered_func.h" namespace tvm { +using namespace tvm::runtime; /*! * \brief Container for target device information. * Use target::llvm, target::cuda etc functions instead of constructing directly. */ -struct Target { +class TargetNode : public Node { + public: /*! \brief The name of the target device */ std::string target_name; /*! \brief The type of the target device */ - DLDeviceType device_type; + int device_type; /*! \brief The maximum threads that a schedule should use for this device */ int max_num_threads = 1; /*! \brief The warp size that should be used by the LowerThreadAllreduce pass */ int thread_warp_size = 1; /*! \brief Keys for this target */ - std::unordered_set keys; + Array keys_array; /*! \brief Options for this target */ - std::vector options; - /*! \brief Set of imported libs */ - std::unordered_set libs; - - Target(const std::string& target_name, - DLDeviceType device_type, - int max_num_threads, - int thread_warp_size, - const std::unordered_set& keys, - const std::vector& options, - const std::unordered_set& libs = - std::unordered_set()) : - target_name(target_name), - device_type(device_type), - max_num_threads(max_num_threads), - thread_warp_size(thread_warp_size), - keys(keys), - options(options), - libs(libs) { - } + Array options_array; + /*! \brief Collection of imported libs */ + Array libs_array; /*! \return the full device string to pass to codegen::Build */ EXPORT std::string str() const; + void VisitAttrs(AttrVisitor* v) final { + v->Visit("target_name", &target_name); + v->Visit("device_type", &device_type); + v->Visit("max_num_threads", &max_num_threads); + v->Visit("thread_warp_size", &thread_warp_size); + v->Visit("keys_array", &keys_array); + v->Visit("options_array", &options_array); + v->Visit("libs_array", &libs_array); + } + + /*! \brief Get the keys for this target as a vector of string */ + EXPORT std::vector keys() const; + + /*! \brief Get the options for this target as a vector of string */ + EXPORT std::vector options() const; + + /*! \brief Get the keys for this target as an unordered_set of string */ + EXPORT std::unordered_set libs() const; + + static constexpr const char* _type_key = "Target"; + TVM_DECLARE_NODE_TYPE_INFO(TargetNode, Node); +}; + +class Target : public NodeRef { + public: + Target() {} + explicit Target(std::shared_ptr n) : NodeRef(n) {} + /*! - * \brief Create a Target given a string - * \param target_str the string to parse - */ + * \brief Create a Target given a string + * \param target_str the string to parse + */ EXPORT static Target create(const std::string& target_str); + + /*! + * \brief Push a new target context onto the thread local stack. The Target on top of + * the stack is used to determine which specialization to use when invoking a GenericFunc. + * \param target The target to set as the current context. + */ + EXPORT static void EnterTargetScope(const tvm::Target& target); + + /*! + * \brief Pop a target off the thread local context stack, restoring the previous target + * as the current context. + */ + EXPORT static void ExitTargetScope(); + + /*! + * \brief Get the current target context from thread local storage. + * \param allow_not_defined If the context stack is empty and this is set to true, an + * undefined Target will be returned. Otherwise, an empty context stack will cause a + * runtime error. + * \return The target that is the current context. The target may not be defined if + * allow_not_defined is true. + */ + EXPORT static tvm::Target current_target(bool allow_not_defined = true); + + inline const TargetNode* operator->() const { + return static_cast(node_.get()); + } + + using ContainerType = TargetNode; +}; + +/*! + * \brief RAII container to provide a scoped target context. Pushes a target onto the + * context stack when constructed, and pops it when destructed. + */ +struct TargetContext { + /*! + * \brief Enter a new target context. The given target becomes the new current context. + * When the TargetContext is destructed, the previous context is restored. + * \param target The target to set as the new current context. + */ + explicit TargetContext(const tvm::Target& target) { + Target::EnterTargetScope(target); + } + + /*! \brief Destructor. Pops the context off the thread local stack. */ + ~TargetContext() { + Target::ExitTargetScope(); + } }; /*! \brief This namespace provides functions to construct Target instances */ namespace target { /*! \return A target for LLVM */ -EXPORT Target llvm(); +EXPORT Target llvm(const std::unordered_set& options = {}); /*! \return A target for CUDA */ -EXPORT Target cuda(); +EXPORT Target cuda(const std::unordered_set& options = {}); /*! \return A target for ROCm */ -EXPORT Target rocm(); +EXPORT Target rocm(const std::unordered_set& options = {}); + +/*! \return A target for OpenCL */ +EXPORT Target opencl(const std::unordered_set& options = {}); /*! \return A target for Metal */ -EXPORT Target metal(); +EXPORT Target metal(const std::unordered_set& options = {}); /*! \return A target for rasp */ -EXPORT Target rasp(); +EXPORT Target rasp(const std::unordered_set& options = {}); /*! \return A target for Mali */ -EXPORT Target mali(); +EXPORT Target mali(const std::unordered_set& options = {}); /*! \return A target for stackvm */ -EXPORT Target stackvm(); +EXPORT Target stackvm(const std::unordered_set& options = {}); } // namespace target @@ -174,15 +239,147 @@ EXPORT Array lower(Schedule sch, * \brief Build a device and host module for a specific target from an array of lowered functions. * \param funcs The functions to be built. * \param target The target device to build for. -* \param target_host The target for building host code. If null, a suitable default will be used. +* \param target_host The target for building host code. To use the default, pass Target() * \param config The build configuration. * \return The built module. */ EXPORT runtime::Module build(const Array& funcs, const Target& target, - Target* target_host, + const Target& target_host, const BuildConfig& config); +class GenericFuncNode; + +/*! + * \brief Generic function that can be specialized on a per-target basis. + */ +class GenericFunc : public NodeRef { + public: + GenericFunc() {} + explicit GenericFunc(std::shared_ptr n) : NodeRef(n) {} + + /*! + * \brief Set the default function implementaiton. + * \param value The default function + * \param allow_override If true, this call may override a previously registered function. If + * false, an error will be logged if the call would override a previously registered function. + * \return reference to self. + */ + TVM_DLL GenericFunc& set_default(const PackedFunc value, + bool allow_override = false); + /*! + * \brief Register a specialized function + * \param tags The tags for this specialization + * \param value The specialized function + * \param allow_override If true, this call may override previously registered tags. If false, + * an error will be logged if the call would override previously registered tags. + * \return reference to self. + */ + TVM_DLL GenericFunc& register_func(const std::vector& tags, + const PackedFunc value, + bool allow_override = false); + /*! + * \brief Call generic function by directly passing in unpacked format. + * \param args Arguments to be passed. + * \tparam Args arguments to be passed. + * + * \code + * // Example code on how to call generic function + * void CallGeneirc(GenericFunc f) { + * // call like normal functions by pass in arguments + * // return value is automatically converted back + * int rvalue = f(1, 2.0); + * } + * \endcode + */ + template + inline TVMRetValue operator()(Args&& ...args) const; + /*! + * \brief Invoke the relevant function for the current target context, set by set_target_context. + * Arguments are passed in packed format. + * \param args The arguments to pass to the function. + * \param ret The return value + */ + TVM_DLL void CallPacked(TVMArgs args, TVMRetValue* ret) const; + + /*! + * \brief Find or register the GenericFunc instance corresponding to the give name + * \param name The name of the registered GenericFunc + * \return The GenericFunc instance + */ + TVM_DLL static GenericFunc Get(const std::string& name); + + /*! + * \brief Add a GenericFunc instance to the registry + * \param func The GenericFunc instance + * \param name The name of the registered GenericFunc + */ + TVM_DLL static void RegisterGenericFunc(GenericFunc func, const std::string& name); + + /*! + * \brief access the internal node container + * \return the pointer to the internal node container + */ + inline GenericFuncNode* operator->(); + + // declare container type + using ContainerType = GenericFuncNode; + + // Internal class. + struct Manager; + + private: + friend struct Manager; +}; + +template +inline TVMRetValue GenericFunc::operator()(Args&& ...args) const { + const int kNumArgs = sizeof...(Args); + const int kArraySize = kNumArgs > 0 ? kNumArgs : 1; + TVMValue values[kArraySize]; + int type_codes[kArraySize]; + detail::for_each(TVMArgsSetter(values, type_codes), + std::forward(args)...); + TVMRetValue rv; + CallPacked(TVMArgs(values, type_codes, kNumArgs), &rv); + return rv; +} + +/*! + * \brief Represents a generic function that can be specialized on a per-target basis. + */ +class GenericFuncNode : public Node { + public: + /*! \brief name of the function */ + std::string name_; + /* \brief the generic builder */ + PackedFunc generic_func_; + /* \brief map from keys to registered functions */ + std::unordered_map dispatch_dict_; + + static constexpr const char* _type_key = "GenericFunc"; + TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node); +}; + +inline GenericFuncNode* GenericFunc::operator->() { + return static_cast(node_.get()); +} + +#define TVM_GENERIC_FUNC_REG_VAR_DEF \ + static TVM_ATTRIBUTE_UNUSED ::tvm::GenericFunc& __mk_ ## TVM + +/*! + * \def TVM_REGISTER_GENERIC_FUNC + * \brief Register a new generic function, or set a device-specific variant + * of the corresponding function. + * + * \param name The name of the function + */ +#define TVM_REGISTER_GENERIC_FUNC(name) \ + TVM_STR_CONCAT(TVM_GENERIC_FUNC_REG_VAR_DEF, __COUNTER__) = \ + ::tvm::GenericFunc::Get(#name) + + } // namespace tvm #endif // TVM_BUILD_MODULE_H_ diff --git a/python/tvm/target.py b/python/tvm/target.py index 7fa8998bfbe1..77fa50efe01c 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -40,8 +40,9 @@ """ from __future__ import absolute_import -import warnings from ._ffi.base import _LIB_NAME +from ._ffi.node import NodeBase, register_node +from . import _api_internal try: from decorator import decorate @@ -62,17 +63,10 @@ def _merge_opts(opts, new_opts): return opts -class Target(object): +@register_node +class Target(NodeBase): """Target device information, use through TVM API. - Parameters - ---------- - target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"} - The major target name. - - options : list of str, optional - Additional arguments appended to the target. - Note ---- Do not use class constructor, you can create target using the following functions @@ -83,68 +77,190 @@ class Target(object): - :any:`tvm.target.rocm` create ROCM target - :any:`tvm.target.mali` create Mali target """ - current = None - - def __init__(self, - target_name, - options=None): - self.target_name = target_name - self.options = _merge_opts([], options) - self.device_name = "" - self.libs = [] - # Parse device option - for item in self.options: - if item.startswith("-libs="): - libs = item.split("=")[1] - self.libs += libs.split(",") - elif item.startswith("-device="): - self.device_name = item.split("=")[1] - # Target query searches device name first - if self.device_name: - self.keys = (self.device_name,) - else: - self.keys = () - # Target configuration handling - self.thread_warp_size = 1 - if target_name in ("llvm", ): - self.keys += ("cpu",) - elif target_name in ("cuda", "nvptx"): - self.keys += ("cuda", "gpu") - self.max_num_threads = 512 - self.thread_warp_size = 32 - elif target_name in ("rocm", "opencl"): - # For now assume rocm schedule for opencl - self.keys += ("rocm", "gpu") - self.max_num_threads = 256 - elif target_name in ("metal", "vulkan"): - self.keys += (target_name, "gpu",) - self.max_num_threads = 256 - elif target_name in ("opengl",): - self.keys += ("opengl",) - elif target_name in ("stackvm", "ext_dev"): - # Do not now class for stackvm or ext_dev - pass - else: - raise ValueError("Unknown target name %s" % target_name) - - def __str__(self): - return " ".join([self.target_name] + self.options) - - def __repr__(self): - return self.__str__() + def __init__(self, handle): + super(Target, self).__init__(handle) + self._keys = None + self._options = None + self._libs = None + + @property + def keys(self): + if not self._keys: + self._keys = [k.value for k in self.keys_array] + return self._keys + + @property + def options(self): + if not self._options: + self._options = [o.value for o in self.options_array] + return self._options + + @property + def libs(self): + if not self._libs: + self._libs = [l.value for l in self.libs_array] + return self._libs def __enter__(self): - self._old_target = Target.current - if self._old_target is not None and str(self) != str(self._old_target): - warnings.warn( - "Override target '%s' with new target scope '%s'" % ( - self._old_target, self)) - Target.current = self + _api_internal._EnterTargetScope(self) return self def __exit__(self, ptype, value, trace): - Target.current = self._old_target + _api_internal._ExitTargetScope() + +@register_node +class GenericFunc(NodeBase): + """GenericFunc node reference. This represents a generic function + that may be specialized for different targets. When this object is + called, a specialization is chosen based on the current target. + + Note + ---- + Do not construct an instance of this object, it should only ever be + used as a return value from calling into C++. + """ + def __call__(self, *args): + return _api_internal._GenericFuncCallFunc(self, *args) + + def set_default(self, func, allow_override=False): + """Set the default function to be used if no specializations match + the current target. + + Parameters + ---------- + func : function + The default function + + allow_override : bool + Whether to allow the current default to be overridden + """ + _api_internal._GenericFuncSetDefault(self, func, allow_override) + + def register(self, func, key_list, allow_override=False): + """Register a specialization for this GenericFunc. + + Parameters + ---------- + func : function + The function to be registered. + + key : str or list of str + The key to be registered. + + allow_override : bool, optional + Whether to allow existing keys to be overridden. + """ + key_list = [key_list] if isinstance(key_list, str) else key_list + _api_internal._GenericFuncRegisterFunc(self, func, key_list, allow_override) + +def get_native_generic_func(name): + """Get a generic function from the global registry. If no + function is registered under the given name, a new generic + function is created. + + Parameters + ---------- + name : string + The name of the generic function to get + + Returns + ------- + func : GenericFunc + The generic function for the given name + """ + return _api_internal._GenericFuncGetGlobal(name) + +def override_native_generic_func(func_name): + """Override a generic function defined in C++ + + Generic function allows registration of further functions + that can be dispatched on current target context. + If no registered dispatch is matched, the fdefault will be called. + + Parameters + ---------- + func_name : string + The name of the generic func to be overridden + + Returns + ------- + fgeneric : function + A wrapped generic function. + + Example + ------- + .. code-block:: python + + import tvm + # wrap function as target generic + @tvm.target.override_native_generic_func("my_func") + def my_func(a): + return a + 1 + # register specialization of my_func under target cuda + @my_func.register("cuda") + def my_func_cuda(a): + return a + 2 + # displays 3, because my_func is called + print(my_func(2)) + # displays 4, because my_func_cuda is called + with tvm.target.cuda(): + print(my_func(2)) + """ + generic_func_node = get_native_generic_func(func_name) + + def fdecorate(fdefault): + """Wrap a target generic function, overriding the previous + default that was set for the generic function. + + Parameters + ---------- + fdefault : function + The default function. + + Returns + ------- + fgeneric : function + A wrapped generic function. + """ + generic_func_node.set_default(fdefault, allow_override=True) + + def register(key, func=None, override=True): + """Register function to be the dispatch function. + + Parameters + ---------- + key : str or list of str + The key to be registered. + + func : function + The function to be registered. + + override : bool, optional + Whether override existing registration. + + Returns + ------- + The register function is necessary. + """ + def _do_reg(myf): + generic_func_node.register(myf, key, override) + return myf + if func: + return _do_reg(func) + return _do_reg + + def dispatch_func(func, *args, **kwargs): + #pylint: disable=unused-argument + """The wrapped dispath function""" + if kwargs: + raise RuntimeError( + "Keyword arguments cannot be used when invoking generic_func %s" % func_name) + return generic_func_node(*args) + fresult = decorate(fdefault, dispatch_func) + fresult.register = register + return fresult + return fdecorate def generic_func(fdefault): """Wrap a target generic function. @@ -228,7 +344,6 @@ def dispatch_func(func, *args, **kwargs): fdecorate.register = register return fdecorate - def cuda(options=None): """Returns a cuda target. @@ -237,7 +352,8 @@ def cuda(options=None): options : list of str Additional options """ - return Target("cuda", options) + options = options if options else [] + return _api_internal._TargetCreate("cuda", *options) def rocm(options=None): @@ -248,7 +364,8 @@ def rocm(options=None): options : list of str Additional options """ - return Target("rocm", options) + options = options if options else [] + return _api_internal._TargetCreate("rocm", *options) def rasp(options=None): @@ -264,7 +381,7 @@ def rasp(options=None): "-mcpu=cortex-a53", "-mattr=+neon"] opts = _merge_opts(opts, options) - return Target("llvm", opts) + return _api_internal._TargetCreate("llvm", *opts) def mali(options=None): @@ -277,7 +394,7 @@ def mali(options=None): """ opts = ["-device=mali"] opts = _merge_opts(opts, options) - return Target("opencl", opts) + return _api_internal._TargetCreate("opencl", *opts) def opengl(options=None): @@ -288,7 +405,8 @@ def opengl(options=None): options : list of str Additional options """ - return Target("opengl", options) + options = options if options else [] + return _api_internal._TargetCreate("opengl", *options) def create(target_str): @@ -312,17 +430,8 @@ def create(target_str): return target_str if not isinstance(target_str, str): raise ValueError("target_str has to be string type") - arr = target_str.split() - # Parse device option - device_name = "" - for item in arr[1:]: - if item.startswith("-device="): - device_name = item.split("=")[1] - if device_name == "rasp": - return rasp(arr[1:]) - if device_name == "mali": - return mali(arr[1:]) - return Target(arr[0], arr[1:]) + + return _api_internal._TargetFromString(target_str) def current_target(allow_none=True): @@ -337,10 +446,5 @@ def current_target(allow_none=True): ------ ValueError if current target is not set. """ - if Target.current: - return Target.current - if not allow_none: - raise RuntimeError( - "Requires a current target in generic function, but it is not set. " - "Please set it using `with TargetObject:`") - return Target.current + target_str = _api_internal._GetCurrentTarget(allow_none) + return create(target_str) if target_str is not None else None diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index dc0971b2d4c3..43bc3e32aad6 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -3,40 +3,147 @@ * Compile executable modules. * \file build_module.cc */ +#include #include #include #include #include +#include +#include +#include namespace tvm { -std::string Target::str() const { +TVM_REGISTER_NODE_TYPE(TargetNode); + +TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) +.set_dispatch([](const TargetNode *op, IRPrinter *p) { + p->stream << op->str(); + }); + + +/*! +* \brief Construct a Target node from the given name and options. +* \param target_name The major target name. Should be one of +* {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"} +* \param options Additional options appended to the target +* \return The constructed Target +*/ +Target CreateTarget(const std::string& target_name, + const std::unordered_set& options) { + auto target = Target(std::make_shared()); + auto t = static_cast(target.node_.get()); + + t->target_name = target_name; + + std::string device_name = ""; + + std::string libs_flag = "-libs="; + std::string device_flag = "-device="; + for (auto& item : options) { + t->options_array.push_back(ir::StringImm::make(item)); + + if (item.find(libs_flag) == 0) { + std::stringstream ss(item.substr(libs_flag.length())); + std::string lib_item; + while (std::getline(ss, lib_item, ',')) { + t->libs_array.push_back(ir::StringImm::make(lib_item)); + } + } else if (item.find(device_flag) == 0) { + device_name = item.substr(device_flag.length()); + } + } + + if (device_name.length() > 0) { + t->keys_array.push_back(ir::StringImm::make(device_name)); + } + + t->device_type = kDLCPU; + t->thread_warp_size = 1; + if (target_name == "llvm") { + t->keys_array.push_back(ir::StringImm::make("cpu")); + } else if (target_name == "cuda" || target_name == "nvptx") { + t->device_type = kDLGPU; + t->keys_array.push_back(ir::StringImm::make("cuda")); + t->keys_array.push_back(ir::StringImm::make("gpu")); + t->max_num_threads = 512; + t->thread_warp_size = 32; + } else if (target_name == "rocm" || target_name == "opencl") { + // For now assume rocm schedule for opencl + t->device_type = static_cast(target_name == "rocm" ? kDLROCM : kDLOpenCL); + t->keys_array.push_back(ir::StringImm::make("rocm")); + t->keys_array.push_back(ir::StringImm::make("gpu")); + t->max_num_threads = 256; + } else if (target_name == "metal" || target_name == "vulkan") { + t->device_type = static_cast(target_name == "metal" ? kDLMetal : kDLVulkan); + t->keys_array.push_back(ir::StringImm::make(target_name)); + t->keys_array.push_back(ir::StringImm::make("gpu")); + t->max_num_threads = 256; + } else if (target_name == "opengl") { + t->device_type = kDLGPU; + t->keys_array.push_back(ir::StringImm::make("opengl")); + } else if (target_name == "stackvm" || target_name == "ext_dev") { + } else { + LOG(ERROR) << "Unknown target name " << target_name; + return target::stackvm(); + } + + return target; +} + +TVM_REGISTER_API("_TargetCreate") +.set_body([](TVMArgs args, TVMRetValue* ret) { + std::string target_name = args[0]; + std::unordered_set options; + for (int i = 1; i < args.num_args; ++i) { + std::string arg = args[i]; + options.insert(arg); + } + + *ret = CreateTarget(target_name, options); + }); + +TVM_REGISTER_API("_TargetFromString") +.set_body([](TVMArgs args, TVMRetValue* ret) { + std::string target_str = args[0]; + + *ret = Target::create(target_str); + }); + +std::vector TargetNode::keys() const { + std::vector result; + for (auto& expr : keys_array) { + result.push_back(expr.as()->value); + } + return result; +} + +std::vector TargetNode::options() const { + std::vector result; + for (auto& expr : options_array) { + result.push_back(expr.as()->value); + } + return result; +} + +std::unordered_set TargetNode::libs() const { + std::unordered_set result; + for (auto& expr : libs_array) { + result.insert(expr.as()->value); + } + return result; +} + +std::string TargetNode::str() const { std::ostringstream result; result << target_name; - for (const auto &x : options) { + for (const auto &x : options()) { result << " " << x; } return result.str(); } -Target TargetFromName(const std::string& name) { - if (name == "llvm") { - return target::llvm(); - } else if (name == "cuda" || name == "nvptx") { - return target::cuda(); - } else if (name == "rocm" || name == "opencl") { - /* For now, assume rocm schedule for opencl */ - return target::rocm(); - } else if (name == "metal") { - return target::metal(); - } else if (name == "stackvm" || name == "ext_dev") { - return target::stackvm(); - } else { - LOG(ERROR) << "Unknown target name " << name; - return target::stackvm(); - } -} bool StartsWith(const std::string& str, const std::string& pattern) { return str.compare(0, pattern.length(), pattern) == 0; @@ -68,74 +175,99 @@ Target Target::create(const std::string& target_str) { ss >> target_name; auto device_name = GetDeviceName(target_str); - auto result = device_name == "rasp" ? - target::rasp() : - (device_name == "mali" ? target::mali() : - TargetFromName(target_name)); - + std::unordered_set options; std::string item; while (ss >> item) { - result.options.push_back(item); + options.insert(item); } - return result; + if (device_name == "rasp") { + return target::rasp(options); + } else if (device_name == "mail") { + return target::mali(options); + } else { + return CreateTarget(target_name, options); + } +} + +/*! \brief Entry to hold the Target context stack. */ +struct TVMTargetThreadLocalEntry { + /*! \brief The current target context */ + std::stack context_stack; + + TVMTargetThreadLocalEntry() { + } +}; + +/*! \brief Thread local store to hold the Target context stack. */ +typedef dmlc::ThreadLocalStore TVMTargetThreadLocalStore; + +void Target::EnterTargetScope(const tvm::Target& target) { + TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get(); + entry->context_stack.push(target); +} + +void Target::ExitTargetScope() { + TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get(); + entry->context_stack.pop(); +} + +tvm::Target Target::current_target(bool allow_not_defined) { + TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get(); + if (entry->context_stack.size() > 0) { + return entry->context_stack.top(); + } + CHECK(allow_not_defined) + << "Target context required. Please set it by constructing a TargetContext"; + + return Target(); } namespace target { -Target llvm() { - std::unordered_set keys({ "llvm", "cpu" }); - std::vector options; - return Target("llvm", kDLCPU, 512, 1, keys, options, - std::unordered_set()); +std::unordered_set MergeOptions(std::unordered_set opts, + const std::unordered_set& new_opts) { + opts.insert(new_opts.begin(), new_opts.end()); + return opts; +} + +Target llvm(const std::unordered_set& options) { + return CreateTarget("llvm", options); } -Target cuda() { - std::unordered_set keys({ "cuda", "gpu" }); - std::vector options; - return Target("cuda", kDLGPU, 512, 32, keys, options, - std::unordered_set()); +Target cuda(const std::unordered_set& options) { + return CreateTarget("cuda", options); } -Target rocm() { - std::unordered_set keys({ "rocm", "gpu" }); - std::vector options; - return Target("rocm", kDLROCM, 256, 1, keys, options, - std::unordered_set()); +Target rocm(const std::unordered_set& options) { + return CreateTarget("rocm", options); } -Target metal() { - std::unordered_set keys({ "gpu" }); - std::vector options; - return Target("metal", kDLMetal, 256, 1, keys, options, - std::unordered_set()); +Target opencl(const std::unordered_set& options) { + return CreateTarget("opencl", options); } -Target rasp() { - std::unordered_set keys({ "llvm", "cpu" }); - std::vector options({ +Target metal(const std::unordered_set& options) { + return CreateTarget("metal", options); +} + +Target rasp(const std::unordered_set& options) { + return CreateTarget("llvm", MergeOptions(options, { "-device=rasp", "-mtriple=armv7l-none-linux-gnueabihf", "-mcpu=cortex-a53", "-mattr=+neon" - }); - return Target("llvm", kDLCPU, 512, 1, keys, options, - std::unordered_set()); + })); } -Target mali() { - std::unordered_set keys({ "rocm", "gpu" }); - std::vector options({ +Target mali(const std::unordered_set& options) { + return CreateTarget("opencl", MergeOptions(options, { "-device=mali" - }); - return Target("opencl", kDLOpenCL, 256, 1, keys, options); + })); } -Target stackvm() { - std::unordered_set keys({ "stackvm", "cpu" }); - std::vector options; - return Target("stackvm", kDLCPU, 512, 1, keys, options, - std::unordered_set()); +Target stackvm(const std::unordered_set& options) { + return CreateTarget("stackvm", options); } } // namespace target @@ -146,7 +278,7 @@ bool LLVMEnabled() { /*! \return The default host target for a given device target */ Target DefaultTargetHost(Target target) { - if (target.device_type == kDLCPU) { + if (target->device_type == kDLCPU) { return target; } else { if (LLVMEnabled()) { @@ -254,7 +386,7 @@ Array lower(Schedule sch, runtime::Module build(const Array& funcs, const Target& target, - Target* target_host, + const Target& target_host, const BuildConfig& config) { std::unordered_set all_names; for (const auto &x : funcs) { @@ -262,15 +394,13 @@ runtime::Module build(const Array& funcs, all_names.insert(x->name); } - Target target_host_val = target_host == nullptr ? - DefaultTargetHost(target) : - *target_host; + auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target); Array fhost; Array fdevice; for (const auto& x : funcs) { - CHECK(ir::VerifyMemory(x, target.device_type)) + CHECK(ir::VerifyMemory(x, target->device_type)) << "Direct host side access to device memory is detected in " << x->func_name() << ". Did you forget to bind?"; @@ -281,7 +411,7 @@ runtime::Module build(const Array& funcs, } func = ir::ThreadSync(func, "shared"); - func = ir::LowerThreadAllreduce(func, target.thread_warp_size); + func = ir::LowerThreadAllreduce(func, target->thread_warp_size); auto fsplits = ir::SplitHostDevice(func); fhost.push_back(fsplits[0]); for (auto f = fsplits.begin() + 1; f != fsplits.end(); ++f) { @@ -296,14 +426,17 @@ runtime::Module build(const Array& funcs, } } - if (target.keys.count("gpu") > 0 && fdevice.size() == 0) { - LOG(WARNING) << "Specified target " + target.str() + + auto keys = target->keys(); + bool target_is_gpu = + std::find(keys.begin(), keys.end(), "gpu") != keys.end(); + if (target_is_gpu && fdevice.size() == 0) { + LOG(WARNING) << "Specified target " + target->str() + " but cannot find device code. Did you forget to bind?"; } for (size_t i = 0; i < fhost.size(); ++i) { auto func = fhost[i]; - func = ir::BindDeviceType(func, target.device_type); + func = ir::BindDeviceType(func, target->device_type); func = ir::LowerTVMBuiltin(func); fhost.Set(i, func); } @@ -311,21 +444,21 @@ runtime::Module build(const Array& funcs, for (size_t i = 0; i < fdevice.size(); ++i) { auto func = fdevice[i]; - func = ir::LowerIntrin(func, target.target_name); + func = ir::LowerIntrin(func, target->target_name); fdevice.Set(i, func); } for (size_t i = 0; i < fhost.size(); ++i) { auto func = fhost[i]; - func = ir::LowerIntrin(func, target_host_val.target_name); + func = ir::LowerIntrin(func, target_host_val->target_name); func = ir::CombineContextCall(func); fhost.Set(i, func); } - auto mhost = codegen::Build(fhost, target_host_val.str()); + auto mhost = codegen::Build(fhost, target_host_val->str()); if (fdevice.size() > 0) { - auto mdev = codegen::Build(fdevice, target.str()); + auto mdev = codegen::Build(fdevice, target->str()); mhost.Import(mdev); } @@ -354,4 +487,160 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) p->stream << ")"; }); +struct GenericFunc::Manager { + std::unordered_map > fmap; + // mutex + std::mutex mutex; + + Manager() { + } + + static Manager* Global() { + static Manager inst; + return &inst; + } +}; + +GenericFunc GenericFunc::Get(const std::string& name) { + Manager* m = Manager::Global(); + std::lock_guard(m->mutex); + auto it = m->fmap.find(name); + if (it == m->fmap.end()) { + auto f = std::make_shared(); + f->name_ = name; + m->fmap[name] = f; + return GenericFunc(f); + } else { + return GenericFunc(it->second); + } +} + +void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name) { + Manager* m = Manager::Global(); + std::lock_guard(m->mutex); + auto it = m->fmap.find(name); + CHECK(it == m->fmap.end()) << "GenericFunc already registered " << name; + func->name_ = name; + m->fmap[name] = func.node_; +} + +GenericFunc& GenericFunc::set_default(const PackedFunc value, + bool allow_override) { + auto node = static_cast(node_.get()); + if (!allow_override) { + CHECK(node->generic_func_ == nullptr) + << "Generic function already registered for " << node->name_; + } + node->generic_func_ = value; + return *this; +} + +GenericFunc& GenericFunc::register_func(const std::vector& tags, + const PackedFunc value, + bool allow_override) { + for (auto &t : tags) { + if (!allow_override) { + auto iter = (*this)->dispatch_dict_.find(t); + CHECK(iter == (*this)->dispatch_dict_.end()) + << "Tag " << t << " already registered for schedule factory " << (*this)->name_; + } + (*this)->dispatch_dict_[t] = value; + } + return *this; +} + +void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const { + auto node = static_cast(node_.get()); + auto target = Target::current_target(true); + PackedFunc func; + + if (target.defined()) { + for (auto &k : target->keys()) { + auto iter = node->dispatch_dict_.find(k); + if (iter != node->dispatch_dict_.end()) { + func = iter->second; + break; + } + } + } + + if (func == nullptr) { + CHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_; + func = node->generic_func_; + } + + func.CallPacked(args, ret); +} + + +TVM_REGISTER_API("_GenericFuncCreate") +.set_body([](TVMArgs args, TVMRetValue* ret) { + *ret = GenericFunc(std::make_shared()); + }); + +TVM_REGISTER_API("_GenericFuncGetGlobal") +.set_body([](TVMArgs args, TVMRetValue* ret) { + std::string func_name = args[0]; + *ret = GenericFunc::Get(func_name); + }); + +TVM_REGISTER_API("_GenericFuncSetDefault") +.set_body([](TVMArgs args, TVMRetValue* ret) { + GenericFunc generic_func = args[0]; + // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown + PackedFunc* func = new PackedFunc(args[1].operator PackedFunc()); + bool allow_override = args[2]; + + generic_func + .set_default(*func, allow_override); + }); + +TVM_REGISTER_API("_GenericFuncRegisterFunc") +.set_body([](TVMArgs args, TVMRetValue* ret) { + GenericFunc generic_func = args[0]; + // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown + PackedFunc* func = new PackedFunc(args[1].operator PackedFunc()); + Array tags = args[2]; + bool allow_override = args[3]; + + std::vector tags_vector; + for (auto& tag : tags) { + tags_vector.push_back(tag.as()->value); + } + + generic_func + .register_func(tags_vector, *func, allow_override); + }); + +TVM_REGISTER_API("_GenericFuncCallFunc") +.set_body([](TVMArgs args, TVMRetValue* ret) { + GenericFunc generic_func = args[0]; + TVMArgs func_args(&args.values[1], &args.type_codes[1], args.num_args - 1); + + generic_func + .CallPacked(func_args, ret); + }); + +TVM_REGISTER_API("_GetCurrentTarget") +.set_body([](TVMArgs args, TVMRetValue* ret) { + bool allow_not_defined = args[0]; + *ret = Target::current_target(allow_not_defined); + }); + +TVM_REGISTER_API("_EnterTargetScope") +.set_body([](TVMArgs args, TVMRetValue* ret) { + Target target = args[0]; + auto current = Target::current_target(); + if (current.defined() && target->str() != current->str()) { + LOG(WARNING) << "Overriding target " << current->str() + << " with new target scope " << target->str(); + } + Target::EnterTargetScope(target); + }); + +TVM_REGISTER_API("_ExitTargetScope") +.set_body([](TVMArgs args, TVMRetValue* ret) { + Target::ExitTargetScope(); + }); + } // namespace tvm diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 19ba9bf2d776..c91796891e24 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -6,6 +6,7 @@ #include #include #include +#include #if defined(__linux__) #include #endif diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index fe0a9151cc2c..a1aed5b00b96 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -31,7 +31,7 @@ TEST(BuildModule, Basic) { auto target = target::llvm(); auto lowered = lower(s, args, "func", binds, config); - auto module = build(lowered, target, nullptr, config); + auto module = build(lowered, target, Target(), config); } diff --git a/tests/python/unittest/test_lang_target.py b/tests/python/unittest/test_lang_target.py index 4e13c76d5f39..812da0fd2710 100644 --- a/tests/python/unittest/test_lang_target.py +++ b/tests/python/unittest/test_lang_target.py @@ -34,11 +34,16 @@ def test_target_dispatch(): with tvm.target.create("metal"): assert mygeneric(1) == 3 - try: - mygeneric(0) - raise RuntimeError("not reached") - except RuntimeError: - pass + assert tvm.target.current_target() == None + +def test_target_string_parse(): + target = tvm.target.create("cuda -libs=cublas,cudnn") + + assert target.target_name == "cuda" + assert target.options == ['-libs=cublas,cudnn'] + assert target.keys == ['cuda', 'gpu'] + assert target.libs == ['cublas', 'cudnn'] if __name__ == "__main__": test_target_dispatch() + test_target_string_parse() diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h index a07aafea6e30..77e29fc7fdbb 100644 --- a/topi/include/topi/cuda/dense.h +++ b/topi/include/topi/cuda/dense.h @@ -24,31 +24,30 @@ namespace cuda { * \param target The target device * \param data Tensor with shape [batch, in_dim] * \param weight Tensor with shape [out_dim, in_dim] -* \param bias Tensor with shape [out_dim] (optional) +* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor() * * \return Tensor with shape [batch, out_dim] */ inline tvm::Tensor dense_cuda(const Target& target, const tvm::Tensor& data, const tvm::Tensor& weight, - tvm::Tensor* bias) { + const tvm::Tensor& bias) { CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data"; CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight"; - if (bias != nullptr) { - CHECK_EQ((*bias)->shape.size(), 1) << "dense requires 1-D bias"; + if (bias.defined()) { + CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias"; } auto batch = data->shape[0]; auto in_dim = data->shape[1]; auto out_dim = weight->shape[0]; - if (target.libs.count("cublas") > 0) { + if (target->libs().count("cublas")) { auto mm = topi::contrib::cublas_matmul(data, weight, false, true); - if (bias != nullptr) { - auto bias_val = *bias; + if (bias.defined()) { mm = tvm::compute({ batch, out_dim }, [&](Var i, Var j) { - return mm(i, j) + bias_val(j); + return mm(i, j) + bias(j); }, "tensor", kBroadcast); } @@ -67,8 +66,8 @@ inline tvm::Tensor dense_cuda(const Target& target, * \return A schedule for the given ops. */ inline Schedule schedule_dense(const Target &target, const Array& outs) { - if (target.target_name == "cuda" && - target.libs.count("cublas") > 0) { + if (target->target_name == "cuda" && + target->libs().count("cublas")) { return topi::generic::schedule_extern(target, outs); } diff --git a/topi/include/topi/cuda/extern.h b/topi/include/topi/cuda/extern.h index 1c2f9a79ab00..475ab6ba8a19 100644 --- a/topi/include/topi/cuda/extern.h +++ b/topi/include/topi/cuda/extern.h @@ -28,7 +28,7 @@ namespace cuda { inline Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) { auto x = op.output(0); auto fused = detail::Fuse(sch[x], sch[x]->op.as()->axis); - auto num_thread = target.max_num_threads; + auto num_thread = target->max_num_threads; IterVar bx, tx; sch[x].split(fused, num_thread, &bx, &tx); sch[x].bind(bx, tvm::thread_axis(Range(), "blockIdx.x")); diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h index e8e60fb6809e..91c6df3a2a3c 100644 --- a/topi/include/topi/cuda/injective.h +++ b/topi/include/topi/cuda/injective.h @@ -25,7 +25,7 @@ namespace cuda { inline void ScheduleInjectiveOp(const Target &target, Operation op, Schedule s) { auto x = op.output(0); auto fused = detail::Fuse(s[x], s[x]->op.as()->axis); - auto num_thread = target.max_num_threads; + auto num_thread = target->max_num_threads; IterVar bx, tx; s[x].split(fused, num_thread, &bx, &tx); s[x].bind(bx, thread_axis(Range(), "blockIdx.x")); diff --git a/topi/include/topi/cuda/pooling.h b/topi/include/topi/cuda/pooling.h index d7536f315dba..f8730c9d0299 100644 --- a/topi/include/topi/cuda/pooling.h +++ b/topi/include/topi/cuda/pooling.h @@ -34,7 +34,7 @@ inline Schedule schedule_pool(const Target &target, const Array& outs) { auto _schedule = [&](const Tensor& padded_input, const Tensor& pool) { s[padded_input].compute_inline(); - auto num_thread = target.max_num_threads; + auto num_thread = target->max_num_threads; Tensor out; Tensor OL; if (detail::contains(s->outputs, pool->op)) { diff --git a/topi/include/topi/cuda/reduction.h b/topi/include/topi/cuda/reduction.h index e7a41d9274fa..a1670873cc31 100644 --- a/topi/include/topi/cuda/reduction.h +++ b/topi/include/topi/cuda/reduction.h @@ -51,7 +51,7 @@ Schedule ScheduleReduce(const Target& target, if (out_stage->op.as()->axis.size() > 0) { all_reduce = false; num_thread = 32; - if (target.target_name == "opencl") { + if (target->target_name == "opencl") { // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests. // Don't know why. num_thread = 16; @@ -61,7 +61,7 @@ Schedule ScheduleReduce(const Target& target, thread_y = tvm::thread_axis(Range(0, num_thread), "threadIdx.y"); } else { all_reduce = true; - num_thread = target.max_num_threads; + num_thread = target->max_num_threads; thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x"); } diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h index cdc7fde158a6..695b8e187856 100644 --- a/topi/include/topi/nn/dense.h +++ b/topi/include/topi/nn/dense.h @@ -20,17 +20,17 @@ using namespace tvm; * * \param data Tensor with shape [batch, in_dim] * \param weight Tensor with shape [out_dim, in_dim] -* \param bias Tensor with shape [out_dim] (optional) +* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor() * * \return Tensor with shape [batch, out_dim] */ inline tvm::Tensor dense(const tvm::Tensor& data, const tvm::Tensor& weight, - tvm::Tensor* bias) { + const tvm::Tensor& bias) { CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data"; CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight"; - if (bias != nullptr) { - CHECK_EQ((*bias)->shape.size(), 1) << "dense requires 1-D bias"; + if (bias.defined()) { + CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias"; } auto batch = data->shape[0]; @@ -44,12 +44,11 @@ inline tvm::Tensor dense(const tvm::Tensor& data, return tvm::sum(data(i, k) * weight(j, k), { k }); }, "tensor", "dense"); - if (bias != nullptr) { - auto bias_val = *bias; + if (bias.defined()) { matmul = tvm::compute( { batch, out_dim }, [&](Var i, Var j) { - return matmul(i, j) + bias_val(j); + return matmul(i, j) + bias(j); }, "tensor", kBroadcast); } diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h index 8256ba983eee..6f171f6780fc 100644 --- a/topi/include/topi/rocm/dense.h +++ b/topi/include/topi/rocm/dense.h @@ -25,31 +25,30 @@ namespace rocm { * \param target The target device * \param data Tensor with shape [batch, in_dim] * \param weight Tensor with shape [out_dim, in_dim] -* \param bias Tensor with shape [out_dim] (optional) +* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor() * * \return Tensor with shape [batch, out_dim] */ inline tvm::Tensor dense_rocm(const Target& target, const tvm::Tensor& data, const tvm::Tensor& weight, - tvm::Tensor* bias) { + const tvm::Tensor& bias) { CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data"; CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight"; - if (bias != nullptr) { - CHECK_EQ((*bias)->shape.size(), 1) << "dense requires 1-D bias"; + if (bias.defined()) { + CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias"; } auto batch = data->shape[0]; auto in_dim = data->shape[1]; auto out_dim = weight->shape[0]; - if (target.libs.count("rocblas") > 0) { + if (target->libs().count("rocblas")) { auto mm = topi::contrib::rocblas_matmul(data, weight, false, true); - if (bias != nullptr) { - auto bias_val = *bias; + if (bias.defined()) { mm = tvm::compute({ batch, out_dim }, [&](Var i, Var j) { - return mm(i, j) + bias_val(j); + return mm(i, j) + bias(j); }, "tensor", kBroadcast); } @@ -68,8 +67,8 @@ inline tvm::Tensor dense_rocm(const Target& target, * \return A schedule for the given ops. */ inline Schedule schedule_dense(const Target &target, const Array& outs) { - if (target.target_name == "rocm" && - target.libs.count("rocblas") > 0) { + if (target->target_name == "rocm" && + target->libs().count("rocblas")) { return topi::generic::schedule_extern(target, outs); } diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index c9a9b7bc01b6..9666b88b80eb 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -11,6 +11,10 @@ from tvm._ffi.libinfo import __version__ +# Ensure C++ schedules get registered first, so python schedules can +# override them. +from . import cpp + from .math import * from .tensor import * from .reduction import * @@ -24,7 +28,6 @@ from . import opengl from . import util from . import rocm -from . import cpp from . import vision # not import testing by default # because testing can have extra deps that are not necessary diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py index 765f4e4f518d..0a9e394661af 100644 --- a/topi/python/topi/generic/injective.py +++ b/topi/python/topi/generic/injective.py @@ -4,7 +4,7 @@ import tvm -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_injective") def schedule_injective(outs): """Schedule for injective op. diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 5c580aad24c4..7fe76e1739f0 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -106,7 +106,7 @@ def schedule_depthwise_conv2d_nhwc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_reduce") def schedule_reduce(outs): """Schedule for reduction @@ -124,7 +124,7 @@ def schedule_reduce(outs): return _default_schedule(outs, True) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_softmax") def schedule_softmax(outs): """Schedule for softmax @@ -142,7 +142,7 @@ def schedule_softmax(outs): return _default_schedule(outs, False) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_dense") def schedule_dense(outs): """Schedule for dense @@ -160,7 +160,7 @@ def schedule_dense(outs): return _default_schedule(outs, False) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_pool") def schedule_pool(outs): """Schedule for pool @@ -178,7 +178,7 @@ def schedule_pool(outs): return _default_schedule(outs, False) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_global_pool") def schedule_global_pool(outs): """Schedule for global pool @@ -195,7 +195,7 @@ def schedule_global_pool(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_binarize_pack") def schedule_binarize_pack(outs): """Schedule for binarize_pack @@ -213,7 +213,7 @@ def schedule_binarize_pack(outs): return _default_schedule(outs, False) -@tvm.target.generic_func +@tvm.target.override_native_generic_func("schedule_binary_dense") def schedule_binary_dense(outs): """Schedule for binary_dense diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py index 11cc6097c250..abd424a64aeb 100644 --- a/topi/python/topi/nn/dense.py +++ b/topi/python/topi/nn/dense.py @@ -39,7 +39,7 @@ def dense_default(data, weight, bias=None): return matmul -@tvm.target.generic_func +@tvm.target.override_native_generic_func("dense") def dense(data, weight, bias=None): """Applies a linear transformation: :math:`Y = XW^T + b`. diff --git a/topi/src/topi.cc b/topi/src/topi.cc index d6b67c74bacc..2b0191cd72d7 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -51,6 +51,7 @@ struct extension_class_info { } // namespace runtime namespace topi { + using namespace tvm; using namespace tvm::runtime; @@ -281,15 +282,7 @@ TVM_REGISTER_GLOBAL("topi.nn.binary_dense") /* Ops from nn/dense.h */ TVM_REGISTER_GLOBAL("topi.nn.dense") .set_body([](TVMArgs args, TVMRetValue *rv) { - Tensor bias_val; - Tensor *bias; - if (args[2].type_code() == kNull) { - bias = nullptr; - } else { - bias_val = args[2]; - bias = &bias_val; - } - *rv = nn::dense(args[0], args[1], bias); + *rv = nn::dense(args[0], args[1], args[2]); }); /* Ops from nn/dilate.h */ @@ -388,15 +381,7 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_injective") /* ROCm schedules */ TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda") .set_body([](TVMArgs args, TVMRetValue *rv) { - Tensor bias_val; - Tensor *bias; - if (args[3].type_code() == kNull) { - bias = nullptr; - } else { - bias_val = args[3]; - bias = &bias_val; - } - *rv = rocm::dense_rocm(args[0], args[1], args[2], bias); + *rv = rocm::dense_rocm(args[0], args[1], args[2], args[3]); }); TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense") @@ -407,15 +392,7 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense") /* CUDA schedules */ TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda") .set_body([](TVMArgs args, TVMRetValue *rv) { - Tensor bias_val; - Tensor *bias; - if (args[3].type_code() == kNull) { - bias = nullptr; - } else { - bias_val = args[3]; - bias = &bias_val; - } - *rv = cuda::dense_cuda(args[0], args[1], args[2], bias); + *rv = cuda::dense_cuda(args[0], args[1], args[2], args[3]); }); TVM_REGISTER_GLOBAL("topi.cuda.schedule_dense") @@ -453,4 +430,106 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax") *rv = topi::cuda::schedule_softmax(args[0], args[1]); }); +/*! \brief Builder function for instantiating schedules. */ +using FTVMScheduleBuilder = std::function< + tvm::Schedule(const tvm::Target& target, const tvm::Array& outs)>; + +/*! + * \brief Helper function for registering generic functions matching the + * FTVMScheduleBuilder signature. The schedule builder function is wrapped + * with a PackedFunc suitable for passing to a tvm::GenericFunc. + * + * \param builder The schedule builder to wrap. + * + * \return The wrapped schedule builder + */ +inline PackedFunc WrapSchedule(FTVMScheduleBuilder builder) { + return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) { + auto target = Target::current_target(false); + Array outs; + NodeRef argNodeRef = args[0]; + if (argNodeRef->type_index() == outs->type_index()) { + outs = args[0]; + } else { + outs = Array { args[0] }; + } + + *ret = builder(target, outs); + }); +} + +TVM_REGISTER_GENERIC_FUNC(schedule_injective) +.set_default(WrapSchedule(topi::generic::schedule_injective)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_injective)) +.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_injective)); + +TVM_REGISTER_GENERIC_FUNC(schedule_softmax) +.set_default(WrapSchedule(topi::generic::default_schedule)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule)) +.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_softmax)); + +TVM_REGISTER_GENERIC_FUNC(schedule_dense) +.set_default(WrapSchedule(topi::generic::default_schedule)) +.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense)) +.register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense)); + +TVM_REGISTER_GENERIC_FUNC(schedule_pool) +.set_default(WrapSchedule(topi::generic::default_schedule)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule)) +.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_pool)); + +TVM_REGISTER_GENERIC_FUNC(schedule_global_pool) +.set_default(WrapSchedule(topi::generic::default_schedule)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule)) +.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_global_pool)); + +TVM_REGISTER_GENERIC_FUNC(schedule_reduce) +.set_default(WrapSchedule(topi::generic::default_schedule_auto_inline)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule_auto_inline)) +.register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_reduce)); + +TVM_REGISTER_GENERIC_FUNC(schedule_binarize_pack) +.set_default(WrapSchedule(topi::generic::default_schedule)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_binarize_pack)); + +TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense) +.set_default(WrapSchedule(topi::generic::default_schedule)) +.register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_binary_dense)); + +/*! \brief Builder function for instantiating dense ops. */ +using FTVMDenseOpBuilder = std::function; + +/*! +* \brief Helper function for registering dense ops matching the +* FTVMDenseOpBuilder signature. The op builder function is wrapped +* with a PackedFunc suitable for passing to a tvm::GenericFunc. +* +* \param builder The op builder to wrap. +* +* \return The wrapped op builder +*/ +inline PackedFunc WrapDenseOp(FTVMDenseOpBuilder builder) { + return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) { + auto target = Target::current_target(false); + Tensor data = args[0]; + Tensor weight = args[1]; + Tensor bias = args[2]; + + *ret = builder(target, data, weight, bias); + }); +} + +TVM_REGISTER_GENERIC_FUNC(dense) +.set_default(WrapDenseOp([](const Target& target, + const tvm::Tensor& data, + const tvm::Tensor& weight, + const tvm::Tensor& bias) { + return topi::nn::dense(data, weight, bias); +})) +.register_func({ "cuda", "gpu" }, WrapDenseOp(topi::cuda::dense_cuda)) +.register_func({ "rocm" }, WrapDenseOp(topi::rocm::dense_rocm)); + } // namespace topi From 56d71d3dac3e73e619cb34307ab97f6663d0743f Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Tue, 20 Mar 2018 01:03:00 +0530 Subject: [PATCH 214/948] JVM NDArray fatal exception fix (#1022) --- jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java b/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java index 3b9a4bb12e5a..1aea1a35c96d 100644 --- a/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java +++ b/jvm/core/src/main/java/ml/dmlc/tvm/NDArray.java @@ -53,7 +53,7 @@ public void copyFrom(double[] sourceArray) { } NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } /** @@ -72,7 +72,7 @@ public void copyFrom(float[] sourceArray) { } NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } /** @@ -91,7 +91,7 @@ public void copyFrom(long[] sourceArray) { } NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } /** @@ -110,7 +110,7 @@ public void copyFrom(int[] sourceArray) { } NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } /** @@ -129,7 +129,7 @@ public void copyFrom(short[] sourceArray) { } NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } /** @@ -161,7 +161,7 @@ public void copyFrom(char[] sourceArray) { } NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(nativeArr, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } private void checkCopySize(int sourceLength) { @@ -179,7 +179,7 @@ private void checkCopySize(int sourceLength) { public void copyFromRaw(byte[] sourceArray) { NDArray tmpArr = empty(shape(), this.dtype); Base.checkCall(Base._LIB.tvmArrayCopyFromJArray(sourceArray, tmpArr.handle, handle)); - Base.checkCall(Base._LIB.tvmArrayFree(tmpArr.handle)); + tmpArr.release(); } /** From 13e63a418cf62d1d97868900f4b900fb5934deb4 Mon Sep 17 00:00:00 2001 From: Yida Wang Date: Tue, 20 Mar 2018 03:13:31 -0700 Subject: [PATCH 215/948] [RUNTIME] better parallel launcher and task distribution (#1026) --- src/runtime/thread_pool.cc | 56 +++++++++++++++----------------- src/runtime/threading_backend.cc | 20 +++++++++--- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index d70f03e08b64..316baedff425 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -37,12 +37,11 @@ class ParallelLauncher { void* cdata, int num_task, bool need_sync) { - std::lock_guard lock(mutex_); - num_pending_ = num_task; + num_pending_.store(num_task); this->cdata = cdata; this->flambda = flambda; this->env.num_task = num_task; - has_error_ = false; + has_error_.store(false); // reshape if (static_cast(num_task) > par_errors_.size()) { par_errors_.resize(num_task + 1); @@ -66,11 +65,10 @@ class ParallelLauncher { } // Wait n jobs to finish int WaitForJobs() { - std::unique_lock lock(mutex_); - cv_.wait(lock, [this] { - return num_pending_ == 0; - }); - if (!has_error_) return 0; + while (num_pending_.load() != 0) { + tvm::runtime::threading::Yield(); + } + if (!has_error_.load()) return 0; std::string err(""); for (size_t i = 0; i < par_errors_.size(); ++i) { if (par_errors_[i].length() != 0) { @@ -83,23 +81,13 @@ class ParallelLauncher { } // Signal that one job has finished. void SignalJobError(int task_id) { - std::unique_lock lock(mutex_); - --num_pending_; + num_pending_.fetch_sub(1); par_errors_[task_id] = TVMGetLastError(); - has_error_ = true; - if (num_pending_ == 0) { - lock.unlock(); - cv_.notify_one(); - } + has_error_.store(true); } // Signal that one job has finished. void SignalJobFinish() { - std::unique_lock lock(mutex_); - --num_pending_; - if (num_pending_ == 0) { - lock.unlock(); - cv_.notify_one(); - } + num_pending_.fetch_sub(1); } // Get thread local version of the store. static ParallelLauncher* ThreadLocal() { @@ -116,14 +104,10 @@ class ParallelLauncher { bool is_worker{false}; private: - // The mutex to access local env. - std::mutex mutex_; - // The conditional variable. - std::condition_variable cv_; // The pending jobs. - uint32_t num_pending_; + std::atomic num_pending_; // Whether error has been countered. - bool has_error_; + std::atomic has_error_; // The counter page. std::atomic* sync_counter_{nullptr}; // The error message @@ -257,13 +241,13 @@ class ThreadPool { public: ThreadPool(): num_workers_(tvm::runtime::threading::MaxConcurrency()) { for (int i = 0; i < num_workers_; ++i) { - // The SpscTaskQueue only host ONE item at a time + // The SpscTaskQueue only hosts ONE item at a time queues_.emplace_back(std::unique_ptr(new SpscTaskQueue())); } threads_ = std::unique_ptr( new tvm::runtime::threading::ThreadGroup( num_workers_, [this](int worker_id) { this->RunWorker(worker_id); }, - false /* include_main_thread */)); + exclude_worker0_ /* include_main_thread */)); } ~ThreadPool() { for (std::unique_ptr& q : queues_) { @@ -289,10 +273,20 @@ class ThreadPool { launcher->Init(flambda, cdata, num_task, need_sync != 0); SpscTaskQueue::Task tsk; tsk.launcher = launcher; - for (int i = 0; i < num_task; ++i) { + // if worker0 is taken by the master, queues_[0] is abandoned + for (int i = exclude_worker0_; i < num_task; ++i) { tsk.task_id = i; queues_[i]->Push(tsk); } + // use the master thread to run task 0 + if (exclude_worker0_) { + TVMParallelGroupEnv* penv = &(tsk.launcher->env); + if ((*tsk.launcher->flambda)(0, penv, cdata) == 0) { + tsk.launcher->SignalJobFinish(); + } else { + tsk.launcher->SignalJobError(tsk.task_id); + } + } int res = launcher->WaitForJobs(); return res; } @@ -320,6 +314,8 @@ class ThreadPool { } } int num_workers_; + // if excluding worker 0 and using master to run task 0 + bool exclude_worker0_{true}; std::vector > queues_; std::unique_ptr threads_; }; diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index c91796891e24..11df12837860 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -29,7 +29,7 @@ class ThreadGroup::Impl { const char *val = getenv("TVM_BIND_THREADS"); if (val == nullptr || atoi(val) == 1) { if (num_workers_ <= std::thread::hardware_concurrency()) { - SetAffinity(); + SetAffinity(exclude_worker0); } else { LOG(WARNING) << "The thread affinity cannot be set when the number of workers" @@ -47,7 +47,9 @@ class ThreadGroup::Impl { private: // bind worker threads to disjoint cores - void SetAffinity() { + // if worker 0 is offloaded to master, i.e. exclude_worker0 is true, + // the master thread is bound to core 0. + void SetAffinity(bool exclude_worker0) { #if defined(__ANDROID__) #ifndef CPU_SET #define CPU_SETSIZE 1024 @@ -62,19 +64,27 @@ class ThreadGroup::Impl { memset((cpusetp), 0, sizeof(cpu_set_t)) #endif #endif - for (unsigned i=0; i < threads_.size(); ++i) { #if defined(__linux__) || defined(__ANDROID__) + for (unsigned i = 0; i < threads_.size(); ++i) { + unsigned core_id = i + exclude_worker0; cpu_set_t cpuset; CPU_ZERO(&cpuset); - CPU_SET(i, &cpuset); + CPU_SET(core_id, &cpuset); #if defined(__ANDROID__) sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); #else pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); -#endif #endif } + if (exclude_worker0) { // bind the master thread to core 0 + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(pthread_self(), + sizeof(cpu_set_t), &cpuset); + } +#endif } int num_workers_; From ebac59b4ed8dd559b995c372d560a2b43441d896 Mon Sep 17 00:00:00 2001 From: cjjia <33363657+jiacunjiang1215@users.noreply.github.com> Date: Fri, 23 Mar 2018 16:59:30 +0800 Subject: [PATCH 216/948] Fix the issue #1033 (#1037) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix the issue #1033 fix the issue #1033 "converting to ‘const std::unordered_set >’from initializer" * Fix the issue #1033 fix the issue #1033 "converting to ‘const std::unordered_set’from initializer". --- include/tvm/build_module.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index 0cbc97d71e46..d2f21c9c87ce 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -126,28 +126,36 @@ struct TargetContext { /*! \brief This namespace provides functions to construct Target instances */ namespace target { /*! \return A target for LLVM */ -EXPORT Target llvm(const std::unordered_set& options = {}); +EXPORT Target llvm(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for CUDA */ -EXPORT Target cuda(const std::unordered_set& options = {}); +EXPORT Target cuda(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for ROCm */ -EXPORT Target rocm(const std::unordered_set& options = {}); +EXPORT Target rocm(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for OpenCL */ -EXPORT Target opencl(const std::unordered_set& options = {}); +EXPORT Target opencl(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for Metal */ -EXPORT Target metal(const std::unordered_set& options = {}); +EXPORT Target metal(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for rasp */ -EXPORT Target rasp(const std::unordered_set& options = {}); +EXPORT Target rasp(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for Mali */ -EXPORT Target mali(const std::unordered_set& options = {}); +EXPORT Target mali(const std::unordered_set& options = + std::unordered_set()); /*! \return A target for stackvm */ -EXPORT Target stackvm(const std::unordered_set& options = {}); +EXPORT Target stackvm(const std::unordered_set& options = + std::unordered_set()); } // namespace target From 5922b7e19178887f82cebaed79dd70f2efd93b7a Mon Sep 17 00:00:00 2001 From: ZhangXinqian Date: Sat, 24 Mar 2018 10:42:05 -0500 Subject: [PATCH 217/948] Fix the issue #1036 (#1040) --- apps/howto_deploy/tvm_runtime_pack.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc index 445768128413..e35aa2f90561 100644 --- a/apps/howto_deploy/tvm_runtime_pack.cc +++ b/apps/howto_deploy/tvm_runtime_pack.cc @@ -56,7 +56,7 @@ // Uncomment the following lines to enable CUDA // #include "../../src/runtime/cuda/cuda_device_api.cc" -// #include "../../src/runtime/cuda/cuda_runtime.cc" +// #include "../../src/runtime/cuda/cuda_module.cc" // Uncomment the following lines to enable OpenCL // #include "../../src/runtime/opencl/opencl_device_api.cc" From 8a61bb1f13d8843586ae8a848b240c6cd1307fb8 Mon Sep 17 00:00:00 2001 From: Kirill Mavreshko Date: Sat, 24 Mar 2018 22:12:36 +0500 Subject: [PATCH 218/948] Fix fatal error when building with CMake (#1045) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb52d1f82723..ff74458f27d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,7 +154,7 @@ endif(USE_CUDA) if(USE_OPENCL) find_package(OpenCL QUIET REQUIRED) message(STATUS "Build with OpenCL support") - include_directories(${OPENCL_INCLUDE_DIRS}) + include_directories(${OpenCL_INCLUDE_DIRS}) list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES}) list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS}) add_definitions(-DTVM_OPENCL_RUNTIME=1) From b59683841f648f483808482a9778f27ec2a64377 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 24 Mar 2018 18:34:53 -0700 Subject: [PATCH 219/948] [CYTHON] Fix exception propagation for cython3 (#1046) --- python/tvm/_ffi/_cython/base.pxi | 4 ++-- python/tvm/_ffi/_cython/function.pxi | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi index b84391648f35..c027c723de08 100644 --- a/python/tvm/_ffi/_cython/base.pxi +++ b/python/tvm/_ffi/_cython/base.pxi @@ -64,8 +64,8 @@ ctypedef int (*TVMPackedCFunc)( ctypedef void (*TVMPackedCFuncFinalizer)(void* resource_handle) cdef extern from "tvm/runtime/c_runtime_api.h": - void TVMAPISetLastError(const char* msg); - const char *TVMGetLastError(); + void TVMAPISetLastError(const char* msg) + const char *TVMGetLastError() int TVMFuncCall(TVMFunctionHandle func, TVMValue* arg_values, int* type_codes, diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi index 7cadf22a1cc7..50f89a5f4aaa 100644 --- a/python/tvm/_ffi/_cython/function.pxi +++ b/python/tvm/_ffi/_cython/function.pxi @@ -74,10 +74,10 @@ def convert_to_tvm_func(object pyfunc): return ret -cdef inline void make_arg(object arg, - TVMValue* value, - int* tcode, - list temp_args): +cdef inline int make_arg(object arg, + TVMValue* value, + int* tcode, + list temp_args) except -1: """Pack arguments into c args tvm call accept""" cdef unsigned long long ptr if isinstance(arg, NodeBase): @@ -152,6 +152,7 @@ cdef inline void make_arg(object arg, temp_args.append(arg) else: raise TypeError("Don't know how to handle type %s" % type(arg)) + return 0 cdef inline bytearray make_ret_bytes(void* chandle): handle = ctypes_handle(chandle) From 9907c080a6f46a39cfae646bb33369a6572f8757 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 24 Mar 2018 18:58:55 -0700 Subject: [PATCH 220/948] Fix verilog testcase (#1047) --- tests/verilog/integration/test_codegen_verilog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/verilog/integration/test_codegen_verilog.py b/tests/verilog/integration/test_codegen_verilog.py index e25ff6d9dff4..26c0a9e36c9d 100644 --- a/tests/verilog/integration/test_codegen_verilog.py +++ b/tests/verilog/integration/test_codegen_verilog.py @@ -18,7 +18,7 @@ def lower(s, args, name): stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.SplitPipeline(stmt, True) - fapi = tvm.ir_pass.MakeAPI(stmt, name, arg_list, 0) + fapi = tvm.ir_pass.MakeAPI(stmt, name, arg_list, 0, True) return fapi @tvm.register_func From 2d46d88fb6bc73305737d9fc88eab1d5bfd20006 Mon Sep 17 00:00:00 2001 From: HungMingWu Date: Sun, 25 Mar 2018 21:32:56 +0800 Subject: [PATCH 221/948] LLVM 7.0 support (#1048) --- src/codegen/llvm/llvm_module.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc index 439c132d09a8..33fbf569726c 100644 --- a/src/codegen/llvm/llvm_module.cc +++ b/src/codegen/llvm/llvm_module.cc @@ -68,7 +68,11 @@ class LLVMModuleNode final : public runtime::ModuleNode { CHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name << " " << ecode.message(); if (fmt == "o" || fmt == "obj") { +#if TVM_LLVM_VERSION <= 60 std::unique_ptr m = llvm::CloneModule(mptr_); +#else + std::unique_ptr m = llvm::CloneModule(*mptr_); +#endif llvm::legacy::PassManager pass; CHECK(tm_); CHECK(tm_->addPassesToEmitFile( @@ -76,7 +80,11 @@ class LLVMModuleNode final : public runtime::ModuleNode { << "Cannot emit target CGFT_ObjectFile"; pass.run(*m); } else if (fmt == "s" || fmt == "asm") { +#if TVM_LLVM_VERSION <= 60 std::unique_ptr m = llvm::CloneModule(mptr_); +#else + std::unique_ptr m = llvm::CloneModule(*mptr_); +#endif llvm::legacy::PassManager pass; CHECK(tm_); CHECK(tm_->addPassesToEmitFile( @@ -86,7 +94,11 @@ class LLVMModuleNode final : public runtime::ModuleNode { } else if (fmt == "ll") { mptr_->print(dest, nullptr); } else if (fmt == "bc") { +#if TVM_LLVM_VERSION <= 60 llvm::WriteBitcodeToFile(mptr_, dest); +#else + llvm::WriteBitcodeToFile(*mptr_, dest); +#endif } else { LOG(FATAL) << "Do not know how to save file " << file_name << " with format=\'"<< format << "\'"; From 567a10bb0947180b067f39a97c76d7fe7a3ca1f2 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 25 Mar 2018 15:07:57 -0400 Subject: [PATCH 222/948] [CONTRIB] Patch nnvcc to generate error when build the empty result (#1049) --- README.md | 2 +- python/tvm/contrib/nvcc.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 07e550d76043..a25246245c66 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -TVM: Tensor IR Stack for Deep Learning Systems + Tensor IR Stack for Deep Learning Systems ============================================== [![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE) diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py index e3c2b7895da7..f08d639cf887 100644 --- a/python/tvm/contrib/nvcc.py +++ b/python/tvm/contrib/nvcc.py @@ -75,8 +75,11 @@ def compile_cuda(code, msg += py_str(out) raise RuntimeError(msg) - return bytearray(open(file_target, "rb").read()) - + data = bytearray(open(file_target, "rb").read()) + if not data: + raise RuntimeError( + "Compilation error: empty result is generated") + return data def find_cuda_path(): """Utility function to find cuda path From bbe4974ef4c5b2c4e4f11c9dd4396c4c0e8e669b Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 26 Mar 2018 10:43:40 -0400 Subject: [PATCH 223/948] [SCHEDULE][PASS] Enable Warp memory and lower to shuffle (#1050) * [SCHEDULE][PASS] Enable Warp memory and lower to shuffle * OpenCL dispatches for now to intel shuffle --- include/tvm/ir.h | 8 + include/tvm/ir_pass.h | 9 + python/tvm/build_module.py | 4 + src/api/api_pass.cc | 1 + src/codegen/codegen_c.cc | 14 +- src/codegen/intrin_rule_cuda.cc | 10 + src/codegen/intrin_rule_opencl.cc | 11 + src/pass/ir_util.h | 17 + src/pass/lower_warp_memory.cc | 317 ++++++++++++++++++ src/schedule/bound.cc | 11 +- tests/python/integration/test_ewise.py | 40 +++ .../unittest/test_pass_lower_warp_memory.py | 27 ++ .../unittest/test_schedule_bound_inference.py | 24 ++ 13 files changed, 481 insertions(+), 12 deletions(-) create mode 100644 src/pass/lower_warp_memory.cc create mode 100644 tests/python/unittest/test_pass_lower_warp_memory.py diff --git a/include/tvm/ir.h b/include/tvm/ir.h index f36d914e621f..9e3c8cbc2be1 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -411,6 +411,14 @@ constexpr const char* tvm_call_packed_lowered = "tvm_call_packed_lowered"; * } */ constexpr const char* tvm_storage_sync = "tvm_storage_sync"; +/*! + * \brief See pseudo code + * + * Type tvm_warp_shuffle(Type value, warp_id) { + * return (value passed in by warp indicated by warp_id); + * } + */ +constexpr const char* tvm_warp_shuffle = "tvm_warp_shuffle"; /*! * \brief Initialize the global barrier. * Call this at beginning of kernel that need global barrier. diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index 572385d9a895..1ae41032cbb8 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -407,6 +407,15 @@ LoweredFunc ThreadSync(LoweredFunc stmt, std::string storage_scope); */ LoweredFunc LowerThreadAllreduce(LoweredFunc f, int warp_size); +/*! + * \brief Lower warp memory in stmt. + * \param f The device function to be lowered. + * \param warp_size the size of warp where no sync is needed. + * this function will only take in effect if warp_size is bigger than one. + * \return Transformed function. + */ +LoweredFunc LowerWarpMemory(LoweredFunc f, int warp_size); + /*! * \brief Lower packed function call. * \param f The function to be lowered. diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 03a79860e9ee..0b86cde626af 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -450,6 +450,10 @@ def build(sch, else: raise ValueError("unknown function type %d" % func.func_type) + for i, func in enumerate(fdevice): + warp_size = target.thread_warp_size + fdevice[i] = ir_pass.LowerWarpMemory(func, warp_size) + if "gpu" in target.keys and not fdevice: warnings.warn( "Specified target %s, but cannot find device code, did you do bind?" % target) diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index 7ec6ef4009e4..6d59cb3ae505 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -125,6 +125,7 @@ REGISTER_PASS2(SplitPipeline); REGISTER_PASS2(LiftAttrScope); REGISTER_PASS1(NarrowChannelAccess); REGISTER_PASS2(LowerThreadAllreduce); +REGISTER_PASS2(LowerWarpMemory); REGISTER_PASS2(LowerIntrin); REGISTER_PASS1(LowerTVMBuiltin); REGISTER_PASS1(CombineContextCall); diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index e00cd82abe48..9732f0ef65af 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -5,6 +5,7 @@ #include #include #include "./codegen_c.h" +#include "../pass/ir_util.h" #include "../arithmetic/compute_expr.h" namespace tvm { @@ -544,15 +545,6 @@ void CodeGenC::PrintVecBinaryOp( } } -inline bool TryGetRamp1Base(Expr index, int lanes, Expr *base) { - const Ramp* r = index.as(); - if (!r) return false; - if (!is_one(r->stride)) return false; - CHECK_EQ(r->lanes, lanes); - *base = r->base; - return true; -} - void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) { // NOLINT(*) int lanes = op->type.lanes(); // delcare type. @@ -563,7 +555,7 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) { // NOLINT(*) CHECK(is_one(op->predicate)) << "predicated load is not supported"; Expr base; - if (TryGetRamp1Base(op->index, op->type.lanes(), &base)) { + if (GetRamp1Base(op->index, op->type.lanes(), &base)) { std::string ref = GetVecLoad(op->type, op->buffer_var.get(), base); os << ref; } else { @@ -617,7 +609,7 @@ void CodeGenC::VisitStmt_(const Store* op) { CHECK(is_one(op->predicate)) << "Predicated store is not supported"; Expr base; - if (TryGetRamp1Base(op->index, t.lanes(), &base)) { + if (GetRamp1Base(op->index, t.lanes(), &base)) { std::string value = this->PrintExpr(op->value); this->PrintVecStore(op->buffer_var.get(), t, base, value); } else { diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc index 9abb99d7c7c5..1d199fe5af28 100644 --- a/src/codegen/intrin_rule_cuda.cc +++ b/src/codegen/intrin_rule_cuda.cc @@ -49,6 +49,12 @@ struct CUDAPopcount { } }; +struct CUDAShuffle { + std::string operator()(Type t, std::string name) const { + return "__shfl"; + } +}; + TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp") .set_body(DispatchExtern); @@ -67,6 +73,10 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.pow") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.popcount") .set_body(DispatchExtern); +TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.tvm_warp_shuffle") +.set_body(DispatchExtern); + + } // namespace intrin } // namespace codegen } // namespace tvm diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc index 924abcade63f..b8b2412215d1 100644 --- a/src/codegen/intrin_rule_opencl.cc +++ b/src/codegen/intrin_rule_opencl.cc @@ -27,6 +27,17 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.pow") TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.popcount") .set_body(DispatchExtern); +// There is no warp shuffle instruction in standard OpenCL +// When shuffle is used, we assume it is intel's shuffle extension +struct IntelShuffle { + std::string operator()(Type t, std::string name) const { + return "intel_sub_group_shuffle"; + } +}; + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.tvm_warp_shuffle") +.set_body(DispatchExtern); + } // namespace intrin } // namespace codegen } // namespace tvm diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h index 96a41b120e46..f871133fb74f 100644 --- a/src/pass/ir_util.h +++ b/src/pass/ir_util.h @@ -161,6 +161,23 @@ inline int GetTempAllocaAlignment(Type type, int32_t const_size) { } return align; } + +/*! + * \brief Pattern match index to Ramp with stride=1 + * This is a common pattern in continuous memory load. + * \param index The index formula + * \param lanes number of lanes in the ramp + * \param base The result base. + * \return true if pattern match success and store the base to base. + */ +inline bool GetRamp1Base(Expr index, int lanes, Expr *base) { + const Ramp* r = index.as(); + if (!r) return false; + if (!is_one(r->stride)) return false; + CHECK_EQ(r->lanes, lanes); + *base = r->base; + return true; +} } // namespace ir } // namespace tvm #endif // TVM_PASS_IR_UTIL_H_ diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc new file mode 100644 index 000000000000..dbd073c0b14c --- /dev/null +++ b/src/pass/lower_warp_memory.cc @@ -0,0 +1,317 @@ +/*! + * Copyright (c) 2018 by Contributors + * + * Lower warp memory to use local memory + * and shuffle intrinsics. + * + * \file lower_warp_memory.cc + */ +// Thanks to Andrew Adams and Vinod Grover for +// explaining the concept of warp shuffle. +#include +#include +#include +#include +#include +#include "./ir_util.h" +#include "../arithmetic/compute_expr.h" +#include "../runtime/thread_storage_scope.h" + +namespace tvm { +namespace ir { + +// Rewrite Rule +// +// There is no special warp memory in most GPUs. +// Instead, we can stripe the data into threads +// and store the data into local memory. +// +// This requires us to do the following rewriting: +// - Rewrite allocation to use local memory. +// - Rewrite store of warp memory to local store. +// - Rewrite load of waro memory to local plus a shuffle. +// +// Define a generic shuffle instrinsic warp_shuffle(data, warp_index). +// We can use the following rewriting rule +// +// Before rewrite, +// +// alloc warp warp_mem[n * warp_size * m] +// store warp_mem[m * warp_index + (warp_size * m) * y + x] +// load warp_mem[m * z + (warp_size * m) * y + x] +// subject to x \in [0, m), y \in [0, n) +// +// After rewrite: +// +// alloc local local_mem[n * m] +// store warp_mem[m * y + x] +// warp_shuffle(load warp_mem[m * y + x], z) +// subject to (m * y + x) is invariant to warp_index + +// Algorithm +// +// To implement this rewrite rule, we can do the follow step: +// For each warp memory alloc +// - Use linear pattern detector on load index to find m +// - Deduce n given warp_size and alloc size +// - Now that we have m, n, warp_size, we can proceed with the rewrite + +// Visitor to find m in pattern +// store warp_mem[m * warp_index + (warp_size * m) * y + x] +class WarpStoreCoeffFinder : private IRVisitor { + public: + WarpStoreCoeffFinder(const Variable* buffer, + Var warp_index) + : buffer_(buffer), warp_index_(warp_index) { + } + // find the warp co-efficient in the statement given the warp size + int Find(const Stmt& stmt) { + this->Visit(stmt); + return warp_coeff_; + } + + private: + /// Visitor implementation + void Visit_(const Store *op) final { + if (op->buffer_var.get() == buffer_) { + if (op->value.type().lanes() == 1) { + UpdatePattern(op->index); + } else { + Expr base; + CHECK(GetRamp1Base(op->index, op->value.type().lanes(), &base)) + << "LowerWarpMemory failed due to store index=" << op->index + << ", can only handle continuous store"; + UpdatePattern(base); + } + } else { + IRVisitor::Visit_(op); + } + } + + void UpdatePattern(const Expr& index) { + Array m = + arith::DetectLinearEquation(index, {warp_index_}); + CHECK_EQ(m.size(), 2U) + << "LowerWarpMemory failed due to store index=" << index; + int coeff; + CHECK(arith::GetConstInt(ir::Simplify(m[0]), &coeff) && coeff > 0) + << "LowerWarpMemory failed due to store index=" << index + << ", require positive constant coefficient on warp index"; + if (warp_coeff_ != 0) { + CHECK_EQ(warp_coeff_, coeff) + << "LowerWarpMemory failed due to two different store coefficient to warp index"; + } else { + warp_coeff_ = coeff; + } + } + + // The buffer variable + const Variable* buffer_; + // the warp index + Var warp_index_; + // the coefficient + int warp_coeff_{0}; +}; + + +// Visitor to find the warp index +class WarpIndexFinder : private IRVisitor { + public: + explicit WarpIndexFinder(int warp_size) + : warp_size_(warp_size) { + } + // find the warp co-efficient in the statement given the warp size + IterVar Find(const Stmt& stmt) { + this->Visit(stmt); + CHECK(warp_index_.defined()) + << "Cannot find warp index(threadIdx.x) within the scope of warp memory"; + return warp_index_; + } + + private: + void Visit(const NodeRef &node) final { + if (warp_index_.defined()) return; + IRVisitor::Visit(node); + } + + /// Visitor implementation + void Visit_(const AttrStmt *op) final { + if (op->attr_key == attr::thread_extent) { + IterVar iv(op->node.node_); + if (iv->thread_tag == "threadIdx.x") { + int value; + CHECK(arith::GetConstInt(op->value, &value) && + value == warp_size_) + << "Expect threadIdx.x 's size to be equal to warp size(" + << warp_size_ << ")" << " to enable warp memory" + << " but get " << op->value << " instead"; + warp_index_ = iv; + } + } + IRVisitor::Visit_(op); + } + // warp size + int warp_size_{0}; + // the warp index + IterVar warp_index_{nullptr}; +}; +// Mutator to change the read pattern +class WarpAccessRewriter : protected IRMutator { + public: + explicit WarpAccessRewriter(int warp_size) + : warp_size_(warp_size) {} + // Rewrite the allocate statement which transforms + // warp memory to local memory. + Stmt Rewrite(const Allocate* op, const Stmt& stmt) { + buffer_ = op->buffer_var.get(); + int alloc_size = op->constant_allocation_size(); + CHECK_GT(alloc_size, 0) + << "warp memory only support constant alloc size"; + alloc_size *= op->type.lanes(); + warp_index_ = WarpIndexFinder(warp_size_).Find(op->body)->var; + warp_coeff_ = WarpStoreCoeffFinder( + buffer_, warp_index_).Find(op->body); + CHECK_EQ(alloc_size % (warp_size_ * warp_coeff_), 0) + << "Warp memory must be multiple of warp size"; + warp_group_ = alloc_size / (warp_size_ * warp_coeff_); + return Allocate::make( + op->buffer_var, + op->type, + {make_const(Int(32), alloc_size / warp_size_)}, + op->condition, + this->Mutate(op->body)); + } + + protected: + Expr Mutate_(const Variable* op, const Expr& expr) { + CHECK(op != buffer_) + << "Cannot access address of warp memory directly"; + return IRMutator::Mutate_(op, expr); + } + + Stmt Mutate_(const Store* op, const Stmt& stmt) { + if (op->buffer_var.get() == buffer_) { + Expr local_index, group; + std::tie(local_index, group) = SplitIndexByGroup(op->index); + return Store::make(op->buffer_var, op->value, local_index, op->predicate); + } else { + return IRMutator::Mutate_(op, stmt); + } + } + + Expr Mutate_(const Load* op, const Expr& expr) { + if (op->buffer_var.get() == buffer_) { + Expr local_index, group; + std::tie(local_index, group) = SplitIndexByGroup(op->index); + // invariance: local index must do not contain warp id + CHECK(!ExprUseVar(local_index, {warp_index_.get()})) + << "LowerWarpMemory failed to rewrite load to shuffle for index " + << op->index << " local_index=" << local_index; + Expr load_value = Load::make( + op->type, op->buffer_var, local_index, op->predicate); + return Call::make(load_value.type(), + intrinsic::tvm_warp_shuffle, + {load_value, group}, + Call::Intrinsic); + } else { + return IRMutator::Mutate_(op, expr); + } + } + // Split the index to the two component + // + // local index is the index in the local + // source index is the corresponding source index + // in this access pattern. + std::pair SplitIndexByGroup(const Expr& index) { + if (index.type().lanes() != 1) { + Expr base, local_index, group; + CHECK(GetRamp1Base(index, index.type().lanes(), &base)); + std::tie(local_index, group) = SplitIndexByGroup(base); + local_index = + Ramp::make(local_index, make_const(local_index.type(), 1), index.type().lanes()); + return std::make_pair(local_index, group); + } + Expr m = make_const(index.type(), warp_coeff_); + Range rng = Range::make_by_min_extent( + make_zero(index.type()), make_const(index.type(), warp_size_)); + Map vrange({{warp_index_, rng}}); + + // simple case, warp index is on the highest. + if (warp_group_ == 1) { + Expr x = Simplify(index % m, vrange); + Expr z = Simplify(index / m, vrange); + return std::make_pair(x, z); + } else { + Expr x = Simplify(index % m, vrange); + Expr y = index / make_const(index.type(), warp_coeff_ * warp_size_); + y = y * m + x; + Expr z = index % make_const(index.type(), warp_coeff_ * warp_size_) / m; + return std::make_pair(Simplify(y, vrange), Simplify(z, vrange)); + } + } + + private: + // the warp size + int warp_size_{0}; + // The buffer variable + const Variable* buffer_; + // Warp index + Var warp_index_; + // the coefficient m + int warp_coeff_{0}; + // the coefficient n + int warp_group_{0}; +}; + +// Mutator to change the read pattern +class WarpMemoryRewriter : private IRMutator { + public: + explicit WarpMemoryRewriter(int warp_size) + : warp_size_(warp_size) { + } + + Stmt Rewrite(Stmt stmt) { + if (warp_size_ == 1) return stmt; + return this->Mutate(stmt); + } + + private: + Stmt Mutate_(const Allocate* op, const Stmt& stmt) { + if (warp_buffer_.count(op->buffer_var.get())) { + WarpAccessRewriter rewriter(warp_size_); + return rewriter.Rewrite(op, stmt); + } else { + return IRMutator::Mutate_(op, stmt); + } + } + + Stmt Mutate_(const AttrStmt* op, const Stmt& stmt) { + using runtime::StorageScope; + if (op->attr_key == attr::storage_scope) { + const Variable* buf = op->node.as(); + StorageScope scope = StorageScope::make(op->value.as()->value); + if (scope.rank == runtime::StorageRank::kWarp) { + warp_buffer_.insert(buf); + Stmt ret = IRMutator::Mutate_(op, stmt); + op = ret.as(); + return AttrStmt::make( + op->node, op->attr_key, StringImm::make("local"), op->body); + } + } + return IRMutator::Mutate_(op, stmt); + } + + int warp_size_{0}; + std::unordered_set warp_buffer_; +}; + +LoweredFunc +LowerWarpMemory(LoweredFunc f, int warp_size) { + CHECK_EQ(f->func_type, kDeviceFunc); + auto n = std::make_shared(*f.operator->()); + n->body = WarpMemoryRewriter(warp_size).Rewrite(n->body); + return LoweredFunc(n); +} + +} // namespace ir +} // namespace tvm diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc index 908b579ec9a4..7929969a8502 100644 --- a/src/schedule/bound.cc +++ b/src/schedule/bound.cc @@ -42,7 +42,16 @@ bool NeedRelax(const IterVar& iv, if (tag.length() == 0 || tag == "pipeline") { return !found_attach; } - return static_cast(scope.rank) <= ThreadScope::make(tag).rank; + ThreadScope ts = ThreadScope::make(tag); + + // When there is warp memory + // threadIdx.x must be set to be warp index. + if (scope.rank == StorageRank::kWarp && + ts.rank == 1 && + ts.dim_index == 0) { + return true; + } + return static_cast(scope.rank) <= ts.rank; } // infer storage scope, if not given diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index 414b1ff008fe..ee880ed1d9fb 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -1,4 +1,5 @@ import tvm +from tvm.contrib import nvcc import numpy as np import time @@ -155,7 +156,46 @@ def check_device(device): run("uint64") +def try_warp_memory(): + """skip this in default test because it require higher arch""" + m = 128 + A = tvm.placeholder((m,), name='A') + B = tvm.compute((m,), lambda i: A[i] + 3, name='B') + warp_size = 32 + s = tvm.create_schedule(B.op) + AA = s.cache_read(A, "warp", [B]) + xo, xi = s[B].split(B.op.axis[0], warp_size * 2) + xi0, xi1 = s[B].split(xi, factor=warp_size) + tx = tvm.thread_axis("threadIdx.x") + s[B].bind(xi1, tx) + s[B].bind(xo, tvm.thread_axis("blockIdx.x")) + s[AA].compute_at(s[B], xo) + xo, xi = s[AA].split(s[AA].op.axis[0], warp_size) + s[AA].bind(xi, tx) + + @tvm.register_func + def tvm_callback_cuda_compile(code): + ptx = nvcc.compile_cuda(code, target="ptx") + return ptx + + # one line to build the function. + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("skip because %s is not enabled.." % device) + return + f = tvm.build(s, [A, B], device) + a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx) + f(a, b) + np.testing.assert_allclose( + b.asnumpy(), a.asnumpy() + 3, rtol=1e-6) + + check_device("cuda") + + if __name__ == "__main__": + try_warp_memory() test_add() test_log_pow_llvm() test_exp() diff --git a/tests/python/unittest/test_pass_lower_warp_memory.py b/tests/python/unittest/test_pass_lower_warp_memory.py new file mode 100644 index 000000000000..9793b21371bd --- /dev/null +++ b/tests/python/unittest/test_pass_lower_warp_memory.py @@ -0,0 +1,27 @@ +import tvm + +def test_lower_warp_mem(): + m = 128 + A = tvm.placeholder((m,), name='A') + B = tvm.compute((m,), lambda i: A[i] + 3, name='B') + + s = tvm.create_schedule(B.op) + AA = s.cache_read(A, "warp", [B]) + xo, xi = s[B].split(B.op.axis[0], 32) + xi0, xi1 = s[B].split(xi, factor=16) + tx = tvm.thread_axis("threadIdx.x") + s[B].bind(xi1, tx) + s[B].bind(xo, tvm.thread_axis("blockIdx.x")) + s[AA].compute_at(s[B], xo) + xo, xi = s[AA].split(s[AA].op.axis[0], 16) + s[AA].bind(xi, tx) + + f = tvm.lower(s, [A, B]) + fhost, fdevice = tvm.ir_pass.SplitHostDevice(f) + fdevice = tvm.ir_pass.LowerWarpMemory(fdevice, 16) + assert(fdevice.body.body.value.value == "local") + assert(fdevice.body.body.body.extents[0].value == 2) + + +if __name__ == "__main__": + test_lower_warp_mem() diff --git a/tests/python/unittest/test_schedule_bound_inference.py b/tests/python/unittest/test_schedule_bound_inference.py index 3601833de08d..30be3783bbb3 100644 --- a/tests/python/unittest/test_schedule_bound_inference.py +++ b/tests/python/unittest/test_schedule_bound_inference.py @@ -53,6 +53,29 @@ def test_bound3(): assert(bounds[A1.op.axis[0]].extent.value==32) assert(bounds[A1.op.axis[1]].extent.value==16) + +def test_bound_warp(): + m = tvm.var('m') + l = tvm.var('l') + A = tvm.placeholder((m, l), name='A') + A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') + A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') + + s = tvm.create_schedule(A2.op) + s[A1].set_scope("warp") + xo, xi = s[A2].split(A2.op.axis[0], 32) + xi0, xi1 = s[A2].split(xi, factor=16) + tx = tvm.thread_axis("threadIdx.x") + s[A2].bind(xi1, tx) + s[A2].bind(xi0, tvm.thread_axis("threadIdx.y")) + y = s[A2].op.axis[1] + s[A1].compute_at(s[A2], y) + xo, xi = s[A1].split(s[A1].op.axis[0], factor=16) + s[A1].bind(xi, tx) + bounds = tvm.schedule.InferBound(s) + assert isinstance(bounds, tvm.container.Map) + assert(bounds[A1.op.axis[0]].extent.value==16) + def test_bound_scan(): m = tvm.var("m") n = tvm.var("n") @@ -249,3 +272,4 @@ def test_gemm_bound(): test_bound_conv1d() test_bound2() test_gemm_bound() + test_bound_warp() From f3b6a0bc2c001771b9036dd780c7edb45eb1cc76 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Tue, 27 Mar 2018 22:24:49 +0530 Subject: [PATCH 224/948] Update softmax.h (#1057) --- topi/include/topi/nn/softmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h index f542f0eb0256..d17f93046e72 100644 --- a/topi/include/topi/nn/softmax.h +++ b/topi/include/topi/nn/softmax.h @@ -83,7 +83,7 @@ inline Tensor softmax(const Tensor &x, }); return tvm::compute(input_shape, [&](const Array &indices) { return _normalize(max_elem, expsum, indices); - }); + }, name, tag); } /*! @@ -116,7 +116,7 @@ inline Tensor log_softmax(const Tensor& x, return tvm::compute( x->shape, [&](Var i, Var j) { return x(i, j) - max_elem(i) - tvm::log(expsum(i)); - }); + }, name, tag); } } // namespace nn From 7ea69dbf399bf38d114ab4516501c34920f4e925 Mon Sep 17 00:00:00 2001 From: Ding <37059654+dingobye@users.noreply.github.com> Date: Wed, 28 Mar 2018 03:57:21 +1100 Subject: [PATCH 225/948] [TOPI] Overload operators of Tensor when TOPI is imported (#1029) --- python/tvm/__init__.py | 1 + python/tvm/expr.py | 16 +- python/tvm/generic.py | 81 ++++++ .../unittest/test_lang_tensor_overload_op.py | 249 ++++++++++++++++++ topi/python/topi/__init__.py | 1 + topi/python/topi/generic_op_impl.py | 123 +++++++++ 6 files changed, 464 insertions(+), 7 deletions(-) create mode 100644 python/tvm/generic.py create mode 100644 tests/python/unittest/test_lang_tensor_overload_op.py create mode 100644 topi/python/topi/generic_op_impl.py diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 1119dd3fcfc0..e4068cca3b9f 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -15,6 +15,7 @@ from . import node from . import ir_builder from . import target +from . import generic from . import ndarray as nd from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl diff --git a/python/tvm/expr.py b/python/tvm/expr.py index 750e78909c7a..5ba2928306e8 100644 --- a/python/tvm/expr.py +++ b/python/tvm/expr.py @@ -18,32 +18,34 @@ from __future__ import absolute_import as _abs from ._ffi.node import NodeBase, NodeGeneric, register_node from . import make as _make +from . import generic as _generic from . import _api_internal + class ExprOp(object): def __add__(self, other): - return _make.Add(self, other) + return _generic.add(self, other) def __radd__(self, other): return self.__add__(other) def __sub__(self, other): - return _make.Sub(self, other) + return _generic.subtract(self, other) def __rsub__(self, other): - return _make.Sub(other, self) + return _generic.subtract(other, self) def __mul__(self, other): - return _make.Mul(self, other) + return _generic.multiply(self, other) def __rmul__(self, other): - return _make.Mul(other, self) + return _generic.multiply(other, self) def __div__(self, other): - return _make.Div(self, other) + return _generic.divide(self, other) def __rdiv__(self, other): - return _make.Div(other, self) + return _generic.divide(other, self) def __truediv__(self, other): return self.__div__(other) diff --git a/python/tvm/generic.py b/python/tvm/generic.py new file mode 100644 index 000000000000..7e70d66e432a --- /dev/null +++ b/python/tvm/generic.py @@ -0,0 +1,81 @@ +"""Generic opertors in TVM. +We follow the numpy naming convention for this interface +(e.g., tvm.generic.multitply ~ numpy.multiply). +The default implementation is used by tvm.ExprOp. +""" +# pylint: disable=unused-argument +from . import make as _make + +#Operator precedence used when overloading. +__op_priority__ = 0 + +def add(lhs, rhs): + """Generic add operator. + + Parameters + ---------- + lhs : object + The left operand. + rhs : object + The right operand. + + Returns + ------- + op : tvm.Expr + The result Expr of add operaton. + """ + return _make.Add(lhs, rhs) + + +def subtract(lhs, rhs): + """Generic subtract operator. + + Parameters + ---------- + lhs : object + The left operand. + rhs : object + The right operand. + + Returns + ------- + op : tvm.Expr + The result Expr of subtract operaton. + """ + return _make.Sub(lhs, rhs) + + +def multiply(lhs, rhs): + """Generic multiply operator. + + Parameters + ---------- + lhs : object + The left operand. + rhs : object + The right operand. + + Returns + ------- + op : tvm.Expr + The result Expr of multiply operaton. + """ + return _make.Mul(lhs, rhs) + + +def divide(lhs, rhs): + """Generic divide operator. + + Parameters + ---------- + lhs : object + The left operand. + rhs : object + The right operand. + + Returns + ------- + op : tvm.Expr + The result Expr of divide operaton. + """ + return _make.Div(lhs, rhs) diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py new file mode 100644 index 000000000000..c1d860b89b61 --- /dev/null +++ b/tests/python/unittest/test_lang_tensor_overload_op.py @@ -0,0 +1,249 @@ +import numpy as np +import tvm +import topi +import topi.testing +from topi.util import get_const_tuple + + +def test_operator_type_and_tags(): + k = 1 + n = tvm.var('n') + A = tvm.placeholder((), name='A') + B = tvm.placeholder((10, 5), name='B') + B1 = B[0] + B2 = B[0,0] + + assert isinstance(k + n, tvm.expr.Expr) + assert isinstance(n + n, tvm.expr.Expr) + assert isinstance(k + A, tvm.expr.Expr) + assert isinstance(A + k, tvm.expr.Expr) + assert isinstance(n + A, tvm.expr.Expr) + assert isinstance(A + n, tvm.expr.Expr) + assert isinstance(A + A, tvm.expr.Expr) + + assert isinstance(k + B, tvm.tensor.Tensor) + assert isinstance(B + k, tvm.tensor.Tensor) + assert isinstance(n + B, tvm.tensor.Tensor) + assert isinstance(B + n, tvm.tensor.Tensor) + assert isinstance(A + B, tvm.tensor.Tensor) + assert isinstance(B + A, tvm.tensor.Tensor) + assert isinstance(B + B, tvm.tensor.Tensor) + + assert (k + B).op.tag == topi.tag.ELEMWISE + assert (B + k).op.tag == topi.tag.ELEMWISE + assert (n + B).op.tag == topi.tag.ELEMWISE + assert (B + n).op.tag == topi.tag.ELEMWISE + assert (A + B).op.tag == topi.tag.ELEMWISE + assert (B + A).op.tag == topi.tag.ELEMWISE + assert (B + B).op.tag == topi.tag.BROADCAST + + assert isinstance(k + B2, tvm.expr.Expr) + assert isinstance(B2 + k, tvm.expr.Expr) + assert isinstance(n + B2, tvm.expr.Expr) + assert isinstance(B2 + n, tvm.expr.Expr) + assert isinstance(B2 + B2, tvm.expr.Expr) + assert isinstance(B2 + A, tvm.expr.Expr) + assert isinstance(A + B2, tvm.expr.Expr) + assert isinstance(B2 + B, tvm.tensor.Tensor) + assert isinstance(B + B2, tvm.tensor.Tensor) + + +def test_combination(): + k = 3 + n = 5 + m = 10 + x = tvm.var('x') + A = tvm.placeholder((n, m), name='A') + B = tvm.placeholder((n, m), name='B') + C = tvm.placeholder((n, m), name='C') + D = k + A - B * C / x + s = tvm.create_schedule(D.op) + foo = tvm.build(s, [x, A, B, C, D], "llvm") + ctx = tvm.cpu(0) + x = 2 + a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), ctx) + c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), ctx) + d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) + foo(x, a, b, c, d) + np.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() / x) + + +def verify_tensor_scalar_bop(shape, typ="add"): + """Verify non-constant Tensor and scalar binary operations.""" + sh = [tvm.var('n%d' % i) for i in range(0, len(shape))] + k = tvm.var('k') + A = tvm.placeholder(sh, name='A') + if typ == "add": + B = A + k + elif typ == "sub": + B = A - k + elif typ == "mul": + B = A * k + elif typ == "div": + B = A / k + else: + raise NotImplementedError() + + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_elemwise(B) + + k_ = 2 + foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ) + a_npy = np.random.uniform(size=shape).astype(A.dtype) + if typ == "add": + b_npy = a_npy + k_ + elif typ == "sub": + b_npy = a_npy - k_ + elif typ == "mul": + b_npy = a_npy * k_ + elif typ == "div": + b_npy = a_npy / k_ + else: + raise NotImplementedError() + + a_nd = tvm.nd.array(a_npy, ctx) + b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) + foo(a_nd, b_nd, k_, *shape) + np.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5) + + for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']: + check_device(device) + + +def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"): + A = tvm.placeholder(shape=lhs_shape, name="A") + B = tvm.placeholder(shape=rhs_shape, name="B") + if typ == "add": + C = A + B + elif typ == "sub": + C = A - B + elif typ == "mul": + C = A * B + elif typ == "div": + C = A / B + else: + raise NotImplementedError() + + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + s = topi.generic.schedule_broadcast(C) + + foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) + lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) + rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) + if typ == "add": + out_npy = lhs_npy + rhs_npy + elif typ == "sub": + out_npy = lhs_npy - rhs_npy + elif typ == "mul": + out_npy = lhs_npy * rhs_npy + elif typ == "div": + rhs_npy = np.abs(rhs_npy) + 0.001 + out_npy = lhs_npy / rhs_npy + else: + raise NotImplementedError() + + lhs_nd = tvm.nd.array(lhs_npy, ctx) + rhs_nd = tvm.nd.array(rhs_npy, ctx) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) + for _ in range(1): + foo(lhs_nd, rhs_nd, out_nd) + np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) + + for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']: + check_device(device) + + +def verify_conv2d_scalar_bop(batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add"): + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + + k = 10.0 + with tvm.target.create(device): + A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') + B = topi.nn.conv2d(A, W, stride, padding) + if typ == "add": + C = B + k + elif typ == "sub": + C = B - k + elif typ == "mul": + C = B * k + elif typ == "div": + C = B / k + else: + raise NotImplementedError() + s = topi.generic.schedule_conv2d_nchw([C]) + + foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ) + + a_npy = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + w_npy = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) + b_npy = topi.testing.conv2d_nchw_python(a_npy, w_npy, stride, padding) + c_npy = np.random.uniform(size=get_const_tuple(B.shape)).astype(B.dtype) + if typ == "add": + c_npy = b_npy + k + elif typ == "sub": + c_npy = b_npy - k + elif typ == "mul": + c_npy = b_npy * k + elif typ == "div": + c_npy = b_npy / k + else: + raise NotImplementedError() + + a_nd = tvm.nd.array(a_npy, ctx) + w_nd = tvm.nd.array(w_npy, ctx) + b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) + c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx) + foo(a_nd, w_nd, b_nd, c_nd) + np.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4) + + for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']: + check_device(device) + + +def test_tensor_scalar_bop(): + verify_tensor_scalar_bop((1,), typ="add") + verify_tensor_scalar_bop((3, 5), typ="sub") + verify_tensor_scalar_bop((1, 3, 5), typ="mul") + verify_tensor_scalar_bop((2, 3, 1, 32), typ="div") + + +def test_broadcast_bop(): + verify_broadcast_bop((2, 3), (), typ="add") + verify_broadcast_bop((5, 2, 3), (1,), typ="add") + verify_broadcast_bop((1, 32), (64, 32), typ="sub") + verify_broadcast_bop((5, 64, 128), (2, 5, 64, 1), typ="mul") + verify_broadcast_bop((2, 3, 1, 32), (64, 32), typ="div") + + +def test_conv2d_scalar_bop(): + verify_conv2d_scalar_bop(1, 16, 4, 4, 3, 1, 1, typ="add") + verify_conv2d_scalar_bop(1, 32, 2, 1, 3, 1, 1, typ="sub") + verify_conv2d_scalar_bop(1, 32, 1, 1, 3, 1, 1, typ="mul") + verify_conv2d_scalar_bop(1, 16, 2, 1, 3, 1, 1, typ="div") + + +if __name__ == "__main__": + test_operator_type_and_tags() + test_combination() + test_tensor_scalar_bop() + test_broadcast_bop() + test_conv2d_scalar_bop() \ No newline at end of file diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index 9666b88b80eb..f82a40f6b42e 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -17,6 +17,7 @@ from .math import * from .tensor import * +from .generic_op_impl import * from .reduction import * from .transform import * from .broadcast import * diff --git a/topi/python/topi/generic_op_impl.py b/topi/python/topi/generic_op_impl.py new file mode 100644 index 000000000000..16a79c095783 --- /dev/null +++ b/topi/python/topi/generic_op_impl.py @@ -0,0 +1,123 @@ +"""Implementation of generic operators in the presence of Tensor""" +# pylint: disable=invalid-name, too-many-arguments +from __future__ import absolute_import as _abs +import tvm +from . import broadcast as _broadcast +from . import tag + + +def _make_bop(elementwise_bop, broadcast_bop, orig_bop): + """Make a specific overloaded binary operator of Tensor when applicable; + apply the original operator if it is not supposed to be overloaded. + + Consider the following scenario: + OP : + | - | * | / + R0 : int | float | Expr | TensorSlice | Tensor (rank zero) + R1 : Tensor (positive rank) + + In terms of (LHS OP RHS), we apply the following overloading rules: + (1) We use broadcast_OP(LHS, RHS), when both LHS and RHS are R1. + (2) We perform element-wise operation of Tensor and scalar, + when one of LHS and RHS is R1 and another is R0. + (3) We do not overload OP (i.e. stick to orig_bop) otherwise. + + Parameters + ---------- + elementwise_bop : operator function + Operator for element-wise tensor-scalar operation, for rule (2). + + broadcast_bop : operator function + Operator for broadcast tensor-tensor operation, for rule (1). + + orig_bop: operator function + Operator before overloading, for rule (3). + + Returns + ------- + ret : operator function + The overloaded operator function if applicable or orig_bop otherwise. + """ + + name = orig_bop.__name__ + + def _tensor_bop_impl(lhs, rhs): + """Overloaded {op} operator. + + If both operands are non-zero-rank Tensors, it performs + tensor-tensor {op} operation, and broadcasts inputs when necessary. + + If one operand is non-zero-rank Tensor, while the other operand is + scalar like type (e.g., numeric types, Expr, or TensorSlice), + it performs tensor-scalar {op} operation on an element-wise basis. + + Otherwise, it performs default generic.{op} operation, as defined + in tvm.generic module. + + Parameters + ---------- + lhs : object + Left operand. + rhs : object + Right operand. + + Returns + ------- + ret : tvm.Tensor (if at least one operand is non-zero-rank Tensor) + tvm.Expr (otherwise) + The result of {op} operation. + """ + + def _get_rank(x): + """Get the rank of a value. + If x is Tensor, then return its rank; + if x is scalar_like (i.e., numeric types, Expr, or TensorSlice), return 0; + otherwise, return -1. + """ + if isinstance(x, tvm.tensor.Tensor): + return len(x.shape) + elif isinstance(x, (int, float, tvm.expr.Expr, tvm.tensor.TensorSlice)): + return 0 + return -1 + + rl = _get_rank(lhs) + rr = _get_rank(rhs) + if rl == -1 or rr == -1 or (rl == 0 and rr == 0): + return orig_bop(lhs, rhs) + elif rl > 0 and rr > 0: + return broadcast_bop(lhs, rhs) + elif rl == 0: + f = lambda *i: elementwise_bop(lhs, rhs(*i)) + with tvm.tag_scope(tag=tag.ELEMWISE): + return tvm.compute(rhs.shape, f, "tensor_" + name) + elif rr == 0: + f = lambda *i: elementwise_bop(lhs(*i), rhs) + with tvm.tag_scope(tag=tag.ELEMWISE): + return tvm.compute(lhs.shape, f, "tensor_" + name) + else: + raise AssertionError("Cannot reach this line.") + + _tensor_bop_impl.__doc__ = _tensor_bop_impl.__doc__.format(op=name) + return _tensor_bop_impl + + +def _bind_generic_ops(): + """Bind generic operators for Tensor.""" + # Check __op_priority__ to make sure the binding happens only once. + __op_priority__ = 1 + if __op_priority__ > tvm.generic.__op_priority__: + tvm.generic.__op_priority__ = __op_priority__ + tvm.generic.add = _make_bop(lambda x, y: x + y, + _broadcast.broadcast_add, + tvm.generic.add) + tvm.generic.subtract = _make_bop(lambda x, y: x - y, + _broadcast.broadcast_sub, + tvm.generic.subtract) + tvm.generic.multiply = _make_bop(lambda x, y: x * y, + _broadcast.broadcast_mul, + tvm.generic.multiply) + tvm.generic.divide = _make_bop(lambda x, y: x / y, + _broadcast.broadcast_div, + tvm.generic.divide) + + +_bind_generic_ops() From 100fd8bd88472424bb9cc178607e96f7d3913c8d Mon Sep 17 00:00:00 2001 From: nhynes Date: Tue, 27 Mar 2018 09:58:54 -0700 Subject: [PATCH 226/948] Only warn when unable to find a graph input (#1052) --- src/runtime/graph/graph_runtime.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 892967b310e2..ce1c51ba58a6 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -88,7 +88,7 @@ class GraphRuntime : public ModuleNode { return static_cast(i); } } - LOG(FATAL) << "cannot find " << name << " among input"; + LOG(WARNING) << "Warning: cannot find \"" << name << "\" among input"; return -1; } /*! @@ -459,7 +459,8 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) { CHECK(size == names.size()) << "Invalid parameters file format"; for (size_t i = 0; i < size; ++i) { - uint32_t in_idx = GetInputIndex(names[i]); + int in_idx = GetInputIndex(names[i]); + CHECK_GE(in_idx, 0) << "Found param for non-existent input: " << names[i]; uint32_t eid = this->entry_id(input_nodes_[in_idx], 0); CHECK_LT(eid, data_entry_.size()); LoadDLTensor(strm, &data_entry_[eid]); @@ -585,7 +586,8 @@ PackedFunc GraphRuntime::GetFunction( if (name == "set_input") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { if (args[0].type_code() == kStr) { - this->SetInput(this->GetInputIndex(args[0]), args[1]); + int in_idx = this->GetInputIndex(args[0]); + if (in_idx >= 0) this->SetInput(in_idx, args[1]); } else { this->SetInput(args[0], args[1]); } @@ -597,7 +599,9 @@ PackedFunc GraphRuntime::GetFunction( } else if (name == "get_input") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { if (args[0].type_code() == kStr) { - this->GetInput(this->GetInputIndex(args[0]), args[1]); + int in_idx = this->GetInputIndex(args[0]); + CHECK_GE(in_idx, 0); + this->GetInput(in_idx, args[1]); } else { this->GetInput(args[0], args[1]); } From abe4cf969595600271bf4e8825b5e0e2ef502702 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Tue, 27 Mar 2018 23:49:52 +0530 Subject: [PATCH 227/948] [TOPI] PReLU Support (#1008) --- topi/include/topi/nn.h | 35 +++++++++++++++++++++++++ topi/python/topi/nn/elemwise.py | 34 ++++++++++++++++++++++++ topi/src/topi.cc | 5 ++++ topi/tests/python/test_topi_relu.py | 25 ++++++++++++++++++ topi/tests/python_cpp/test_topi_relu.py | 25 ++++++++++++++++++ 5 files changed, 124 insertions(+) diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h index df4b73bdd6f3..07e12252a357 100644 --- a/topi/include/topi/nn.h +++ b/topi/include/topi/nn.h @@ -10,6 +10,7 @@ #include #include "topi/tags.h" +#include "topi/detail/constant_utils.h" #include "tvm/ir.h" #include "tvm/ir_pass.h" #include "tvm/tvm.h" @@ -83,6 +84,40 @@ inline tvm::Tensor leaky_relu(const tvm::Tensor& t, tag); } +/*! + * \brief Creates an operation that performs a parametric rectified linear unit + * + * \param x The input data tensor + * \param slope The channel-wise slope tensor + * \param axis The axis where the channel data needs to be applied + * \param name The name of the operation + * \param tag The tag to mark the operation + * + * \return A Tensor whose op member is the relu operation + */ +template +inline tvm::Tensor prelu(const tvm::Tensor &x, + const tvm::Tensor &slope, + const int axis = 1, + std::string name = "tensor", + std::string tag = kBroadcast) { + CHECK_EQ(4, x->shape.size()); + CHECK((size_t)axis < x->shape.size()) << + "Wrong axis (" << axis << ")value. "; + CHECK(topi::detail::GetConstInt(slope->shape[0]) == + topi::detail::GetConstInt(x->shape[axis])) + << "Wrong slope shape received."; + + return tvm::compute(x->shape, + [&](const tvm::Array &indices) { + return tvm::select(x(indices) > 0, + x(indices), + x(indices) * slope(indices[axis])); + }, + name, + tag); +} + /*! * \brief Creates an operation that performs padding * diff --git a/topi/python/topi/nn/elemwise.py b/topi/python/topi/nn/elemwise.py index 856652da02e8..b4132b2f1b7a 100644 --- a/topi/python/topi/nn/elemwise.py +++ b/topi/python/topi/nn/elemwise.py @@ -2,6 +2,7 @@ from __future__ import absolute_import as _abs import tvm from .. import tag +from ..util import get_const_int @tvm.tag_scope(tag=tag.ELEMWISE) def relu(x): @@ -42,3 +43,36 @@ def _compute(*indices): calpha = tvm.const(alpha, value.dtype) return tvm.select(value > 0, value, value * calpha) return tvm.compute(x.shape, _compute) + +@tvm.tag_scope(tag=tag.BROADCAST) +def prelu(x, slope, axis=1): + """ PReLU. + It accepts two arguments: an input ``x`` and a weight array ``W`` + and computes the output as :math:`PReLU(x) y = x > 0 ? x : W * x`, + where :math:`*` is an elementwise multiplication for each sample in the + batch. + Arguments: + x : tvm.Tensor + Input argument. + + slope : tvm.Tensor + Channelised slope tensor for prelu + + axis : int + The axis where the channel data needs to be applied + + Returns: + y : tvm.Tensor + The result. + + Links: + [http://arxiv.org/pdf/1502.01852v1.pdf] + """ + + assert len(x.shape) == 4 and len(slope.shape) == 1 + assert axis < len(x.shape) + assert get_const_int(slope.shape[0]) == get_const_int(x.shape[axis]) + + def _compute_channelwise(*indices): + return tvm.select(x(*indices) > 0, x(*indices), x(*indices) * slope(indices[axis])) + return tvm.compute(x.shape, _compute_channelwise) diff --git a/topi/src/topi.cc b/topi/src/topi.cc index 2b0191cd72d7..a31a7809c667 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -190,6 +190,11 @@ TVM_REGISTER_GLOBAL("topi.nn.leaky_relu") *rv = leaky_relu(args[0]); }); +TVM_REGISTER_GLOBAL("topi.nn.prelu") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = prelu(args[0], args[1]); + }); + TVM_REGISTER_GLOBAL("topi.nn.pad") .set_body([](TVMArgs args, TVMRetValue *rv) { *rv = pad(args[0], args[1], args[2], args[3]); diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py index 994ed491411e..2a70789cf7b0 100644 --- a/topi/tests/python/test_topi_relu.py +++ b/topi/tests/python/test_topi_relu.py @@ -46,13 +46,38 @@ def verify_leaky_relu(m, alpha): np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) +def verify_prelu(x, w): + X = tvm.placeholder((x), name='X') + W = tvm.placeholder((w), name='W') + x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype) + w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype) + + def _prelu_numpy(x, W): + return (x < 0) * (x *W.reshape(3, 1, 1)) + (x>=0) * x + + B = topi.nn.prelu(X, W) + s = tvm.create_schedule([B.op]) + + ctx = tvm.cpu(0) + x_tvm = tvm.nd.array(x_np, ctx) + w_tvm = tvm.nd.array(w_np, ctx) + + b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [X, W, B], "llvm", name="prelu") + foo(x_tvm, w_tvm, b) + out_np = _prelu_numpy(x_np, w_np) + np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5) + def test_relu(): verify_relu(10, 128) def test_leaky_relu(): verify_leaky_relu(100, 0.1) +def test_prelu(): + verify_prelu((1, 3, 2, 2), (3,)) if __name__ == "__main__": test_relu() test_leaky_relu() + test_prelu() diff --git a/topi/tests/python_cpp/test_topi_relu.py b/topi/tests/python_cpp/test_topi_relu.py index f21426635121..6b80c20dd63c 100644 --- a/topi/tests/python_cpp/test_topi_relu.py +++ b/topi/tests/python_cpp/test_topi_relu.py @@ -50,6 +50,28 @@ def verify_leaky_relu(m, alpha): foo(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) +def verify_prelu(x, w): + X = tvm.placeholder((x), name='X') + W = tvm.placeholder((w), name='W') + x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype) + w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype) + def _prelu_numpy(x, W): + return (x < 0) * (x *W.reshape(3, 1, 1)) + (x>=0) * x + + out_np = _prelu_numpy(x_np, w_np) + B = topi.cpp.nn.prelu(X, W) + device = "llvm" + target = topi.cpp.TEST_create_target(device) + s = topi.cpp.generic.schedule_injective(target, [B]) + + ctx = tvm.cpu(0) + x_tvm = tvm.nd.array(x_np, ctx) + w_tvm = tvm.nd.array(w_np, ctx) + + b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx) + foo = tvm.build(s, [X, W, B], "llvm", name="prelu") + foo(x_tvm, w_tvm, b) + np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5) def test_relu(): for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']: @@ -58,7 +80,10 @@ def test_relu(): def test_leaky_relu(): verify_leaky_relu(100, 0.1) +def test_prelu(): + verify_prelu((1, 3, 2, 2), (3,)) if __name__ == "__main__": test_relu() test_leaky_relu() + test_prelu() From 1c49f7cbc8f895412298b4049127f33a5972ce78 Mon Sep 17 00:00:00 2001 From: eqy Date: Tue, 27 Mar 2018 18:01:01 -0700 Subject: [PATCH 228/948] fix Android build w/ threading_backend (#1059) --- apps/android_rpc/app/src/main/jni/tvm_runtime.h | 2 +- src/runtime/threading_backend.cc | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index fc384a8fcd72..0bb1ff553947 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -21,7 +21,7 @@ #include "../src/runtime/rpc/rpc_module.cc" #include "../src/runtime/rpc/rpc_socket_impl.cc" #include "../src/runtime/thread_pool.cc" - +#include "../src/runtime/threading_backend.cc" #include "../src/runtime/graph/graph_runtime.cc" #ifdef TVM_OPENCL_RUNTIME diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 11df12837860..413054750682 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -81,8 +81,13 @@ class ThreadGroup::Impl { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); +#if defined(__ANDROID__) + sched_setaffinity(pthread_self(), + sizeof(cpu_set_t), &cpuset); +#else pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); +#endif } #endif } From 85b903aee5f2d888318683659330c24a37472d37 Mon Sep 17 00:00:00 2001 From: cjjia <33363657+jiacunjiang1215@users.noreply.github.com> Date: Thu, 29 Mar 2018 02:12:09 +0800 Subject: [PATCH 229/948] fix warning (#1041) --- src/runtime/threading_backend.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 413054750682..aa15c229cfca 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -28,7 +28,7 @@ class ThreadGroup::Impl { } const char *val = getenv("TVM_BIND_THREADS"); if (val == nullptr || atoi(val) == 1) { - if (num_workers_ <= std::thread::hardware_concurrency()) { + if (static_cast(num_workers_) <= std::thread::hardware_concurrency()) { SetAffinity(exclude_worker0); } else { LOG(WARNING) From 1680a49f36b0d19077850ec713a06e8ac68cfbda Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 28 Mar 2018 11:24:25 -0700 Subject: [PATCH 230/948] Add Community Page (#1063) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a25246245c66..74213e8041cd 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,17 @@ [Operator Inventory](topi) | [FAQ](docs/faq.md) | [Contributors](CONTRIBUTORS.md) | +[Community](http://tvmlang.org/community.html) | [Release Notes](NEWS.md) TVM is a Tensor intermediate representation(IR) stack for deep learning systems. It is designed to close the gap between the productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends. TVM works with deep learning frameworks to provide end to end compilation to different backends. -Checkout our [announcement](http://tvmlang.org/2017/08/17/tvm-release-announcement.html) for more details. +Checkout the [tvm stack homepage](http://tvmlang.org/) for more information. License ------- -© Contributors, 2017. Licensed under an [Apache-2.0](https://github.com/dmlc/tvm/blob/master/LICENSE) license. +© Contributors Licensed under an [Apache-2.0](https://github.com/dmlc/tvm/blob/master/LICENSE) license. Contribute to TVM ----------------- From 7f92ba15b32e4e0d969e9b87af0fd60660bfeab2 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 28 Mar 2018 14:30:07 -0700 Subject: [PATCH 231/948] delete init part when keeping trivial loop (#1031) --- include/tvm/operation.h | 12 +++++------ include/tvm/schedule_pass.h | 7 +++++-- src/api/api_schedule.cc | 2 +- src/codegen/build_module.cc | 2 +- src/op/compute_op.cc | 25 +++++++++++++--------- src/op/compute_op.h | 12 +++++------ src/op/cross_thread_reduction.cc | 4 ++-- src/op/extern_op.cc | 2 +- src/op/op_util.cc | 6 +++--- src/op/op_util.h | 4 ++-- src/op/placeholder_op.cc | 2 +- src/op/scan_op.cc | 4 ++-- src/op/tensorize.cc | 4 ++-- src/schedule/schedule_ops.cc | 36 +++++++++++++++++--------------- 14 files changed, 66 insertions(+), 56 deletions(-) diff --git a/include/tvm/operation.h b/include/tvm/operation.h index 9b950c3d544f..3c819ff02ab8 100644 --- a/include/tvm/operation.h +++ b/include/tvm/operation.h @@ -117,13 +117,13 @@ class OperationNode : public FunctionBaseNode { * \brief Build the statement that provide the output tensors. * \param stage The schedule stage of the op. * \param dom_map The domain map of all iteration domains. - * \param del_trivial_loop Whether eliminate trivial loop with extent of 1 + * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 * \return A statement that add production and wraps consumer. */ virtual Stmt BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const = 0; + bool debug_keep_trivial_loop) const = 0; static constexpr const char* _type_key = "Operation"; @@ -163,7 +163,7 @@ class PlaceholderOpNode : public OperationNode { Stmt BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const final; + bool debug_keep_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); @@ -215,7 +215,7 @@ class ComputeOpNode : public OperationNode { Stmt BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const final; + bool debug_keep_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); @@ -287,7 +287,7 @@ class ScanOpNode : public OperationNode { Stmt BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const final; + bool debug_keep_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); @@ -351,7 +351,7 @@ class ExternOpNode : public OperationNode { Stmt BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const final; + bool debug_keep_trivial_loop) const final; void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h index 011c7510ced9..3f0334facc2b 100644 --- a/include/tvm/schedule_pass.h +++ b/include/tvm/schedule_pass.h @@ -29,10 +29,13 @@ Map InferBound(const Schedule& sch); * * \param s The schedule to be realized * \param dom_map The domain of each iter vars. - * \param del_trivial_loop Whether delete trivial loops with extent of 1 + * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 during lowering. + * This is a debug feature for dataflow/axis analysis. + * Note: If this is true, The lowered IR may be incorrect, + * because we will also delete the init part of reduction * \return the result Stmt */ -Stmt ScheduleOps(Schedule s, Map dom_map, bool del_trivial_loop); +Stmt ScheduleOps(Schedule s, Map dom_map, bool debug_keep_trivial_loop); /*! * \brief To automatically inline the element-wise operations. diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc index b1a6729ec662..14191d79ec02 100644 --- a/src/api/api_schedule.cc +++ b/src/api/api_schedule.cc @@ -27,7 +27,7 @@ TVM_REGISTER_API("schedule.AutoInlineInjective") TVM_REGISTER_API("schedule.ScheduleOps") .set_body([](TVMArgs args, TVMRetValue* ret) { if (args.size() == 2) - *ret = ScheduleOps(args[0], args[1], true); + *ret = ScheduleOps(args[0], args[1], false); else *ret = ScheduleOps(args[0], args[1], args[2]); }); diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 43bc3e32aad6..210f3661372a 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -349,7 +349,7 @@ Stmt BuildStmt(Schedule sch, // Phase 0 auto bounds = schedule::InferBound(sch); - auto stmt = schedule::ScheduleOps(sch, bounds, true); + auto stmt = schedule::ScheduleOps(sch, bounds, false); stmt = ir::InjectPrefetch(stmt); // Phase 1 diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc index f3f8335c195c..0dadf4ab3dd5 100644 --- a/src/op/compute_op.cc +++ b/src/op/compute_op.cc @@ -296,9 +296,9 @@ Stmt MakeProvide(const ComputeOpNode* op, Stmt MakeComputeStmt(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) { + bool debug_keep_trivial_loop) { // grab the nest structure - ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, del_trivial_loop); + ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, debug_keep_trivial_loop); // Normal loop structure n.init_nest.emplace_back(op::MakeIfNest(n.init_predicates)); n.main_nest.emplace_back(op::MakeIfNest(n.main_predicates)); @@ -319,7 +319,11 @@ Stmt MakeComputeStmt(const ComputeOpNode* self, n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.end()); provide = op::Substitute(provide, n.main_vmap); provide = MergeNest(reduce, provide); - return MergeNest(common, Block::make(init, provide)); + if (debug_keep_trivial_loop) { + return MergeNest(common, provide); + } else { + return MergeNest(common, Block::make(init, provide)); + } } else { std::vector provides; for (size_t i = 0; i < self->body.size(); ++i) { @@ -379,16 +383,16 @@ ComputeType DetectComputeType(const ComputeOpNode* self, Stmt ComputeOpNode::BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const { + bool debug_keep_trivial_loop) const { CHECK_EQ(stage->op.operator->(), this); ComputeType ctype = DetectComputeType(this, stage); if (ctype == ComputeType::kCrossThreadReduction) { // specially handle cross thread reduction. - return MakeCrossThreadReduction(this, stage, dom_map, del_trivial_loop); + return MakeCrossThreadReduction(this, stage, dom_map, debug_keep_trivial_loop); } else if (ctype == ComputeType::kTensorize) { - return MakeTensorize(this, stage, dom_map, del_trivial_loop); + return MakeTensorize(this, stage, dom_map, debug_keep_trivial_loop); } else { - return MakeComputeStmt(this, stage, dom_map, del_trivial_loop); + return MakeComputeStmt(this, stage, dom_map, debug_keep_trivial_loop); } } @@ -396,12 +400,13 @@ ComputeLoopNest ComputeLoopNest::make( const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) { + bool debug_keep_trivial_loop) { CHECK_EQ(stage->op.operator->(), self); ComputeLoopNest ret; // make main loop nest ret.main_nest = op::MakeLoopNest( - stage, dom_map, 0, false, std::unordered_set(), &ret.main_vmap, del_trivial_loop); + stage, dom_map, 0, false, std::unordered_set(), &ret.main_vmap, + debug_keep_trivial_loop); ret.main_predicates = schedule::MakeBoundCheck( stage, dom_map, ret.main_vmap, false, std::unordered_set()); @@ -443,7 +448,7 @@ ComputeLoopNest ComputeLoopNest::make( } ret.init_nest = op::MakeLoopNest( stage, dom_map, begin_loop, true, - skip_iter, &(ret.init_vmap), del_trivial_loop); + skip_iter, &(ret.init_vmap), debug_keep_trivial_loop); ret.init_predicates = schedule::MakeBoundCheck( stage, dom_map, ret.init_vmap, true, skip_iter); for (auto& e : ret.init_predicates) { diff --git a/src/op/compute_op.h b/src/op/compute_op.h index 2164feee6988..996764c6cdc1 100644 --- a/src/op/compute_op.h +++ b/src/op/compute_op.h @@ -37,14 +37,14 @@ struct ComputeLoopNest { * \param self The pointer to compute op. * \param stage The scxhedule stage. * \param dom_map The domain map. - * \param del_trivial_loop Whether eliminate trivial loops with extent of 1 + * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 * \return The constructed loop nest */ static ComputeLoopNest make( const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop); + bool debug_keep_trivial_loop); }; /*! @@ -52,27 +52,27 @@ struct ComputeLoopNest { * \param self The pointer to ComputeOpNode * \param stage The schedule stage. * \param dom_map The domain map. - * \param del_trivial_loop Wheter eliminate trivial loops with extent of 1 + * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 * \return The created statement. */ Stmt MakeCrossThreadReduction( const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop); + bool debug_keep_trivial_loop); /*! * \brief Build body of compute for tensorization. * \param self The pointer to ComputeOpNode * \param stage The schedule stage. * \param dom_map The domain map. - * \param del_trivial_loop Wheter eliminate trivial loops with extent of 1 + * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 * \return The created statement. */ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop); + bool debug_keep_trivial_loop); } // namespace tvm #endif // TVM_OP_COMPUTE_OP_H_ diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc index e32b3dcd4407..eb320388860a 100644 --- a/src/op/cross_thread_reduction.cc +++ b/src/op/cross_thread_reduction.cc @@ -14,14 +14,14 @@ Stmt MakeCrossThreadReduction( const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) { + bool debug_keep_trivial_loop) { Array args; for (IterVar iv : self->axis) { args.push_back(iv->var); } std::unordered_map value_map; auto nest = op::MakeLoopNest( - stage, dom_map, 0, false, std::unordered_set(), &value_map, del_trivial_loop); + stage, dom_map, 0, false, std::unordered_set(), &value_map, debug_keep_trivial_loop); auto conds = schedule::MakeBoundCheck( stage, dom_map, value_map, false, std::unordered_set()); diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc index df3a32d50fe7..8c372966cd09 100644 --- a/src/op/extern_op.cc +++ b/src/op/extern_op.cc @@ -129,7 +129,7 @@ Stmt ExternOpNode::BuildRealize( Stmt ExternOpNode::BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const { + bool debug_keep_trivial_loop) const { CHECK_EQ(stage->op.operator->(), this); Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body); auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) { diff --git a/src/op/op_util.cc b/src/op/op_util.cc index ef7af85bf079..daa489593298 100644 --- a/src/op/op_util.cc +++ b/src/op/op_util.cc @@ -24,7 +24,7 @@ MakeLoopNest(const Stage& stage, bool new_loop_var, const std::unordered_set& skip_iter, std::unordered_map* p_value_map, - bool del_trivial_loop) { + bool debug_keep_trivial_loop) { auto leaf_iter_vars = stage->leaf_iter_vars; Stmt no_op = Evaluate::make(0); // create the loop nest @@ -76,7 +76,7 @@ MakeLoopNest(const Stage& stage, AttrStmt::make(iv, ir::attr::pragma_scope, p, no_op)); } } - if (del_trivial_loop && is_one(dom->extent)) { + if (!debug_keep_trivial_loop && is_one(dom->extent)) { nest[i + 1].emplace_back( LetStmt::make(var, dom->min, no_op)); value_map[iv] = dom->min; @@ -131,7 +131,7 @@ MakeLoopNest(const Stage& stage, // annotate the extent of the IterVar nest[i + 1].emplace_back( AttrStmt::make(bind_iv, ir::attr::thread_extent, dom->extent, no_op)); - if (del_trivial_loop && is_one(dom->extent)) { + if (!debug_keep_trivial_loop && is_one(dom->extent)) { value_map[iv] = dom->min; } else { value_map[iv] = var; diff --git a/src/op/op_util.h b/src/op/op_util.h index 9b8f7dc629bd..558e8d4e7324 100644 --- a/src/op/op_util.h +++ b/src/op/op_util.h @@ -29,7 +29,7 @@ using ir::MergeNest; * \param new_loop_var Whether create new loop variable. * \param skip_iter Whether skip certain iteration. * \param p_value_map The result value of each IterVar. - * \param del_trivial_loop Whether eliminate trivial loops with extent of 1 + * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 */ std::vector > MakeLoopNest(const Stage& stage, @@ -38,7 +38,7 @@ MakeLoopNest(const Stage& stage, bool new_loop_var, const std::unordered_set& skip_iter, std::unordered_map* p_value_map, - bool del_trivial_loop); + bool debug_keep_trivial_loop); /*! * \brief Create a nest of if checking the predicates. diff --git a/src/op/placeholder_op.cc b/src/op/placeholder_op.cc index 27c1fa9c5001..a2cd0eb2d81f 100644 --- a/src/op/placeholder_op.cc +++ b/src/op/placeholder_op.cc @@ -79,7 +79,7 @@ Stmt PlaceholderOpNode::BuildRealize( Stmt PlaceholderOpNode::BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const { + bool debug_keep_trivial_loop) const { return Stmt(); } } // namespace tvm diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc index 5c61eae0f183..a3ccd70fa763 100644 --- a/src/op/scan_op.cc +++ b/src/op/scan_op.cc @@ -253,7 +253,7 @@ Stmt ScanOpNode::BuildRealize( Stmt ScanOpNode::BuildProvide( const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) const { + bool debug_keep_trivial_loop) const { CHECK_EQ(stage->op.operator->(), this); Stmt provide = AttrStmt::make( stage->op, attr::scan_update_scope, this->scan_axis->var, @@ -271,7 +271,7 @@ Stmt ScanOpNode::BuildProvide( std::unordered_map vmap; std::unordered_set empty; auto nest = op::MakeLoopNest( - stage, dom_map, 0, false, empty, &vmap, del_trivial_loop); + stage, dom_map, 0, false, empty, &vmap, debug_keep_trivial_loop); nest[begin_scan].push_back(init); nest.push_back( op::MakeIfNest( diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc index 1f03ec9c0ebb..148ad0f90fe7 100644 --- a/src/op/tensorize.cc +++ b/src/op/tensorize.cc @@ -370,14 +370,14 @@ Stmt TransformUpdate(const Stage& stage, Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, - bool del_trivial_loop) { + bool debug_keep_trivial_loop) { std::unordered_map out_dom; std::unordered_map > in_region; size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region); TensorIntrin intrin = stage->iter_var_attrs.at( stage->leaf_iter_vars[tloc])->tensor_intrin; CHECK(intrin.defined()); - ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, del_trivial_loop); + ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, debug_keep_trivial_loop); VerifyTensorizeLoopNest(self, stage, n, tloc); VerifyTensorizeBody(self, stage, out_dom, in_region, intrin); // Start bind data. diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc index b9b02050a556..6fd2496aeabe 100644 --- a/src/schedule/schedule_ops.cc +++ b/src/schedule/schedule_ops.cc @@ -23,8 +23,8 @@ using namespace ir; Stmt MakePipeline(const Stage& s, const std::unordered_map& dom_map, Stmt consumer, - bool del_trivial_loop) { - Stmt producer = s->op->BuildProvide(s, dom_map, del_trivial_loop); + bool debug_keep_trivial_loop) { + Stmt producer = s->op->BuildProvide(s, dom_map, debug_keep_trivial_loop); if (producer.defined()) { producer = ProducerConsumer::make(s->op, true, producer); } @@ -58,9 +58,9 @@ class InjectAttach : public IRMutator { InjectAttach(const Stage& stage, const Stage& attach_spec, const std::unordered_map& dom_map, - bool del_trivial_loop) + bool debug_keep_trivial_loop) : stage_(stage), attach_spec_(attach_spec), dom_map_(dom_map), - del_trivial_loop_(del_trivial_loop) {} + debug_keep_trivial_loop_(debug_keep_trivial_loop) {} Stmt Mutate(Stmt stmt) final { CHECK(stmt.defined()); @@ -76,7 +76,7 @@ class InjectAttach : public IRMutator { found_attach = true; stmt = AttrStmt::make( op->node, op->attr_key, op->value, - MakePipeline(stage_, dom_map_, op->body, del_trivial_loop_)); + MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_)); } } return stmt; @@ -91,8 +91,9 @@ class InjectAttach : public IRMutator { const Stage& attach_spec_; // domain map const std::unordered_map& dom_map_; - // whether delete trivial loops with extent of 1 - bool del_trivial_loop_; + // Whether keep trivial loops with extent of 1 during lowering. + // This is a debug feature for dataflow/axis analysis + bool debug_keep_trivial_loop_; }; // inject the operator's realization on the stmt. @@ -102,9 +103,9 @@ class InjectScanStep : public IRMutator { const Operation& scan_op, const std::unordered_map& dom_map, bool is_init, - bool del_trivial_loop) + bool debug_keep_trivial_loop) : stage_(stage), scan_op_(scan_op), - dom_map_(dom_map), is_init_(is_init), del_trivial_loop_(del_trivial_loop) {} + dom_map_(dom_map), is_init_(is_init), debug_keep_trivial_loop_(debug_keep_trivial_loop) {} Stmt Mutate(Stmt stmt) final { CHECK(stmt.defined()); @@ -118,7 +119,7 @@ class InjectScanStep : public IRMutator { found_attach = true; stmt = AttrStmt::make( op->node, op->attr_key, op->value, - MakePipeline(stage_, dom_map_, op->body, del_trivial_loop_)); + MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_)); } } return stmt; @@ -135,8 +136,9 @@ class InjectScanStep : public IRMutator { const std::unordered_map& dom_map_; // whether it is init. bool is_init_; - // whether delete trivial loops with extent of 1 - bool del_trivial_loop_; + // Whether keep trivial loops with extent of 1 during lowering. + // This is a debug feature for dataflow/axis analysis + bool debug_keep_trivial_loop_; }; // Postprocessing of schedule op @@ -337,7 +339,7 @@ class SchedulePostProc : public IRMutator { }; Stmt ScheduleOps( - Schedule sch, Map dom_map_, bool del_trivial_loop) { + Schedule sch, Map dom_map_, bool debug_keep_trivial_loop) { Stmt body = Stmt(); std::unordered_map dom_map = as_unordered_map(dom_map_); // scan init and scan updates @@ -372,14 +374,14 @@ Stmt ScheduleOps( if (scan_init.count(s->op)) { CHECK(body.defined()); - InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, del_trivial_loop); + InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, debug_keep_trivial_loop); body = mu.Mutate(body); CHECK(mu.found_attach) << "did not find attachment point for scan.init"; } else if (attach_spec->attach_type == kScanUpdate) { // Handle scan update CHECK(body.defined()); - InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, del_trivial_loop); + InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, debug_keep_trivial_loop); body = mu.Mutate(body); CHECK(mu.found_attach) << "did not find attachment point for scan.update"; @@ -387,11 +389,11 @@ Stmt ScheduleOps( // do nothing } else if (attach_spec->attach_type == kGroupRoot) { CHECK(!s->group.defined()); - body = MakePipeline(s, dom_map, body, del_trivial_loop); + body = MakePipeline(s, dom_map, body, debug_keep_trivial_loop); } else { CHECK_EQ(attach_spec->attach_type, kScope); CHECK(body.defined()); - InjectAttach mutator(s, attach_spec, dom_map, del_trivial_loop); + InjectAttach mutator(s, attach_spec, dom_map, debug_keep_trivial_loop); body = mutator.Mutate(body); CHECK(mutator.found_attach) << "did not find attachment point for " << s << " in " From 525cb3bc6025c47b8b56fb3c73c3b82b88bc0a50 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 28 Mar 2018 16:35:17 -0700 Subject: [PATCH 232/948] [DOCS] Add logo and tracker to docs (#1064) --- docs/_static/css/tvm_theme.css | 10 ++++++++++ docs/_static/img/README | 2 ++ docs/_static/img/tvm-logo-small.png | Bin 0 -> 6683 bytes docs/conf.py | 8 ++++++++ 4 files changed, 20 insertions(+) create mode 100644 docs/_static/img/README create mode 100644 docs/_static/img/tvm-logo-small.png diff --git a/docs/_static/css/tvm_theme.css b/docs/_static/css/tvm_theme.css index 5e0838abf6cb..274589887b3f 100644 --- a/docs/_static/css/tvm_theme.css +++ b/docs/_static/css/tvm_theme.css @@ -9,3 +9,13 @@ nav .hidden-section { display: inherit; } + +.wy-side-nav-search { + background-color: #fff; + color: #333; +} + +.version{ + color: #404040 !important; +} + diff --git a/docs/_static/img/README b/docs/_static/img/README new file mode 100644 index 000000000000..414328cc729d --- /dev/null +++ b/docs/_static/img/README @@ -0,0 +1,2 @@ +The logo file in this repo is an exception due to the need of sphinx. +By default we avoid to put large binary blobs into this repo. \ No newline at end of file diff --git a/docs/_static/img/tvm-logo-small.png b/docs/_static/img/tvm-logo-small.png new file mode 100644 index 0000000000000000000000000000000000000000..c3519fece55b24bddca3dcf6d825fcfdef5576b1 GIT binary patch literal 6683 zcmZ{J1yodDzxE6r0z-p@LyFYU(lLOvbV+x^2t!GCm(&PKcS(sz4xJ(m0!m4TltaVC z_x*qGz29Byp0)Pbdq4a6J-^szopmBc3+)(37l0C4pH1#bfYo;(1+t~mf8iU0s8VeeZs z#UB=MVTy(x002Je-v$I^y{7^IfHMv{25i65N~g9Zf|~WH+MS-uZV~Ugoh8p$H(=+;PUWwg zkpE~un2P^JMP=PwVD2^^9uNAGyyE|2{(sni`}sFt-QB_FA=1C`{D=QPvH#82uy=#I zJygxz!Ai*$ZsY#o>>q2sf4u$QjDNJmA%7G9ABFgLmH&l4lvxr_9P*znlf*-t4}Q2p zC~KAEWORIh2S$Erx--7ZTb^e8Y|)wwA#5+KG-F{6$t~#G@4zj{FNsyBUX20ujU0O20XktO^YxhvZDHsnaWN=czo%!o z2_jz0@jFg{(%1-b=!TV$SC(AnoIjt;EE$KZ&!HhY(zHmm6yB|0{cI9c$?_rr$ED~j zPyT)~?{&bdSDkP*H8DH~m)N@gnT0Z<$;G`c+q}7CrlZD0`jp&$H7}_)jljlA)|8jT z=#vc!Xv_{?I}JyO5khMzM$h+JTH+548`?^24)}qV0h|<4-(S-LH#wry*a_~DE*1!? zpT}k&JFPAS=~O80-Gni+%epeTO)7q_LwhygzMS+@Bc~FpzVt;3ow)~LLVy>D+#;qD zfPmK_<-}eT5NKlGu|`P$OJ*j%0~5QgOFQgOGjQP<27;(7*kt^dRItTUy9UG6FEy}a z6CAXNt_?AFLeC+cuVg0AJBK8v?`mqeku-?4V`1VU(TFw}9%kN#ojP~o?z>8Mrfp;+Eb;_s-sS&2v8p=(2||zY7+KZQjXHZF$waH$>PDMCOv$G4ulct zv<~>&(mO?)E|7Y<1ez8Dgmm zVuTi^7v`zUo9LJ_UdX>MY;pW%Z%SO{Do`WK+Gf(~HFS7*I9V4JT<~xYhas;p*6Ah8$nvV)BQ? z2nSG}Hdf-8-mt~e@S-4p3@4(PT^kI>qGFRZ2wN?4Yg+XmT$%K(s}IVEl@tw^%@)+a ziQ{1aC~%My1PjAff-^+RS>MK6VCOxRa%K`@b1B9gc_Ayih|po@nA&;#?4?|`cgm%l z=*=*1#Mqb!;c6H_G=_YdEJ#JJ!a(n!_&w4d=Kn`=q)WgCNROVjGdK+u$EYHr-eo-1 z=B(AH`$%&qJT=bcf0eUZ+%WX(dT;aVh{7miYN7WGu2eWtC0}s9VaM$>k<`K zQ5Aa>&`hM&&+v%gk9XA#_fb-@Jk);RN^8WDSL8)A9|sAd%wxYO=TGp}c$?>*WvTz_ z(r>{WkGqHt$aSqhVt()Dx?*wnbNJ$*1tx=|LdaCo%oZ?d%6#*L%B;E7tQC9xCXs)e ze~n)--^@E_{JkW@x^@T8!NzE-pAjil+l@0aU9mnc!`Au%Q?x{>pu{w1asU$s$^EK0A zJs0A1^25>&@Uc0)c`8Fh1TQ3?WXcD3C`_P%jCp17&&GdqWTXcGWw)o(cfG!QaXzAv z!3ejY-n?v)gzrfEOB?9L8_Dkrw0+%IsCKkJ5Uwj}LhbrGK(CY((7&b-?f*p&dL#V-85)_vg%m>l2~i`dSY&|wZZ%5 zVsYO&=Zvht2#Z&*uewP1^RqOrwwHS?>pIb__KfF(KGhYkn-y=|7X{2#neq{+ z4f^|xLOGoegq<{<1`_gmbW1Ggi2MOh2Qd`gvv~J7g@-f6Uam$ZRMVZ$L7dE{@*O_T zQ`P9;L~127#j_LC)^H2-PNR2@V`@F(el)0IleAG1Q*6cjQu^6T1Tz;fR5f?ECE0Z5lko2szI?r4D;6e0bxtnaH*h|_$HjKt_S03yU zJMK)<2;(tKZ7Dm0O-F->T)~<)Cq6+6!>`51iHVh-J5u)jYaYP$i%@MOdMy&p9`1_R z9XfF5&disgqEu(zgmwhZ)HWVytxD)6~ zG)+_(+YZk7WoZD41(EEcr)r2DpRbnEa;ZD}F;^nHx7t3!*{{5!)%W!tM)I2$Mu9*c zQ=7KzpB!Z=|6s;UbA;P%F?KPx7_V=a< zzqfg`R?vw#A1L&J8~OM{HT6O}D0Bn|UmVR*OYD^jfb}nL|)0ssgQzf<%p+eJn}B^mZyYz-oz7klS^L6 zVB}d)jbV`bzoY!=wEsjz`w!Pab{_Ry9-D0zYnWE1P>u1Ww)B42JgfVJWvyA-HvIU9 zvLWT_F=f07!koGP4V0eZen6CyL8oN=5mvplQbbul@u6)Mvli1gG zzHZQjADKJq_x+o>VK#6x079Ye-;w;S^&R|UPF49AVMzOiS~VS{%bpvvM{Qa}gPqTt z5_yM~-CWIfr#+cucWS$E;=hjZbsU1^gVPmXGm*49*k}VTAho45|9{`R!Mx&Xl`h2sj zRAtYruTg4jYkGLxy(+vEwr6g{E6S5?(Dt2H@mj~3Zpo&trs~GrH>G}I^0>{9L9Zq+ z6&Md7E^bb{)YBP_Yj-vS&{nHLrXOM<>nxYzy^Q$Z^rD>tk|RgxK`9g@0(YIkiv>+a zG*_9qfqB?RSCeRqf#ZDIBekQWM*`9C9r>S3Z2hoJzzQPkv$Lgj21j>UrSh0r8 z{W8CK8MPqbf^xP}*kb5xudAcn7`2%?+i4OB5ViIZ-fZw5WNZ=P$+t3uPpLVruMa#) zyG9+LhYYD8p8O$0>tY~kzVNSiY5j3gT4F<&eRO4GpVceUR|?YXz*si=Vj{Q5K*8pI zM=?M0Eg0C6tQTEE8F#<;_!D)^sXVup1ZZaxo@z(G?X7O)aqHid+aB01cuYs$tc8|T zUGPHc*Ld^my9&dG9JhM?CBVRrBb3T#`pGv*R4!qyIVKL z!Fa}N$x!VS&@QKugxy%Ki_sE7a6!8kV@?{DAbvwJepOMG-b{HU1O+f7WEr#rl*+b0 zpzA5dlHP;>cwz_2-g~EmJm~vlV^bv-e@qzW@8zux&^MS^BQ^IESb553#?=LMFtAUD zgi)3Alz$`(A~Ire?H4Ez{a?)2Vgx3RR3WaehNnBTMq9rl$n%)^$MGUG$w%T%LRoHg zvhI;du;(9ZDeqGdDf9k>YFWRufEL&7@IJaKY;In^vG?7{P0^ZpFP~tq8i?49?Il!K zI`ixYjtC_6eTj)1(lqC4>j1ib_g*+9`Wc?wD)E~YS%5ZmMUh`2QdS3IGa5HGuw=HJ zQswYsywk}oDXE466V?li5N=iPS}U% zBV#*&qtWqm2j-JC$#G{9>1^?6*_IUw97(J_f9z2w5aT`Cu7+#=DBA1F4XWLBd*?cI z;(ksgbkarkLQII%z&V8H)J$)PCNC)YJJrJ1cd?S1X!}fR$G{E^Lm|>Pf53(O7LjYS*RSpcW73xr=uIKscFea3y$&d zi{kw$TWjlwJ6iFPIHyt1{8A|5={q+nQ7>HwVN!JvVBuX)AGb1gckE}nLE4p*;--;~ zd+I4~byoD0!$6(rC!=VxwonfBvYIa-+0`Dc?e&Rc4)Y7@ZA!clS!i(ZBRAz&sq8VT zQ%+#TnptVZlb^XOS z?@EL;h_3|9-rt>>jjO^N5?-h@uyjY0A}ac7ZpX3EVx)txgN;PQ z_;a`mQT#o2Fn>mEZ5nNIM{5#{U8K5-Ln*wNO!@rwGw-`|2IP8OfXxj9p~V#$Zoy+| z9Mf*GQ{o~KforKqmS9{ibDl%2?dqed94n$ssi&f z7%(p<+(_XZrq3?z>+i|+BULil=9)=ua?T?Qc50mRP6?nBax2S(C)(9>{7l0U(@Jo{}(kp=;MRjLqTpr|wUaDHIYk)S(+Wky?=dMz|>cK~q{I<~#uAg}|E z%_E@PJX>gxB|FgeTsbL2uH5IXj-zp*Gt}??x^V)Sf^o!%VJ_;bn{(mIE_4PU{C{Hl>83g8fXMb}jwnB2qQxa2COmhvBBAP`#Wo2)L>3Wjz{EfnMU^B)ewcG7gSez4`8Usm? za=kkSD5l_@4kaE|s_|D(wu?f8f*s+2u9c;p-4+&tki68%7Y^7#QZTu&2y8h6m9_(( zTQTYtP5RHQ+;?_MS6w1|m=BLa(0hQ96dwHtDf$JTgr%$aHXFdD)g2d}^va~{9m8=L zsfwasMvryi*(iND(tgEU5EcMu;uoI=}Dgw0l$Y@&HZ5c zp>X%A=xDH2$4TUhB~M(3oXmop6VO9=gsvlNs>igui%<58)KCjr24r?0ueWw9e?#B* z#5lkIN9ZE2csE4LH9W{2eKud%@`MoUbFI9Kz45GRrSH!q{e?Kwr-`>d>5R}y9-VpH zm5WcLUnlvl�<59rn*Z!a?KV1NWH%x5g(ha~Hr9*CqXmc)-TX?`1S5-|I#>FHz2UTjSP<0k9z-aT{%9xjA&SE(q? z_CW2hPNd#54>}4DWJr1LHWbwsu)kTC>KB->$hSOX!;eMvD_G|BQdK)9{cd{m#RVBF zie2CmWVz(KM1_>Qd0%SOX&6w6(i?08?wi*>9=1Zmv@PO0+A(?eqi^V!uEhBu#S6e{PS96|UhV3=5c!MjagkcR{}8Bd+> z$`X6Yc9X{9+Y%G|W)nln8w;A644UTJFHaES-D7#F)RPsAoO;2C=nv1hdcw4K^!Wm5 zl`Pq)ri94akoOZ(@tDYYFlAB*nUas&+fr}#l$9(|v%^EZpcPPt{u>OJIEBwqVj$dp z=Z!zx$YdC^4*8WfMhxcRQ1nd?=)FAjW*gnVu*x|Y`@Jr-qS~}R`h2s43%(_4(AYTu zu-h&f>YAw%1~a~!e|FT(3E9+>X5eJimfd6oRY>d$U+$5VlW`}!V%)Q9>fE!=`ABGC z&Na#&T5{1!4tjMP0OihBWg(GY!fL9yi;zMg9a&Su3qy4iRa`W+blbPaU$G@*s8TEa zw8g_jcB5wFF@h!1$~W+FpxHfh;E*fbrO$x`*{5a8#d`{V(&i*^N$|t~ zP{jc3igm_0>oKZhU0mxGp~04BqkXnz)D3&mgFEPNJe0G)9f9WRarX^tvEMMN0C)41 zpHIQ_$^{=YzB`0p Date: Thu, 29 Mar 2018 22:09:38 +0530 Subject: [PATCH 233/948] [WIP] Linux/Android native deploy (#980) --- Makefile | 4 + apps/android_deploy/.gitignore | 9 + apps/android_deploy/README.md | 119 ++++ apps/android_deploy/app/.gitignore | 1 + apps/android_deploy/app/build.gradle | 56 ++ .../android_deploy/app/download-models.gradle | 64 ++ .../app/src/main/AndroidManifest.xml | 37 + .../dmlc/tvm/android/demo/MainActivity.java | 633 ++++++++++++++++++ .../app/src/main/jni/Android.mk | 42 ++ .../app/src/main/jni/Application.mk | 16 + apps/android_deploy/app/src/main/jni/build.sh | 9 + .../app/src/main/jni/make/config.mk | 26 + .../app/src/main/jni/tvm_runtime.h | 26 + .../app/src/main/res/layout/activity_main.xml | 27 + .../app/src/main/res/layout/content_main.xml | 46 ++ .../app/src/main/res/values/colors.xml | 6 + .../app/src/main/res/values/strings.xml | 3 + .../app/src/main/res/values/styles.xml | 17 + .../app/src/main/res/xml/provider_paths.xml | 4 + apps/android_deploy/build.gradle | 29 + apps/android_deploy/dev_tools/gen_keystore.sh | 3 + apps/android_deploy/dev_tools/sign_apk.sh | 7 + apps/android_deploy/settings.gradle | 1 + docs/how_to/deploy_android.md | 40 ++ 24 files changed, 1225 insertions(+) create mode 100644 apps/android_deploy/.gitignore create mode 100644 apps/android_deploy/README.md create mode 100644 apps/android_deploy/app/.gitignore create mode 100644 apps/android_deploy/app/build.gradle create mode 100644 apps/android_deploy/app/download-models.gradle create mode 100644 apps/android_deploy/app/src/main/AndroidManifest.xml create mode 100644 apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java create mode 100644 apps/android_deploy/app/src/main/jni/Android.mk create mode 100644 apps/android_deploy/app/src/main/jni/Application.mk create mode 100644 apps/android_deploy/app/src/main/jni/build.sh create mode 100644 apps/android_deploy/app/src/main/jni/make/config.mk create mode 100644 apps/android_deploy/app/src/main/jni/tvm_runtime.h create mode 100644 apps/android_deploy/app/src/main/res/layout/activity_main.xml create mode 100644 apps/android_deploy/app/src/main/res/layout/content_main.xml create mode 100644 apps/android_deploy/app/src/main/res/values/colors.xml create mode 100644 apps/android_deploy/app/src/main/res/values/strings.xml create mode 100644 apps/android_deploy/app/src/main/res/values/styles.xml create mode 100644 apps/android_deploy/app/src/main/res/xml/provider_paths.xml create mode 100644 apps/android_deploy/build.gradle create mode 100644 apps/android_deploy/dev_tools/gen_keystore.sh create mode 100644 apps/android_deploy/dev_tools/sign_apk.sh create mode 100644 apps/android_deploy/settings.gradle create mode 100644 docs/how_to/deploy_android.md diff --git a/Makefile b/Makefile index 9e15d60e8c9b..1732ed5d9623 100644 --- a/Makefile +++ b/Makefile @@ -128,6 +128,10 @@ ifeq ($(USE_OPENCL), 1) LDFLAGS += -lOpenCL endif RUNTIME_DEP += $(OPENCL_OBJ) +ifdef OPENCL_PATH + CFLAGS += -I$(OPENCL_PATH)/include + LDFLAGS += -L$(OPENCL_PATH)/lib +endif else CFLAGS += -DTVM_OPENCL_RUNTIME=0 endif diff --git a/apps/android_deploy/.gitignore b/apps/android_deploy/.gitignore new file mode 100644 index 000000000000..39fb081a42a8 --- /dev/null +++ b/apps/android_deploy/.gitignore @@ -0,0 +1,9 @@ +*.iml +.gradle +/local.properties +/.idea/workspace.xml +/.idea/libraries +.DS_Store +/build +/captures +.externalNativeBuild diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md new file mode 100644 index 000000000000..9620a2f1efde --- /dev/null +++ b/apps/android_deploy/README.md @@ -0,0 +1,119 @@ +# Android TVM Demo + +This folder contains Android Demo app that allows us to show how to deploy model using TVM runtime api on a Android phone. + +You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this. + +## Build and Installation + +### Build APK + +We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system. + +Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary. + +``` +dependencies { + compile fileTree(dir: 'libs', include: ['*.jar']) + androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', { + exclude group: 'com.android.support', module: 'support-annotations' + }) + compile 'com.android.support:appcompat-v7:26.0.1' + compile 'com.android.support.constraint:constraint-layout:1.0.2' + compile 'com.android.support:design:26.0.1' + compile 'ml.dmlc.tvm:tvm4j-core:0.0.1-SNAPSHOT' + testCompile 'junit:junit:4.12' +} +``` + +Application default has CPU version TVM runtime flavor and follow below instruction to setup. +In `app/src/main/jni/make` you will find JNI Makefile config `config.mk` and copy it to `app/src/main/jni` and modify it. + +```bash +cd apps/android_deploy/app/src/main/jni +cp make/config.mk . +``` + +Here's a piece of example for `config.mk`. + +```makefile +APP_ABI = arm64-v8a + +APP_PLATFORM = android-17 + +# whether enable OpenCL during compile +USE_OPENCL = 0 +``` + +Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file. + +```bash +export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk] +cd apps/android_deploy +gradle clean build +``` + +In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/tvmdemo-release.apk`. + +Upload `tvmdemo-release.apk` to your Android device and install it. + +### Build with OpenCL + +Application does not link with OpenCL library unless you configure it to. Modify JNI Makefile config `app/src/main/jni` with proper target OpenCL configuration. + +Here's a piece of example for `config.mk`. + +```makefile +APP_ABI = arm64-v8a + +APP_PLATFORM = android-17 + +# whether enable OpenCL during compile +USE_OPENCL = 1 + +# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc +ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc + +# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so +ADD_LDLIBS = libOpenCL.so +``` + +Note that you should specify the correct GPU development headers for your android device. Run `adb shell dumpsys | grep GLES` to find out what GPU your android device uses. It is very likely the library (libOpenCL.so) is already present on the mobile device. For instance, I found it under `/system/vendor/lib64`. You can do `adb pull /system/vendor/lib64/libOpenCL.so ./` to get the file to your desktop. + +After you setup the `config.mk`, follow the instructions in [Build APK](#buildapk) to build the Android package with OpenCL flavor. + +## Cross Compile and Run on Android Devices + +### Architecture and Android Standalone Toolchain + +In order to cross compile a shared library (.so) for your android device, you have to know the target triple for the device. (Refer to [Cross-compilation using Clang](https://clang.llvm.org/docs/CrossCompilation.html) for more information). Run `adb shell cat /proc/cpuinfo` to list the device's CPU information. + +Now use NDK to generate standalone toolchain for your device. For my test device, I use following command. + +```bash +cd /opt/android-ndk/build/tools/ +./make-standalone-toolchain.sh --platform=android-24 --use-llvm --arch=arm64 --install-dir=/opt/android-toolchain-arm64 +``` + +If everything goes well, you will find compile tools in `/opt/android-toolchain-arm64/bin`. For example, `bin/aarch64-linux-android-g++` can be used to compile C++ source codes and create shared libraries for arm64 Android devices. + +### Place compiled model on Android application assets folder + +Follow instruction to get compiled version model for android target [here.](https://github.com/dmlc/tvm/blob/master/docs/how_to/deploy_android.md#build-model-for-android-target) + +Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/dmlc/tvm/blob/master/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java#L81) + +`CPU Verison flavor` +``` + private static final boolean EXE_GPU = false; +``` + +`OpenCL Verison flavor` +``` + private static final boolean EXE_GPU = true; +``` + + +Install compiled android application on phone and enjoy the image classifier demo using extraction model + +You can define your own TVM operators and deploy via this demo application on your Android device to find the most optimized TVM schedule. diff --git a/apps/android_deploy/app/.gitignore b/apps/android_deploy/app/.gitignore new file mode 100644 index 000000000000..796b96d1c402 --- /dev/null +++ b/apps/android_deploy/app/.gitignore @@ -0,0 +1 @@ +/build diff --git a/apps/android_deploy/app/build.gradle b/apps/android_deploy/app/build.gradle new file mode 100644 index 000000000000..6790308a9ec4 --- /dev/null +++ b/apps/android_deploy/app/build.gradle @@ -0,0 +1,56 @@ +// import DownloadModels task +project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets' +project.ext.TMP_DIR = project.buildDir.toString() + '/downloads' + +// Download default models(darknet framework extraction model compiled version); +// if you wish to use your own models then place them in the "assets" directory +// and comment out this line. +apply from: "download-models.gradle" + +apply plugin: 'com.android.application' + +task buildJni(type: Exec, description: 'Build JNI libs') { + commandLine 'sh', 'src/main/jni/build.sh' +} + +tasks.withType(JavaCompile) { + compileTask -> compileTask.dependsOn buildJni +} + +android { + compileSdkVersion 26 + buildToolsVersion "26.0.1" + defaultConfig { + applicationId "ml.dmlc.tvm.android.demo" + minSdkVersion 17 + targetSdkVersion 26 + versionCode 1 + versionName "1.0" + testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner" + } + buildTypes { + release { + minifyEnabled false + proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro' + } + } + sourceSets { + main { + jni.srcDirs = [] + jniLibs.srcDirs = ['src/main/libs'] + assets.srcDirs = [project.ext.ASSET_DIR] + } + } +} + +dependencies { + compile fileTree(dir: 'libs', include: ['*.jar']) + androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', { + exclude group: 'com.android.support', module: 'support-annotations' + }) + compile 'com.android.support:appcompat-v7:26.0.1' + compile 'com.android.support.constraint:constraint-layout:1.0.2' + compile 'com.android.support:design:26.0.1' + compile 'ml.dmlc.tvm:tvm4j-core:0.0.1-SNAPSHOT' + testCompile 'junit:junit:4.12' +} diff --git a/apps/android_deploy/app/download-models.gradle b/apps/android_deploy/app/download-models.gradle new file mode 100644 index 000000000000..5b0509fbca2b --- /dev/null +++ b/apps/android_deploy/app/download-models.gradle @@ -0,0 +1,64 @@ +/* + * download-models.gradle + * Downloads model files from ${MODEL_URL} into application's asset folder + * Input: + * project.ext.TMP_DIR: absolute path to hold downloaded zip files + * project.ext.ASSET_DIR: absolute path to save unzipped model files + * Output: + * 3 model files will be downloaded into given folder of ext.ASSET_DIR + */ +// hard coded model files +def models = ['extraction.zip'] + +// Root URL for model archives +def MODEL_URL = 'https://github.com/PariksheetPinjari909/TVM_models/blob/master/extraction_model' +buildscript { + repositories { + jcenter() + } + dependencies { + classpath 'de.undercouch:gradle-download-task:3.2.0' + } +} + +import de.undercouch.gradle.tasks.download.Download +task downloadFile(type: Download){ + for (f in models) { + src "${MODEL_URL}/" + f + "?raw=true" + dest new File(project.ext.TMP_DIR + "/" + f) + } + overwrite true +} + +task extractModels(type: Copy) { + def needDownload = false + for (f in models) { + def localFile = f.split("/")[-1] + if (!(new File(project.ext.TMP_DIR + '/' + localFile)).exists()) { + needDownload = true + } + } + + if (needDownload) { + dependsOn downloadFile + } + + for (f in models) { + def localFile = f.split("/")[-1] + from zipTree(project.ext.TMP_DIR + '/' + localFile) + } + + into file(project.ext.ASSET_DIR) + fileMode 0644 + exclude '**/LICENSE' +} + +tasks.whenTaskAdded { task -> + if (task.name == 'assembleDebug') { + task.dependsOn 'extractModels' + } + if (task.name == 'assembleRelease') { + task.dependsOn 'extractModels' + } +} + diff --git a/apps/android_deploy/app/src/main/AndroidManifest.xml b/apps/android_deploy/app/src/main/AndroidManifest.xml new file mode 100644 index 000000000000..bac82ee90faa --- /dev/null +++ b/apps/android_deploy/app/src/main/AndroidManifest.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java new file mode 100644 index 000000000000..f3cdefe1c2ff --- /dev/null +++ b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java @@ -0,0 +1,633 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ml.dmlc.tvm.android.demo; + +import android.Manifest; +import android.content.Intent; +import android.content.pm.PackageManager; +import android.content.res.AssetManager; +import android.app.AlertDialog; +import android.app.ProgressDialog; +import android.content.DialogInterface; +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.graphics.Canvas; +import android.graphics.Matrix; +import android.net.Uri; +import android.os.AsyncTask; +import android.os.Build; +import android.os.Bundle; +import android.os.Environment; +import android.os.SystemClock; +import android.provider.MediaStore; +import android.support.v4.content.FileProvider; +import android.support.v7.app.AppCompatActivity; +import android.support.v7.widget.Toolbar; +import android.util.Log; +import android.view.View; +import android.widget.ImageView; +import android.widget.TextView; +import android.widget.Toast; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Vector; + +import ml.dmlc.tvm.Function; +import ml.dmlc.tvm.Module; +import ml.dmlc.tvm.NDArray; +import ml.dmlc.tvm.TVMContext; +import ml.dmlc.tvm.TVMValue; +import ml.dmlc.tvm.TVMType; + +public class MainActivity extends AppCompatActivity { + private static final String TAG = MainActivity.class.getSimpleName(); + + private static final int PERMISSIONS_REQUEST = 100; + private static final int PICTURE_FROM_GALLERY = 101; + private static final int PICTURE_FROM_CAMERA = 102; + private static final int IMAGE_PREVIEW_WIDTH = 960; + private static final int IMAGE_PREVIEW_HEIGHT = 720; + + // TVM constants + private static final int OUTPUT_INDEX = 0; + private static final int IMG_CHANNEL = 3; + private static final String INPUT_NAME = "data"; + + // Configuration values for extraction model. Note that the graph, lib and params is not + // included with TVM and must be manually placed in the assets/ directory by the user. + // Graphs and models downloaded from https://github.com/pjreddie/darknet/blob/ may be + // converted e.g. via define_and_compile_model.py. + private static final boolean EXE_GPU = false; + private static final int MODEL_INPUT_SIZE = 224; + private static final String MODEL_CL_LIB_FILE = "file:///android_asset/deploy_lib_opencl.so"; + private static final String MODEL_CPU_LIB_FILE = "file:///android_asset/deploy_lib_cpu.so"; + private static final String MODEL_GRAPH_FILE = "file:///android_asset/deploy_graph.json"; + private static final String MODEL_PARAM_FILE = "file:///android_asset/deploy_param.params"; + private static final String MODEL_LABEL_FILE = "file:///android_asset/imagenet.shortnames.list"; + + private Uri mCameraImageUri; + private ImageView mImageView; + private TextView mResultView; + private AssetManager assetManager; + private Module graphRuntimeModule; + private Vector labels = new Vector(); + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_main); + Toolbar toolbar = findViewById(R.id.toolbar); + setSupportActionBar(toolbar); + assetManager = getAssets(); + + mImageView = (ImageView) findViewById(R.id.imageView); + mResultView = (TextView) findViewById(R.id.resultTextView); + findViewById(R.id.btnPickImage).setOnClickListener(new View.OnClickListener() { + @Override + public void onClick(View v) { + showPictureDialog(); + } + }); + + if (hasPermission()) { + // instantiate tvm runtime and setup environment on background after application begin + new LoadModleAsyncTask().execute(); + } else { + requestPermission(); + } + } + + /* + Load precompiled model on TVM graph runtime and init the system. + */ + private class LoadModleAsyncTask extends AsyncTask { + ProgressDialog dialog = new ProgressDialog(MainActivity.this); + + @Override + protected Integer doInBackground(Void... args) { + + // load synset name + String lableFilename = MODEL_LABEL_FILE.split("file:///android_asset/")[1]; + Log.i(TAG, "Reading synset name from: " + lableFilename); + try { + String labelsContent = new String(getBytesFromFile(assetManager, lableFilename)); + for (String line : labelsContent.split("\\r?\\n")) { + labels.add(line); + } + } catch (IOException e) { + Log.e(TAG, "Problem reading synset name file!" + e); + return -1;//failure + } + + // load json graph + String modelGraph = null; + String graphFilename = MODEL_GRAPH_FILE.split("file:///android_asset/")[1]; + Log.i(TAG, "Reading json graph from: " + graphFilename); + try { + modelGraph = new String(getBytesFromFile(assetManager, graphFilename)); + } catch (IOException e) { + Log.e(TAG, "Problem reading json graph file!" + e); + return -1;//failure + } + + // upload tvm compiled function on application cache folder + String libCacheFilePath = null; + String libFilename = EXE_GPU ? MODEL_CL_LIB_FILE.split("file:///android_asset/")[1] : + MODEL_CPU_LIB_FILE.split("file:///android_asset/")[1]; + Log.i(TAG, "Uploading compiled function to cache folder"); + try { + libCacheFilePath = getTempLibFilePath(libFilename); + byte[] modelLibByte = getBytesFromFile(assetManager, libFilename); + FileOutputStream fos = new FileOutputStream(libCacheFilePath); + fos.write(modelLibByte); + fos.close(); + } catch (IOException e) { + Log.e(TAG, "Problem uploading compiled function!" + e); + return -1;//failure + } + + // load parameters + byte[] modelParams = null; + String paramFilename = MODEL_PARAM_FILE.split("file:///android_asset/")[1]; + try { + modelParams = getBytesFromFile(assetManager, paramFilename); + } catch (IOException e) { + Log.e(TAG, "Problem reading params file!" + e); + return -1;//failure + } + + // create java tvm context + TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu(); + + // tvm module for compiled functions + Module modelLib = Module.load(libCacheFilePath); + + // get global function module for graph runtime + Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create"); + TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph) + .pushArg(modelLib) + .pushArg(tvmCtx.deviceType) + .pushArg(tvmCtx.deviceId) + .invoke(); + graphRuntimeModule = runtimeCreFunRes.asModule(); + + // get the function from the module(load parameters) + Function loadParamFunc = graphRuntimeModule.getFunction("load_params"); + loadParamFunc.pushArg(modelParams).invoke(); + + // release tvm local variables + modelLib.release(); + loadParamFunc.release(); + runtimeCreFun.release(); + + return 0;//success + } + + @Override + protected void onPreExecute() { + dialog.setCancelable(false); + dialog.setMessage("Loading Model..."); + dialog.show(); + super.onPreExecute(); + } + + @Override + protected void onPostExecute(Integer status) { + if (dialog != null && dialog.isShowing()) { + dialog.dismiss(); + } + if (status != 0) { + showDialog("Error", "Fail to initialized model, check compiled model"); + } + } + } + + /* + Execute prediction for processed decode input bitmap image content on TVM graph runtime. + */ + private class ModelRunAsyncTask extends AsyncTask { + ProgressDialog dialog = new ProgressDialog(MainActivity.this); + + @Override + protected Integer doInBackground(Bitmap... bitmaps) { + if (null != graphRuntimeModule) { + int count = bitmaps.length; + for (int i = 0 ; i < count ; i++) { + long processingTimeMs = SystemClock.uptimeMillis(); + Log.i(TAG, "Decode JPEG image content"); + + // extract the jpeg content + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + bitmaps[i].compress(Bitmap.CompressFormat.JPEG,100,stream); + byte[] byteArray = stream.toByteArray(); + Bitmap imageBitmap = BitmapFactory.decodeByteArray(byteArray, 0, byteArray.length); + + // crop input image at centre to model input size + // commecial deploy note:: instead of cropying image do resize + // image to model input size so we never lost the image content + Bitmap cropImageBitmap = Bitmap.createBitmap(MODEL_INPUT_SIZE, MODEL_INPUT_SIZE, Bitmap.Config.ARGB_8888); + Matrix frameToCropTransform = getTransformationMatrix(imageBitmap.getWidth(), imageBitmap.getHeight(), + MODEL_INPUT_SIZE, MODEL_INPUT_SIZE, 0, true); + Canvas canvas = new Canvas(cropImageBitmap); + canvas.drawBitmap(imageBitmap, frameToCropTransform, null); + + // image pixel int values + int[] pixelValues = new int[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE]; + // image RGB float values + float[] imgRgbValues = new float[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE * IMG_CHANNEL]; + // image RGB transpose float values + float[] imgRgbTranValues = new float[MODEL_INPUT_SIZE * MODEL_INPUT_SIZE * IMG_CHANNEL]; + + // pre-process the image data from 0-255 int to normalized float based on the + // provided parameters. + cropImageBitmap.getPixels(pixelValues, 0, MODEL_INPUT_SIZE, 0, 0, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE); + for (int j = 0; j < pixelValues.length; ++j) { + imgRgbValues[j * 3 + 0] = ((pixelValues[j] >> 16) & 0xFF)/255.0f; + imgRgbValues[j * 3 + 1] = ((pixelValues[j] >> 8) & 0xFF)/255.0f; + imgRgbValues[j * 3 + 2] = (pixelValues[j] & 0xFF)/255.0f; + } + + // pre-process the image rgb data transpose based on the provided parameters. + for (int k = 0; k < IMG_CHANNEL; ++k) { + for (int l = 0; l < MODEL_INPUT_SIZE; ++l) { + for (int m = 0; m < MODEL_INPUT_SIZE; ++m) { + int dst_index = m + MODEL_INPUT_SIZE*l + MODEL_INPUT_SIZE*MODEL_INPUT_SIZE*k; + int src_index = k + IMG_CHANNEL*m + IMG_CHANNEL*MODEL_INPUT_SIZE*l; + imgRgbTranValues[dst_index] = imgRgbValues[src_index]; + } + } + } + + // get the function from the module(set input data) + Log.i(TAG, "set input data"); + NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32"));; + inputNdArray.copyFrom(imgRgbTranValues); + Function setInputFunc = graphRuntimeModule.getFunction("set_input"); + setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke(); + // release tvm local variables + inputNdArray.release(); + setInputFunc.release(); + + // get the function from the module(run it) + Log.i(TAG, "run function on target"); + Function runFunc = graphRuntimeModule.getFunction("run"); + runFunc.invoke(); + // release tvm local variables + runFunc.release(); + + // get the function from the module(get output data) + Log.i(TAG, "get output data"); + NDArray outputNdArray = NDArray.empty(new long[]{1000}, new TVMType("float32")); + Function getOutputFunc = graphRuntimeModule.getFunction("get_output"); + getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke(); + float[] output = outputNdArray.asFloatArray(); + // release tvm local variables + outputNdArray.release(); + getOutputFunc.release(); + + // display the result from extracted output data + if (null != output) { + int maxPosition = -1; + float maxValue = 0; + for (int j = 0; j < output.length; ++j) { + if (output[j] > maxValue) { + maxValue = output[j]; + maxPosition = j; + } + } + processingTimeMs = SystemClock.uptimeMillis() - processingTimeMs; + String label = "Prediction Result : "; + label += labels.size() > maxPosition ? labels.get(maxPosition) : "unknown"; + label += "\nPrediction Time : " + processingTimeMs + "ms"; + mResultView.setText(label); + } + Log.i(TAG, "prediction finished"); + } + return 0; + } + return -1; + } + + @Override + protected void onPreExecute() { + dialog.setCancelable(false); + dialog.setMessage("Prediction running on image..."); + dialog.show(); + super.onPreExecute(); + } + + @Override + protected void onPostExecute(Integer status) { + if (dialog != null && dialog.isShowing()) { + dialog.dismiss(); + } + if (status != 0) { + showDialog("Error", "Fail to predict image, GraphRuntime exception"); + } + } + } + + @Override + protected void onDestroy() { + // release tvm local variables + if (null != graphRuntimeModule) + graphRuntimeModule.release(); + super.onDestroy(); + } + + /** + * Read file from assets and return byte array. + * + * @param assets The asset manager to be used to load assets. + * @param fileName The filepath of read file. + * @return byte[] file content + * @throws IOException + */ + private byte[] getBytesFromFile(AssetManager assets, String fileName) throws IOException { + InputStream is = assets.open(fileName); + int length = is.available(); + byte[] bytes = new byte[length]; + // Read in the bytes + int offset = 0; + int numRead = 0; + try { + while (offset < bytes.length + && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) { + offset += numRead; + } + } finally { + is.close(); + } + // Ensure all the bytes have been read in + if (offset < bytes.length) { + throw new IOException("Could not completely read file " + fileName); + } + return bytes; + } + + /** + * Dialog show pick option for select image from Gallery or Camera. + */ + private void showPictureDialog(){ + AlertDialog.Builder pictureDialog = new AlertDialog.Builder(this); + pictureDialog.setTitle("Select Action"); + String[] pictureDialogItems = { + "Select photo from gallery", + "Capture photo from camera" }; + pictureDialog.setItems(pictureDialogItems, + new DialogInterface.OnClickListener() { + @Override + public void onClick(DialogInterface dialog, int which) { + switch (which) { + case 0: + choosePhotoFromGallery(); + break; + case 1: + takePhotoFromCamera(); + break; + } + } + }); + pictureDialog.show(); + } + + /** + * Request to pick image from Gallery. + */ + public void choosePhotoFromGallery() { + Intent galleryIntent = new Intent(Intent.ACTION_PICK, + android.provider.MediaStore.Images.Media.EXTERNAL_CONTENT_URI); + + startActivityForResult(galleryIntent, PICTURE_FROM_GALLERY); + } + + /** + * Request to capture image from Camera. + */ + private void takePhotoFromCamera() { + Intent intent = new Intent(android.provider.MediaStore.ACTION_IMAGE_CAPTURE); + + if (Build.VERSION.SDK_INT < Build.VERSION_CODES.N) { + mCameraImageUri = Uri.fromFile(createImageFile()); + } else { + File file = new File(createImageFile().getPath()); + mCameraImageUri = FileProvider.getUriForFile(getApplicationContext(), getApplicationContext().getPackageName() + ".provider", file); + } + + intent.putExtra(MediaStore.EXTRA_OUTPUT, mCameraImageUri); + startActivityForResult(intent, PICTURE_FROM_CAMERA); + } + + @Override + public void onActivityResult(int requestCode, int resultCode, Intent data) { + super.onActivityResult(requestCode, resultCode, data); + if (resultCode == this.RESULT_CANCELED) { + return; + } + Uri contentURI = null; + if (requestCode == PICTURE_FROM_GALLERY) { + if (data != null) { + contentURI = data.getData(); + } + } else if (requestCode == PICTURE_FROM_CAMERA) { + contentURI = mCameraImageUri; + } + if (null != contentURI) { + try { + Bitmap bitmap = MediaStore.Images.Media.getBitmap(this.getContentResolver(), contentURI); + Bitmap scaled = Bitmap.createScaledBitmap(bitmap, IMAGE_PREVIEW_HEIGHT, IMAGE_PREVIEW_WIDTH, true); + mImageView.setImageBitmap(scaled); + new ModelRunAsyncTask().execute(scaled); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + /** + * Get application cache path where to place compiled functions. + * + * @param fileName library file name. + * @return String application cache folder path + * @throws IOException + */ + private final String getTempLibFilePath(String fileName) throws IOException { + File tempDir = File.createTempFile("tvm4j_demo_", ""); + if (!tempDir.delete() || !tempDir.mkdir()) { + throw new IOException("Couldn't create directory " + tempDir.getAbsolutePath()); + } + return (tempDir + File.separator + fileName); + } + + /** + * Create image file under storage where camera application save captured image. + * + * @return File image file under sdcard where camera can save image + */ + private File createImageFile() { + // Create an image file name + String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); + String imageFileName = "JPEG_" + timeStamp + "_"; + File storageDir = Environment.getExternalStoragePublicDirectory( + Environment.DIRECTORY_PICTURES); + try { + File image = File.createTempFile( + imageFileName, // prefix + ".jpg", // suffix + storageDir // directory + ); + return image; + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } + + /** + * Show dialog to user. + * + * @param title dialog display title + * @param msg dialog display message + */ + private void showDialog(String title, String msg) { + AlertDialog.Builder builder = new AlertDialog.Builder(this); + builder.setTitle(title); + builder.setMessage(msg); + builder.setCancelable(true); + builder.setNeutralButton(android.R.string.ok, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int id) { + dialog.cancel(); + finish(); + } + }); + builder.create().show(); + } + + @Override + public void onRequestPermissionsResult (final int requestCode, final String[] permissions, final int[] grantResults){ + if (requestCode == PERMISSIONS_REQUEST) { + if (grantResults.length > 0 + && grantResults[0] == PackageManager.PERMISSION_GRANTED + && grantResults[1] == PackageManager.PERMISSION_GRANTED) { + // instantiate tvm runtime and setup environment on background after application begin + new LoadModleAsyncTask().execute(); + } else { + requestPermission(); + } + } + } + + /** + * Whether application has required mandatory permissions to run. + */ + private boolean hasPermission() { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) { + return checkSelfPermission(Manifest.permission.CAMERA) == PackageManager.PERMISSION_GRANTED && + checkSelfPermission(Manifest.permission.WRITE_EXTERNAL_STORAGE) == PackageManager.PERMISSION_GRANTED; + } else { + return true; + } + } + + /** + * Request required mandatory permission for application to run. + */ + private void requestPermission() { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) { + if (shouldShowRequestPermissionRationale(Manifest.permission.CAMERA) || + shouldShowRequestPermissionRationale(Manifest.permission.WRITE_EXTERNAL_STORAGE)) { + Toast.makeText(this, + "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show(); + } + requestPermissions(new String[] {Manifest.permission.CAMERA, Manifest.permission.WRITE_EXTERNAL_STORAGE}, PERMISSIONS_REQUEST); + } + } + + /** + * Returns a transformation matrix from one reference frame into another. + * Handles cropping (if maintaining aspect ratio is desired) and rotation. + * + * @param srcWidth Width of source frame. + * @param srcHeight Height of source frame. + * @param dstWidth Width of destination frame. + * @param dstHeight Height of destination frame. + * @param applyRotation Amount of rotation to apply from one frame to another. + * Must be a multiple of 90. + * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant, + * cropping the image if necessary. + * @return The transformation fulfilling the desired requirements. + */ + public static Matrix getTransformationMatrix( + final int srcWidth, + final int srcHeight, + final int dstWidth, + final int dstHeight, + final int applyRotation, + final boolean maintainAspectRatio) { + final Matrix matrix = new Matrix(); + + if (applyRotation != 0) { + if (applyRotation % 90 != 0) { + Log.w(TAG, "Rotation of %d % 90 != 0 " + applyRotation); + } + + // Translate so center of image is at origin. + matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f); + + // Rotate around origin. + matrix.postRotate(applyRotation); + } + + // Account for the already applied rotation, if any, and then determine how + // much scaling is needed for each axis. + final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0; + + final int inWidth = transpose ? srcHeight : srcWidth; + final int inHeight = transpose ? srcWidth : srcHeight; + + // Apply scaling if necessary. + if (inWidth != dstWidth || inHeight != dstHeight) { + final float scaleFactorX = dstWidth / (float) inWidth; + final float scaleFactorY = dstHeight / (float) inHeight; + + if (maintainAspectRatio) { + // Scale by minimum factor so that dst is filled completely while + // maintaining the aspect ratio. Some image may fall off the edge. + final float scaleFactor = Math.max(scaleFactorX, scaleFactorY); + matrix.postScale(scaleFactor, scaleFactor); + } else { + // Scale exactly to fill dst from src. + matrix.postScale(scaleFactorX, scaleFactorY); + } + } + + if (applyRotation != 0) { + // Translate back from origin centered reference to destination frame. + matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f); + } + + return matrix; + } +} \ No newline at end of file diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk new file mode 100644 index 000000000000..a99517f90332 --- /dev/null +++ b/apps/android_deploy/app/src/main/jni/Android.mk @@ -0,0 +1,42 @@ +LOCAL_PATH := $(call my-dir) +MY_PATH := $(LOCAL_PATH) + +include $(CLEAR_VARS) + +LOCAL_PATH := $(MY_PATH) +ROOT_PATH := $(MY_PATH)/../../../../../.. + +ifndef config + ifneq ("$(wildcard ./config.mk)","") + config ?= config.mk + else + config ?= make/config.mk + endif +endif + +include $(config) + +LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc +LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog + +LOCAL_C_INCLUDES := $(ROOT_PATH)/include \ + $(ROOT_PATH)/dlpack/include \ + $(ROOT_PATH)/dmlc-core/include \ + $(ROOT_PATH)/HalideIR/src \ + $(ROOT_PATH)/topi/include + +LOCAL_MODULE = tvm4j_runtime_packed + +LOCAL_CPP_FEATURES += exceptions +LOCAL_LDLIBS += -latomic +LOCAL_ARM_MODE := arm + +ifdef ADD_C_INCLUDES + LOCAL_C_INCLUDES += $(ADD_C_INCLUDES) +endif + +ifdef ADD_LDLIBS + LOCAL_LDLIBS += $(ADD_LDLIBS) +endif + +include $(BUILD_SHARED_LIBRARY) diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk new file mode 100644 index 000000000000..01cad9b783a7 --- /dev/null +++ b/apps/android_deploy/app/src/main/jni/Application.mk @@ -0,0 +1,16 @@ +ifndef config + ifneq ("$(wildcard ./config.mk)","") + config ?= config.mk + else + config ?= make/config.mk + endif +endif + +include $(config) + +APP_STL := gnustl_static + +APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti +ifeq ($(USE_OPENCL), 1) + APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 +endif diff --git a/apps/android_deploy/app/src/main/jni/build.sh b/apps/android_deploy/app/src/main/jni/build.sh new file mode 100644 index 000000000000..1ca38ae5bd12 --- /dev/null +++ b/apps/android_deploy/app/src/main/jni/build.sh @@ -0,0 +1,9 @@ +#!/bin/bash +PATH="$PATH:/usr/local/bin" +CURR_DIR=$(cd `dirname $0`; pwd) +ROOT_DIR="$CURR_DIR/../../../../../.." +javah -o $CURR_DIR/ml_dmlc_tvm_native_c_api.h -cp "$ROOT_DIR/jvm/core/target/*" ml.dmlc.tvm.LibInfo || exit -1 +cp -f $ROOT_DIR/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc $CURR_DIR/ || exit -1 +cp -f $ROOT_DIR/jvm/native/src/main/native/jni_helper_func.h $CURR_DIR/ || exit -1 +rm -rf $CURR_DIR/../libs +ndk-build --directory=$CURR_DIR diff --git a/apps/android_deploy/app/src/main/jni/make/config.mk b/apps/android_deploy/app/src/main/jni/make/config.mk new file mode 100644 index 000000000000..8d6f5a56dd5b --- /dev/null +++ b/apps/android_deploy/app/src/main/jni/make/config.mk @@ -0,0 +1,26 @@ +#------------------------------------------------------------------------------- +# Template configuration for compiling +# +# If you want to change the configuration, please use the following +# steps. Assume you are on the root directory. First copy the this +# file so that any local changes will be ignored by git +# +# cp make/config.mk . +# +# Next modify the according entries, and then compile by +# +# ./build.sh +# +#------------------------------------------------------------------------------- +APP_ABI = all + +APP_PLATFORM = android-17 + +# whether enable OpenCL during compile +USE_OPENCL = 0 + +# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc +ADD_C_INCLUDES = + +# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so +ADD_LDLIBS = diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h new file mode 100644 index 000000000000..69b1f5c83f1f --- /dev/null +++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h @@ -0,0 +1,26 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file tvm_runtime.h + * \brief Pack all tvm runtime source files + */ +#include +#include + +#include "../src/runtime/c_runtime_api.cc" +#include "../src/runtime/cpu_device_api.cc" +#include "../src/runtime/workspace_pool.cc" +#include "../src/runtime/module_util.cc" +#include "../src/runtime/system_lib_module.cc" +#include "../src/runtime/module.cc" +#include "../src/runtime/registry.cc" +#include "../src/runtime/file_util.cc" +#include "../src/runtime/dso_module.cc" +#include "../src/runtime/thread_pool.cc" +#include "../src/runtime/threading_backend.cc" + +#include "../src/runtime/graph/graph_runtime.cc" + +#ifdef TVM_OPENCL_RUNTIME +#include "../src/runtime/opencl/opencl_device_api.cc" +#include "../src/runtime/opencl/opencl_module.cc" +#endif diff --git a/apps/android_deploy/app/src/main/res/layout/activity_main.xml b/apps/android_deploy/app/src/main/res/layout/activity_main.xml new file mode 100644 index 000000000000..b16a5c2548a6 --- /dev/null +++ b/apps/android_deploy/app/src/main/res/layout/activity_main.xml @@ -0,0 +1,27 @@ + + + + + + + + + + + + + diff --git a/apps/android_deploy/app/src/main/res/layout/content_main.xml b/apps/android_deploy/app/src/main/res/layout/content_main.xml new file mode 100644 index 000000000000..34de93843645 --- /dev/null +++ b/apps/android_deploy/app/src/main/res/layout/content_main.xml @@ -0,0 +1,46 @@ + + + + + +