From bf3bdb782816c809b27fd31865b7e7ffbc9e7859 Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Thu, 25 Jun 2020 18:59:12 +0200 Subject: [PATCH] Two small fixes to AMDCPU codegen for LLVM 10+ and ROCm 3.5+ (#5920) - For LLVM 10+ we need to avoid calling Align with 0, or else we get a crash. - For ROCm 3.5+ we need to use code object 3 (the default in LLVM 9+) but for ROCm < 3.5 we want the code object 2. - As we want to separate codegen from the API, we need to add a device api query for the version. But every one else wants now one, too. (But I only filled it in for CUDA for now.) - I'm throwing in an addition of kMaxRegistersPerBlock for ROCm. This was introduced for CUDA in #5898. --- include/tvm/runtime/device_api.h | 3 ++- src/runtime/cuda/cuda_device_api.cc | 4 ++++ src/runtime/metal/metal_device_api.mm | 2 ++ src/runtime/opencl/opencl_device_api.cc | 2 ++ src/runtime/rocm/rocm_common.h | 1 + src/runtime/rocm/rocm_device_api.cc | 8 ++++++- src/runtime/vulkan/vulkan.cc | 2 ++ src/target/llvm/codegen_amdgpu.cc | 29 +++++++++++++++++++++---- 8 files changed, 45 insertions(+), 6 deletions(-) diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 3cf5566f3231..c6a2ce3d28d0 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -45,7 +45,8 @@ enum DeviceAttrKind : int { kMultiProcessorCount = 7, kMaxThreadDimensions = 8, kMaxRegistersPerBlock = 9, - kGcnArch = 10 + kGcnArch = 10, + kApiVersion = 11 }; /*! \brief Number of bytes each allocation must align to */ diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index ccd8e91e0c5d..14444c92f620 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -98,6 +98,10 @@ class CUDADeviceAPI final : public DeviceAPI { } case kGcnArch: return; + case kApiVersion: { + *rv = CUDA_VERSION; + return; + } } *rv = value; } diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index a64f35ced2c2..f2a2930810e5 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -69,6 +69,8 @@ return; case kGcnArch: return; + case kApiVersion: + return; } } diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 72d03fb6a4fc..5753c1d0f76b 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -111,6 +111,8 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* return; case kGcnArch: return; + case kApiVersion: + return; } } diff --git a/src/runtime/rocm/rocm_common.h b/src/runtime/rocm/rocm_common.h index 2e637f5496bb..6ed9bccb1ab7 100644 --- a/src/runtime/rocm/rocm_common.h +++ b/src/runtime/rocm/rocm_common.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_ROCM_ROCM_COMMON_H_ #include +#include #include #include diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index e3dbef5ff42a..e1a14c7dcf1c 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -103,13 +103,19 @@ class ROCMDeviceAPI final : public DeviceAPI { return; } case kMaxRegistersPerBlock: - return; + ROCM_CALL( + hipDeviceGetAttribute(&value, hipDeviceAttributeMaxRegistersPerBlock, ctx.device_id)); + break; case kGcnArch: { hipDeviceProp_t prop; ROCM_CALL(hipGetDeviceProperties(&prop, ctx.device_id)); *rv = prop.gcnArch; return; } + case kApiVersion: { + *rv = HIP_VERSION; + return; + } } *rv = value; } diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index ade4ddca9376..9e730b7fd8b1 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -417,6 +417,8 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* return; case kGcnArch: return; + case kApiVersion: + return; } } diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index 8e6b3a2ff22c..93c94cfa4389 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -108,11 +108,13 @@ class CodeGenAMDGPU : public CodeGenLLVM { llvm::GlobalVariable* global = new llvm::GlobalVariable( *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", nullptr, llvm::GlobalValue::NotThreadLocal, shared_address_space); + if (global->getAlignment() < static_cast(info.alignment)) { #if TVM_LLVM_VERSION >= 100 - global->setAlignment(llvm::Align(info.alignment)); + global->setAlignment(llvm::Align(info.alignment)); #else - global->setAlignment(info.alignment); + global->setAlignment(info.alignment); #endif + } buf = global; } @@ -212,6 +214,20 @@ inline int DetectROCMComputeVersion(const std::string& target) { return 900; } +inline int DetectROCMApiVersion() { + TVMContext tvm_ctx; + tvm_ctx.device_type = kDLROCM; + tvm_ctx.device_id = 0; + tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_ctx, true); + if (api != nullptr) { + TVMRetValue val; + api->GetAttr(tvm_ctx, tvm::runtime::kApiVersion, &val); + return val.operator int(); + } + LOG(WARNING) << "Cannot detect ROCm version, assume >= 3.5"; + return 305; +} + runtime::Module BuildAMDGPU(IRModule mod, std::string target) { #if TVM_LLVM_VERSION < 90 LOG(FATAL) << "AMDGPU backend requires at least LLVM 9"; @@ -221,8 +237,13 @@ runtime::Module BuildAMDGPU(IRModule mod, std::string target) { InitializeLLVM(); CHECK(target.length() >= 4 && target.substr(0, 4) == "rocm"); std::ostringstream config; - config << "-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx" << DetectROCMComputeVersion(target) - << " -mattr=-code-object-v3 " << target.substr(4, target.length() - 4); + config << "-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx" << DetectROCMComputeVersion(target); + if (DetectROCMApiVersion() < 305) { + // before ROCm 3.5 we needed code object v2, starting + // with 3.5 we need v3 (this argument disables v3) + config << " -mattr=-code-object-v3 "; + } + config << target.substr(4, target.length() - 4); std::unique_ptr tm = GetLLVMTargetMachine(config.str()); std::unique_ptr ctx(new llvm::LLVMContext()); // careful: cg will hold a naked pointer reference to ctx, so it should