diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h index dc3e71ff986..b400636a090 100644 --- a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h +++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h @@ -59,6 +59,8 @@ using ncclComm_t = struct ncclComm *; // Forward declaration of hipFFT types. using hipfftHandle = struct hipfftHandle_t *; +// Forward declaration of hiprtcProgram +using hiprtcProgram = struct _hiprtcProgram *; // Enums for corresponding #defines in the hipFFT headers. enum hipfftDirection_t : int { HIPFFT_FORWARD = -1, diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h index 3908eed5205..646aa532827 100644 --- a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h +++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h @@ -52,6 +52,29 @@ extern "C" { const char* hipGetErrorName(hipError_t hip_error); const char* hipGetErrorString(hipError_t hip_error); +const char *hiprtcGetErrorString(hiprtcResult result); +hiprtcResult hiprtcVersion(int* major, int* minor); +hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression); +hiprtcResult hiprtcCompileProgram( + hiprtcProgram prog, + int numOptions, + const char** options); +hiprtcResult hiprtcCreateProgram( + hiprtcProgram* prog, + const char* src, + const char* name, + int numberHeaders, + char** headers, + const char** includeNames); +hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog); +hiprtcResult hiprtcGetLoweredName( + hiprtcProgram prog, + const char* name_expression, + const char** lowered_name); +hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log); +hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet); +hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code); +hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet); // Enums for corresponding #defines in the HIP headers. enum hipDeviceFlags_t { diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h index 98a3f0bc0fd..92feac7d949 100644 --- a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h +++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h @@ -29,6 +29,7 @@ namespace gpu { namespace wrapper { raw_ostream& Print(raw_ostream& os, hipError_t error); +raw_ostream& Print(raw_ostream& os, hiprtcResult result); namespace internal { template <> @@ -162,6 +163,8 @@ llvm::Error HipMemsetD32Async(CurrentContext current, Pointer dst, llvm::Expected HipModuleLoadData(CurrentContext current, const void* image); +llvm::Expected HipRTCModuleLoadData(CurrentContext current, + const void* image); llvm::Expected HipModuleLoadDataEx( CurrentContext current, const void* image, llvm::ArrayRef options, llvm::ArrayRef option_values); diff --git a/backends/gpu/lib/kernels/driver_kernels.cc b/backends/gpu/lib/kernels/driver_kernels.cc index 6feb245ceff..8a5d40c204e 100644 --- a/backends/gpu/lib/kernels/driver_kernels.cc +++ b/backends/gpu/lib/kernels/driver_kernels.cc @@ -320,7 +320,6 @@ static Expected GpuModuleLoad(Argument context, MakeStringError("GPU JIT error log: ", error_log)); } #endif - return GpuModule(context.ValueRef(), std::move(*module)); } diff --git a/backends/gpu/lib/wrapper/driver_wrapper.cc b/backends/gpu/lib/wrapper/driver_wrapper.cc index ef4574980da..7e11316d95a 100644 --- a/backends/gpu/lib/wrapper/driver_wrapper.cc +++ b/backends/gpu/lib/wrapper/driver_wrapper.cc @@ -852,7 +852,7 @@ llvm::Expected ModuleLoadData(CurrentContext current, case Platform::CUDA: return CuModuleLoadData(current, image); case Platform::ROCm: - return HipModuleLoadData(current, image); + return HipRTCModuleLoadData(current, image); default: return InvalidPlatform(platform); } diff --git a/backends/gpu/lib/wrapper/hip_stub.cc b/backends/gpu/lib/wrapper/hip_stub.cc index 68a650d3bdf..539738d9339 100644 --- a/backends/gpu/lib/wrapper/hip_stub.cc +++ b/backends/gpu/lib/wrapper/hip_stub.cc @@ -61,3 +61,92 @@ const char *hipGetErrorString(hipError_t hip_error) { if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL"; return func_ptr(hip_error); } + +const char *hiprtcGetErrorString(hiprtcResult result) { + static auto func_ptr = + GetFunctionPointer("hiprtcGetErrorString", hiprtcGetErrorString); + if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL"; + return func_ptr(result); +} + +hiprtcResult hiprtcVersion(int* major, int* minor){ + static auto func_ptr = + GetFunctionPointer("hiprtcVersion", hiprtcVersion); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(major, minor); +} + +hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression){ + static auto func_ptr = + GetFunctionPointer("hiprtcAddNameExpression", hiprtcAddNameExpression); + if (!func_ptr) return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID; + return func_ptr(prog, name_expression); +} + +hiprtcResult hiprtcCompileProgram( + hiprtcProgram prog, + int numOptions, + const char** options){ + static auto func_ptr = + GetFunctionPointer("hiprtcCompileProgram", hiprtcCompileProgram); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, numOptions, options); +} + +hiprtcResult hiprtcCreateProgram( + hiprtcProgram* prog, + const char* src, + const char* name, + int numberHeaders, + char** headers, + const char** includeNames){ + static auto func_ptr = + GetFunctionPointer("hiprtcCreateProgram", hiprtcCreateProgram); + if (!func_ptr) return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE; + return func_ptr(prog, src, name, numberHeaders, headers, includeNames); +} + +hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog){ + static auto func_ptr = + GetFunctionPointer("hiprtcDestroyProgram", hiprtcDestroyProgram); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog); +} + +hiprtcResult hiprtcGetLoweredName( + hiprtcProgram prog, + const char* name_expression, + const char** lowered_name){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetLoweredName", hiprtcGetLoweredName); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, name_expression, lowered_name); +} + +hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetProgramLog", hiprtcGetProgramLog); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, log); +} + +hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetProgramLogSize", hiprtcGetProgramLogSize); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, logSizeRet); +} + +hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetCode", hiprtcGetCode); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, code); +} + +hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetCodeSize", hiprtcGetCodeSize); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, codeSizeRet); +} diff --git a/backends/gpu/lib/wrapper/hip_wrapper.cc b/backends/gpu/lib/wrapper/hip_wrapper.cc index 052c06f1f75..cf37c4ef03e 100644 --- a/backends/gpu/lib/wrapper/hip_wrapper.cc +++ b/backends/gpu/lib/wrapper/hip_wrapper.cc @@ -37,6 +37,12 @@ llvm::raw_ostream& Print(llvm::raw_ostream& os, hipError_t error) { return os; } +llvm::raw_ostream& Print(llvm::raw_ostream& os, hiprtcResult result) { + const char* msg = hiprtcGetErrorString(result); + if (msg != nullptr) os << "hiprtc Error: (" << msg << ")"; + return os; +} + // Convert wrapper types to HIP types. static hipDevice_t ToRocm(Device device) { return device.id(Platform::ROCm); } @@ -540,6 +546,45 @@ llvm::Expected HipModuleLoadData(CurrentContext current, return OwningModule(module); } +llvm::Expected HipRTCModuleLoadData(CurrentContext current, + const void* image) { + CheckHipContext(current); + hiprtcProgram prog; + //auto img = reinterpret_cast(const_cast(image)); + auto kernel = static_cast(image); + std::string kname(kernel); + kname += ".cu"; + RETURN_IF_ERROR(hiprtcCreateProgram(&prog, + kernel, + kname.c_str(), + 0, + nullptr, + nullptr + )); + hiprtcResult compileResult = hiprtcCompileProgram(prog, 0, nullptr); + if (compileResult != HIPRTC_SUCCESS) { + size_t logSize; + hiprtcGetProgramLogSize(prog, &logSize); + if (logSize) { + std::string log(logSize, '\0'); + hiprtcGetProgramLog(prog, &log[0]); + MakeStringError(log.c_str()); + } + } + + size_t code_size; + RETURN_IF_ERROR(hiprtcGetCodeSize(prog, &code_size)); + std::vector code(code_size); + RETURN_IF_ERROR(hiprtcGetCode(prog, code.data())); + RETURN_IF_ERROR(hiprtcDestroyProgram(&prog)); + + hipModule_t module; + RETURN_IF_ERROR(hipModuleLoadData(&module, code.data())); + + NotifyResourceCreated(ResourceType::kModule, module); + return OwningModule(module); +} + llvm::Expected HipModuleLoadDataEx( CurrentContext current, const void* image, llvm::ArrayRef options, llvm::ArrayRef option_values) { diff --git a/backends/gpu/mlir_tests/rocm/blas.mlir b/backends/gpu/mlir_tests/rocm/blas.mlir new file mode 100644 index 00000000000..fb2969165a6 --- /dev/null +++ b/backends/gpu/mlir_tests/rocm/blas.mlir @@ -0,0 +1,193 @@ +// Copyright 2020 The TensorFlow Runtime Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// RUN: bef_executor_lite %s.bef | FileCheck %s + +// CHECK-LABEL: --- Running 'blas_axpy' +func @blas_axpy() { + %ch1 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1 + + %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3 + %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5 + %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer + + %stride = tfrt.constant.i32 1 + %alpha = tfrt.constant.f32 1.0 + %ch7 = tfrt_gpu.blas.axpy %blas, %stream, %buffer_length, %alpha, rocblas_datatype_f32_r, + %gpu_buffer_0, rocblas_datatype_f32_r, %stride, %gpu_buffer_1, rocblas_datatype_f32_r, %stride, + rocblas_datatype_f32_r, %ch6 + + %ch8 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch7 : !ht.host_buffer, !tfrt_gpu.buffer + %ch9 = tfrt_gpu.stream.synchronize %stream, %ch8 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [3.000000e+00, 5.000000e+00, 7.000000e+00, 9.000000e+00] + %ch10 = tfrt_dht.print_tensor %host_tensor, %ch9 + + tfrt.return +} + +// CHECK-LABEL: --- Running 'blas_gemm' +func @blas_gemm() { + %ch1 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1 + + %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3 + %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5 + %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32] + %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7 + %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer + + %dim = tfrt.constant.i32 2 + %alpha = tfrt.constant.f32 1.0 + %beta = tfrt.constant.f32 1.0 + %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard + %ch9 = tfrt_gpu.blas.gemm %blas, %stream, + rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim, + %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim, + %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %beta, + %gpu_buffer_2, rocblas_datatype_f32_r, %dim, + rocblas_datatype_f32_r, %algo, %ch8 + + %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer + %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01] + %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11 + + tfrt.return +} + +// CHECK-LABEL: --- Running 'blas_gemm_batched' +func @blas_gemm_batched() { + %ch1 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1 + + %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3 + %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5 + %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32] + %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7 + %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer + + %dim = tfrt.constant.i32 2 + %type = tfrt.constant.i32 0 + %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard + %alpha = tfrt.constant.f32 1.0 + %beta = tfrt.constant.f32 1.0 + %batch_count = tfrt.constant.i32 1 + %stride = tfrt.constant.i64 1 + %ch9 = tfrt_gpu.blas.gemm.batch %blas, %stream, + rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim, + %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim, %stride, + %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %stride, %beta, + %gpu_buffer_2, rocblas_datatype_f32_r, %dim, %stride, %batch_count, + rocblas_datatype_f32_r, %algo, %ch8 + + %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer + %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01] + %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11 + + tfrt.return +} + +// CHECK-LABEL: --- Running 'blas_trsm_batched' +func @blas_trsm_batched() { + %ch0 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch1 = tfrt_dht.get_buffer %host_tensor, %ch0 + + %ch2 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch1 [1.0 : f32, 2.0 : f32, 0.0 : f32, 1.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch2 + %ch3 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch2 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch4 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch3 [1.0 : f32, 4.0 : f32, 0.0 : f32, 0.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch4 + %ch5 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch4 : !tfrt_gpu.buffer, !ht.host_buffer + + %dim = tfrt.constant.i32 2 + %alpha = tfrt.constant.f32 1.0 + %batch_count = tfrt.constant.i32 1 + %ch6 = tfrt_gpu.blas.trsm.batch %blas, %stream, rocblas_side_left, + rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, %dim, %dim, + rocblas_datatype_f32_r, %alpha, %gpu_buffer_0, %dim, %gpu_buffer_1, %dim, %batch_count, + %ch5 + + %ch7 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch6 : !ht.host_buffer, !tfrt_gpu.buffer + %ch8 = tfrt_gpu.stream.synchronize %stream, %ch7 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [1.000000e+00, 2.000000e+00, 0.000000e+00, 0.000000e+00] + %ch9 = tfrt_dht.print_tensor %host_tensor, %ch8 + + tfrt.return +} diff --git a/backends/gpu/mlir_tests/rocm/module.mlir b/backends/gpu/mlir_tests/rocm/module.mlir new file mode 100644 index 00000000000..df2d3be1332 --- /dev/null +++ b/backends/gpu/mlir_tests/rocm/module.mlir @@ -0,0 +1,94 @@ +// Copyright 2020 The TensorFlow Runtime Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// RUN: bef_executor_lite %s.bef | FileCheck %s + +// CHECK-LABEL: --- Running 'function_test' +func @function_test() { + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // PTX for empty kernel. + // Typically module loading should be done at initialization time. + %module = tfrt_gpu.module.load %context { + data = "extern \"C\" __global__ void Kernel() { return; }\00" + } + + %func = tfrt_gpu.module.get_function %module { name = "Kernel" } + + tfrt.return +} + +func @global_test() { + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // PTX for a module with a global symbol. + %module = tfrt_gpu.module.load %context { + data = "__device__ unsigned int Global[128];\00" + } + + %global = tfrt_gpu.module.get_global %module { name = "Global" } + + tfrt.return +} + +// CHECK-LABEL: --- Running 'module_bad_data_test' +func @module_bad_data_test() { + %ch2 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // expected-error @+1 {{hipErrorInvalidValue}} + %func = tfrt_gpu.module.load %context { + data = "invalid image\00" + } + + tfrt.return +} + +// CHECK-LABEL: --- Running 'function_bad_name_test' +func @function_bad_name_test() { + %ch2 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + %module = tfrt_gpu.module.load %context { + data = "extern \"C\" __global__ void Kernel() { return; }\00" + } + + // expected-error @+1 {{hipErrorNotFound}} + %func = tfrt_gpu.module.get_function %module { name = "Foo\00" } + + tfrt.return +} + +// CHECK-LABEL: --- Running 'module_not_null_terminated_test' +func @module_not_null_terminated_test() { + %ch2 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // expected-error @+1 {{data attribute must be null-terminated}} + %module = tfrt_gpu.module.load %context { + data = "not null-terminated" + } + + tfrt.return +} diff --git a/backends/gpu/tools/stub_codegen/generate.sh b/backends/gpu/tools/stub_codegen/generate.sh index f9430698bbd..e710ef9e6e6 100755 --- a/backends/gpu/tools/stub_codegen/generate.sh +++ b/backends/gpu/tools/stub_codegen/generate.sh @@ -21,7 +21,7 @@ set -eux # Build the tools and generate the HIP header. -bazel build --nocheck_visibility \ +bazel build --nocheck_visibility --config=gcc\ //backends/gpu/tools/stub_codegen:header_codegen \ //backends/gpu/tools/stub_codegen:impl_codegen @@ -34,3 +34,10 @@ for API in "hip" "rocblas" "rocsolver" "miopen" "hipfft"; do ./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \ $(dirname $0)/$API.json | clang-format > $(printf $SRC_PATH $API) done + +# Hiprtc is currently rolled up in hip shared library. +# It is subject to change in future releases. +./bazel-bin/backends/gpu/tools/stub_codegen/header_codegen \ + $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.h.inc +./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \ + $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.cc.inc diff --git a/backends/gpu/tools/stub_codegen/hip.json b/backends/gpu/tools/stub_codegen/hip.json index 677af1ccf36..b1c11b86202 100644 --- a/backends/gpu/tools/stub_codegen/hip.json +++ b/backends/gpu/tools/stub_codegen/hip.json @@ -13,6 +13,7 @@ "hipDeviceAttribute_t", "hipJitOption", "hipLimit_t", + "hipDataType", "hipMemoryType", "hipMemcpyKind", "hipFunction_attribute", diff --git a/backends/gpu/tools/stub_codegen/hipfft.json b/backends/gpu/tools/stub_codegen/hipfft.json index 36ab5cd5875..41b6c054af9 100644 --- a/backends/gpu/tools/stub_codegen/hipfft.json +++ b/backends/gpu/tools/stub_codegen/hipfft.json @@ -24,5 +24,5 @@ "hipfftSetStream", "hipfftSetWorkArea", "hipfftSetAutoAllocation" - ], + ] } diff --git a/backends/gpu/tools/stub_codegen/hiprtc.json b/backends/gpu/tools/stub_codegen/hiprtc.json new file mode 100644 index 00000000000..3470f990588 --- /dev/null +++ b/backends/gpu/tools/stub_codegen/hiprtc.json @@ -0,0 +1,16 @@ +{ + "header":"/opt/rocm-5.0.0/include/hip/hiprtc.h", + "extra_args":[ + "-D__HIP_PLATFORM_AMD__", + "-I.", + "-I/opt/rocm-5.0.0/include/", + "-Ithird_party/llvm/llvm-project/clang/lib/Headers", + "-Ibazel-genfiles", + "-ferror-limit=0" + ], + "enums":[ + "hiprtcResult" + ], + "functions":[ + ] +} diff --git a/backends/gpu/tools/stub_codegen/miopen.json b/backends/gpu/tools/stub_codegen/miopen.json index 956189c9272..63324e14f40 100644 --- a/backends/gpu/tools/stub_codegen/miopen.json +++ b/backends/gpu/tools/stub_codegen/miopen.json @@ -191,5 +191,5 @@ "miopenSetDropoutDescriptor", "miopenDropoutForward", "miopenDropoutBackward" - ], + ] } diff --git a/backends/gpu/tools/stub_codegen/rocblas.json b/backends/gpu/tools/stub_codegen/rocblas.json index 0ebe21fd122..828f938d9b9 100644 --- a/backends/gpu/tools/stub_codegen/rocblas.json +++ b/backends/gpu/tools/stub_codegen/rocblas.json @@ -29,5 +29,5 @@ "rocblas_dtrsm_batched", "rocblas_ctrsm_batched", "rocblas_ztrsm_batched" - ], + ] } diff --git a/backends/gpu/tools/stub_codegen/rocsolver.json b/backends/gpu/tools/stub_codegen/rocsolver.json index fe4fc6564a2..392a10acd9b 100644 --- a/backends/gpu/tools/stub_codegen/rocsolver.json +++ b/backends/gpu/tools/stub_codegen/rocsolver.json @@ -9,5 +9,5 @@ "rocsolver_dpotrf", "rocsolver_cpotrf", "rocsolver_zpotrf" - ], + ] } diff --git a/third_party/hip/hip_stub.h.inc b/third_party/hip/hip_stub.h.inc index 5e63877c708..119e3dd319b 100644 --- a/third_party/hip/hip_stub.h.inc +++ b/third_party/hip/hip_stub.h.inc @@ -453,3 +453,26 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, const void* f, size_t dynSharedMemPerBlk, int blockSizeLimit); + +enum hipDataType { + HIP_R_16F = 2, + HIP_R_32F = 0, + HIP_R_64F = 1, + HIP_C_16F = 6, + HIP_C_32F = 4, + HIP_C_64F = 5, +}; +enum hiprtcResult { + HIPRTC_SUCCESS = 0, + HIPRTC_ERROR_OUT_OF_MEMORY = 1, + HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + HIPRTC_ERROR_INVALID_INPUT = 3, + HIPRTC_ERROR_INVALID_PROGRAM = 4, + HIPRTC_ERROR_INVALID_OPTION = 5, + HIPRTC_ERROR_COMPILATION = 6, + HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, + HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, + HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, + HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, + HIPRTC_ERROR_INTERNAL_ERROR = 11, +}; diff --git a/third_party/hip/hipfft_stub.cc.inc b/third_party/hip/hipfft_stub.cc.inc index ffe4cebd3eb..f0296ba7cd8 100644 --- a/third_party/hip/hipfft_stub.cc.inc +++ b/third_party/hip/hipfft_stub.cc.inc @@ -1,3 +1,8 @@ +HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) { + return DynamicCall("hipfftCreate", + plan); +} + HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64( hipfftHandle plan, int rank, long long int* n, long long int* inembed, long long int istride, long long int idist, long long int* onembed, @@ -13,17 +18,19 @@ HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize) { plan, workSize); } +HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, + int autoAllocate) { + return DynamicCall("hipfftSetAutoAllocation", plan, + autoAllocate); +} + HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea) { return DynamicCall( "hipfftSetWorkArea", plan, workArea); } -HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable) { - return DynamicCall( - "hipfftSetAutoAllocation", plan, enable); -} - HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan, hipfftComplex* idata, hipfftComplex* odata, int direction) { @@ -72,11 +79,6 @@ HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan, "hipfftSetStream", plan, stream); } -HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) { - return DynamicCall("hipfftCreate", - plan); -} - HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan) { return DynamicCall("hipfftDestroy", plan); diff --git a/third_party/hip/hipfft_stub.h.inc b/third_party/hip/hipfft_stub.h.inc index fee4121a479..cc70b112986 100644 --- a/third_party/hip/hipfft_stub.h.inc +++ b/third_party/hip/hipfft_stub.h.inc @@ -32,17 +32,20 @@ typedef enum hipfftLibraryPropertyType_t { HIPFFT_PATCH_LEVEL, } hipfftLibraryPropertyType; +HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan); + HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64( hipfftHandle plan, int rank, long long int* n, long long int* inembed, long long int istride, long long int idist, long long int* onembed, long long int ostride, long long int odist, hipfftType type, - long long int batch, size_t* work_size); + long long int batch, size_t* workSize); HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize); -HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea); +HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, + int autoAllocate); -HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable); +HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea); HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan, hipfftComplex* idata, @@ -71,8 +74,6 @@ HIPFFT_EXPORT hipfftResult hipfftExecZ2D(hipfftHandle plan, HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan, hipStream_t stream); -HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan); - HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan); HIPFFT_EXPORT hipfftResult hipfftGetProperty(hipfftLibraryPropertyType type,