From 610f85b4ee01ffc8ce8bf6013a745a76e9b46d05 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk <82804725+apavliuk-altran@users.noreply.github.com> Date: Thu, 23 Feb 2023 08:16:20 +0200 Subject: [PATCH] [NVIDIA] CMAKE_CUDA_ARCHITECTURES support and new architectures (#591) * [NVIDIA] Add CMAKE_CUDA_ARCHITECTURES support and new architectures * [NVIDIA] Fix BF16 build errors --- modules/nvidia_plugin/CMakeLists.txt | 19 +++++++++++++++++++ modules/nvidia_plugin/README.md | 6 +++++- modules/nvidia_plugin/src/CMakeLists.txt | 6 +----- modules/nvidia_plugin/src/cuda/math.cuh | 15 +++++++++++++++ modules/nvidia_plugin/src/kernels/power.cu | 12 +++--------- .../nvidia_plugin/tests/unit/CMakeLists.txt | 8 +++----- 6 files changed, 46 insertions(+), 20 deletions(-) diff --git a/modules/nvidia_plugin/CMakeLists.txt b/modules/nvidia_plugin/CMakeLists.txt index b93caf4607f159..de031b0fd493b5 100644 --- a/modules/nvidia_plugin/CMakeLists.txt +++ b/modules/nvidia_plugin/CMakeLists.txt @@ -5,6 +5,12 @@ cmake_minimum_required(VERSION 3.13) project(InferenceEngineNVIDIAGpuPlugin CXX CUDA) +# Initialize CMAKE_CUDA_ARCHITECTURES when CMAKE_CUDA_COMPILER_ID is NVIDIA. +# Raise an error if CUDA_ARCHITECTURES is empty. +if(POLICY CMP0104) + cmake_policy(SET CMP0104 NEW) +endif() + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED TRUE) @@ -91,6 +97,19 @@ add_library(CUDA::cutensor SHARED IMPORTED) set_target_properties(CUDA::cutensor PROPERTIES IMPORTED_LOCATION "${CUTENSOR_PATH}") set_target_properties(CUDA::cutensor PROPERTIES IMPORTED_IMPLIB "${CUTENSOR_PATH}") +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if(${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL 10) + set(CMAKE_CUDA_ARCHITECTURES "30;35;50;60;72") + elseif(${CUDAToolkit_VERSION_MAJOR} EQUAL 11 AND ${CUDAToolkit_VERSION_MINOR} LESS_EQUAL 7) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60-virtual;61;70-virtual;75;86") + elseif(${CUDAToolkit_VERSION_MAJOR} EQUAL 11 AND ${CUDAToolkit_VERSION_MINOR} EQUAL 8) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60-virtual;61;70-virtual;75;86;89-virtual;90-virtual") + else() + set(CMAKE_CUDA_ARCHITECTURES "50;60-virtual;61;70-virtual;75;86;89-virtual;90-virtual") + endif() +endif() +message("-- [nvidia_gpu] CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}") + include(FetchContent) # In this example we are picking a specific tag. diff --git a/modules/nvidia_plugin/README.md b/modules/nvidia_plugin/README.md index e39efe1d361f5f..87a0c24db7c953 100644 --- a/modules/nvidia_plugin/README.md +++ b/modules/nvidia_plugin/README.md @@ -173,9 +173,13 @@ Parameter name | Parameter values | Default | Description `NVIDIA_THROUGHPUT_STREAMS` | `NVIDIA_THROUGHPUT_AUTO`, or non negative integer values | 1 | Specifies number of CPU "execution" streams for the throughput mode. Upper bound for the number of inference requests that can be executed simultaneously. `NVIDIA_OPERATION_BENCHMARK` | `NVIDIA_YES`, `NVIDIA_NO` | `NVIDIA_NO` | Specifies if operation level benchmark should be run for increasing performance of network -During compilation of the openvino_nvidia_gpu_plugin, user could specify two options: +During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options: 1) `-DCUDA_KERNEL_PRINT_LOG=ON` enables print logs from kernels (WARNING, be careful with this options, could print to many logs) 2) `-DENABLE_CUDNN_BACKEND_API` enables cuDNN backend support that could increase performance of convolutions by 20% +3) `-DCMAKE_CUDA_ARCHITECTURES=` e.g. `-DCMAKE_CUDA_ARCHITECTURES=75`, ([CMake documentation](https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html)). This option overrides the default architectures (CUDA Compute Capabitities) listed in `openvino_contrib/modules/nvidia_plugin/CMakeLists.txt`. This option allows to build the plugin for specific architecture or architecture set. Building for the lesser amount of architectures can significally decrease the size of `libopenvino_nvidia_gpu_plugin.so`. To find out the compute capabitity of nVidia devices in your system, you may use the following command: +```bash +nvidia-smi --query-gpu=compute_cap --format=csv +``` ## Supported Layers and Limitations The plugin supports IRv10 and higher. The list of supported layers and its limitations are defined in [cuda_opset.md](docs/cuda_opset.md). diff --git a/modules/nvidia_plugin/src/CMakeLists.txt b/modules/nvidia_plugin/src/CMakeLists.txt index 72680297aedb70..b3286f9c2abd4a 100644 --- a/modules/nvidia_plugin/src/CMakeLists.txt +++ b/modules/nvidia_plugin/src/CMakeLists.txt @@ -37,11 +37,7 @@ ie_add_plugin(NAME ${TARGET_NAME} # Enable support of CC for the plugin ie_mark_target_as_cc(${TARGET_NAME}) -if (${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL "10") - set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES 30 35 50 60 72) -else() - set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES 35 50 60 72) -endif() +set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) ieTargetLinkWholeArchive(${TARGET_NAME} ${OBJ_NAME}) diff --git a/modules/nvidia_plugin/src/cuda/math.cuh b/modules/nvidia_plugin/src/cuda/math.cuh index 5e4f562d3f6bb9..6eea9fdeeded3d 100644 --- a/modules/nvidia_plugin/src/cuda/math.cuh +++ b/modules/nvidia_plugin/src/cuda/math.cuh @@ -171,6 +171,11 @@ inline __device__ __nv_bfloat16 round(__nv_bfloat16 x) { return ::round(static_cast(x)); } +template <> +inline __device__ __nv_bfloat16 pow<__nv_bfloat16>(__nv_bfloat16 x, __nv_bfloat16 y) { + return powf(static_cast(x), static_cast(y)); +} + #if defined(CUDA_HAS_BF16_MATH) inline __device__ __nv_bfloat16 floor(__nv_bfloat16 x) { return ::hfloor(x); } @@ -181,6 +186,11 @@ inline __device__ __nv_bfloat16 exp<__nv_bfloat16>(__nv_bfloat16 x) { return ::hexp(x); } +template <> +inline __device__ __nv_bfloat16 sqrt<__nv_bfloat16>(__nv_bfloat16 x) { + return ::hsqrt(x); +} + template <> inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) { return ::__habs(x); @@ -207,6 +217,11 @@ inline __device__ __nv_bfloat16 exp<__nv_bfloat16>(__nv_bfloat16 x) { return exp(static_cast(x)); } +template <> +inline __device__ __nv_bfloat16 sqrt<__nv_bfloat16>(__nv_bfloat16 x) { + return ::sqrt(static_cast(x)); +} + template <> inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) { return abs(static_cast(x)); diff --git a/modules/nvidia_plugin/src/kernels/power.cu b/modules/nvidia_plugin/src/kernels/power.cu index 335386415b5623..37ff26308745a6 100644 --- a/modules/nvidia_plugin/src/kernels/power.cu +++ b/modules/nvidia_plugin/src/kernels/power.cu @@ -2,7 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "convert.cuh" +#include + #include "power.hpp" namespace ov { @@ -11,14 +12,7 @@ namespace kernel { template struct PowerOpImpl { - __device__ static inline T op(T in0, T in1) { return pow(in0, in1); } -}; - -template <> -struct PowerOpImpl<__half> { - __device__ static inline __half op(__half in0, __half in1) { - return cast<__half>(powf(cast(in0), cast(in1))); - } + __device__ static inline T op(T in0, T in1) { return CUDA::math::pow(in0, in1); } }; Power::Power(Type_t element_type, size_t out_num_elements, size_t max_threads_per_block) diff --git a/modules/nvidia_plugin/tests/unit/CMakeLists.txt b/modules/nvidia_plugin/tests/unit/CMakeLists.txt index b8ac3b67d10cde..32d0013d298244 100644 --- a/modules/nvidia_plugin/tests/unit/CMakeLists.txt +++ b/modules/nvidia_plugin/tests/unit/CMakeLists.txt @@ -42,11 +42,9 @@ addIeTargetTest( CUDA ) set_source_files_properties(*.cu *.cuh PROPERTIES LANGUAGE CUDA) -if (${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL "10") - set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 30 35 50 60 72) -else() - set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 35 50 60 72) -endif() + +set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) + # unitTestUtils brings in USE_STATIC_IE element in INTERFACE_COMPILE_DEFINITIONS, which breaks # INFERENCE_ENGINE_API_CLASS() and friends since we really link to ie dlls. # there's no easy way to override it(something like "remove from definitions" property on client)