From 610f85b4ee01ffc8ce8bf6013a745a76e9b46d05 Mon Sep 17 00:00:00 2001
From: Andrii Pavliuk <82804725+apavliuk-altran@users.noreply.github.com>
Date: Thu, 23 Feb 2023 08:16:20 +0200
Subject: [PATCH] [NVIDIA] CMAKE_CUDA_ARCHITECTURES support and new
 architectures (#591)

* [NVIDIA] Add CMAKE_CUDA_ARCHITECTURES support and new architectures

* [NVIDIA] Fix BF16 build errors
---
 modules/nvidia_plugin/CMakeLists.txt          | 19 +++++++++++++++++++
 modules/nvidia_plugin/README.md               |  6 +++++-
 modules/nvidia_plugin/src/CMakeLists.txt      |  6 +-----
 modules/nvidia_plugin/src/cuda/math.cuh       | 15 +++++++++++++++
 modules/nvidia_plugin/src/kernels/power.cu    | 12 +++---------
 .../nvidia_plugin/tests/unit/CMakeLists.txt   |  8 +++-----
 6 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/modules/nvidia_plugin/CMakeLists.txt b/modules/nvidia_plugin/CMakeLists.txt
index b93caf4607f159..de031b0fd493b5 100644
--- a/modules/nvidia_plugin/CMakeLists.txt
+++ b/modules/nvidia_plugin/CMakeLists.txt
@@ -5,6 +5,12 @@
 cmake_minimum_required(VERSION 3.13)
 project(InferenceEngineNVIDIAGpuPlugin CXX CUDA)
 
+# Initialize CMAKE_CUDA_ARCHITECTURES when CMAKE_CUDA_COMPILER_ID is NVIDIA.
+# Raise an error if CUDA_ARCHITECTURES is empty.
+if(POLICY CMP0104)
+  cmake_policy(SET CMP0104 NEW)
+endif()
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
@@ -91,6 +97,19 @@ add_library(CUDA::cutensor SHARED IMPORTED)
 set_target_properties(CUDA::cutensor PROPERTIES IMPORTED_LOCATION "${CUTENSOR_PATH}")
 set_target_properties(CUDA::cutensor PROPERTIES IMPORTED_IMPLIB "${CUTENSOR_PATH}")
 
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    if(${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL 10)
+        set(CMAKE_CUDA_ARCHITECTURES "30;35;50;60;72")
+    elseif(${CUDAToolkit_VERSION_MAJOR} EQUAL 11 AND ${CUDAToolkit_VERSION_MINOR} LESS_EQUAL 7)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60-virtual;61;70-virtual;75;86")
+    elseif(${CUDAToolkit_VERSION_MAJOR} EQUAL 11 AND ${CUDAToolkit_VERSION_MINOR} EQUAL 8)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60-virtual;61;70-virtual;75;86;89-virtual;90-virtual")
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES "50;60-virtual;61;70-virtual;75;86;89-virtual;90-virtual")
+    endif()
+endif()
+message("-- [nvidia_gpu] CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}")
+
 include(FetchContent)
 
 # In this example we are picking a specific tag.
diff --git a/modules/nvidia_plugin/README.md b/modules/nvidia_plugin/README.md
index e39efe1d361f5f..87a0c24db7c953 100644
--- a/modules/nvidia_plugin/README.md
+++ b/modules/nvidia_plugin/README.md
@@ -173,9 +173,13 @@ Parameter name  | Parameter values  | Default  | Description
 `NVIDIA_THROUGHPUT_STREAMS`   | `NVIDIA_THROUGHPUT_AUTO`, or non negative integer values  | 1  | Specifies number of CPU "execution" streams for the throughput mode. Upper bound for the number of inference requests that can be executed simultaneously.
 `NVIDIA_OPERATION_BENCHMARK`   | `NVIDIA_YES`, `NVIDIA_NO`  | `NVIDIA_NO`  | Specifies if operation level benchmark should be run for increasing performance of network
 
-During compilation of the openvino_nvidia_gpu_plugin, user could specify two options:
+During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options:
 1) `-DCUDA_KERNEL_PRINT_LOG=ON` enables print logs from kernels (WARNING, be careful with this options, could print to many logs)
 2) `-DENABLE_CUDNN_BACKEND_API` enables cuDNN backend support that could increase performance of convolutions by 20%
+3) `-DCMAKE_CUDA_ARCHITECTURES=<arch_set>` e.g. `-DCMAKE_CUDA_ARCHITECTURES=75`, ([CMake documentation](https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html)). This option overrides the default architectures (CUDA Compute Capabitities) listed in `openvino_contrib/modules/nvidia_plugin/CMakeLists.txt`. This option allows to build the plugin for specific architecture or architecture set. Building for the lesser amount of architectures can significally decrease the size of `libopenvino_nvidia_gpu_plugin.so`. To find out the compute capabitity of nVidia devices in your system, you may use the following command:
+```bash
+nvidia-smi --query-gpu=compute_cap --format=csv
+```
 
 ## Supported Layers and Limitations
 The plugin supports IRv10 and higher. The list of supported layers and its limitations are defined in [cuda_opset.md](docs/cuda_opset.md).
diff --git a/modules/nvidia_plugin/src/CMakeLists.txt b/modules/nvidia_plugin/src/CMakeLists.txt
index 72680297aedb70..b3286f9c2abd4a 100644
--- a/modules/nvidia_plugin/src/CMakeLists.txt
+++ b/modules/nvidia_plugin/src/CMakeLists.txt
@@ -37,11 +37,7 @@ ie_add_plugin(NAME ${TARGET_NAME}
 # Enable support of CC for the plugin
 ie_mark_target_as_cc(${TARGET_NAME})
 
-if (${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL "10")
-    set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES 30 35 50 60 72)
-else()
-    set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES 35 50 60 72)
-endif()
+set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
 
 ieTargetLinkWholeArchive(${TARGET_NAME} ${OBJ_NAME})
 
diff --git a/modules/nvidia_plugin/src/cuda/math.cuh b/modules/nvidia_plugin/src/cuda/math.cuh
index 5e4f562d3f6bb9..6eea9fdeeded3d 100644
--- a/modules/nvidia_plugin/src/cuda/math.cuh
+++ b/modules/nvidia_plugin/src/cuda/math.cuh
@@ -171,6 +171,11 @@ inline __device__ __nv_bfloat16 round(__nv_bfloat16 x) {
     return ::round(static_cast<float>(x));
 }
 
+template <>
+inline __device__ __nv_bfloat16 pow<__nv_bfloat16>(__nv_bfloat16 x, __nv_bfloat16 y) {
+    return powf(static_cast<float>(x), static_cast<float>(y));
+}
+
 #if defined(CUDA_HAS_BF16_MATH)
 inline __device__ __nv_bfloat16 floor(__nv_bfloat16 x) { return ::hfloor(x); }
 
@@ -181,6 +186,11 @@ inline __device__ __nv_bfloat16 exp<__nv_bfloat16>(__nv_bfloat16 x) {
     return ::hexp(x);
 }
 
+template <>
+inline __device__ __nv_bfloat16 sqrt<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::hsqrt(x);
+}
+
 template <>
 inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
     return ::__habs(x);
@@ -207,6 +217,11 @@ inline __device__ __nv_bfloat16 exp<__nv_bfloat16>(__nv_bfloat16 x) {
     return exp<float>(static_cast<float>(x));
 }
 
+template <>
+inline __device__ __nv_bfloat16 sqrt<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::sqrt(static_cast<float>(x));
+}
+
 template <>
 inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
     return abs<float>(static_cast<float>(x));
diff --git a/modules/nvidia_plugin/src/kernels/power.cu b/modules/nvidia_plugin/src/kernels/power.cu
index 335386415b5623..37ff26308745a6 100644
--- a/modules/nvidia_plugin/src/kernels/power.cu
+++ b/modules/nvidia_plugin/src/kernels/power.cu
@@ -2,7 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "convert.cuh"
+#include <cuda/math.cuh>
+
 #include "power.hpp"
 
 namespace ov {
@@ -11,14 +12,7 @@ namespace kernel {
 
 template <typename T>
 struct PowerOpImpl {
-    __device__ static inline T op(T in0, T in1) { return pow(in0, in1); }
-};
-
-template <>
-struct PowerOpImpl<__half> {
-    __device__ static inline __half op(__half in0, __half in1) {
-        return cast<__half>(powf(cast<float>(in0), cast<float>(in1)));
-    }
+    __device__ static inline T op(T in0, T in1) { return CUDA::math::pow(in0, in1); }
 };
 
 Power::Power(Type_t element_type, size_t out_num_elements, size_t max_threads_per_block)
diff --git a/modules/nvidia_plugin/tests/unit/CMakeLists.txt b/modules/nvidia_plugin/tests/unit/CMakeLists.txt
index b8ac3b67d10cde..32d0013d298244 100644
--- a/modules/nvidia_plugin/tests/unit/CMakeLists.txt
+++ b/modules/nvidia_plugin/tests/unit/CMakeLists.txt
@@ -42,11 +42,9 @@ addIeTargetTest(
             CUDA
 )
 set_source_files_properties(*.cu *.cuh PROPERTIES LANGUAGE CUDA)
-if (${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL "10")
-	set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 30 35 50 60 72)
-else()
-	set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 35 50 60 72)
-endif()
+
+set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
+
 # unitTestUtils brings in USE_STATIC_IE element in INTERFACE_COMPILE_DEFINITIONS, which breaks
 # INFERENCE_ENGINE_API_CLASS() and friends since we really link to ie dlls.
 # there's no easy way to override it(something like "remove from definitions" property on client)