Skip to content

Commit

Permalink
[NVIDIA] CMAKE_CUDA_ARCHITECTURES support and new architectures (open…
Browse files Browse the repository at this point in the history
…vinotoolkit#591)

* [NVIDIA] Add CMAKE_CUDA_ARCHITECTURES support and new architectures

* [NVIDIA] Fix BF16 build errors
  • Loading branch information
apavliuk55 authored Feb 23, 2023
1 parent aa4429c commit 610f85b
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 20 deletions.
19 changes: 19 additions & 0 deletions modules/nvidia_plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
cmake_minimum_required(VERSION 3.13)
project(InferenceEngineNVIDIAGpuPlugin CXX CUDA)

# Initialize CMAKE_CUDA_ARCHITECTURES when CMAKE_CUDA_COMPILER_ID is NVIDIA.
# Raise an error if CUDA_ARCHITECTURES is empty.
if(POLICY CMP0104)
cmake_policy(SET CMP0104 NEW)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
Expand Down Expand Up @@ -91,6 +97,19 @@ add_library(CUDA::cutensor SHARED IMPORTED)
set_target_properties(CUDA::cutensor PROPERTIES IMPORTED_LOCATION "${CUTENSOR_PATH}")
set_target_properties(CUDA::cutensor PROPERTIES IMPORTED_IMPLIB "${CUTENSOR_PATH}")

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if(${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL 10)
set(CMAKE_CUDA_ARCHITECTURES "30;35;50;60;72")
elseif(${CUDAToolkit_VERSION_MAJOR} EQUAL 11 AND ${CUDAToolkit_VERSION_MINOR} LESS_EQUAL 7)
set(CMAKE_CUDA_ARCHITECTURES "35;50;60-virtual;61;70-virtual;75;86")
elseif(${CUDAToolkit_VERSION_MAJOR} EQUAL 11 AND ${CUDAToolkit_VERSION_MINOR} EQUAL 8)
set(CMAKE_CUDA_ARCHITECTURES "35;50;60-virtual;61;70-virtual;75;86;89-virtual;90-virtual")
else()
set(CMAKE_CUDA_ARCHITECTURES "50;60-virtual;61;70-virtual;75;86;89-virtual;90-virtual")
endif()
endif()
message("-- [nvidia_gpu] CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}")

include(FetchContent)

# In this example we are picking a specific tag.
Expand Down
6 changes: 5 additions & 1 deletion modules/nvidia_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,13 @@ Parameter name | Parameter values | Default | Description
`NVIDIA_THROUGHPUT_STREAMS` | `NVIDIA_THROUGHPUT_AUTO`, or non negative integer values | 1 | Specifies number of CPU "execution" streams for the throughput mode. Upper bound for the number of inference requests that can be executed simultaneously.
`NVIDIA_OPERATION_BENCHMARK` | `NVIDIA_YES`, `NVIDIA_NO` | `NVIDIA_NO` | Specifies if operation level benchmark should be run for increasing performance of network

During compilation of the openvino_nvidia_gpu_plugin, user could specify two options:
During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options:
1) `-DCUDA_KERNEL_PRINT_LOG=ON` enables print logs from kernels (WARNING, be careful with this options, could print to many logs)
2) `-DENABLE_CUDNN_BACKEND_API` enables cuDNN backend support that could increase performance of convolutions by 20%
3) `-DCMAKE_CUDA_ARCHITECTURES=<arch_set>` e.g. `-DCMAKE_CUDA_ARCHITECTURES=75`, ([CMake documentation](https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html)). This option overrides the default architectures (CUDA Compute Capabitities) listed in `openvino_contrib/modules/nvidia_plugin/CMakeLists.txt`. This option allows to build the plugin for specific architecture or architecture set. Building for the lesser amount of architectures can significally decrease the size of `libopenvino_nvidia_gpu_plugin.so`. To find out the compute capabitity of nVidia devices in your system, you may use the following command:
```bash
nvidia-smi --query-gpu=compute_cap --format=csv
```

## Supported Layers and Limitations
The plugin supports IRv10 and higher. The list of supported layers and its limitations are defined in [cuda_opset.md](docs/cuda_opset.md).
Expand Down
6 changes: 1 addition & 5 deletions modules/nvidia_plugin/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,7 @@ ie_add_plugin(NAME ${TARGET_NAME}
# Enable support of CC for the plugin
ie_mark_target_as_cc(${TARGET_NAME})

if (${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL "10")
set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES 30 35 50 60 72)
else()
set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES 35 50 60 72)
endif()
set_property(TARGET ${OBJ_NAME} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})

ieTargetLinkWholeArchive(${TARGET_NAME} ${OBJ_NAME})

Expand Down
15 changes: 15 additions & 0 deletions modules/nvidia_plugin/src/cuda/math.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ inline __device__ __nv_bfloat16 round(__nv_bfloat16 x) {
return ::round(static_cast<float>(x));
}

template <>
inline __device__ __nv_bfloat16 pow<__nv_bfloat16>(__nv_bfloat16 x, __nv_bfloat16 y) {
return powf(static_cast<float>(x), static_cast<float>(y));
}

#if defined(CUDA_HAS_BF16_MATH)
inline __device__ __nv_bfloat16 floor(__nv_bfloat16 x) { return ::hfloor(x); }

Expand All @@ -181,6 +186,11 @@ inline __device__ __nv_bfloat16 exp<__nv_bfloat16>(__nv_bfloat16 x) {
return ::hexp(x);
}

template <>
inline __device__ __nv_bfloat16 sqrt<__nv_bfloat16>(__nv_bfloat16 x) {
return ::hsqrt(x);
}

template <>
inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
return ::__habs(x);
Expand All @@ -207,6 +217,11 @@ inline __device__ __nv_bfloat16 exp<__nv_bfloat16>(__nv_bfloat16 x) {
return exp<float>(static_cast<float>(x));
}

template <>
inline __device__ __nv_bfloat16 sqrt<__nv_bfloat16>(__nv_bfloat16 x) {
return ::sqrt(static_cast<float>(x));
}

template <>
inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
return abs<float>(static_cast<float>(x));
Expand Down
12 changes: 3 additions & 9 deletions modules/nvidia_plugin/src/kernels/power.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "convert.cuh"
#include <cuda/math.cuh>

#include "power.hpp"

namespace ov {
Expand All @@ -11,14 +12,7 @@ namespace kernel {

template <typename T>
struct PowerOpImpl {
__device__ static inline T op(T in0, T in1) { return pow(in0, in1); }
};

template <>
struct PowerOpImpl<__half> {
__device__ static inline __half op(__half in0, __half in1) {
return cast<__half>(powf(cast<float>(in0), cast<float>(in1)));
}
__device__ static inline T op(T in0, T in1) { return CUDA::math::pow(in0, in1); }
};

Power::Power(Type_t element_type, size_t out_num_elements, size_t max_threads_per_block)
Expand Down
8 changes: 3 additions & 5 deletions modules/nvidia_plugin/tests/unit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@ addIeTargetTest(
CUDA
)
set_source_files_properties(*.cu *.cuh PROPERTIES LANGUAGE CUDA)
if (${CUDAToolkit_VERSION_MAJOR} LESS_EQUAL "10")
set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 30 35 50 60 72)
else()
set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES 35 50 60 72)
endif()

set_property(TARGET ${TARGET_NAME} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})

# unitTestUtils brings in USE_STATIC_IE element in INTERFACE_COMPILE_DEFINITIONS, which breaks
# INFERENCE_ENGINE_API_CLASS() and friends since we really link to ie dlls.
# there's no easy way to override it(something like "remove from definitions" property on client)
Expand Down

0 comments on commit 610f85b

Please sign in to comment.