Statically link all CUDA toolkit libraries (rapidsai#4881)

This PR ensures cuML statically links all the CUDA toolkit libraries (not just `cudart`) if a user enables `CUDA_STATIC_RUNTIME`. Authors: - Paul Taylor (https://github.com/trxcllnt) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: rapidsai#4881
jakirkham · Sep 8, 2022 · 00a1351 · 00a1351
1 parent 03cf2c7
commit 00a1351
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 53 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -68,6 +68,7 @@ option(ENABLE_CUMLPRIMS_MG "Enable algorithms that use libcumlprims_mg" ON)
 option(NVTX "Enable nvtx markers" OFF)
 option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
 option(USE_CCACHE "Cache build artifacts with ccache" OFF)
+option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
 option(CUML_USE_RAFT_STATIC "Build and statically link the RAFT libraries" OFF)
 option(CUML_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" OFF)
 option(CUML_USE_TREELITE_STATIC "Build and statically link the treelite library" OFF)
@@ -91,6 +92,7 @@ message(VERBOSE "CUML_CPP: Enabling lineinfo in nvcc: ${CUDA_ENABLE_LINE_INFO}")
 message(VERBOSE "CUML_CPP: Enabling nvtx markers: ${NVTX}")
 message(VERBOSE "CUML_CPP: Disabling all mnmg components and comms libraries: ${SINGLEGPU}")
 message(VERBOSE "CUML_CPP: Cache build artifacts with ccache: ${USE_CCACHE}")
+message(VERBOSE "CUML_CPP: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "CUML_CPP: Build and statically link RAFT libraries: ${CUML_USE_RAFT_STATIC}")
 message(VERBOSE "CUML_CPP: Build and statically link FAISS library: ${CUML_USE_FAISS_STATIC}")
 message(VERBOSE "CUML_CPP: Build and statically link Treelite library: ${CUML_USE_TREELITE_STATIC}")
@@ -129,13 +131,29 @@ endif()
 ##############################################################################
 # - compiler options ---------------------------------------------------------
 
+set(_ctk_static_suffix "")
+if(CUDA_STATIC_RUNTIME)
+  # If we're statically linking CTK cuBLAS,
+  # we also want to statically link BLAS
+  set(BLA_STATIC ON)
+  set(_ctk_static_suffix "_static")
+  set(_ctk_static_suffix_cufft "_static_nocallback")
+  # Control legacy FindCUDA.cmake behavior too
+  # Remove this after we push it into rapids-cmake:
+  # https://github.com/rapidsai/rapids-cmake/pull/259
+  set(CUDA_USE_STATIC_CUDA_RUNTIME ON)
+endif()
+
 if (NOT DISABLE_OPENMP)
   find_package(OpenMP)
   if(OpenMP_FOUND)
     message(STATUS "CUML_CPP: OpenMP found in ${OPENMP_INCLUDE_DIRS}")
   endif()
 endif()
 
+# CUDA runtime
+rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME})
+
 # * find CUDAToolkit package
 # * determine GPU architectures
 # * enable the CMake CUDA language
@@ -522,7 +540,7 @@ if(BUILD_CUML_CPP_LIBRARY)
       $<$<BOOL:${CUML_USE_RAFT_NN}>:raft::nn>
       $<$<BOOL:${CUML_USE_RAFT_DIST}>:raft::distance>
     PRIVATE
-      $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft>
+      $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft${_ctk_static_suffix_cufft}>
       ${TREELITE_LIBS}
       $<$<BOOL:${treeshap_algo}>:GPUTreeShap::GPUTreeShap>
       ${OpenMP_CXX_LIB_NAMES}

diff --git a/cpp/examples/symreg/CMakeLists_standalone.txt b/cpp/examples/symreg/CMakeLists_standalone.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,4 +30,4 @@ add_executable(symreg_example symreg_example.cpp)
 set_target_properties(symreg_example PROPERTIES LINKER_LANGUAGE "CUDA")
 
 # Link cuml and cudart
-target_link_libraries(symreg_example cuml::cuml++ CUDA::cudart)
+target_link_libraries(symreg_example cuml::cuml++)
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
@@ -48,13 +48,13 @@ void launcher(const raft::handle_t& handle,
 
 // Instantiation for dense inputs, int64_t indices
 template <>
-void launcher(const raft::handle_t& handle,
-              const ML::manifold_dense_inputs_t<float>& inputsA,
-              const ML::manifold_dense_inputs_t<float>& inputsB,
-              ML::knn_graph<int64_t, float>& out,
-              int n_neighbors,
-              const ML::UMAPParams* params,
-              cudaStream_t stream)
+inline void launcher(const raft::handle_t& handle,
+                     const ML::manifold_dense_inputs_t<float>& inputsA,
+                     const ML::manifold_dense_inputs_t<float>& inputsB,
+                     ML::knn_graph<int64_t, float>& out,
+                     int n_neighbors,
+                     const ML::UMAPParams* params,
+                     cudaStream_t stream)
 {
   std::vector<float*> ptrs(1);
   std::vector<int> sizes(1);
@@ -76,25 +76,25 @@ void launcher(const raft::handle_t& handle,
 
 // Instantiation for dense inputs, int indices
 template <>
-void launcher(const raft::handle_t& handle,
-              const ML::manifold_dense_inputs_t<float>& inputsA,
-              const ML::manifold_dense_inputs_t<float>& inputsB,
-              ML::knn_graph<int, float>& out,
-              int n_neighbors,
-              const ML::UMAPParams* params,
-              cudaStream_t stream)
+inline void launcher(const raft::handle_t& handle,
+                     const ML::manifold_dense_inputs_t<float>& inputsA,
+                     const ML::manifold_dense_inputs_t<float>& inputsB,
+                     ML::knn_graph<int, float>& out,
+                     int n_neighbors,
+                     const ML::UMAPParams* params,
+                     cudaStream_t stream)
 {
   throw raft::exception("Dense KNN doesn't yet support 32-bit integer indices");
 }
 
 template <>
-void launcher(const raft::handle_t& handle,
-              const ML::manifold_sparse_inputs_t<int, float>& inputsA,
-              const ML::manifold_sparse_inputs_t<int, float>& inputsB,
-              ML::knn_graph<int, float>& out,
-              int n_neighbors,
-              const ML::UMAPParams* params,
-              cudaStream_t stream)
+inline void launcher(const raft::handle_t& handle,
+                     const ML::manifold_sparse_inputs_t<int, float>& inputsA,
+                     const ML::manifold_sparse_inputs_t<int, float>& inputsB,
+                     ML::knn_graph<int, float>& out,
+                     int n_neighbors,
+                     const ML::UMAPParams* params,
+                     cudaStream_t stream)
 {
   raft::sparse::selection::brute_force_knn(inputsA.indptr,
                                            inputsA.indices,
@@ -119,39 +119,39 @@ void launcher(const raft::handle_t& handle,
 }
 
 template <>
-void launcher(const raft::handle_t& handle,
-              const ML::manifold_sparse_inputs_t<int64_t, float>& inputsA,
-              const ML::manifold_sparse_inputs_t<int64_t, float>& inputsB,
-              ML::knn_graph<int64_t, float>& out,
-              int n_neighbors,
-              const ML::UMAPParams* params,
-              cudaStream_t stream)
+inline void launcher(const raft::handle_t& handle,
+                     const ML::manifold_sparse_inputs_t<int64_t, float>& inputsA,
+                     const ML::manifold_sparse_inputs_t<int64_t, float>& inputsB,
+                     ML::knn_graph<int64_t, float>& out,
+                     int n_neighbors,
+                     const ML::UMAPParams* params,
+                     cudaStream_t stream)
 {
   throw raft::exception("Sparse KNN doesn't support 64-bit integer indices");
 }
 
 template <>
-void launcher(const raft::handle_t& handle,
-              const ML::manifold_precomputed_knn_inputs_t<int64_t, float>& inputsA,
-              const ML::manifold_precomputed_knn_inputs_t<int64_t, float>& inputsB,
-              ML::knn_graph<int64_t, float>& out,
-              int n_neighbors,
-              const ML::UMAPParams* params,
-              cudaStream_t stream)
+inline void launcher(const raft::handle_t& handle,
+                     const ML::manifold_precomputed_knn_inputs_t<int64_t, float>& inputsA,
+                     const ML::manifold_precomputed_knn_inputs_t<int64_t, float>& inputsB,
+                     ML::knn_graph<int64_t, float>& out,
+                     int n_neighbors,
+                     const ML::UMAPParams* params,
+                     cudaStream_t stream)
 {
   out.knn_indices = inputsA.knn_graph.knn_indices;
   out.knn_dists   = inputsA.knn_graph.knn_dists;
 }
 
 // Instantiation for precomputed inputs, int indices
 template <>
-void launcher(const raft::handle_t& handle,
-              const ML::manifold_precomputed_knn_inputs_t<int, float>& inputsA,
-              const ML::manifold_precomputed_knn_inputs_t<int, float>& inputsB,
-              ML::knn_graph<int, float>& out,
-              int n_neighbors,
-              const ML::UMAPParams* params,
-              cudaStream_t stream)
+inline void launcher(const raft::handle_t& handle,
+                     const ML::manifold_precomputed_knn_inputs_t<int, float>& inputsA,
+                     const ML::manifold_precomputed_knn_inputs_t<int, float>& inputsB,
+                     ML::knn_graph<int, float>& out,
+                     int n_neighbors,
+                     const ML::UMAPParams* params,
+                     cudaStream_t stream)
 {
   out.knn_indices = inputsA.knn_graph.knn_indices;
   out.knn_dists   = inputsA.knn_graph.knn_dists;

diff --git a/cpp/src/umap/optimize.cuh b/cpp/src/umap/optimize.cuh
@@ -169,7 +169,7 @@ void optimize_params(T* input,
   } while (tol_grads < 2 && num_iters < max_epochs);
 }
 
-void find_params_ab(UMAPParams* params, cudaStream_t stream)
+inline void find_params_ab(UMAPParams* params, cudaStream_t stream)
 {
   float spread   = params->spread;
   float min_dist = params->min_dist;

diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh
@@ -86,7 +86,10 @@ __global__ void init_transform(int* indices,
  * a and b, which are based on min_dist and spread
  * parameters.
  */
-void find_ab(UMAPParams* params, cudaStream_t stream) { Optimize::find_params_ab(params, stream); }
+inline void find_ab(UMAPParams* params, cudaStream_t stream)
+{
+  Optimize::find_params_ab(params, stream);
+}
 
 template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
 void _get_graph(const raft::handle_t& handle,

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
@@ -36,12 +36,12 @@ function(ConfigureTest)
   PRIVATE
     ${CUML_CPP_TARGET}
     $<$<BOOL:BUILD_CUML_C_LIBRARY>:${CUML_C_TARGET}>
-    CUDA::cublas
-    CUDA::curand
-    CUDA::cusolver
-    CUDA::cudart
-    CUDA::cusparse
-    $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft>
+    CUDA::cublas${_ctk_static_suffix}
+    CUDA::curand${_ctk_static_suffix}
+    CUDA::cusolver${_ctk_static_suffix}
+    CUDA::cudart${_ctk_static_suffix}
+    CUDA::cusparse${_ctk_static_suffix}
+    $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft${_ctk_static_suffix_cufft}>
     rmm::rmm
     raft::raft
     $<$<BOOL:${CUML_USE_RAFT_NN}>:raft::nn>