From 867c84da3acdffe3d39d8720fea5e5e859f8d1be Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 14:39:48 +0000
Subject: [PATCH 01/38] opensubdiv: drop the cudatoolkit.run file, and respect
 cudaFlags

(cherry picked from commit e5b174bedb0af5f8da2be151f241b6d0174f6bfb)
---
 .../libraries/opensubdiv/default.nix          | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/pkgs/development/libraries/opensubdiv/default.nix b/pkgs/development/libraries/opensubdiv/default.nix
index b0ff4b528864b..b946d8ab718ae 100644
--- a/pkgs/development/libraries/opensubdiv/default.nix
+++ b/pkgs/development/libraries/opensubdiv/default.nix
@@ -1,9 +1,7 @@
 { config, lib, stdenv, fetchFromGitHub, cmake, pkg-config, xorg, libGLU
 , libGL, glew, ocl-icd, python3
-, cudaSupport ? config.cudaSupport, cudatoolkit
-  # For visibility mostly. The whole approach to cuda architectures and capabilities
-  # will be reworked soon.
-, cudaArch ? "compute_37"
+, cudaSupport ? config.cudaSupport
+, cudaPackages
 , openclSupport ? !cudaSupport
 , darwin
 }:
@@ -21,7 +19,11 @@ stdenv.mkDerivation rec {
 
   outputs = [ "out" "dev" ];
 
-  nativeBuildInputs = [ cmake pkg-config ];
+  nativeBuildInputs = [
+    cmake
+    pkg-config
+    cudaPackages.cuda_nvcc
+  ];
   buildInputs =
     [ libGLU libGL python3
       # FIXME: these are not actually needed, but the configure script wants them.
@@ -30,21 +32,31 @@ stdenv.mkDerivation rec {
     ]
     ++ lib.optional (openclSupport && !stdenv.isDarwin) ocl-icd
     ++ lib.optionals stdenv.isDarwin (with darwin.apple_sdk.frameworks; [OpenCL Cocoa CoreVideo IOKit AppKit AGL ])
-    ++ lib.optional cudaSupport cudatoolkit;
+    ++ lib.optional cudaSupport [
+      cudaPackages.cuda_cudart
+    ];
+
+  # It's important to set OSD_CUDA_NVCC_FLAGS,
+  # because otherwise OSD might piggyback unwanted architectures:
+  # https://github.com/PixarAnimationStudios/OpenSubdiv/blob/7d0ab5530feef693ac0a920585b5c663b80773b3/CMakeLists.txt#L602
+  preConfigure = lib.optionalString cudaSupport ''
+    cmakeFlagsArray+=(
+      -DOSD_CUDA_NVCC_FLAGS="${lib.concatStringsSep " " cudaPackages.cudaFlags.gencode}"
+    )
+  '';
 
   cmakeFlags =
     [ "-DNO_TUTORIALS=1"
       "-DNO_REGRESSION=1"
       "-DNO_EXAMPLES=1"
       "-DNO_METAL=1" # don’t have metal in apple sdk
+      (lib.cmakeBool "NO_OPENCL" (!openclSupport))
+      (lib.cmakeBool "NO_CUDA" (!cudaSupport))
     ] ++ lib.optionals (!stdenv.isDarwin) [
       "-DGLEW_INCLUDE_DIR=${glew.dev}/include"
       "-DGLEW_LIBRARY=${glew.dev}/lib"
     ] ++ lib.optionals cudaSupport [
-      "-DOSD_CUDA_NVCC_FLAGS=--gpu-architecture=${cudaArch}"
-      "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
     ] ++ lib.optionals (!openclSupport) [
-      "-DNO_OPENCL=1"
     ];
 
   preBuild = let maxBuildCores = 16; in lib.optionalString cudaSupport ''

From 2d234b0c2c4f4343175158fc9ff404f1b0f4faaa Mon Sep 17 00:00:00 2001
From: Dmitry Kalinkin <dmitry.kalinkin@gmail.com>
Date: Fri, 8 Dec 2023 12:55:49 -0500
Subject: [PATCH 02/38] opensubdiv: use cuda_nvcc only if cudaSupport

(cherry picked from commit 3b2e32b5117e3da6fa3de1c8b67f9bf6660095b7)
---
 pkgs/development/libraries/opensubdiv/default.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkgs/development/libraries/opensubdiv/default.nix b/pkgs/development/libraries/opensubdiv/default.nix
index b946d8ab718ae..9c485949a5c09 100644
--- a/pkgs/development/libraries/opensubdiv/default.nix
+++ b/pkgs/development/libraries/opensubdiv/default.nix
@@ -22,6 +22,7 @@ stdenv.mkDerivation rec {
   nativeBuildInputs = [
     cmake
     pkg-config
+  ] ++ lib.optional cudaSupport [
     cudaPackages.cuda_nvcc
   ];
   buildInputs =

From c1ab852e6b37feba36a7fbd7ddae216a52f334c2 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 9 Dec 2023 00:53:08 +0000
Subject: [PATCH 03/38] suitesparse: migrate to redist cuda

also fix the incompatible gcc error by adding cuda_nvcc to
nativeBuildInputs (cudaPackages.cudatoolkit in buildInputs is
insufficient because that doesn't propagate the hook)

(cherry picked from commit 7c97d5f5c441cce6ad717a85676eec7c7ff1b27e)
---
 .../science/math/suitesparse/default.nix      | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/pkgs/development/libraries/science/math/suitesparse/default.nix b/pkgs/development/libraries/science/math/suitesparse/default.nix
index dd2eb9478f7f6..abc2ff9a37012 100644
--- a/pkgs/development/libraries/science/math/suitesparse/default.nix
+++ b/pkgs/development/libraries/science/math/suitesparse/default.nix
@@ -8,7 +8,7 @@
 , mpfr
 , config
 , enableCuda ? config.cudaSupport
-, cudatoolkit
+, cudaPackages
 }:
 
 stdenv.mkDerivation rec {
@@ -25,7 +25,11 @@ stdenv.mkDerivation rec {
   };
 
   nativeBuildInputs = [
-  ] ++ lib.optional stdenv.isDarwin fixDarwinDylibNames;
+  ] ++ lib.optionals stdenv.isDarwin [
+    fixDarwinDylibNames
+  ] ++ lib.optionals enableCuda [
+    cudaPackages.cuda_nvcc
+  ];
 
   # Use compatible indexing for lapack and blas used
   buildInputs = assert (blas.isILP64 == lapack.isILP64); [
@@ -34,7 +38,12 @@ stdenv.mkDerivation rec {
     gfortran.cc.lib
     gmp
     mpfr
-  ] ++ lib.optional enableCuda cudatoolkit;
+  ] ++ lib.optionals enableCuda [
+    cudaPackages.cuda_cudart.dev
+    cudaPackages.cuda_cudart.lib
+    cudaPackages.libcublas.dev
+    cudaPackages.libcublas.lib
+  ];
 
   preConfigure = ''
     # Mongoose and GraphBLAS are packaged separately
@@ -49,9 +58,9 @@ stdenv.mkDerivation rec {
   ] ++ lib.optionals blas.isILP64 [
     "CFLAGS=-DBLAS64"
   ] ++ lib.optionals enableCuda [
-    "CUDA_PATH=${cudatoolkit}"
-    "CUDART_LIB=${cudatoolkit.lib}/lib/libcudart.so"
-    "CUBLAS_LIB=${cudatoolkit}/lib/libcublas.so"
+    "CUDA_PATH=${cudaPackages.cuda_nvcc}"
+    "CUDART_LIB=${cudaPackages.cuda_cudart.lib}/lib/libcudart.so"
+    "CUBLAS_LIB=${cudaPackages.libcublas.lib}/lib/libcublas.so"
   ] ++ lib.optionals stdenv.isDarwin [
     # Unless these are set, the build will attempt to use `Accelerate` on darwin, see:
     # https://github.com/DrTimothyAldenDavis/SuiteSparse/blob/v5.13.0/SuiteSparse_config/SuiteSparse_config.mk#L368

From 0551e854d94e6c466d46e084757ed1c068b78511 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 24 Nov 2023 13:47:03 +0000
Subject: [PATCH 04/38] cudaPackagesGoogle: init, a package-set for jax and tf

(cherry picked from commit 5bda2ec626a4107f8adc3a7e58c16c0594c40d2c)
---
 pkgs/development/python-modules/jaxlib/bin.nix   |  8 ++++----
 .../python-modules/jaxlib/default.nix            |  4 ++--
 .../python-modules/tensorflow/bin.nix            |  6 +++---
 .../python-modules/tensorflow/default.nix        | 16 ++++++++--------
 pkgs/top-level/all-packages.nix                  |  4 ++++
 pkgs/top-level/python-packages.nix               |  1 -
 6 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/pkgs/development/python-modules/jaxlib/bin.nix b/pkgs/development/python-modules/jaxlib/bin.nix
index d80cbc2a60183..e35b4759bd64f 100644
--- a/pkgs/development/python-modules/jaxlib/bin.nix
+++ b/pkgs/development/python-modules/jaxlib/bin.nix
@@ -29,11 +29,11 @@
 , stdenv
   # Options:
 , cudaSupport ? config.cudaSupport
-, cudaPackages ? {}
+, cudaPackagesGoogle
 }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudnn;
+  inherit (cudaPackagesGoogle) cudatoolkit cudnn;
 
   version = "0.4.20";
 
@@ -210,8 +210,8 @@ buildPythonPackage {
     maintainers = with maintainers; [ samuela ];
     platforms = [ "aarch64-darwin" "x86_64-linux" "x86_64-darwin" ];
     broken =
-      !(cudaSupport -> (cudaPackages ? cudatoolkit) && lib.versionAtLeast cudatoolkit.version "11.1")
-      || !(cudaSupport -> (cudaPackages ? cudnn) && lib.versionAtLeast cudnn.version "8.2")
+      !(cudaSupport -> (cudaPackagesGoogle ? cudatoolkit) && lib.versionAtLeast cudatoolkit.version "11.1")
+      || !(cudaSupport -> (cudaPackagesGoogle ? cudnn) && lib.versionAtLeast cudnn.version "8.2")
       || !(cudaSupport -> stdenv.isLinux);
   };
 }
diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix
index c70ab0ac2b327..a04d6973ca4be 100644
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@@ -44,14 +44,14 @@
 , config
   # CUDA flags:
 , cudaSupport ? config.cudaSupport
-, cudaPackages ? {}
+, cudaPackagesGoogle
 
   # MKL:
 , mklSupport ? true
 }:
 
 let
-  inherit (cudaPackages) backendStdenv cudatoolkit cudaFlags cudnn nccl;
+  inherit (cudaPackagesGoogle) backendStdenv cudatoolkit cudaFlags cudnn nccl;
 
   pname = "jaxlib";
   version = "0.4.20";
diff --git a/pkgs/development/python-modules/tensorflow/bin.nix b/pkgs/development/python-modules/tensorflow/bin.nix
index dae6816a906c3..ac5bb7edf1a54 100644
--- a/pkgs/development/python-modules/tensorflow/bin.nix
+++ b/pkgs/development/python-modules/tensorflow/bin.nix
@@ -22,7 +22,7 @@
 , tensorboard
 , config
 , cudaSupport ? config.cudaSupport
-, cudaPackages ? {}
+, cudaPackagesGoogle
 , zlib
 , python
 , keras-applications
@@ -43,7 +43,7 @@ assert ! (stdenv.isDarwin && cudaSupport);
 
 let
   packages = import ./binary-hashes.nix;
-  inherit (cudaPackages) cudatoolkit cudnn;
+  inherit (cudaPackagesGoogle) cudatoolkit cudnn;
 in buildPythonPackage {
   pname = "tensorflow" + lib.optionalString cudaSupport "-gpu";
   inherit (packages) version;
@@ -198,7 +198,7 @@ in buildPythonPackage {
   ];
 
   passthru = {
-    inherit cudaPackages;
+    cudaPackages = cudaPackagesGoogle;
   };
 
   meta = with lib; {
diff --git a/pkgs/development/python-modules/tensorflow/default.nix b/pkgs/development/python-modules/tensorflow/default.nix
index c8e292e316744..be8b26f3d0e99 100644
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@@ -19,8 +19,8 @@
 # https://groups.google.com/a/tensorflow.org/forum/#!topic/developers/iRCt5m4qUz0
 , config
 , cudaSupport ? config.cudaSupport
-, cudaPackages ? { }
-, cudaCapabilities ? cudaPackages.cudaFlags.cudaCapabilities
+, cudaPackagesGoogle
+, cudaCapabilities ? cudaPackagesGoogle.cudaFlags.cudaCapabilities
 , mklSupport ? false, mkl
 , tensorboardSupport ? true
 # XLA without CUDA is broken
@@ -50,15 +50,15 @@ let
   # __ZN4llvm11SmallPtrSetIPKNS_10AllocaInstELj8EED1Ev in any of the
   # translation units, so the build fails at link time
   stdenv =
-    if cudaSupport then cudaPackages.backendStdenv
+    if cudaSupport then cudaPackagesGoogle.backendStdenv
     else if originalStdenv.isDarwin then llvmPackages_11.stdenv
     else originalStdenv;
-  inherit (cudaPackages) cudatoolkit nccl;
+  inherit (cudaPackagesGoogle) cudatoolkit nccl;
   # use compatible cuDNN (https://www.tensorflow.org/install/source#gpu)
   # cudaPackages.cudnn led to this:
   # https://github.com/tensorflow/tensorflow/issues/60398
   cudnnAttribute = "cudnn_8_6";
-  cudnn = cudaPackages.${cudnnAttribute};
+  cudnn = cudaPackagesGoogle.${cudnnAttribute};
   gentoo-patches = fetchzip {
     url = "https://dev.gentoo.org/~perfinion/patches/tensorflow-patches-2.12.0.tar.bz2";
     hash = "sha256-SCRX/5/zML7LmKEPJkcM5Tebez9vv/gmE4xhT/jyqWs=";
@@ -486,8 +486,8 @@ let
       broken =
         stdenv.isDarwin
         || !(xlaSupport -> cudaSupport)
-        || !(cudaSupport -> builtins.hasAttr cudnnAttribute cudaPackages)
-        || !(cudaSupport -> cudaPackages ? cudatoolkit);
+        || !(cudaSupport -> builtins.hasAttr cudnnAttribute cudaPackagesGoogle)
+        || !(cudaSupport -> cudaPackagesGoogle ? cudatoolkit);
     } // lib.optionalAttrs stdenv.isDarwin {
       timeout = 86400; # 24 hours
       maxSilent = 14400; # 4h, double the default of 7200s
@@ -590,7 +590,7 @@ in buildPythonPackage {
   # Regression test for #77626 removed because not more `tensorflow.contrib`.
 
   passthru = {
-    inherit cudaPackages;
+    cudaPackages = cudaPackagesGoogle;
     deps = bazel-build.deps;
     libtensorflow = bazel-build.out;
   };
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index d908c937e50f3..4aef78c94d437 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -7324,6 +7324,10 @@ with pkgs;
   cudaPackages_12_2 = callPackage ./cuda-packages.nix { cudaVersion = "12.2"; };
   cudaPackages_12 = cudaPackages_12_0;
 
+  # Use the older cudaPackages for tensorflow and jax, as determined by cudnn
+  # compatibility: https://www.tensorflow.org/install/source#gpu
+  cudaPackagesGoogle = cudaPackages_11;
+
   # TODO: try upgrading once there is a cuDNN release supporting CUDA 12. No
   # such cuDNN release as of 2023-01-10.
   cudaPackages = recurseIntoAttrs cudaPackages_11;
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index 2cc25bf403520..5e59a1fb3942e 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -13828,7 +13828,6 @@ self: super: with self; {
   callPackage ../development/python-modules/tensorflow {
     inherit (pkgs.darwin) cctools;
     inherit (pkgs.config) cudaSupport;
-    inherit (self.tensorflow-bin) cudaPackages;
     inherit (pkgs.darwin.apple_sdk.frameworks) Foundation Security;
     flatbuffers-core = pkgs.flatbuffers;
     flatbuffers-python = self.flatbuffers;

From b774fda5b01d4e25c81e91f1df9447db4356c404 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 24 Nov 2023 22:46:28 +0000
Subject: [PATCH 05/38] tensorrt: dont break eval for unrelated packages

(cherry picked from commit 3ee37e4356ef08d0a4b872e76031983288e40a80)
---
 .../science/math/tensorrt/extension.nix       | 28 +++++++++++++++----
 .../science/math/tensorrt/generic.nix         | 15 ++++++----
 pkgs/top-level/python-packages.nix            |  2 +-
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/pkgs/development/libraries/science/math/tensorrt/extension.nix b/pkgs/development/libraries/science/math/tensorrt/extension.nix
index c6596dbaacde1..5ffa3910f1e5c 100644
--- a/pkgs/development/libraries/science/math/tensorrt/extension.nix
+++ b/pkgs/development/libraries/science/math/tensorrt/extension.nix
@@ -17,16 +17,32 @@ final: prev: let
     isSupported = fileData: elem cudaVersion fileData.supportedCudaVersions;
     # Return the first file that is supported. In practice there should only ever be one anyway.
     supportedFile = files: findFirst isSupported null files;
-    # Supported versions with versions as keys and file as value
-    supportedVersions = filterAttrs (version: file: file !=null ) (mapAttrs (version: files: supportedFile files) tensorRTVersions);
+
     # Compute versioned attribute name to be used in this package set
     computeName = version: "tensorrt_${toUnderscore version}";
+
+    # Supported versions with versions as keys and file as value
+    supportedVersions = lib.recursiveUpdate
+      {
+        tensorrt = {
+          enable = false;
+          fileVersionCuda = null;
+          fileVersionCudnn = null;
+          fullVersion = "0.0.0";
+          sha256 = null;
+          tarball = null;
+          supportedCudaVersions = [ ];
+        };
+      }
+      (mapAttrs' (version: attrs: nameValuePair (computeName version) attrs)
+        (filterAttrs (version: file: file != null) (mapAttrs (version: files: supportedFile files) tensorRTVersions)));
+
     # Add all supported builds as attributes
-    allBuilds = mapAttrs' (version: file: nameValuePair (computeName version) (buildTensorRTPackage (removeAttrs file ["fileVersionCuda"]))) supportedVersions;
+    allBuilds = mapAttrs (name: file: buildTensorRTPackage (removeAttrs file ["fileVersionCuda"])) supportedVersions;
+
     # Set the default attributes, e.g. tensorrt = tensorrt_8_4;
-    defaultBuild = { "tensorrt" = if allBuilds ? ${computeName tensorRTDefaultVersion}
-      then allBuilds.${computeName tensorRTDefaultVersion}
-      else throw "tensorrt-${tensorRTDefaultVersion} does not support your cuda version ${cudaVersion}"; };
+    defaultName = computeName tensorRTDefaultVersion;
+    defaultBuild = lib.optionalAttrs (allBuilds ? ${defaultName}) { tensorrt = allBuilds.${computeName tensorRTDefaultVersion}; };
   in {
     inherit buildTensorRTPackage;
   } // allBuilds // defaultBuild;
diff --git a/pkgs/development/libraries/science/math/tensorrt/generic.nix b/pkgs/development/libraries/science/math/tensorrt/generic.nix
index 165c6f356da89..2bcdd8e588cf0 100644
--- a/pkgs/development/libraries/science/math/tensorrt/generic.nix
+++ b/pkgs/development/libraries/science/math/tensorrt/generic.nix
@@ -8,20 +8,22 @@
 , cudnn
 }:
 
-{ fullVersion
+{ enable ? true
+, fullVersion
 , fileVersionCudnn ? null
 , tarball
 , sha256
 , supportedCudaVersions ? [ ]
 }:
 
-assert fileVersionCudnn == null || lib.assertMsg (lib.strings.versionAtLeast cudnn.version fileVersionCudnn)
+assert !enable || fileVersionCudnn == null || lib.assertMsg (lib.strings.versionAtLeast cudnn.version fileVersionCudnn)
   "This version of TensorRT requires at least cuDNN ${fileVersionCudnn} (current version is ${cudnn.version})";
 
 backendStdenv.mkDerivation rec {
   pname = "cudatoolkit-${cudatoolkit.majorVersion}-tensorrt";
   version = fullVersion;
-  src = requireFile rec {
+  src = if !enable then null else
+  requireFile rec {
     name = tarball;
     inherit sha256;
     message = ''
@@ -38,13 +40,13 @@ backendStdenv.mkDerivation rec {
 
   outputs = [ "out" "dev" ];
 
-  nativeBuildInputs = [
+  nativeBuildInputs = lib.optionals enable [
     autoPatchelfHook
     autoAddOpenGLRunpathHook
   ];
 
   # Used by autoPatchelfHook
-  buildInputs = [
+  buildInputs = lib.optionals enable [
     backendStdenv.cc.cc.lib # libstdc++
     cudatoolkit
     cudnn
@@ -75,6 +77,7 @@ backendStdenv.mkDerivation rec {
     '';
 
   passthru.stdenv = backendStdenv;
+  passthru.enable = enable;
 
   meta = with lib; {
     # Check that the cudatoolkit version satisfies our min/max constraints (both
@@ -82,7 +85,7 @@ backendStdenv.mkDerivation rec {
     # official version constraints (as recorded in default.nix). In some cases
     # you _may_ be able to smudge version constraints, just know that you're
     # embarking into unknown and unsupported territory when doing so.
-    broken = !(elem cudaVersion supportedCudaVersions);
+    broken = !enable || !(elem cudaVersion supportedCudaVersions);
     description = "TensorRT: a high-performance deep learning interface";
     homepage = "https://developer.nvidia.com/tensorrt";
     license = licenses.unfree;
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index 5e59a1fb3942e..ef6bc8dfacba6 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -13860,7 +13860,7 @@ self: super: with self; {
 
   tensorly = callPackage ../development/python-modules/tensorly { };
 
-  tensorrt = callPackage ../development/python-modules/tensorrt { };
+  tensorrt = callPackage ../development/python-modules/tensorrt { cudaPackages = pkgs.cudaPackages_11; };
 
   tensorstore = callPackage ../development/python-modules/tensorstore { };
 

From 32b15d38aa669d8362d746a699b90846f295b712 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 14:46:41 +0000
Subject: [PATCH 06/38] blender: drop cudatoolkit.runfile

(cherry picked from commit 238138417344ba277e80ed451ce5f80f47dd86f2)
---
 pkgs/applications/misc/blender/default.nix | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pkgs/applications/misc/blender/default.nix b/pkgs/applications/misc/blender/default.nix
index 24ea7287160b7..bdfd867f55675 100644
--- a/pkgs/applications/misc/blender/default.nix
+++ b/pkgs/applications/misc/blender/default.nix
@@ -52,7 +52,10 @@ stdenv.mkDerivation (finalAttrs: rec {
   nativeBuildInputs =
     [ cmake makeWrapper python310Packages.wrapPython llvmPackages.llvm.dev
     ]
-    ++ lib.optionals cudaSupport [ addOpenGLRunpath ]
+    ++ lib.optionals cudaSupport [
+      addOpenGLRunpath
+      cudaPackages.cuda_nvcc
+    ]
     ++ lib.optionals waylandSupport [ pkg-config ];
   buildInputs =
     [ boost ffmpeg gettext glew ilmbase
@@ -87,7 +90,7 @@ stdenv.mkDerivation (finalAttrs: rec {
       llvmPackages.openmp SDL Cocoa CoreGraphics ForceFeedback OpenAL OpenGL
     ])
     ++ lib.optional jackaudioSupport libjack2
-    ++ lib.optional cudaSupport cudaPackages.cudatoolkit
+    ++ lib.optionals cudaSupport [ cudaPackages.cuda_cudart ]
     ++ lib.optional colladaSupport opencollada
     ++ lib.optional spaceNavSupport libspnav;
   pythonPath = with python310Packages; [ numpy requests zstandard ];

From 65f38d5fdfcd99aad7536b59901c7768d3e8a997 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 15:19:17 +0000
Subject: [PATCH 07/38] catboost: downgrade to cudaPackages_11 because of
 unsupported architectures (compute_35)

(cherry picked from commit 5c2a368f87f0f3b0585255aaa5a2d1547da4c29c)
---
 pkgs/top-level/all-packages.nix | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 4aef78c94d437..cd0a3eb17de16 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -20806,6 +20806,9 @@ with pkgs;
     # catboost requires clang 12+ for build
     # after bumping the default version of llvm, check for compatibility with the cuda backend and pin it.
     inherit (llvmPackages_12) stdenv;
+
+    # https://github.com/catboost/catboost/issues/2540
+    cudaPackages = cudaPackages_11;
   };
 
   ndn-cxx = callPackage ../development/libraries/ndn-cxx { };

From 1c3e40cd302af5622c3323e60496944e02545d11 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 15:42:46 +0000
Subject: [PATCH 08/38] ctranslate2: fix the cuda 12 build

(cherry picked from commit 361d7da37f7bb945a026cb15dcb5ec548bf87e95)
---
 pkgs/development/libraries/ctranslate2/default.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkgs/development/libraries/ctranslate2/default.nix b/pkgs/development/libraries/ctranslate2/default.nix
index 722672d3a46b6..fa812432bd7ce 100644
--- a/pkgs/development/libraries/ctranslate2/default.nix
+++ b/pkgs/development/libraries/ctranslate2/default.nix
@@ -57,6 +57,7 @@ stdenv.mkDerivation rec {
   buildInputs = lib.optionals withMkl [
     mkl
   ] ++ lib.optionals withCUDA [
+    cudaPackages.cuda_cccl # <nv/target> required by the fp16 headers in cudart
     cudaPackages.cuda_cudart
     cudaPackages.libcublas
     cudaPackages.libcurand

From 7532a0663fde1c5a9065e16af1c76f2f06b08566 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 17:35:08 +0000
Subject: [PATCH 09/38] cudaPackages.cuda_nvcc: fix (getExe cuda_nvcc)

(cherry picked from commit 6c63202052bb23151ca77b613357c790c5542e42)
---
 pkgs/development/compilers/cudatoolkit/redist/overrides.nix | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
index a0ac0b0fcb1fb..5cc560fa0fd22 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@@ -51,7 +51,7 @@ in
     ]
   );
 
-  cuda_nvcc = prev.cuda_nvcc.overrideAttrs (_: {
+  cuda_nvcc = prev.cuda_nvcc.overrideAttrs (oldAttrs: {
     # Required by cmake's enable_language(CUDA) to build a test program
     # When implementing cross-compilation support: this is
     # final.pkgs.targetPackages.cudaPackages.cuda_cudart
@@ -82,6 +82,10 @@ in
     depsTargetTargetPropagated = [
       final.setupCudaHook
     ];
+
+    meta = (oldAttrs.meta or { }) // {
+      mainProgram = "nvcc";
+    };
   });
 
   cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: {

From ea94f71f50c2311ce9b9dd41ce4e12792b95834c Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 17:36:26 +0000
Subject: [PATCH 10/38] cudaPackages_12.cutensor: init and fix

(cherry picked from commit 0dc161b2f806aa3d9dd7bee4f3dbb5289bc98eeb)
---
 .../science/math/cutensor/generic.nix         | 29 ++++++++++++++-----
 pkgs/top-level/cuda-packages.nix              | 15 +++++++++-
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/pkgs/development/libraries/science/math/cutensor/generic.nix b/pkgs/development/libraries/science/math/cutensor/generic.nix
index c957fcdd99d4e..02fe13851620b 100644
--- a/pkgs/development/libraries/science/math/cutensor/generic.nix
+++ b/pkgs/development/libraries/science/math/cutensor/generic.nix
@@ -1,7 +1,11 @@
 { stdenv
 , lib
 , libPath
+, cuda_cudart
+, cudaMajorVersion
+, cuda_nvcc
 , cudatoolkit
+, libcublas
 , fetchurl
 , autoPatchelfHook
 , addOpenGLRunpath
@@ -17,7 +21,7 @@ let
 in
 
 stdenv.mkDerivation {
-  pname = "cudatoolkit-${cudatoolkit.majorVersion}-cutensor";
+  pname = "cutensor-cu${cudaMajorVersion}";
   inherit version;
 
   src = fetchurl {
@@ -32,20 +36,27 @@ stdenv.mkDerivation {
   nativeBuildInputs = [
     autoPatchelfHook
     addOpenGLRunpath
+    cuda_nvcc
   ];
 
   buildInputs = [
     stdenv.cc.cc.lib
-  ];
-
-  propagatedBuildInputs = [
-    cudatoolkit
+    cuda_cudart
+    libcublas
   ];
 
   # Set RUNPATH so that libcuda in /run/opengl-driver(-32)/lib can be found.
   # See the explanation in addOpenGLRunpath.
   installPhase = ''
     mkdir -p "$out" "$dev"
+
+    if [[ ! -d "${libPath}" ]] ; then
+      echo "Cutensor: ${libPath} does not exist, only found:" >&2
+      find "$(dirname ${libPath})"/ -maxdepth 1 >&2
+      echo "This cutensor release might not support your cudatoolkit version" >&2
+      exit 1
+    fi
+
     mv include "$dev"
     mv ${libPath} "$out/lib"
 
@@ -58,7 +69,7 @@ stdenv.mkDerivation {
   '';
 
   passthru = {
-    inherit cudatoolkit;
+    cudatoolkit = lib.warn "cutensor.passthru: cudaPackages.cudatoolkit is deprecated" cudatoolkit;
     majorVersion = lib.versions.major version;
   };
 
@@ -66,7 +77,11 @@ stdenv.mkDerivation {
     description = "cuTENSOR: A High-Performance CUDA Library For Tensor Primitives";
     homepage = "https://developer.nvidia.com/cutensor";
     sourceProvenance = with sourceTypes; [ binaryNativeCode ];
-    license = licenses.unfree;
+    license = licenses.unfreeRedistributable // {
+      shortName = "cuTENSOR EULA";
+      name = "cuTENSOR SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS";
+      url = "https://docs.nvidia.com/cuda/cutensor/license.html";
+    };
     platforms = [ "x86_64-linux" ];
     maintainers = with maintainers; [ obsidian-systems-maintenance ];
   };
diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix
index a2f49a98ccd53..3912422785bc4 100644
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@@ -24,6 +24,7 @@ let
 
     buildCuTensorPackage = final.callPackage ../development/libraries/science/math/cutensor/generic.nix;
 
+    # FIXME: Include non-x86_64 platforms
     cuTensorVersions = {
       "1.2.2.5" = {
         hash = "sha256-lU7iK4DWuC/U3s1Ct/rq2Gr3w4F2U7RYYgpmF05bibY=";
@@ -31,12 +32,24 @@ let
       "1.5.0.3" = {
         hash = "sha256-T96+lPC6OTOkIs/z3QWg73oYVSyidN0SVkBWmT9VRx0=";
       };
+      "2.0.0.7" = {
+        hash = "sha256-32M4rtGOW2rgxJUhBT0WBtKkHhh9f17M+RgK9rvE72g=";
+      };
     };
 
     inherit (final) cudaMajorMinorVersion cudaMajorVersion;
 
+    cudaToCutensor = {
+      "10" = "1.2.25";
+      "11" = "1.5.0.3";
+      "12" = "2.0.0.7";
+    };
+
+    versionNewer = lib.flip lib.versionOlder;
+    latestVersion = (builtins.head (lib.sort versionNewer (builtins.attrNames cuTensorVersions)));
+
     cutensor = buildCuTensorPackage rec {
-      version = if cudaMajorMinorVersion == "10.1" then "1.2.2.5" else "1.5.0.3";
+      version = cudaToCutensor.${cudaMajorVersion} or latestVersion;
       inherit (cuTensorVersions.${version}) hash;
       # This can go into generic.nix
       libPath = "lib/${if cudaMajorVersion == "10" then cudaMajorMinorVersion else cudaMajorVersion}";

From 10a73aa0c74b3add81eb0cf6d847788bdb105711 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 17:36:57 +0000
Subject: [PATCH 11/38] python3Packages.cupy: fix (use older cutensor)

(cherry picked from commit ee108108fcbe21999ecc1f36ac730c15376dc824)
---
 .../python-modules/cupy/default.nix           | 46 +++++++++++++++----
 pkgs/top-level/python-packages.nix            |  3 +-
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/pkgs/development/python-modules/cupy/default.nix b/pkgs/development/python-modules/cupy/default.nix
index e5de149fca14a..71defbb99b985 100644
--- a/pkgs/development/python-modules/cupy/default.nix
+++ b/pkgs/development/python-modules/cupy/default.nix
@@ -11,11 +11,34 @@
 , cudaPackages
 , addOpenGLRunpath
 , pythonOlder
+, symlinkJoin
 }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudnn cutensor nccl;
-in buildPythonPackage rec {
+  inherit (cudaPackages) cudnn cutensor nccl;
+  cudatoolkit-joined = symlinkJoin {
+    name = "cudatoolkit-joined-${cudaPackages.cudaVersion}";
+    paths = with cudaPackages; [
+      cuda_cccl # <nv/target>
+      cuda_cccl.dev
+      cuda_cudart
+      cuda_nvcc.dev # <crt/host_defines.h>
+      cuda_nvprof
+      cuda_nvrtc
+      cuda_nvtx
+      cuda_profiler_api
+      libcublas
+      libcufft
+      libcurand
+      libcusolver
+      libcusparse
+
+      # Missing:
+      # cusparselt
+    ];
+  };
+in
+buildPythonPackage rec {
   pname = "cupy";
   version = "12.2.0";
 
@@ -32,27 +55,32 @@ in buildPythonPackage rec {
   # very short builds and a few extremely long ones, so setting both ends up
   # working nicely in practice.
   preConfigure = ''
-    export CUDA_PATH=${cudatoolkit}
     export CUPY_NUM_BUILD_JOBS="$NIX_BUILD_CORES"
     export CUPY_NUM_NVCC_THREADS="$NIX_BUILD_CORES"
   '';
 
   nativeBuildInputs = [
+    setuptools
+    wheel
     addOpenGLRunpath
     cython
+    cudaPackages.cuda_nvcc
   ];
 
-  LDFLAGS = "-L${cudatoolkit}/lib/stubs";
-
-  propagatedBuildInputs = [
-    cudatoolkit
+  buildInputs = [
+    cudatoolkit-joined
     cudnn
     cutensor
     nccl
+  ];
+
+  NVCC = "${lib.getExe cudaPackages.cuda_nvcc}"; # FIXME: splicing/buildPackages
+  CUDA_PATH = "${cudatoolkit-joined}";
+  LDFLAGS = "-L${cudaPackages.cuda_cudart}/lib/stubs";
+
+  propagatedBuildInputs = [
     fastrlock
     numpy
-    setuptools
-    wheel
   ];
 
   nativeCheckInputs = [
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index ef6bc8dfacba6..326f59a8a08a1 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -2447,7 +2447,8 @@ self: super: with self; {
 
   cufflinks = callPackage ../development/python-modules/cufflinks { };
 
-  cupy = callPackage ../development/python-modules/cupy { };
+  # cupy 12.2.0 possibly incompatible with cutensor 2.0 that comes with cudaPackages_12
+  cupy = callPackage ../development/python-modules/cupy { cudaPackages = pkgs.cudaPackages_11; };
 
   curio = callPackage ../development/python-modules/curio { };
 

From 525df0e1acd0c2d7b042f4c53a268c0933d6967c Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Nov 2023 17:50:09 +0000
Subject: [PATCH 12/38] ucx: fix the cudaPackages_12 variant; drop the
 cudatoolkit runfile dependency

(cherry picked from commit 58819d631edc8ffa5658ef025e1b1c04903cc43f)
---
 pkgs/development/libraries/ucx/default.nix | 32 ++++++++++++++--------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/pkgs/development/libraries/ucx/default.nix b/pkgs/development/libraries/ucx/default.nix
index 627cac56bb737..3b923d8efdd2a 100644
--- a/pkgs/development/libraries/ucx/default.nix
+++ b/pkgs/development/libraries/ucx/default.nix
@@ -2,18 +2,12 @@
 , rdma-core, libbfd, libiberty, perl, zlib, symlinkJoin, pkg-config
 , config
 , enableCuda ? config.cudaSupport
-, cudatoolkit
+, cudaPackages
 , enableRocm ? config.rocmSupport
 , rocmPackages
 }:
 
 let
-  # Needed for configure to find all libraries
-  cudatoolkit' = symlinkJoin {
-    inherit (cudatoolkit) name meta;
-    paths = [ cudatoolkit cudatoolkit.lib ];
-  };
-
   rocmList = with rocmPackages; [ rocm-core rocm-runtime rocm-device-libs clr ];
 
   rocm = symlinkJoin {
@@ -35,7 +29,15 @@ stdenv.mkDerivation rec {
 
   outputs = [ "out" "doc" "dev" ];
 
-  nativeBuildInputs = [ autoreconfHook doxygen pkg-config ];
+  nativeBuildInputs = [
+    autoreconfHook
+    doxygen
+    pkg-config
+  ]
+  ++ lib.optionals enableCuda [
+    cudaPackages.cuda_nvcc
+    cudaPackages.autoAddOpenGLRunpathHook
+  ];
 
   buildInputs = [
     libbfd
@@ -44,8 +46,16 @@ stdenv.mkDerivation rec {
     perl
     rdma-core
     zlib
-  ] ++ lib.optional enableCuda cudatoolkit
-  ++ lib.optionals enableRocm rocmList;
+  ] ++ lib.optionals enableCuda [
+    cudaPackages.cuda_cudart
+    cudaPackages.cuda_nvml_dev
+
+  ] ++ lib.optionals enableRocm rocmList;
+
+  LDFLAGS = lib.optionals enableCuda [
+    # Fake libnvidia-ml.so (the real one is deployed impurely)
+    "-L${cudaPackages.cuda_nvml_dev}/lib/stubs"
+  ];
 
   configureFlags = [
     "--with-rdmacm=${lib.getDev rdma-core}"
@@ -53,7 +63,7 @@ stdenv.mkDerivation rec {
     "--with-rc"
     "--with-dm"
     "--with-verbs=${lib.getDev rdma-core}"
-  ] ++ lib.optional enableCuda "--with-cuda=${cudatoolkit'}"
+  ] ++ lib.optionals enableCuda [ "--with-cuda=${cudaPackages.cuda_cudart}" ]
   ++ lib.optional enableRocm "--with-rocm=${rocm}";
 
   postInstall = ''

From 9bf80c5f21c9999811ab4306b9bb1fff713a6ab5 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 28 Nov 2023 00:12:07 +0000
Subject: [PATCH 13/38] gromacs: drop cudatoolkit.run

(cherry picked from commit 31f1b517cdea429dc7a6c212ca650ea184c75a93)
---
 .../molecular-dynamics/gromacs/default.nix    | 34 ++++++++++++++++---
 pkgs/development/libraries/hwloc/default.nix  |  5 +--
 pkgs/top-level/all-packages.nix               |  1 -
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/pkgs/applications/science/molecular-dynamics/gromacs/default.nix b/pkgs/applications/science/molecular-dynamics/gromacs/default.nix
index 2ca47d812bbfe..429376b72d912 100644
--- a/pkgs/applications/science/molecular-dynamics/gromacs/default.nix
+++ b/pkgs/applications/science/molecular-dynamics/gromacs/default.nix
@@ -1,4 +1,14 @@
-{ lib, stdenv, fetchurl, cmake, hwloc, fftw, perl, blas, lapack, mpi, cudatoolkit
+{ lib
+, stdenv
+, fetchurl
+, cmake
+, hwloc
+, fftw
+, perl
+, blas
+, lapack
+, mpi
+, cudaPackages
 , singlePrec ? true
 , config
 , enableMpi ? false
@@ -7,6 +17,8 @@
 }:
 
 let
+  inherit (cudaPackages.cudaFlags) cudaCapabilities dropDot;
+
   # Select reasonable defaults for all major platforms
   # The possible values are defined in CMakeLists.txt:
   # AUTO None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256
@@ -31,7 +43,9 @@ in stdenv.mkDerivation rec {
 
   outputs = [ "out" "dev" "man" ];
 
-  nativeBuildInputs = [ cmake ];
+  nativeBuildInputs =
+    [ cmake ]
+    ++ lib.optionals enableCuda [ cudaPackages.cuda_nvcc ];
 
   buildInputs = [
     fftw
@@ -40,13 +54,17 @@ in stdenv.mkDerivation rec {
     blas
     lapack
   ] ++ lib.optional enableMpi mpi
-    ++ lib.optional enableCuda cudatoolkit
-  ;
+  ++ lib.optionals enableCuda [
+    cudaPackages.cuda_cudart
+    cudaPackages.libcufft
+    cudaPackages.cuda_profiler_api
+  ];
 
   propagatedBuildInputs = lib.optional enableMpi mpi;
   propagatedUserEnvPkgs = lib.optional enableMpi mpi;
 
   cmakeFlags = [
+    (lib.cmakeBool "GMX_HWLOC" true)
     "-DGMX_SIMD:STRING=${SIMD cpuAcceleration}"
     "-DGMX_OPENMP:BOOL=TRUE"
     "-DBUILD_SHARED_LIBS=ON"
@@ -66,7 +84,13 @@ in stdenv.mkDerivation rec {
      else [
        "-DGMX_MPI:BOOL=FALSE"
      ]
-  ) ++ lib.optional enableCuda "-DGMX_GPU=CUDA";
+  ) ++ lib.optionals enableCuda [
+    "-DGMX_GPU=CUDA"
+    (lib.cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (builtins.concatStringsSep ";" (map dropDot cudaCapabilities)))
+
+    # Gromacs seems to ignore and override the normal variables, so we add this ad hoc:
+    (lib.cmakeFeature "GMX_CUDA_TARGET_COMPUTE" (builtins.concatStringsSep ";" (map dropDot cudaCapabilities)))
+  ];
 
   postInstall = ''
     moveToOutput share/cmake $dev
diff --git a/pkgs/development/libraries/hwloc/default.nix b/pkgs/development/libraries/hwloc/default.nix
index 67048167d6bfa..626d0b7cca949 100644
--- a/pkgs/development/libraries/hwloc/default.nix
+++ b/pkgs/development/libraries/hwloc/default.nix
@@ -22,12 +22,13 @@ stdenv.mkDerivation rec {
   ];
 
   # XXX: libX11 is not directly needed, but needed as a propagated dep of Cairo.
-  nativeBuildInputs = [ pkg-config ];
+  nativeBuildInputs = [ pkg-config ]
+  ++ lib.optionals enableCuda [ cudaPackages.cuda_nvcc ];
 
   buildInputs = [ expat ncurses ]
     ++ lib.optionals x11Support [ cairo libX11 ]
     ++ lib.optionals stdenv.isLinux [ numactl ]
-    ++ lib.optional enableCuda cudaPackages.cudatoolkit;
+    ++ lib.optionals enableCuda [ cudaPackages.cuda_cudart ];
 
   # Since `libpci' appears in `hwloc.pc', it must be propagated.
   propagatedBuildInputs = lib.optional stdenv.isLinux pciutils;
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index cd0a3eb17de16..3236dd164d357 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -39486,7 +39486,6 @@ with pkgs;
     singlePrec = true;
     enableMpi = true;
     enableCuda = true;
-    cudatoolkit = cudatoolkit_11;
     fftw = fftwSinglePrec;
   });
 

From f89fab85e9ce60071c2468ca6ec242dd1d3c1201 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 28 Nov 2023 09:46:47 +0000
Subject: [PATCH 14/38] openvino: opencvConfig.cmake attempts to
 find_package(CUDA)

(cherry picked from commit 0c4b1fcfba531c8f33b416583053df2582a6843e)
---
 pkgs/development/libraries/opencv/4.x.nix       | 2 ++
 pkgs/development/libraries/openvino/default.nix | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/pkgs/development/libraries/opencv/4.x.nix b/pkgs/development/libraries/opencv/4.x.nix
index 06360449c1ba9..4c1b13d1309e0 100644
--- a/pkgs/development/libraries/opencv/4.x.nix
+++ b/pkgs/development/libraries/opencv/4.x.nix
@@ -476,6 +476,8 @@ effectiveStdenv.mkDerivation {
   '';
 
   passthru = {
+    cudaSupport = enableCuda;
+
     tests = {
       inherit (gst_all_1) gst-plugins-bad;
     }
diff --git a/pkgs/development/libraries/openvino/default.nix b/pkgs/development/libraries/openvino/default.nix
index b3809f0953641..5761f9e7bb645 100644
--- a/pkgs/development/libraries/openvino/default.nix
+++ b/pkgs/development/libraries/openvino/default.nix
@@ -3,6 +3,7 @@
 , fetchFromGitHub
 , fetchurl
 , substituteAll
+, cudaSupport ? opencv.cudaSupport or false
 
 # build
 , addOpenGLRunpath
@@ -21,6 +22,7 @@
 , protobuf
 , pugixml
 , tbb
+, cudaPackages
 }:
 
 let
@@ -68,6 +70,8 @@ stdenv.mkDerivation rec {
       setuptools
     ]))
     shellcheck
+  ] ++ lib.optionals cudaSupport [
+    cudaPackages.cuda_nvcc
   ];
 
   patches = [
@@ -133,6 +137,8 @@ stdenv.mkDerivation rec {
     protobuf
     pugixml
     tbb
+  ] ++ lib.optionals cudaSupport [
+    cudaPackages.cuda_cudart
   ];
 
   enableParallelBuilding = true;

From 8e6c3caa12f01296ce676df00a95cf0d85969c65 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 28 Nov 2023 10:32:22 +0000
Subject: [PATCH 15/38] nvidia-thrust: rm as deprecated

The GitHub repo has been archived, the new thing is
[cccl](https://github.com/nvidia/cccl)

(cherry picked from commit 9cc210a7839f4e2aeb13bcadc96981daf8bc6501)
---
 .../libraries/nvidia-thrust/default.nix       | 102 ------------------
 .../libraries/science/math/faiss/default.nix  |   7 --
 pkgs/top-level/aliases.nix                    |   1 +
 pkgs/top-level/all-packages.nix               |  11 --
 4 files changed, 1 insertion(+), 120 deletions(-)
 delete mode 100644 pkgs/development/libraries/nvidia-thrust/default.nix

diff --git a/pkgs/development/libraries/nvidia-thrust/default.nix b/pkgs/development/libraries/nvidia-thrust/default.nix
deleted file mode 100644
index f68b57f193b79..0000000000000
--- a/pkgs/development/libraries/nvidia-thrust/default.nix
+++ /dev/null
@@ -1,102 +0,0 @@
-{ lib
-, config
-, fetchFromGitHub
-, stdenv
-, cmake
-, pkg-config
-, cudaPackages ? { }
-, symlinkJoin
-, tbb
-, hostSystem ? "CPP"
-, deviceSystem ? if config.cudaSupport then "CUDA" else "OMP"
-}:
-
-# Policy for device_vector<T>
-assert builtins.elem deviceSystem [
-  "CPP" # Serial on CPU
-  "OMP" # Parallel with OpenMP
-  "TBB" # Parallel with Intel TBB
-  "CUDA" # Parallel on GPU
-];
-
-# Policy for host_vector<T>
-# Always lives on CPU, but execution can be made parallel
-assert builtins.elem hostSystem [ "CPP" "OMP" "TBB" ];
-
-let
-  pname = "nvidia-thrust";
-  version = "1.16.0";
-
-  inherit (cudaPackages) backendStdenv cudaFlags;
-  cudaCapabilities = map cudaFlags.dropDot cudaFlags.cudaCapabilities;
-
-  tbbSupport = builtins.elem "TBB" [ deviceSystem hostSystem ];
-  cudaSupport = deviceSystem == "CUDA";
-
-  # TODO: Would like to use this:
-  cudaJoined = symlinkJoin {
-    name = "cuda-packages-unsplit";
-    paths = with cudaPackages; [
-      cuda_nvcc
-      cuda_nvrtc # symbols: cudaLaunchDevice, &c; notice postBuild
-      cuda_cudart # cuda_runtime.h
-      libcublas
-    ];
-    postBuild = ''
-      ln -s $out/lib $out/lib64
-    '';
-  };
-in
-stdenv.mkDerivation {
-  inherit pname version;
-
-  src = fetchFromGitHub {
-    owner = "NVIDIA";
-    repo = "thrust";
-    rev = version;
-    fetchSubmodules = true;
-    hash = "sha256-/EyznxWKuHuvHNjq+SQg27IaRbtkjXR2zlo2YgCWmUQ=";
-  };
-
-  # NVIDIA's "compiler hacks" seem like work-arounds for legacy toolchains and
-  # cause us errors such as:
-  # > Thrust's test harness uses CMAKE_CXX_COMPILER for the CUDA host compiler.
-  # > Refusing to overwrite specified CMAKE_CUDA_HOST_COMPILER
-  # So we un-fix cmake after them:
-  postPatch = ''
-    echo > cmake/ThrustCompilerHacks.cmake
-  '';
-
-  buildInputs = lib.optionals tbbSupport [ tbb ];
-
-  nativeBuildInputs = [
-    cmake
-    pkg-config
-  ] ++ lib.optionals cudaSupport [
-    # Goes in native build inputs because thrust looks for headers
-    # in a path relative to nvcc...
-    cudaJoined
-  ];
-
-  cmakeFlags = [
-    "-DTHRUST_INCLUDE_CUB_CMAKE=${if cudaSupport then "ON" else "OFF"}"
-    "-DTHRUST_DEVICE_SYSTEM=${deviceSystem}"
-    "-DTHRUST_HOST_SYSTEM=${hostSystem}"
-    "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=OFF"
-    "-DTHRUST_DISABLE_ARCH_BY_DEFAULT=ON"
-  ] ++ lib.optionals cudaFlags.enableForwardCompat [
-    "-DTHRUST_ENABLE_COMPUTE_FUTURE=ON"
-  ] ++ map (sm: "THRUST_ENABLE_COMPUTE_${sm}") cudaCapabilities;
-
-  passthru = {
-    inherit cudaSupport cudaPackages cudaJoined;
-  };
-
-  meta = with lib; {
-    description = "A high-level C++ parallel algorithms library that builds on top of CUDA, TBB, OpenMP, etc";
-    homepage = "https://github.com/NVIDIA/thrust";
-    license = licenses.asl20;
-    platforms = platforms.unix;
-    maintainers = with maintainers; [ SomeoneSerge ];
-  };
-}
diff --git a/pkgs/development/libraries/science/math/faiss/default.nix b/pkgs/development/libraries/science/math/faiss/default.nix
index 21e6cbf858cd5..25ac539e05f28 100644
--- a/pkgs/development/libraries/science/math/faiss/default.nix
+++ b/pkgs/development/libraries/science/math/faiss/default.nix
@@ -6,8 +6,6 @@
 , cmake
 , cudaPackages ? { }
 , cudaSupport ? config.cudaSupport
-, nvidia-thrust
-, useThrustSourceBuild ? true
 , pythonSupport ? true
 , pythonPackages
 , llvmPackages
@@ -27,8 +25,6 @@
 , runCommand
 }@inputs:
 
-assert cudaSupport -> nvidia-thrust.cudaSupport;
-
 let
   pname = "faiss";
   version = "1.7.4";
@@ -44,9 +40,6 @@ let
       cuda_cudart # cuda_runtime.h
       libcublas
       libcurand
-    ] ++ lib.optionals useThrustSourceBuild [
-      nvidia-thrust
-    ] ++ lib.optionals (!useThrustSourceBuild) [
       cuda_cccl
     ] ++ lib.optionals (cudaPackages ? cuda_profiler_api) [
       cuda_profiler_api # cuda_profiler_api.h
diff --git a/pkgs/top-level/aliases.nix b/pkgs/top-level/aliases.nix
index 5d1ae513ce4ca..340850200686b 100644
--- a/pkgs/top-level/aliases.nix
+++ b/pkgs/top-level/aliases.nix
@@ -644,6 +644,7 @@ mapAliases ({
   noto-fonts-cjk = noto-fonts-cjk-sans; # Added 2021-12-16
   noto-fonts-emoji = noto-fonts-color-emoji; # Added 2023-09-09
   noto-fonts-extra = noto-fonts; # Added 2023-04-08
+  nvidia-thrust = throw "nvidia-thrust has been removed because the project was deprecated; use cudaPackages.cuda_cccl";
 
   ### O ###
 
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 3236dd164d357..aff9967296248 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -11282,16 +11282,6 @@ with pkgs;
 
   nvfetcher = haskell.lib.compose.justStaticExecutables haskellPackages.nvfetcher;
 
-  nvidia-thrust = callPackage ../development/libraries/nvidia-thrust { };
-
-  nvidia-thrust-intel = callPackage ../development/libraries/nvidia-thrust {
-    hostSystem = "TBB";
-    deviceSystem = if config.cudaSupport then "CUDA" else "TBB";
-  };
-
-  nvidia-thrust-cuda = callPackage ../development/libraries/nvidia-thrust {
-    deviceSystem = "CUDA";
-  };
 
   miller = callPackage ../tools/text/miller { };
 
@@ -40025,7 +40015,6 @@ with pkgs;
 
   faissWithCuda = faiss.override {
     cudaSupport = true;
-    nvidia-thrust = nvidia-thrust-cuda;
   };
 
   fityk = callPackage ../applications/science/misc/fityk { };

From e1ec7475d451e05315d6d4bf6564d58fd22781ee Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 3 Dec 2023 21:03:57 +0000
Subject: [PATCH 16/38] ucc: drop the cudatoolkit runfile

(cherry picked from commit 3e37f3c9836da7cf6dd2c3d66a442cd8c74113b3)
---
 pkgs/development/libraries/ucc/default.nix | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pkgs/development/libraries/ucc/default.nix b/pkgs/development/libraries/ucc/default.nix
index a92c6bea37d70..a6e7e7710a97b 100644
--- a/pkgs/development/libraries/ucc/default.nix
+++ b/pkgs/development/libraries/ucc/default.nix
@@ -1,7 +1,7 @@
 { stdenv, lib, fetchFromGitHub, libtool, automake, autoconf, ucx
 , config
 , enableCuda ? config.cudaSupport
-, cudatoolkit
+, cudaPackages
 , enableAvx ? stdenv.hostPlatform.avxSupport
 , enableSse41 ? stdenv.hostPlatform.sse4_1Support
 , enableSse42 ? stdenv.hostPlatform.sse4_2Support
@@ -30,19 +30,23 @@ stdenv.mkDerivation rec {
     done
   '';
 
+  nativeBuildInputs = [ libtool automake autoconf ]
+    ++ lib.optionals enableCuda [ cudaPackages.cuda_nvcc ];
+  buildInputs = [ ucx ]
+    ++ lib.optionals enableCuda [
+      cudaPackages.cuda_cccl
+      cudaPackages.cuda_cudart
+    ];
+
+
   preConfigure = ''
     ./autogen.sh
   '';
-
-  nativeBuildInputs = [ libtool automake autoconf ];
-  buildInputs = [ ucx ]
-    ++ lib.optional enableCuda cudatoolkit;
-
   configureFlags = [ ]
    ++ lib.optional enableSse41 "--with-sse41"
    ++ lib.optional enableSse42 "--with-sse42"
    ++ lib.optional enableAvx "--with-avx"
-   ++ lib.optional enableCuda "--with-cuda=${cudatoolkit}";
+   ++ lib.optional enableCuda "--with-cuda=${cudaPackages.cuda_cudart}";
 
   postInstall = ''
     find $out/lib/ -name "*.la" -exec rm -f \{} \;

From 6cc7ef30c376164e1a8425d148917bbcd89c30a3 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 3 Dec 2023 21:04:11 +0000
Subject: [PATCH 17/38] ucc: respect cudaFlags

(cherry picked from commit 0f047c2372f2ebfaf67fad4aaacc131f5d879583)
---
 pkgs/development/libraries/ucc/default.nix | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkgs/development/libraries/ucc/default.nix b/pkgs/development/libraries/ucc/default.nix
index a6e7e7710a97b..68f358b3d3deb 100644
--- a/pkgs/development/libraries/ucc/default.nix
+++ b/pkgs/development/libraries/ucc/default.nix
@@ -41,6 +41,8 @@ stdenv.mkDerivation rec {
 
   preConfigure = ''
     ./autogen.sh
+  '' + lib.optionalString enableCuda ''
+    configureFlagsArray+=( "--with-nvcc-gencode=${builtins.concatStringsSep " " cudaPackages.cudaFlags.gencode}" )
   '';
   configureFlags = [ ]
    ++ lib.optional enableSse41 "--with-sse41"

From f49ae138ef8eb3b91ed0c11da9e0bad4f468fe22 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 3 Dec 2023 21:06:03 +0000
Subject: [PATCH 18/38] openmpi: drop the cudatoolkit runfile

(cherry picked from commit 4c6d2b81cf4dd94398a6fc5b3727c685b439f4b3)
---
 pkgs/development/libraries/openmpi/default.nix | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/pkgs/development/libraries/openmpi/default.nix b/pkgs/development/libraries/openmpi/default.nix
index 1c4955e2c51a4..a8bd8acacd595 100644
--- a/pkgs/development/libraries/openmpi/default.nix
+++ b/pkgs/development/libraries/openmpi/default.nix
@@ -3,7 +3,7 @@
 , libpsm2, libfabric, pmix, ucx, ucc
 , config
 # Enable CUDA support
-, cudaSupport ? config.cudaSupport, cudatoolkit
+, cudaSupport ? config.cudaSupport, cudaPackages
 
 # Enable the Sun Grid Engine bindings
 , enableSGE ? false
@@ -18,12 +18,7 @@
 , fortranSupport ? true
 }:
 
-let
-  cudatoolkit_joined = symlinkJoin {
-    name = "${cudatoolkit.name}-unsplit";
-    paths = [ cudatoolkit.out cudatoolkit.lib ];
-  };
-in stdenv.mkDerivation rec {
+stdenv.mkDerivation rec {
   pname = "openmpi";
   version = "4.1.6";
 
@@ -47,12 +42,13 @@ in stdenv.mkDerivation rec {
 
   buildInputs = [ zlib ]
     ++ lib.optionals stdenv.isLinux [ libnl numactl pmix ucx ucc ]
-    ++ lib.optionals cudaSupport [ cudatoolkit ]
+    ++ lib.optionals cudaSupport [ cudaPackages.cuda_cudart ]
     ++ [ libevent hwloc ]
     ++ lib.optional (stdenv.isLinux || stdenv.isFreeBSD) rdma-core
     ++ lib.optionals fabricSupport [ libpsm2 libfabric ];
 
   nativeBuildInputs = [ perl ]
+    ++ lib.optionals cudaSupport [ cudaPackages.cuda_nvcc ]
     ++ lib.optionals fortranSupport [ gfortran ];
 
   configureFlags = lib.optional (!cudaSupport) "--disable-mca-dso"
@@ -67,7 +63,7 @@ in stdenv.mkDerivation rec {
     # TODO: add UCX support, which is recommended to use with cuda for the most robust OpenMPI build
     # https://github.com/openucx/ucx
     # https://www.open-mpi.org/faq/?category=buildcuda
-    ++ lib.optionals cudaSupport [ "--with-cuda=${cudatoolkit_joined}" "--enable-dlopen" ]
+    ++ lib.optionals cudaSupport [ "--with-cuda=${cudaPackages.cuda_cudart}" "--enable-dlopen" ]
     ++ lib.optionals fabricSupport [ "--with-psm2=${lib.getDev libpsm2}" "--with-libfabric=${lib.getDev libfabric}" ]
     ;
 
@@ -98,7 +94,8 @@ in stdenv.mkDerivation rec {
   doCheck = true;
 
   passthru = {
-    inherit cudaSupport cudatoolkit;
+    inherit cudaSupport;
+    cudatoolkit = cudaPackages.cudatoolkit; # For backward compatibility only
   };
 
   meta = with lib; {

From 923ead9f7543985449ae7ee1354bc54a58498dff Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 4 Dec 2023 11:54:16 +0000
Subject: [PATCH 19/38] python311Packages.torchWithCuda: drop
 cuda_cudart.static at runtime

(cherry picked from commit 9d729f260d023569a20c4339776e6b442a9f588c)
---
 pkgs/development/python-modules/torch/default.nix | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 5523a87b6b5e8..a00e15f0518b8 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -334,7 +334,8 @@ in buildPythonPackage rec {
   buildInputs = [ blas blas.provider ]
     ++ lib.optionals cudaSupport (with cudaPackages; [
       cuda_cccl.dev # <thrust/*>
-      cuda_cudart # cuda_runtime.h and libraries
+      cuda_cudart.dev # cuda_runtime.h and libraries
+      cuda_cudart.lib
       cuda_cupti.dev # For kineto
       cuda_cupti.lib # For kineto
       cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too

From b93a4e259b5c1e5330e1403dda647330f27d7692 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 4 Dec 2023 16:29:18 +0000
Subject: [PATCH 20/38] cudaPackages.setupCudaHook: fix cudart flags

(cherry picked from commit a7891f2adaf409a95b2c0d8e373270993951e69d)
---
 .../compilers/cudatoolkit/extension.nix          | 13 +++++++++++++
 .../compilers/cudatoolkit/redist/overrides.nix   | 16 ----------------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index 93800a0dbc6b1..be482e2cc7992 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -54,11 +54,24 @@ final: prev: let
         {
           name = "setup-cuda-hook";
 
+          # Point NVCC at a compatible compiler
           substitutions.ccRoot = "${backendStdenv.cc}";
 
           # Required in addition to ccRoot as otherwise bin/gcc is looked up
           # when building CMakeCUDACompilerId.cu
           substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
+
+          # Required by cmake's enable_language(CUDA) to build a test program
+          # When implementing cross-compilation support: this is
+          # final.pkgs.targetPackages.cudaPackages.cuda_cudart
+          # Given the multiple-outputs each CUDA redist has, we can specify the exact components we
+          # need from the package. CMake requires:
+          # - the cuda_runtime.h header, which is in the dev output
+          # - the dynamic library, which is in the lib output
+          # - the static library, which is in the static output
+          substitutions.cudartInclude = "${final.cuda_cudart.dev}";
+          substitutions.cudartLib = "${final.cuda_cudart.lib}";
+          substitutions.cudartStatic = "${final.cuda_cudart.static}";
         }
         ./hooks/setup-cuda-hook.sh)
     { });
diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
index 5cc560fa0fd22..16b03b93b1db6 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@@ -52,22 +52,6 @@ in
   );
 
   cuda_nvcc = prev.cuda_nvcc.overrideAttrs (oldAttrs: {
-    # Required by cmake's enable_language(CUDA) to build a test program
-    # When implementing cross-compilation support: this is
-    # final.pkgs.targetPackages.cudaPackages.cuda_cudart
-    env = {
-      # Given the multiple-outputs each CUDA redist has, we can specify the exact components we
-      # need from the package. CMake requires:
-      # - the cuda_runtime.h header, which is in the dev output
-      # - the dynamic library, which is in the lib output
-      # - the static library, which is in the static output
-      cudartInclude = "${final.cuda_cudart.dev}";
-      cudartLib = "${final.cuda_cudart.lib}";
-      cudartStatic = "${final.cuda_cudart.static}";
-    };
-
-    # Point NVCC at a compatible compiler
-
     # Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to
     # nativeBuildInputs (offsets `(-1, 0)`), magma should also source the
     # setupCudaHook, i.e. we want it the hook to be propagated into the

From c5e5690513f4740adc520300e102856eb6e3c3a7 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 4 Dec 2023 18:56:55 +0000
Subject: [PATCH 21/38] cudaPackages.setupCudaHook: rewrite cudartFlags, remove
 infinite recursion in cudatoolkit

We don't need to add the extra nvcc flags to locate cudart when using
cudatoolkit because it comes in the merged layout and nvcc doesn't have
any trouble locating dependencies in the same prefix

(cherry picked from commit 182e6b41d08d37c8eb817212b88b37b671abfb22)
---
 pkgs/development/compilers/cudatoolkit/extension.nix   | 10 +++++++---
 .../compilers/cudatoolkit/hooks/setup-cuda-hook.sh     |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index be482e2cc7992..d75d288f5577e 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -69,9 +69,13 @@ final: prev: let
           # - the cuda_runtime.h header, which is in the dev output
           # - the dynamic library, which is in the lib output
           # - the static library, which is in the static output
-          substitutions.cudartInclude = "${final.cuda_cudart.dev}";
-          substitutions.cudartLib = "${final.cuda_cudart.lib}";
-          substitutions.cudartStatic = "${final.cuda_cudart.static}";
+          substitutions.cudartFlags = let cudart = final.cuda_cudart; in
+            builtins.concatStringsSep " " (final.lib.optionals (final ? cuda_cudart) ([
+              "-I${final.lib.getDev cudart}/include"
+              "-L${final.lib.getLib cudart}/lib"
+            ] ++ final.lib.optionals (builtins.elem "static" cudart.outputs) [
+              "-L${cudart.static}/lib"
+            ]));
         }
         ./hooks/setup-cuda-hook.sh)
     { });
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
index 5ea57594211c4..0272e7938b9aa 100644
--- a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
+++ b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
@@ -55,8 +55,9 @@ setupCUDAToolkitCompilers() {
 
     # CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
     # CUDAToolkit_ROOT. We have to help it locate libcudart
-    if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then
-        export NVCC_APPEND_FLAGS+=" -L@cudartLib@/lib -L@cudartStatic@/lib -I@cudartInclude@/include"
+    local cudartFlags="@cudartFlags@"
+    if [[ -z "${nvccDontPrependCudartFlags-}" ]] && [[ -n "${cudartFlags:-}" ]] ; then
+        export NVCC_APPEND_FLAGS+=" $cudartFlags"
     fi
 }
 

From 94623d5a96bc9b02e776cfa66dfb92f4e806d1f4 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 4 Dec 2023 20:36:52 +0000
Subject: [PATCH 22/38] cudaPackages_11_3.saxpy: fallback to the cudatoolkit
 runfile

(cherry picked from commit e084a6c648fecd473ff6190a5acac3369b292530)
---
 .../compilers/cudatoolkit/saxpy/default.nix   | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/default.nix b/pkgs/development/compilers/cudatoolkit/saxpy/default.nix
index f347b43d1d11c..2da6da29004dc 100644
--- a/pkgs/development/compilers/cudatoolkit/saxpy/default.nix
+++ b/pkgs/development/compilers/cudatoolkit/saxpy/default.nix
@@ -1,12 +1,13 @@
 { autoAddOpenGLRunpathHook
 , backendStdenv
 , cmake
-, cuda_cccl
-, cuda_cudart
+, cuda_cccl ? null
+, cuda_cudart ? null
 , cudaFlags
-, cuda_nvcc
+, cuda_nvcc ? null
+, cudatoolkit ? null
 , lib
-, libcublas
+, libcublas ? null
 , setupCudaHook
 , stdenv
 }:
@@ -17,23 +18,24 @@ backendStdenv.mkDerivation {
 
   src = ./.;
 
-  buildInputs = [
+  buildInputs = lib.optionals (cuda_cudart != null) [
     libcublas
     cuda_cudart
     cuda_cccl
+  ] ++ lib.optionals (cuda_cudart == null) [
+    cudatoolkit
   ];
   nativeBuildInputs = [
     cmake
 
-    # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
-    # cross-compilation to work. This should work automatically once we move to
-    # spliced scopes. Delete this comment once that happens
-    cuda_nvcc
-
     # Alternatively, we could remove the propagated hook from cuda_nvcc and add
     # directly:
     # setupCudaHook
     autoAddOpenGLRunpathHook
+  ] ++ lib.optionals (cuda_nvcc != null) [
+    cuda_nvcc
+  ] ++ lib.optionals (cuda_nvcc == null) [
+    cudatoolkit
   ];
 
   cmakeFlags = [

From daafff2e5bfd927f183623fc918358bc13b0abd4 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 4 Dec 2023 20:40:34 +0000
Subject: [PATCH 23/38] cudaPackages.cuda_nvcc: fix hook's offsets (-1, -1) ->
 (-1, 0)

Cf. explanations in https://github.com/NixOS/nixpkgs/pull/271078

(cherry picked from commit d031523a012688079e3bef68abaab2f8c5d099af)
---
 .../compilers/cudatoolkit/redist/overrides.nix      | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
index 16b03b93b1db6..71e70e8d7b704 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@@ -52,18 +52,7 @@ in
   );
 
   cuda_nvcc = prev.cuda_nvcc.overrideAttrs (oldAttrs: {
-    # Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to
-    # nativeBuildInputs (offsets `(-1, 0)`), magma should also source the
-    # setupCudaHook, i.e. we want it the hook to be propagated into the
-    # same nativeBuildInputs.
-    #
-    # Logically, cuda_nvcc should include the hook in depsHostHostPropagated,
-    # so that the final offsets for the propagated hook would be `(-1, 0) +
-    # (0, 0) = (-1, 0)`.
-    #
-    # In practice, TargetTarget appears to work:
-    # https://gist.github.com/fd80ff142cd25e64603618a3700e7f82
-    depsTargetTargetPropagated = [
+    propagatedBuildInputs = [
       final.setupCudaHook
     ];
 

From a557b5004aa3360807ef2e5fd1ae3854a10ca8be Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 4 Dec 2023 20:47:11 +0000
Subject: [PATCH 24/38] cudaPackages.setupCudaHook: source only from
 nativeBuildInputs

(cherry picked from commit 37ec2cb6b173c7d5b78ec704db533454f9bc84ba)
---
 .../development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
index 0272e7938b9aa..0fa8883081c50 100644
--- a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
+++ b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
@@ -1,5 +1,8 @@
 # shellcheck shell=bash
 
+# Only run the hook from nativeBuildInputs
+(( "$hostOffset" == -1 && "$targetOffset" == 0)) || return 0
+
 echo Sourcing setup-cuda-hook >&2
 
 extendCUDAToolkit_ROOT() {

From a2bd1af51781e994605915d66f6be55cf1e0d190 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 5 Dec 2023 19:14:05 +0000
Subject: [PATCH 25/38] tiny-cuda-nn: cuda_cccl required with the newer cuda

(cherry picked from commit b9635cfa4d659ca652ab4c400988d6c8b1e453c0)
---
 pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix b/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix
index d046c6864539d..b613b112b2a85 100644
--- a/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix
+++ b/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix
@@ -15,6 +15,7 @@
 
   cuda-common-redist = with cudaPackages; [
     cuda_cudart # cuda_runtime.h
+    cuda_cccl.dev # <nv/target>
     libcublas # cublas_v2.h
     libcusolver # cusolverDn.h
     libcusparse # cusparse.h

From ee3152687e30e227fe0d33e4ce3f45f8a88be309 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 5 Dec 2023 19:14:42 +0000
Subject: [PATCH 26/38] tiny-cuda-nn: prune runtime closure

(cherry picked from commit 18a2e518cdcef0764cfd7a96c39fdc0e7874e9fd)
---
 .../libraries/science/math/tiny-cuda-nn/default.nix  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix b/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix
index b613b112b2a85..2036c4c86253b 100644
--- a/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix
+++ b/pkgs/development/libraries/science/math/tiny-cuda-nn/default.nix
@@ -14,11 +14,15 @@
   inherit (cudaPackages) backendStdenv cudaFlags;
 
   cuda-common-redist = with cudaPackages; [
-    cuda_cudart # cuda_runtime.h
+    cuda_cudart.dev # cuda_runtime.h
+    cuda_cudart.lib
     cuda_cccl.dev # <nv/target>
-    libcublas # cublas_v2.h
-    libcusolver # cusolverDn.h
-    libcusparse # cusparse.h
+    libcublas.dev # cublas_v2.h
+    libcublas.lib
+    libcusolver.dev # cusolverDn.h
+    libcusolver.lib
+    libcusparse.dev # cusparse.h
+    libcusparse.lib
   ];
 
   cuda-native-redist = symlinkJoin {

From 545055a600dc7537568e31d9668bc4e0faef06e2 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Thu, 30 Nov 2023 00:33:01 +0000
Subject: [PATCH 27/38] cudaPackages.setupCudaHook: propagate buildInputs and
 self

This is useful for the cuda variants of packages like opencv and pytorch,
whose xxxxConfig.cmake files do find_package(CUDAToolkit REQUIRED)
regardless of whether they actually use it. With the propagated hook,
we no longer have to manually add cuda dependencies into torch/opencvs
reverse dependencies

cudaPackages.cuda_nvcc: fix setupCudaHook propagation

(cherry picked from commit be9c779deba0e898802dd341a1ba9c04c4e9abe8)
---
 .../compilers/cudatoolkit/extension.nix       |  20 +---
 .../hooks/mark-for-cudatoolkit-root-hook.sh   |   8 +-
 .../cudatoolkit/hooks/nvcc-setup-hook.sh      |   5 -
 .../cudatoolkit/hooks/setup-cuda-hook.sh      | 101 +++++++++++++++---
 4 files changed, 94 insertions(+), 40 deletions(-)
 delete mode 100644 pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh

diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index d75d288f5577e..016675fa07015 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -47,35 +47,21 @@ final: prev: let
         ./hooks/mark-for-cudatoolkit-root-hook.sh)
     { });
 
-  # Normally propagated by cuda_nvcc or cudatoolkit through their depsHostHostPropagated
+  # Currently propagated by cuda_nvcc or cudatoolkit, rather than used directly
   setupCudaHook = (final.callPackage
     ({ makeSetupHook, backendStdenv }:
       makeSetupHook
         {
           name = "setup-cuda-hook";
 
+          substitutions.setupCudaHook = placeholder "out";
+
           # Point NVCC at a compatible compiler
           substitutions.ccRoot = "${backendStdenv.cc}";
 
           # Required in addition to ccRoot as otherwise bin/gcc is looked up
           # when building CMakeCUDACompilerId.cu
           substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
-
-          # Required by cmake's enable_language(CUDA) to build a test program
-          # When implementing cross-compilation support: this is
-          # final.pkgs.targetPackages.cudaPackages.cuda_cudart
-          # Given the multiple-outputs each CUDA redist has, we can specify the exact components we
-          # need from the package. CMake requires:
-          # - the cuda_runtime.h header, which is in the dev output
-          # - the dynamic library, which is in the lib output
-          # - the static library, which is in the static output
-          substitutions.cudartFlags = let cudart = final.cuda_cudart; in
-            builtins.concatStringsSep " " (final.lib.optionals (final ? cuda_cudart) ([
-              "-I${final.lib.getDev cudart}/include"
-              "-L${final.lib.getLib cudart}/lib"
-            ] ++ final.lib.optionals (builtins.elem "static" cudart.outputs) [
-              "-L${cudart.static}/lib"
-            ]));
         }
         ./hooks/setup-cuda-hook.sh)
     { });
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh
index 5c18760a3a2b0..ba04c2e0806af 100644
--- a/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh
+++ b/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh
@@ -1,8 +1,14 @@
 # shellcheck shell=bash
 
+# Should we mimick cc-wrapper's "hygiene"?
+[[ -z ${strictDeps-} ]] || (( "$hostOffset" < 0 )) || return 0
+
+echo "Sourcing mark-for-cudatoolkit-root-hook" >&2
+
 markForCUDAToolkit_ROOT() {
     mkdir -p "${prefix}/nix-support"
-    touch "${prefix}/nix-support/include-in-cudatoolkit-root"
+    [[ -f "${prefix}/nix-support/include-in-cudatoolkit-root" ]] && return
+    echo "$pname-$output" > "${prefix}/nix-support/include-in-cudatoolkit-root"
 }
 
 fixupOutputHooks+=(markForCUDAToolkit_ROOT)
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh
deleted file mode 100644
index e75a84a9550e7..0000000000000
--- a/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-# shellcheck shell=bash
-
-# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
-# CUDAToolkit_ROOT. We have to help it locate libcudart
-export NVCC_APPEND_FLAGS+=" -L@cudartLib@/lib -L@cudartStatic@/lib -I@cudartInclude@/include"
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
index 0fa8883081c50..7b7b3bdde80e3 100644
--- a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
+++ b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
@@ -3,19 +3,57 @@
 # Only run the hook from nativeBuildInputs
 (( "$hostOffset" == -1 && "$targetOffset" == 0)) || return 0
 
-echo Sourcing setup-cuda-hook >&2
+guard=Sourcing
+reason=
 
-extendCUDAToolkit_ROOT() {
-    if [[ -f "$1/nix-support/include-in-cudatoolkit-root" ]] ; then
-        addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$1"
+[[ -n ${cudaSetupHookOnce-} ]] && guard=Skipping && reason=" because the hook has been propagated more than once"
 
-        if [[ -d "$1/include" ]] ; then
-            addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$1/include"
-        fi
-    fi
+if (( "${NIX_DEBUG:-0}" >= 1 )) ; then
+    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset setupCudaHook$reason" >&2
+else
+    echo "$guard setup-cuda-hook$reason" >&2
+fi
+
+[[ "$guard" = Sourcing ]] || return 0
+
+declare -g cudaSetupHookOnce=1
+declare -Ag cudaHostPathsSeen=()
+declare -Ag cudaOutputToPath=()
+
+extendcudaHostPathsSeen() {
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "extendcudaHostPathsSeen $1" >&2
+
+    local markerPath="$1/nix-support/include-in-cudatoolkit-root"
+    [[ ! -f "${markerPath}" ]] && return
+    [[ -v cudaHostPathsSeen[$1] ]] && return
+
+    cudaHostPathsSeen["$1"]=1
+
+    # E.g. cuda_cudart-lib
+    local cudaOutputName
+    read -r cudaOutputName < "$markerPath"
+
+    [[ -z "$cudaOutputName" ]] && return
+
+    local oldPath="${cudaOutputToPath[$cudaOutputName]-}"
+    [[ -n "$oldPath" ]] && echo "extendcudaHostPathsSeen: warning: overwriting $cudaOutputName from $oldPath to $1" >&2
+    cudaOutputToPath["$cudaOutputName"]="$1"
 }
+addEnvHooks "$targetOffset" extendcudaHostPathsSeen
+
+setupCUDAToolkit_ROOT() {
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "setupCUDAToolkit_ROOT: cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
 
-addEnvHooks "$targetOffset" extendCUDAToolkit_ROOT
+    for path in "${!cudaHostPathsSeen[@]}" ; do
+        addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$path"
+        if [[ -d "$path/include" ]] ; then
+            addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$path/include"
+        fi
+    done
+
+    export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
+}
+preConfigureHooks+=(setupCUDAToolkit_ROOT)
 
 setupCUDAToolkitCompilers() {
     echo Executing setupCUDAToolkitCompilers >&2
@@ -58,15 +96,44 @@ setupCUDAToolkitCompilers() {
 
     # CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
     # CUDAToolkit_ROOT. We have to help it locate libcudart
-    local cudartFlags="@cudartFlags@"
-    if [[ -z "${nvccDontPrependCudartFlags-}" ]] && [[ -n "${cudartFlags:-}" ]] ; then
-        export NVCC_APPEND_FLAGS+=" $cudartFlags"
+    if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then
+        if [[ ! -v cudaOutputToPath["cuda_cudart-out"] ]] ; then
+            echo "setupCUDAToolkitCompilers: missing cudaPackages.cuda_cudart. This may become an an error in the future" >&2
+            # exit 1
+        fi
+        for pkg in "${!cudaOutputToPath[@]}" ; do
+            [[ ! "$pkg" = cuda_cudart* ]] && continue
+
+            local path="${cudaOutputToPath[$pkg]}"
+            if [[ -d "$path/include" ]] ; then
+                export NVCC_PREPEND_FLAGS+=" -I$path/include"
+            fi
+            if [[ -d "$path/lib" ]] ; then
+                export NVCC_PREPEND_FLAGS+=" -L$path/lib"
+            fi
+        done
     fi
 }
+preConfigureHooks+=(setupCUDAToolkitCompilers)
 
-setupCMakeCUDAToolkit_ROOT() {
-    export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
-}
+propagateCudaLibraries() {
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "propagateCudaLibraries: cudaPropagateToOutput=$cudaPropagateToOutput cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
 
-postHooks+=(setupCUDAToolkitCompilers)
-preConfigureHooks+=(setupCMakeCUDAToolkit_ROOT)
+    [[ -z "${cudaPropagateToOutput-}" ]] && return
+
+    mkdir -p "${!cudaPropagateToOutput}/nix-support"
+    # One'd expect this should be propagated-bulid-build-deps, but that doesn't seem to work
+    echo "@setupCudaHook@" >> "${!cudaPropagateToOutput}/nix-support/propagated-native-build-inputs"
+
+    local propagatedBuildInputs=( "${!cudaHostPathsSeen[@]}" )
+    for output in $(getAllOutputNames) ; do
+        if [[ ! "$output" = "$cudaPropagateToOutput" ]] ; then
+            propagatedBuildInputs+=( "${!output}" )
+        fi
+        break
+    done
+
+    # One'd expect this should be propagated-host-host-deps, but that doesn't seem to work
+    printWords "${propagatedBuildInputs[@]}" >> "${!cudaPropagateToOutput}/nix-support/propagated-build-inputs"
+}
+postFixupHooks+=(propagateCudaLibraries)

From 5a66afcd8effccb43b4d0ae695c1528dcfce7fce Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Thu, 30 Nov 2023 00:44:50 +0000
Subject: [PATCH 28/38] opencv4: expose cxxdev, propagating optional cuda deps

(cherry picked from commit ada3991349beb5880e3994f25c65a0cf68941b83)
---
 pkgs/development/libraries/opencv/4.x.nix | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/libraries/opencv/4.x.nix b/pkgs/development/libraries/opencv/4.x.nix
index 4c1b13d1309e0..8cfb169a0bace 100644
--- a/pkgs/development/libraries/opencv/4.x.nix
+++ b/pkgs/development/libraries/opencv/4.x.nix
@@ -247,8 +247,10 @@ effectiveStdenv.mkDerivation {
 
   outputs = [
     "out"
+    "cxxdev"
     "package_tests"
   ];
+  cudaPropagateToOutput = "cxxdev";
 
   postUnpack = lib.optionalString buildContrib ''
     cp --no-preserve=mode -r "${contribSrc}/modules" "$NIX_BUILD_TOP/source/opencv_contrib"
@@ -328,7 +330,7 @@ effectiveStdenv.mkDerivation {
       bzip2 AVFoundation Cocoa VideoDecodeAcceleration CoreMedia MediaToolbox Accelerate
     ]
     ++ lib.optionals enableDocs [ doxygen graphviz-nox ]
-    ++ lib.optionals enableCuda  (with cudaPackages; [
+    ++ lib.optionals enableCuda (with cudaPackages; [
       cuda_cudart
       cuda_cccl # <thrust/*>
       libnpp # npp.h
@@ -338,7 +340,7 @@ effectiveStdenv.mkDerivation {
       cudnn # cudnn.h
     ] ++ lib.optionals enableCufft [
       libcufft # cufft.h
-  ]);
+    ]);
 
   propagatedBuildInputs = lib.optional enablePython pythonPackages.numpy
     ++ lib.optionals enableCuda [ nvidia-optical-flow-sdk ];
@@ -458,6 +460,7 @@ effectiveStdenv.mkDerivation {
   postInstall = ''
     sed -i "s|{exec_prefix}/$out|{exec_prefix}|;s|{prefix}/$out|{prefix}|" \
       "$out/lib/pkgconfig/opencv4.pc"
+    mkdir $cxxdev
   ''
   # install python distribution information, so other packages can `import opencv`
   + lib.optionalString enablePython ''

From 7be294a3c7f6a638d17078e316673f45eb8db170 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Thu, 30 Nov 2023 00:59:10 +0000
Subject: [PATCH 29/38] opencv4: propagate optical flow sdk same as cuda

(cherry picked from commit 45698380295187b35f3872542b71efc2223f8201)
---
 .../libraries/nvidia-optical-flow-sdk/default.nix            | 5 +++++
 pkgs/development/libraries/opencv/4.x.nix                    | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix b/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix
index a82fa9068c66c..2914d059cfaff 100644
--- a/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix
+++ b/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix
@@ -18,6 +18,11 @@ stdenv.mkDerivation {
     cp -R * $out/include
   '';
 
+  postFixup = ''
+    mkdir -p $out/nix-support
+    echo $pname >> "$out/nix-support/include-in-cudatoolkit-root"
+  '';
+
   meta = with lib; {
     description = "Nvidia optical flow headers for computing the relative motion of pixels between images";
     homepage = "https://developer.nvidia.com/opticalflow-sdk";
diff --git a/pkgs/development/libraries/opencv/4.x.nix b/pkgs/development/libraries/opencv/4.x.nix
index 8cfb169a0bace..d7693a3077a82 100644
--- a/pkgs/development/libraries/opencv/4.x.nix
+++ b/pkgs/development/libraries/opencv/4.x.nix
@@ -334,6 +334,7 @@ effectiveStdenv.mkDerivation {
       cuda_cudart
       cuda_cccl # <thrust/*>
       libnpp # npp.h
+      nvidia-optical-flow-sdk
     ] ++ lib.optionals enableCublas [
       libcublas # cublas_v2.h
     ] ++ lib.optionals enableCudnn [
@@ -342,8 +343,7 @@ effectiveStdenv.mkDerivation {
       libcufft # cufft.h
     ]);
 
-  propagatedBuildInputs = lib.optional enablePython pythonPackages.numpy
-    ++ lib.optionals enableCuda [ nvidia-optical-flow-sdk ];
+  propagatedBuildInputs = lib.optionals enablePython [ pythonPackages.numpy ];
 
   nativeBuildInputs = [ cmake pkg-config unzip ]
   ++ lib.optionals enablePython [

From 1ab4e3a2164bdb9e1289cd9e6b4d7684c195e76d Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 3 Dec 2023 02:47:44 +0000
Subject: [PATCH 30/38] opencv4: discard build-time cuda deps

(cherry picked from commit 55af9329429a30ce81f7ad01da95406e1d62f785)
---
 pkgs/development/libraries/opencv/4.x.nix | 24 +++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/pkgs/development/libraries/opencv/4.x.nix b/pkgs/development/libraries/opencv/4.x.nix
index d7693a3077a82..023e56940b75c 100644
--- a/pkgs/development/libraries/opencv/4.x.nix
+++ b/pkgs/development/libraries/opencv/4.x.nix
@@ -331,16 +331,28 @@ effectiveStdenv.mkDerivation {
     ]
     ++ lib.optionals enableDocs [ doxygen graphviz-nox ]
     ++ lib.optionals enableCuda (with cudaPackages; [
-      cuda_cudart
-      cuda_cccl # <thrust/*>
-      libnpp # npp.h
+      cuda_cudart.lib
+      cuda_cudart.dev
+      cuda_cccl.dev # <thrust/*>
+      libnpp.dev # npp.h
+      libnpp.lib
+      libnpp.static
       nvidia-optical-flow-sdk
     ] ++ lib.optionals enableCublas [
-      libcublas # cublas_v2.h
+      # May start using the default $out instead once
+      # https://github.com/NixOS/nixpkgs/issues/271792
+      # has been addressed
+      libcublas.static
+      libcublas.lib
+      libcublas.dev # cublas_v2.h
     ] ++ lib.optionals enableCudnn [
-      cudnn # cudnn.h
+      cudnn.dev # cudnn.h
+      cudnn.lib
+      cudnn.static
     ] ++ lib.optionals enableCufft [
-      libcufft # cufft.h
+      libcufft.dev # cufft.h
+      libcufft.lib
+      libcufft.static
     ]);
 
   propagatedBuildInputs = lib.optionals enablePython [ pythonPackages.numpy ];

From ad2fd258c68308239c40fecca0e269f4fac599b0 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Thu, 30 Nov 2023 01:00:05 +0000
Subject: [PATCH 31/38] torch: add the cxxdev output for cmake consumers

(cherry picked from commit 71c248ec1309381136bf74339d453a58b400b2a9)
---
 .../python-modules/torch/default.nix          | 16 ++++++++--
 .../python-modules/torchaudio/default.nix     | 12 +------
 .../python-modules/torchvision/default.nix    | 32 +++++--------------
 3 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index a00e15f0518b8..dbfa72b884a9e 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -133,7 +133,9 @@ in buildPythonPackage rec {
     "out" # output standard python package
     "dev" # output libtorch headers
     "lib" # output libtorch libraries
+    "cxxdev" # propagated deps for the cmake consumers of torch
   ];
+  cudaPropagateToOutput = "cxxdev";
 
   src = fetchFromGitHub {
     owner = "pytorch";
@@ -368,7 +370,10 @@ in buildPythonPackage rec {
     ++ lib.optionals rocmSupport [ rocmPackages.llvm.openmp ]
     ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
     ++ lib.optionals stdenv.isLinux [ numactl ]
-    ++ lib.optionals stdenv.isDarwin [ Accelerate CoreServices libobjc ];
+    ++ lib.optionals stdenv.isDarwin [ Accelerate CoreServices libobjc ]
+    ++ lib.optionals tritonSupport [ openai-triton ]
+    ++ lib.optionals MPISupport [ mpi ]
+    ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
 
   propagatedBuildInputs = [
     cffi
@@ -388,8 +393,10 @@ in buildPythonPackage rec {
 
     # torch/csrc requires `pybind11` at runtime
     pybind11
+  ] ++ lib.optionals tritonSupport [ openai-triton ];
+
+  propagatedCxxBuildInputs = [
   ]
-  ++ lib.optionals tritonSupport [ openai-triton ]
   ++ lib.optionals MPISupport [ mpi ]
   ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
 
@@ -450,7 +457,10 @@ in buildPythonPackage rec {
       --replace "/build/source/torch/include" "$dev/include"
   '';
 
-  postFixup = lib.optionalString stdenv.isDarwin ''
+  postFixup = ''
+    mkdir -p "$cxxdev/nix-support"
+    printWords "''${propagatedCxxBuildInputs[@]}" >> "$cxxdev/nix-support/propagated-build-inputs"
+  '' + lib.optionalString stdenv.isDarwin ''
     for f in $(ls $lib/lib/*.dylib); do
         install_name_tool -id $lib/lib/$(basename $f) $f || true
     done
diff --git a/pkgs/development/python-modules/torchaudio/default.nix b/pkgs/development/python-modules/torchaudio/default.nix
index 207d2a6fade27..4d689d0b39064 100644
--- a/pkgs/development/python-modules/torchaudio/default.nix
+++ b/pkgs/development/python-modules/torchaudio/default.nix
@@ -44,17 +44,7 @@ buildPythonPackage rec {
   ];
   buildInputs = [
     pybind11
-  ] ++ lib.optionals cudaSupport [
-    cudaPackages.libcurand.dev
-    cudaPackages.libcurand.lib
-    cudaPackages.cuda_cudart # cuda_runtime.h and libraries
-    cudaPackages.cuda_cccl.dev # <thrust/*>
-    cudaPackages.cuda_nvtx.dev
-    cudaPackages.cuda_nvtx.lib # -llibNVToolsExt
-    cudaPackages.libcublas.dev
-    cudaPackages.libcublas.lib
-    cudaPackages.libcufft.dev
-    cudaPackages.libcufft.lib
+    torch.cxxdev
   ];
   propagatedBuildInputs = [
     torch
diff --git a/pkgs/development/python-modules/torchvision/default.nix b/pkgs/development/python-modules/torchvision/default.nix
index 401e415e2812a..46a933835f0cf 100644
--- a/pkgs/development/python-modules/torchvision/default.nix
+++ b/pkgs/development/python-modules/torchvision/default.nix
@@ -17,28 +17,6 @@ let
   inherit (torch) cudaCapabilities cudaPackages cudaSupport;
   inherit (cudaPackages) backendStdenv cudaVersion;
 
-  # NOTE: torchvision doesn't use cudnn; torch does!
-  #   For this reason it is not included.
-  cuda-common-redist = with cudaPackages; [
-    cuda_cccl # <thrust/*>
-    libcublas # cublas_v2.h
-    libcusolver # cusolverDn.h
-    libcusparse # cusparse.h
-  ];
-
-  cuda-native-redist = symlinkJoin {
-    name = "cuda-native-redist-${cudaVersion}";
-    paths = with cudaPackages; [
-      cuda_cudart # cuda_runtime.h
-      cuda_nvcc
-    ] ++ cuda-common-redist;
-  };
-
-  cuda-redist = symlinkJoin {
-    name = "cuda-redist-${cudaVersion}";
-    paths = cuda-common-redist;
-  };
-
   pname = "torchvision";
   version = "0.15.2";
 in
@@ -52,9 +30,15 @@ buildPythonPackage {
     hash = "sha256-KNbOgd6PCINZqZ24c/Ev+ODux3ik5iUlzem9uUfQArM=";
   };
 
-  nativeBuildInputs = [ libpng ninja which ] ++ lib.optionals cudaSupport [ cuda-native-redist ];
+  nativeBuildInputs = [
+    libpng
+    ninja
+    which
+  ] ++ lib.optionals cudaSupport [
+    cudaPackages.cuda_nvcc
+  ];
 
-  buildInputs = [ libjpeg_turbo libpng ] ++ lib.optionals cudaSupport [ cuda-redist ];
+  buildInputs = [ libjpeg_turbo libpng torch.cxxdev ];
 
   propagatedBuildInputs = [ numpy pillow torch scipy ];
 

From 50064e53acd3a224e388f0f1d4d4079ebd8e5747 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Thu, 30 Nov 2023 00:45:37 +0000
Subject: [PATCH 32/38] openvino: use opencv4.cxxdev in case cuda is enabled

(cherry picked from commit 3ececb9efafd80058525571d77d881767de6f5b8)
---
 pkgs/development/libraries/openvino/default.nix | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/libraries/openvino/default.nix b/pkgs/development/libraries/openvino/default.nix
index 5761f9e7bb645..6ff2be8ddbd6e 100644
--- a/pkgs/development/libraries/openvino/default.nix
+++ b/pkgs/development/libraries/openvino/default.nix
@@ -122,6 +122,7 @@ stdenv.mkDerivation rec {
     "-DENABLE_CPPLINT:BOOL=OFF"
     "-DBUILD_TESTING:BOOL=OFF"
     "-DENABLE_SAMPLES:BOOL=OFF"
+    (lib.cmakeBool "CMAKE_VERBOSE_MAKEFILE" true)
   ];
 
   env.NIX_CFLAGS_COMPILE = lib.optionalString stdenv.isAarch64 "-Wno-narrowing";
@@ -133,7 +134,7 @@ stdenv.mkDerivation rec {
   buildInputs = [
     libusb1
     libxml2
-    opencv
+    opencv.cxxdev
     protobuf
     pugixml
     tbb

From 484b846cdd78beaa67d6378e579a4c3dc39628c9 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Thu, 30 Nov 2023 00:59:33 +0000
Subject: [PATCH 33/38] cctag: unbreak the cuda variant

(cherry picked from commit 44611c4a6d16b0eeb1488e9557b6a11e45193a46)
---
 pkgs/development/libraries/cctag/default.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkgs/development/libraries/cctag/default.nix b/pkgs/development/libraries/cctag/default.nix
index 2c1a5f9ae7863..238821b6af914 100644
--- a/pkgs/development/libraries/cctag/default.nix
+++ b/pkgs/development/libraries/cctag/default.nix
@@ -49,7 +49,7 @@ stdenv.mkDerivation rec {
   buildInputs = [
     boost179
     eigen
-    opencv
+    opencv.cxxdev
   ];
 
   # Tests are broken on Darwin (linking issue)

From f68149e7973fd676a5d05a4e5fe93ca074d6f768 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 5 Dec 2023 20:03:13 +0000
Subject: [PATCH 34/38] python311Packages.torch: enable_language(CUDA) wants to
 -lcudart_static?

(cherry picked from commit 2df7ccfa1498f5038b15acd50bc9277ad768dcbf)
---
 pkgs/development/python-modules/torch/default.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index dbfa72b884a9e..b930f08aec73d 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -338,6 +338,7 @@ in buildPythonPackage rec {
       cuda_cccl.dev # <thrust/*>
       cuda_cudart.dev # cuda_runtime.h and libraries
       cuda_cudart.lib
+      cuda_cudart.static
       cuda_cupti.dev # For kineto
       cuda_cupti.lib # For kineto
       cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too

From 75cac5d64d5281175459dd8b76f1b701e2975014 Mon Sep 17 00:00:00 2001
From: Martin Weinelt <hexa@darmstadt.ccc.de>
Date: Fri, 8 Dec 2023 03:23:22 +0100
Subject: [PATCH 35/38] openvino: fix build by providing ocl-icd for
 libOpenCL.so.1

(cherry picked from commit 807a4c7b82731359b9171b951a39b76d91951e7b)
---
 pkgs/development/libraries/openvino/default.nix | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkgs/development/libraries/openvino/default.nix b/pkgs/development/libraries/openvino/default.nix
index 6ff2be8ddbd6e..26fac012948d4 100644
--- a/pkgs/development/libraries/openvino/default.nix
+++ b/pkgs/development/libraries/openvino/default.nix
@@ -18,6 +18,7 @@
 # runtime
 , libusb1
 , libxml2
+, ocl-icd
 , opencv
 , protobuf
 , pugixml
@@ -134,6 +135,7 @@ stdenv.mkDerivation rec {
   buildInputs = [
     libusb1
     libxml2
+    ocl-icd
     opencv.cxxdev
     protobuf
     pugixml

From 270e8cb169f4a6c43c253d388ac65e3b99e07588 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 10 Dec 2023 01:38:43 +0000
Subject: [PATCH 36/38] cudaPackages.setupCudaHook: disable the guard for 23.11

---
 .../compilers/cudatoolkit/hooks/setup-cuda-hook.sh            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
index 7b7b3bdde80e3..4f1009adfc02e 100644
--- a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
+++ b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
@@ -1,7 +1,7 @@
 # shellcheck shell=bash
 
-# Only run the hook from nativeBuildInputs
-(( "$hostOffset" == -1 && "$targetOffset" == 0)) || return 0
+# Starting with 24.05: only run the hook from nativeBuildInputs
+# (( "$hostOffset" == -1 && "$targetOffset" == 0)) || return 0
 
 guard=Sourcing
 reason=

From d8ed2baa7c48a75470e2093e6dfdf497ffa31d4c Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 9 Dec 2023 00:50:05 +0000
Subject: [PATCH 37/38] cudaPackages.cudatoolkit: propagate the hook to
 nativeBuildInputs correctly

(cherry picked from commit 810599277409a7d564c159983f3bb4a51aed1ea3)
---
 pkgs/development/compilers/cudatoolkit/common.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index 681549fa62dbe..0725fd56faf62 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -88,7 +88,7 @@ backendStdenv.mkDerivation rec {
   ] ++ lib.optionals (lib.versionAtLeast version "11.8") [
     qt6Packages.wrapQtAppsHook
   ];
-  depsTargetTargetPropagated = [
+  propagatedBuildInputs = [
     setupCudaHook
   ];
   buildInputs = lib.optionals (lib.versionOlder version "11") [

From e5d569fa2e15759fbe5fd7188843dc22b20e6989 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 15 Dec 2023 01:10:37 +0000
Subject: [PATCH 38/38] nvidia-optical-flow-sdk: refactor: propagation via
 setupCudaHook

(cherry picked from commit a33ae59eeb935515194f8edabbabe0df767fa8ba)
---
 .../libraries/nvidia-optical-flow-sdk/default.nix    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix b/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix
index 2914d059cfaff..813821bfb71c2 100644
--- a/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix
+++ b/pkgs/development/libraries/nvidia-optical-flow-sdk/default.nix
@@ -1,4 +1,4 @@
-{ lib, stdenv, fetchFromGitHub }:
+{ lib, stdenv, fetchFromGitHub, cudaPackages }:
 
 stdenv.mkDerivation {
   pname = "nvidia-optical-flow-sdk";
@@ -18,10 +18,12 @@ stdenv.mkDerivation {
     cp -R * $out/include
   '';
 
-  postFixup = ''
-    mkdir -p $out/nix-support
-    echo $pname >> "$out/nix-support/include-in-cudatoolkit-root"
-  '';
+  # Makes setupCudaHook propagate nvidia-optical-flow-sdk together with cuda
+  # packages. Currently used by opencv4.cxxdev, hopefully can be removed in the
+  # future
+  nativeBuildInputs = [
+    cudaPackages.markForCudatoolkitRootHook
+  ];
 
   meta = with lib; {
     description = "Nvidia optical flow headers for computing the relative motion of pixels between images";