From 41bdf53f65684b54abaa3098a5fe3acf568cdf2a Mon Sep 17 00:00:00 2001 From: Babak Sarashki Date: Wed, 3 Mar 2021 12:15:52 +0000 Subject: [PATCH] integ: gpu-operator helm charts This commit adds nvidia gpu-operator helm charts use case for custom container runtime feature. To load nvidia-gpu-operator on starlingx: system service-parameter-add platform container_runtime \ custom_container_runtime=\ nvidia:/usr/local/nvidia/toolkit/nvidia-container-runtime And define runtimeClass for nvidia gpu pods: kind: RuntimeClass apiVersion: node.k8s.io/v1beta1 metadata: name: nvidia handler: nvidia The above will direct all containerd creations of pods with nvidia runtimeClass to nvidia-container-runtime -- where the nvidia-conta iner-runtime is installed by the operator onto a hostMount. Story: 2008434 Task: 41978 Signed-off-by: Babak Sarashki Change-Id: I999804d4697349bc0966d0a6e653d7bce15e18fc --- centos_pkg_dirs | 1 + centos_tarball-dl.lst | 1 + gpu/gpu-operator/centos/build_srpm.data | 8 + gpu/gpu-operator/centos/gpu-operator.spec | 45 ++ ...p-configmap-with-assets-for-volumemo.patch | 137 ++++ ...-support-on-starlingx-cloud-platform.patch | 590 ++++++++++++++++++ 6 files changed, 782 insertions(+) create mode 100644 gpu/gpu-operator/centos/build_srpm.data create mode 100644 gpu/gpu-operator/centos/gpu-operator.spec create mode 100644 gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch create mode 100644 gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch diff --git a/centos_pkg_dirs b/centos_pkg_dirs index 3c1e8365..41a920ac 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -85,3 +85,4 @@ python/python-webencodings python/python-daemon base/inih base/pf-bb-config +gpu/gpu-operator diff --git a/centos_tarball-dl.lst b/centos_tarball-dl.lst index 6c0fda73..3924a01c 100644 --- a/centos_tarball-dl.lst +++ b/centos_tarball-dl.lst @@ -71,3 +71,4 @@ xxHash-1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9.tar.gz#xxHash#https://api.github zstd-f4340f46b2387bc8de7d5320c0b83bb1499933ad.tar.gz#zstd#https://api.github.com/repos/facebook/zstd/tarball/f4340f46b2387bc8de7d5320c0b83bb1499933ad#https## inih-b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69.tar.gz#inih-44#https://github.com/benhoyt/inih/tarball/b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69#https## pf-bb-config-791b4f38d15377d4fbb3c9799a652acbc405b088.tar.gz#pf-bb-config-20.11#https://github.com/intel/pf-bb-config/tarball/791b4f38d15377d4fbb3c9799a652acbc405b088#https## +gpu-operator-1.6.0.tar.gz#gpu-operator-1.6.0#https://github.com/NVIDIA/gpu-operator/archive/1.6.0.tar.gz##https## diff --git a/gpu/gpu-operator/centos/build_srpm.data b/gpu/gpu-operator/centos/build_srpm.data new file mode 100644 index 00000000..927c712d --- /dev/null +++ b/gpu/gpu-operator/centos/build_srpm.data @@ -0,0 +1,8 @@ +VERSION=1.6.0 +TAR_NAME=gpu-operator +TAR="$TAR_NAME-$VERSION.tar.gz" +COPY_LIST=" \ + $PKG_BASE/files/* \ + $STX_BASE/downloads/$TAR" + +TIS_PATCH_VER=PKG_GITREVCOUNT diff --git a/gpu/gpu-operator/centos/gpu-operator.spec b/gpu/gpu-operator/centos/gpu-operator.spec new file mode 100644 index 00000000..1db9fc95 --- /dev/null +++ b/gpu/gpu-operator/centos/gpu-operator.spec @@ -0,0 +1,45 @@ +# Build variables +%global app_folder /usr/local/share/applications/helm + +Summary: StarlingX nvidia gpu-operator helm chart +Name: gpu-operator +Version: 1.6.0 +Release: 0%{?_tis_dist}.%{tis_patch_ver} +License: Apache-2.0 +Group: base +Packager: Wind River +URL: https://github.com/NVIDIA/gpu-operator/tree/gh-pages + +Source0: %{name}-%{version}.tar.gz + +BuildArch: noarch + +Patch01: deployments-setup-configmap-with-assets-for-volumemo.patch +Patch02: enablement-support-on-starlingx-cloud-platform.patch + +BuildRequires: helm + +%define debug_package %{nil} +%description +StarlingX port of NVIDIA gpu-operator + +%prep +%setup + +%patch01 -p1 +%patch02 -p1 + +%build +cp -r assets deployments/gpu-operator/assets + +helm lint deployments/gpu-operator +mkdir build_results +helm package --version %{version} --app-version %{version} -d build_results deployments/gpu-operator + +%install +install -d -m 755 ${RPM_BUILD_ROOT}%{helm_folder} +install -p -D -m 755 build_results/%{name}-%{version}.tgz ${RPM_BUILD_ROOT}%{helm_folder} + +%files +%defattr(-,root,root,-) +%{helm_folder} diff --git a/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch b/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch new file mode 100644 index 00000000..6a7129fb --- /dev/null +++ b/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch @@ -0,0 +1,137 @@ +From de6068e56987960b7f3227dd4747e64b169742df Mon Sep 17 00:00:00 2001 +From: Babak Sarashki +Date: Sat, 6 Mar 2021 00:22:40 +0000 +Subject: [PATCH] deployments: setup configmap with assets for volumemounts + +This feature allows inclusion of assets/ in the helm chart and their +export to the gpu-operator pod through configmap volumeMounts. + +Signed-off-by: Babak Sarashki +--- + .../gpu-operator/templates/operator.yaml | 45 +++++++++++++++++++ + .../templates/operator_configmap.yaml | 36 +++++++++++++++ + deployments/gpu-operator/values.yaml | 2 + + 3 files changed, 83 insertions(+) + create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml + +diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml +index 50983b20..90aa3874 100644 +--- a/deployments/gpu-operator/templates/operator.yaml ++++ b/deployments/gpu-operator/templates/operator.yaml +@@ -50,6 +50,45 @@ spec: + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true ++ ++ {{- if eq .Values.operator.include_assets "include_assets" }} ++ {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }} ++ subPath: {{ printf "gfd_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }} ++ subPath: {{ printf "state_container_toolkit_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }} ++ subPath: {{ printf "state_device_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }} ++ subPath: {{ printf "state_device_validation_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }} ++ subPath: {{ printf "state_driver_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }} ++ subPath: {{ printf "state_monitor_%s" (base $path) }} ++ {{- end }} ++ {{- end }} ++ + readinessProbe: + exec: + command: ["stat", "/tmp/operator-sdk-ready"] +@@ -63,6 +102,12 @@ spec: + - name: host-os-release + hostPath: + path: "/etc/os-release" ++ {{- if eq .Values.operator.include_assets "include_assets" }} ++ - name: assets ++ configMap: ++ name: operator-configmap ++ {{- end }} ++ + {{- with .Values.operator.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} +diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml +new file mode 100644 +index 00000000..61f366e8 +--- /dev/null ++++ b/deployments/gpu-operator/templates/operator_configmap.yaml +@@ -0,0 +1,36 @@ ++{{- if eq .Values.operator.include_assets "include_assets" }} ++apiVersion: v1 ++kind: ConfigMap ++metadata: ++ name: operator-configmap ++data: ++{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} ++{{ printf "gfd_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} ++{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} ++{{ printf "state_device_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} ++{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} ++{{ printf "state_driver_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} ++{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++{{- end }} +diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml +index 00d94195..8b43c59f 100644 +--- a/deployments/gpu-operator/values.yaml ++++ b/deployments/gpu-operator/values.yaml +@@ -39,6 +39,8 @@ operator: + values: [""] + logging: + timeEncoding: epoch ++ # Set to "include_assets" to include assets/gpu-operator with the helm chart ++ include_assets: "" + + driver: + repository: nvcr.io/nvidia +-- +2.17.1 + diff --git a/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch b/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch new file mode 100644 index 00000000..7608a2dd --- /dev/null +++ b/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch @@ -0,0 +1,590 @@ +From eeb01daae7a39db2717198e03d2aa1e73c7130d8 Mon Sep 17 00:00:00 2001 +From: Babak Sarashki +Date: Sun, 7 Mar 2021 17:19:08 +0000 +Subject: [PATCH] enablement: support on starlingx cloud platform + +StarlingX is a cloud infrastructure software stack for edge. +It has an immutable file system, and system configruation. For +instance changes to set containerd runtime by the gpu-operator +will be overriden and must be avoided. The default_runtime is +to remain docker, therefore. + +This commit enables gpu-operator on Starlingx (starlingx.io). +The changes to the gpu-operator include bundling modified assets +and a modified version of the nvidia-driver with the helm charts. + +The modficiations to the assets include setting the runtimeClassName +on the gpu-operator pods that require nvidia-container-runtime and +host-mounting the kernel headers and build directory. The changes to +the nvidia-driver account for pre-installed kernel packages. + +To load the operator on starlingx, define a runtimeclass with name +and handler set to nvidia; thereafter: + +$ source /etc/platform/openrc +[...(keystone_admin)]$ system service-parameter-add \ + platform container_runtime \ + custom_container_runtime=nvidia:/path/to/nvidia-container-runtime + +[...(keystone_admin)]$ system host-lock 1; system host-unlock 1 + +Signed-off-by: Babak Sarashki +--- + .../gpu-feature-discovery/0500_daemonset.yaml | 1 + + .../cuda-vector-add.yaml | 1 + + .../0400_device_plugin.yml | 1 + + assets/state-driver/0400_configmap.yaml | 327 +++++++++++++++++- + assets/state-driver/0500_daemonset.yaml | 39 ++- + assets/state-monitoring/0900_daemonset.yaml | 1 + + .../gpu-operator/templates/operator.yaml | 12 +- + deployments/gpu-operator/values.yaml | 8 +- + 8 files changed, 379 insertions(+), 11 deletions(-) + +diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml +index 9785dc93..1589e710 100644 +--- a/assets/gpu-feature-discovery/0500_daemonset.yaml ++++ b/assets/gpu-feature-discovery/0500_daemonset.yaml +@@ -18,6 +18,7 @@ spec: + app.kubernetes.io/part-of: nvidia-gpu + spec: + serviceAccount: nvidia-gpu-feature-discovery ++ runtimeClassName: nvidia + containers: + - image: "FILLED BY THE OPERATOR" + name: gpu-feature-discovery +diff --git a/assets/state-device-plugin-validation/cuda-vector-add.yaml b/assets/state-device-plugin-validation/cuda-vector-add.yaml +index cfb547ad..8269adeb 100644 +--- a/assets/state-device-plugin-validation/cuda-vector-add.yaml ++++ b/assets/state-device-plugin-validation/cuda-vector-add.yaml +@@ -12,6 +12,7 @@ spec: + effect: NoSchedule + readOnlyRootFilesystem: true + restartPolicy: OnFailure ++ runtimeClassName: nvidia + initContainers: + - name: device-plugin-validation-init + image: "FILLED BY THE OPERATOR" +diff --git a/assets/state-device-plugin/0400_device_plugin.yml b/assets/state-device-plugin/0400_device_plugin.yml +index a5cf7fae..84e9c534 100644 +--- a/assets/state-device-plugin/0400_device_plugin.yml ++++ b/assets/state-device-plugin/0400_device_plugin.yml +@@ -30,6 +30,7 @@ spec: + operator: Exists + effect: NoSchedule + serviceAccount: nvidia-device-plugin ++ runtimeClassName: nvidia + initContainers: + - name: toolkit-validation + image: "FILLED BY THE OPERATOR" +diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml +index 48e9f51e..561adc9f 100644 +--- a/assets/state-driver/0400_configmap.yaml ++++ b/assets/state-driver/0400_configmap.yaml +@@ -4,7 +4,7 @@ metadata: + name: nvidia-driver + namespace: gpu-operator-resources + data: +- oci-nvidia-hook-json: | ++ oci-nvidia-hook-json: | + { + "version": "1.0.0", + "hook": { +@@ -20,3 +20,328 @@ data: + }, + "stages": ["prestart"] + } ++ nvidia-driver-build-script: | ++ #! /bin/bash ++ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. ++ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier: ++ # Apache-2.0. ++ # This script is from: https://gitlab.com/nvidia/container-images/driver. ++ # It is modified and included under configmap for platforms that require ++ # pre-installed packages. Such platforms have the option to modify the ++ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for ++ # further customizations. ++ ++ set -eu ++ ++ RUN_DIR=/run/nvidia ++ PID_FILE=${RUN_DIR}/${0##*/}.pid ++ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"} ++ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver ++ KERNEL_VERSION="$(uname -r)" ++ ++ # Default to 0 ; 1 is experimental and not supported ++ export IGNORE_PREEMPT_RT_PRESENCE=0 ++ ++ # Check if the kernel version requires a new precompiled driver packages. ++ _kernel_requires_package() { ++ local proc_mount_arg="" ++ ++ echo "Checking NVIDIA driver packages..." ++ cd /usr/src/nvidia-${DRIVER_VERSION}/kernel ++ ++ # When the kernel version is latest on host, this check fails and lead to recompilation, even when precompiled modules exist. ++ #if [ "${KERNEL_VERSION}" != "$(uname -r)" ]; then ++ #Not needed with pre-installed readonly headers, devel and modules ++ #proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" ++ #fi ++ for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do ++ is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg}) ++ if [ "${is_match}" == "kernel interface matches." ]; then ++ echo "Found NVIDIA driver package ${pkg_name##*/}" ++ return 1 ++ fi ++ done ++ return 0 ++ } ++ ++ # Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer. ++ _create_driver_package() ( ++ local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}" ++ local nvidia_sign_args="" ++ local nvidia_modeset_sign_args="" ++ local nvidia_uvm_sign_args="" ++ ++ trap "make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT ++ ++ echo "Compiling NVIDIA driver kernel modules..." ++ cd /usr/src/nvidia-${DRIVER_VERSION}/kernel ++ ++ export IGNORE_CC_MISMATCH=1 ++ make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null ++ ++ echo "Relinking NVIDIA driver kernel modules..." ++ rm -f nvidia.ko nvidia-modeset.ko ++ ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary ++ ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary ++ ++ if [ -n "${PRIVATE_KEY}" ]; then ++ echo "Signing NVIDIA driver kernel modules..." ++ donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/kernels/$(uname -r)/scripts && \ ++ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \ ++ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \ ++ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko" ++ nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign" ++ nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign" ++ nvidia_uvm_sign_args="--signed" ++ fi ++ ++ echo "Building NVIDIA driver package ${pkg_name}..." ++ ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \ ++ --driver-version ${DRIVER_VERSION} \ ++ --kernel-interface nv-linux.o \ ++ --linked-module-name nvidia.ko \ ++ --core-object-name nvidia/nv-kernel.o_binary \ ++ ${nvidia_sign_args} \ ++ --target-directory . \ ++ --kernel-interface nv-modeset-linux.o \ ++ --linked-module-name nvidia-modeset.ko \ ++ --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \ ++ ${nvidia_modeset_sign_args} \ ++ --target-directory . \ ++ --kernel-module nvidia-uvm.ko \ ++ ${nvidia_uvm_sign_args} \ ++ --target-directory . ++ mkdir -p precompiled ++ mv ${pkg_name} precompiled ++ ) ++ ++ # Load the kernel modules and start persistenced. ++ _load_driver() { ++ echo "Loading IPMI kernel module..." ++ modprobe ipmi_msghandler ++ ++ echo "Loading NVIDIA driver kernel modules..." ++ modprobe -a nvidia nvidia-uvm nvidia-modeset ++ ++ echo "Starting NVIDIA persistence daemon..." ++ nvidia-persistenced --persistence-mode ++ } ++ ++ # Stop persistenced and unload the kernel modules if they are currently loaded. ++ _unload_driver() { ++ local rmmod_args=() ++ local nvidia_deps=0 ++ local nvidia_refs=0 ++ local nvidia_uvm_refs=0 ++ local nvidia_modeset_refs=0 ++ ++ echo "Stopping NVIDIA persistence daemon..." ++ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then ++ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) ++ ++ kill -SIGTERM "${pid}" ++ for i in $(seq 1 10); do ++ kill -0 "${pid}" 2> /dev/null || break ++ sleep 0.1 ++ done ++ if [ $i -eq 10 ]; then ++ echo "Could not stop NVIDIA persistence daemon" >&2 ++ return 1 ++ fi ++ fi ++ ++ echo "Unloading NVIDIA driver kernel modules..." ++ if [ -f /sys/module/nvidia_modeset/refcnt ]; then ++ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) ++ rmmod_args+=("nvidia-modeset") ++ ((++nvidia_deps)) ++ fi ++ if [ -f /sys/module/nvidia_uvm/refcnt ]; then ++ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) ++ rmmod_args+=("nvidia-uvm") ++ ((++nvidia_deps)) ++ fi ++ if [ -f /sys/module/nvidia/refcnt ]; then ++ nvidia_refs=$(< /sys/module/nvidia/refcnt) ++ rmmod_args+=("nvidia") ++ fi ++ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then ++ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 ++ return 1 ++ fi ++ ++ if [ ${#rmmod_args[@]} -gt 0 ]; then ++ rmmod ${rmmod_args[@]} ++ fi ++ return 0 ++ } ++ ++ # Link and install the kernel modules from a precompiled package using the nvidia-installer. ++ _install_driver() { ++ local install_args=() ++ ++ echo "Installing NVIDIA driver kernel modules..." ++ cd /usr/src/nvidia-${DRIVER_VERSION} ++ rm -rf /lib/modules/${KERNEL_VERSION}/video ++ ++ if [ "${ACCEPT_LICENSE}" = "yes" ]; then ++ install_args+=("--accept-license") ++ fi ++ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"} ++ # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path ++ # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point ++ # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit ++ #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} ++ } ++ ++ # Mount the driver rootfs into the run directory with the exception of sysfs. ++ _mount_rootfs() { ++ echo "Mounting NVIDIA driver rootfs..." ++ mount --make-runbindable /sys ++ mount --make-private /sys ++ mkdir -p ${RUN_DIR}/driver ++ mount --rbind / ${RUN_DIR}/driver ++ } ++ ++ # Unmount the driver rootfs from the run directory. ++ _unmount_rootfs() { ++ echo "Unmounting NVIDIA driver rootfs..." ++ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then ++ umount -l -R ${RUN_DIR}/driver ++ fi ++ } ++ ++ # Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS). ++ _write_kernel_update_hook() { ++ if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then ++ return ++ fi ++ ++ echo "Writing kernel update hook..." ++ cat > ${KERNEL_UPDATE_HOOK} <<'EOF' ++ #!/bin/bash ++ ++ set -eu ++ trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR ++ ++ NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid) ++ ++ export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)" ++ nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1" ++ EOF ++ chmod +x ${KERNEL_UPDATE_HOOK} ++ } ++ ++ _shutdown() { ++ if _unload_driver; then ++ _unmount_rootfs ++ rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK} ++ return 0 ++ fi ++ return 1 ++ } ++ ++ init() { ++ echo -e "\n========== NVIDIA Software Installer ==========\n" ++ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" ++ ++ exec 3> ${PID_FILE} ++ if ! flock -n 3; then ++ echo "An instance of the NVIDIA driver is already running, aborting" ++ exit 1 ++ fi ++ echo $$ >&3 ++ ++ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM ++ trap "_shutdown" EXIT ++ ++ _unload_driver || exit 1 ++ _unmount_rootfs ++ ++ if _kernel_requires_package; then ++ _create_driver_package ++ fi ++ ++ _install_driver ++ _load_driver ++ _mount_rootfs ++ _write_kernel_update_hook ++ ++ echo "Done, now waiting for signal" ++ sleep infinity & ++ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM ++ trap - EXIT ++ while true; do wait $! || continue; done ++ exit 0 ++ } ++ ++ update() { ++ exec 3>&2 ++ if exec 2> /dev/null 4< ${PID_FILE}; then ++ if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then ++ exec > >(tee -a "/proc/${pid}/fd/1") ++ exec 2> >(tee -a "/proc/${pid}/fd/2" >&3) ++ else ++ exec 2>&3 ++ fi ++ exec 4>&- ++ fi ++ exec 3>&- ++ ++ echo -e "\n========== NVIDIA Software Updater ==========\n" ++ echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" ++ ++ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM ++ ++ if _kernel_requires_package; then ++ _create_driver_package ++ fi ++ ++ echo "Done" ++ exit 0 ++ } ++ ++ usage() { ++ cat >&2 < ++ cat /usr/local/bin/nvidia-driver.22 > /usr/local/bin/nvidia-driver && ++ chmod 755 /usr/local/bin/nvidia-driver && ++ mkdir -p /usr/src/kernels && ++ tar -C /usr/src/host-kernels/ -c $(uname -r) -f - | tar -C /usr/src/kernels/ -xf - && ++ rm -rf /lib/modules/ && mkdir -p /lib/modules/ && ++ tar -C /lib/host-modules/ -c $(uname -r) -f - | tar -C /lib/modules/ -xf - && ++ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so && ++ /usr/local/bin/nvidia-driver init + securityContext: + privileged: true + seLinuxOptions: +@@ -44,10 +55,23 @@ spec: + mountPropagation: Bidirectional + - name: config + mountPath: /etc/containers/oci/hooks.d ++ subPath: oci-nvidia-hook-json ++ - name: config ++ mountPath: /usr/local/bin/nvidia-driver.22 ++ subPath: nvidia-driver-build-script + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log ++ - name: host-modules ++ mountPath: /lib/host-modules ++ readOnly: true ++ - name: host-include ++ mountPath: /usr/include ++ readOnly: true ++ - name: host-kernel-devel ++ mountPath: /usr/src/host-kernels ++ readOnly: true + volumes: + - name: run-nvidia + hostPath: +@@ -58,11 +82,22 @@ spec: + - name: dev-log + hostPath: + path: /dev/log ++ - name: host-modules ++ hostPath: ++ path: /lib/modules ++ - name: host-kernel-devel ++ hostPath: ++ path: /usr/src/kernels/ ++ - name: host-include ++ hostPath: ++ path: /usr/include + - name: config + configMap: + name: nvidia-driver + items: + - key: oci-nvidia-hook-json + path: oci-nvidia-hook.json ++ - key: nvidia-driver-build-script ++ path: nvidia-driver-build-script + nodeSelector: + nvidia.com/gpu.present: "true" +diff --git a/assets/state-monitoring/0900_daemonset.yaml b/assets/state-monitoring/0900_daemonset.yaml +index 38c4d63a..aebb4297 100644 +--- a/assets/state-monitoring/0900_daemonset.yaml ++++ b/assets/state-monitoring/0900_daemonset.yaml +@@ -31,6 +31,7 @@ spec: + effect: NoSchedule + serviceAccount: nvidia-dcgm-exporter + serviceAccountName: nvidia-dcgm-exporter ++ runtimeClassName: nvidia + initContainers: + - name: toolkit-validation + image: "FILLED BY THE OPERATOR" +diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml +index 439b78ba..90aa3874 100644 +--- a/deployments/gpu-operator/templates/operator.yaml ++++ b/deployments/gpu-operator/templates/operator.yaml +@@ -57,38 +57,38 @@ spec: + mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }} + subPath: {{ printf "gfd_%s" (base $path) }} + {{- end }} +- ++ + {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} + - name: assets + mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }} + subPath: {{ printf "state_container_toolkit_%s" (base $path) }} + {{- end }} +- ++ + {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} + - name: assets + mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }} + subPath: {{ printf "state_device_%s" (base $path) }} + {{- end }} +- ++ + {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} + - name: assets + mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }} + subPath: {{ printf "state_device_validation_%s" (base $path) }} + {{- end }} +- ++ + {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} + - name: assets + mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }} + subPath: {{ printf "state_driver_%s" (base $path) }} + {{- end }} +- ++ + {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} + - name: assets + mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }} + subPath: {{ printf "state_monitor_%s" (base $path) }} + {{- end }} + {{- end }} +- ++ + readinessProbe: + exec: + command: ["stat", "/tmp/operator-sdk-ready"] +diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml +index 8b43c59f..17662729 100644 +--- a/deployments/gpu-operator/values.yaml ++++ b/deployments/gpu-operator/values.yaml +@@ -15,6 +15,10 @@ operator: + #version: 1.5.2 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] ++ # We cannot default to containerd because the operator modifies containerd ++ # configuration by adding itself to it, either as the default runtime or a ++ # runtimeclass, and restarts the service thereafter. ++ # defaultRuntime: containerd + defaultRuntime: docker + validator: + image: cuda-sample +@@ -40,7 +44,7 @@ operator: + logging: + timeEncoding: epoch + # Set to "include_assets" to include assets/gpu-operator with the helm chart +- include_assets: "" ++ include_assets: "include_assets" + + driver: + repository: nvcr.io/nvidia +@@ -73,7 +77,7 @@ driver: + toolkit: + repository: nvcr.io/nvidia/k8s + image: container-toolkit +- version: 1.4.5-ubuntu18.04 ++ version: 1.4.5-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] +-- +2.17.1 +