Skip to content

Commit

Permalink
Fix driver installation on Rocky Linunx 9
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan committed May 15, 2024
1 parent 5fad654 commit 4abb0a6
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 53 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -11,14 +11,39 @@ verify_driver() {
nvidia-smi
}

setup_rocky_vault_repo() {
# Setup the repo file to the Rocky vault repo:
# https://wiki.rockylinux.org/rocky/repo/#vault
# The regular Rocky Linux 9 repo only has the latest kernel-devel package,
# and the vault repo contains all previous versions of the kernel-devel
# package; need to get a version that matches the kernel to build the driver
cat << EOF > appstream-vault.repo
[appstream-vault]
name=Rocky Linux \$releasever - AppStream - Vault
baseurl=http://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF

sudo mv appstream-vault.repo /etc/yum.repos.d/appstream-vault.repo
sudo chown root:root /etc/yum.repos.d/appstream-vault.repo
sudo chmod 0644 /etc/yum.repos.d/appstream-vault.repo
}

install_driver_from_runfile() {
# Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile
# This method requires the matching kernel-devel package to be installed, and
# the package may be absent from the repo and cause this method to fail
# Remove existing installation before using the runfile
remove_driver_package
if [ ${ID} == rocky ] && [ "$MAJOR_VERSION_ID" = 9 ]; then
setup_rocky_vault_repo
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils
local DRIVER_VERSION=535.129.03
local DRIVER_VERSION=550.54.15
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
Expand All @@ -28,7 +53,7 @@ install_driver_from_runfile() {
setup_repo() {
sudo yum install -y yum-utils
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

Expand All @@ -51,7 +76,7 @@ install_dcgm() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export MAJOR_VERSION_ID VERSION_ID ID
for install_method in "$@"; do
echo "Installing NVIDIA driver with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -81,17 +106,12 @@ handle_rhel7() {

handle_common() {
install_driver_package() {
#TODO: b/332690428 - Remove this temporary fix for Rocky Linux 9 rocky-linux-9-v20240313 with kernel 5.14.0-362.18.1.el9_3.0.1.x86_64
case $(uname -r) in
5.14.0-362.18.1.el9_3.0.1.x86_64)
sudo ln -s /lib/modules/5.14.0-362.18.1.el9_3.0.1.x86_64 /lib/modules/5.14.0-362.18.1.el9_3.x86_64
sudo yum -y module install nvidia-driver:545
;;
*)
# Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
sudo yum -y module install nvidia-driver:latest
;;
esac
#TODO: b/332690428 - Remove this temporary fix for Rocky Linux 9 rocky-linux-9-v20240313 with kernel 5.14.0-362.24.1.el9_3.0.1.x86_64
if [ $(uname -r) == 5.14.0-362.24.1.el9_3.0.1.x86_64 ]; then
sudo ln -s /lib/modules/5.14.0-362.24.1.el9_3.0.1.x86_64 /lib/modules/5.14.0-362.24.1.el9_3.x86_64
fi
# Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
sudo yum -y module install nvidia-driver:latest
}

remove_driver_package() {
Expand All @@ -100,7 +120,7 @@ handle_common() {
}
}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -11,53 +11,60 @@ verify_driver() {
nvidia-smi
}

setup_rocky_vault_repo() {
# Setup the repo file to the Rocky vault repo:
# https://wiki.rockylinux.org/rocky/repo/#vault
# The regular Rocky Linux 9 repo only has the latest kernel-devel package,
# and the vault repo contains all previous versions of the kernel-devel
# package; need to get a version that matches the kernel to build the driver
cat << EOF > appstream-vault.repo
[appstream-vault]
name=Rocky Linux \$releasever - AppStream - Vault
baseurl=http://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF

sudo mv appstream-vault.repo /etc/yum.repos.d/appstream-vault.repo
sudo chown root:root /etc/yum.repos.d/appstream-vault.repo
sudo chmod 0644 /etc/yum.repos.d/appstream-vault.repo
}

install_cuda_from_runfile() {
# Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile
# This method requires the matching kernel-devel package to be installed, and
# the package may be absent from the repo and cause this method to fail
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
if [ ${ID} == rocky ] && [ "$MAJOR_VERSION_ID" = 9 ]; then
setup_rocky_vault_repo
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.4.1
local CUDA_BUNDLED_DRIVER_VERSION=550.54.15
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

setup_repo() {
sudo yum install -y yum-utils
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

install_cuda_from_package_manager() {
setup_repo
install_driver_package
#TODO: b/332690428 - Remove this temporary fix for Rocky Linux 9 rocky-linux-9-v20240313 with kernel 5.14.0-362.18.1.el9_3.0.1.x86_64
case $(uname -r) in
5.14.0-362.18.1.el9_3.0.1.x86_64)
sudo yum -y install cuda-12-3
;;
*)
sudo yum -y install cuda
;;
esac
sudo yum -y install cuda
verify_driver
}

Expand All @@ -71,7 +78,7 @@ remove_cuda_package() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export MAJOR_VERSION_ID VERSION_ID ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -101,17 +108,12 @@ handle_rhel7() {

handle_common() {
install_driver_package() {
#TODO: b/332690428 - Remove this temporary fix for Rocky Linux 9 rocky-linux-9-v20240313 with kernel 5.14.0-362.18.1.el9_3.0.1.x86_64
case $(uname -r) in
5.14.0-362.18.1.el9_3.0.1.x86_64)
sudo ln -s /lib/modules/5.14.0-362.18.1.el9_3.0.1.x86_64 /lib/modules/5.14.0-362.18.1.el9_3.x86_64
sudo yum -y module install nvidia-driver:545
;;
*)
# Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
sudo yum -y module install nvidia-driver:latest
;;
esac
#TODO: b/332690428 - Remove this temporary fix for Rocky Linux 9 rocky-linux-9-v20240313 with kernel 5.14.0-362.24.1.el9_3.0.1.x86_64
if [ $(uname -r) == 5.14.0-362.24.1.el9_3.0.1.x86_64 ]; then
sudo ln -s /lib/modules/5.14.0-362.24.1.el9_3.0.1.x86_64 /lib/modules/5.14.0-362.24.1.el9_3.x86_64
fi
# Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
sudo yum -y module install nvidia-driver:latest
}

remove_driver_package() {
Expand All @@ -121,7 +123,7 @@ handle_common() {

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down

0 comments on commit 4abb0a6

Please sign in to comment.