From c777549586157fb0fbf978e7efd089e2bdf6af10 Mon Sep 17 00:00:00 2001 From: Wyatt Hicken Date: Fri, 1 Nov 2024 13:27:51 -0600 Subject: [PATCH 1/2] Bug fix - Fix repo formatting errors for Nvidia AL2023 repo, install Nvidia ctk --- .../provisioners/install-nvidia-driver.sh | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index 9d9d8c8df..f36aa7d81 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -20,6 +20,26 @@ function is-isolated-partition() { return 0 } +function rpm_install() { + local RPMS=($@) + echo "pulling and installing rpms: (${RPMS[@]}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" + for RPM in ${RPMS[@]}; do + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} + sudo dnf localinstall -y ${WORKING_DIR}/${RPM} + done +} + +function install-nvidia-container-toolkit(){ + # The order of these RPMs is important, as they have dependencies on each other + RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm") + for RPM in ${RPMS[@]}; do + echo "pulling and installing rpms: (${RPM}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} + echo "installing rpm: ${WORKING_DIR}/${RPM}" + sudo rpm -ivh ${WORKING_DIR}/${RPM} + done +} + echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..." ################################################################################ @@ -27,15 +47,10 @@ echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..." ################################################################################ # Determine the domain based on the region if is-isolated-partition; then - echo '[amzn2023-nvidia] - name=Amazon Linux 2023 Nvidia repository - mirrorlist=https://al2023-repos-$awsregion-de612dc2.s3.$awsregion.$awsdomain/nvidia/mirrors/$releasever/$basearch/mirror.list - priority=20 - enabled=1 - repo_gpgcheck=0 - type=rpm - gpgcheck=0 - gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2023' | sudo tee /etc/yum.repos.d/amzn2023-nvidia.repo + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/amzn2023-nvidia.repo ${WORKING_DIR}/amzn2023-nvidia.repo + + sudo dnf config-manager --add-repo ${WORKING_DIR}/amzn2023-nvidia.repo + rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm" else if [[ $AWS_REGION == cn-* ]]; then @@ -102,7 +117,14 @@ sudo systemctl enable set-nvidia-clocks.service ################################################################################ ### Install other dependencies ################################################# ################################################################################ -sudo dnf -y install nvidia-fabric-manager nvidia-container-toolkit +sudo dnf -y install nvidia-fabric-manager + +# NVIDIA Container toolkit needs to be locally installed for isolated partitions +if is-isolated-partition; then + install-nvidia-container-toolkit +else + sudo dnf -y install nvidia-container-toolkit +fi sudo systemctl enable nvidia-fabricmanager sudo systemctl enable nvidia-persistenced From de06d9eba22fc06f77c9a885404e22be0dc529a5 Mon Sep 17 00:00:00 2001 From: Wyatt Hicken Date: Fri, 1 Nov 2024 15:32:12 -0600 Subject: [PATCH 2/2] Fix lint errors --- templates/al2023/provisioners/install-nvidia-driver.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index f36aa7d81..a27c20103 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -22,17 +22,17 @@ function is-isolated-partition() { function rpm_install() { local RPMS=($@) - echo "pulling and installing rpms: (${RPMS[@]}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" - for RPM in ${RPMS[@]}; do + echo "Pulling and installing local rpms from s3 bucket" + for RPM in "${RPMS[@]}"; do aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} sudo dnf localinstall -y ${WORKING_DIR}/${RPM} done } -function install-nvidia-container-toolkit(){ +function install-nvidia-container-toolkit() { # The order of these RPMs is important, as they have dependencies on each other RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm") - for RPM in ${RPMS[@]}; do + for RPM in "${RPMS[@]}"; do echo "pulling and installing rpms: (${RPM}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} echo "installing rpm: ${WORKING_DIR}/${RPM}"