diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb similarity index 95% rename from cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb rename to cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb index 1043206fc..9e71464f9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and limitations under the License. provides :nvidia_driver, platform: 'ubuntu' do |node| - node['platform_version'].to_i >= 20 + node['platform_version'].to_i == 20 end use 'partial/_nvidia_driver_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb new file mode 100644 index 000000000..3a410b120 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_driver, platform: 'ubuntu' do |node| + node['platform_version'].to_i == 22 +end + +use 'partial/_nvidia_driver_common.rb' + +def rebuild_initramfs? + true +end + +def compiler_path + gcc_major_version = get_gcc_major_version_used_by_kernel + + # If the gcc version used to compile the kernel cannot be detected, + # empty string is returned, meaning that the NVIDIA driver will be compiled + # using the system default compiler. + return "" if gcc_major_version.nil? + + "CC=/usr/bin/gcc-#{gcc_major_version}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index 947e68f49..5cfa7ea90 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -73,14 +73,13 @@ end # Install driver - # TODO remove --no-cc-version-check when we can update ubuntu 22 images bash 'nvidia.run advanced' do user 'root' group 'root' cwd '/tmp' code <<-NVIDIA set -e - #{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{nvidia_kernel_module} + #{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau -m=#{nvidia_kernel_module} rm -f /tmp/nvidia.run NVIDIA creates '/usr/bin/nvidia-smi' diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index ce3a27532..34c6d3c8d 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -106,3 +106,17 @@ def wait_sync_file(path) timeout 5 end end + +def get_gcc_major_version_used_by_kernel + # Detects the gcc major version used to compile the kernel, e.g. 12. + # If the version cannot be detected, nil is returned. + begin + gcc_full_version = shell_out!("awk '{print $8}' /proc/version | tr -d ',' | cut -d '.' -f 1").stdout.strip + gcc_major_version = gcc_full_version.split('.')[0].to_i + rescue => error + Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}") + return nil + end + Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}") + gcc_major_version +end