diff --git a/CHANGELOG.md b/CHANGELOG.md index 3623a2afc..5ac5cb7bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5` - Auto-restart slurmctld on failure. - Upgrade mysql-community-client to version 8.0.39. +- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel. **BUG FIXES** - Fix retrieval of regions when managing volumes to correctly handle local zones. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb similarity index 95% rename from cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb rename to cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb index 1043206fc..9e71464f9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and limitations under the License. provides :nvidia_driver, platform: 'ubuntu' do |node| - node['platform_version'].to_i >= 20 + node['platform_version'].to_i == 20 end use 'partial/_nvidia_driver_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb new file mode 100644 index 000000000..b075c75f3 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_driver, platform: 'ubuntu' do |node| + node['platform_version'].to_i == 22 +end + +use 'partial/_nvidia_driver_common.rb' + +def rebuild_initramfs? + true +end + +def set_compiler? + true +end + +def compiler_version + 'gcc' +end + +def extra_packages + %w() +end + +def compiler_path + gcc_major_version = gcc_major_version_used_by_kernel + + # If the gcc version used to compile the kernel cannot be detected, + # empty string is returned, meaning that the NVIDIA driver will be compiled + # using the system default compiler. + return "" if gcc_major_version.nil? + + "CC=/usr/bin/gcc-#{gcc_major_version}" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 741fddbcc..2c77cdda3 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil) cached(:nvidia_driver_version) { 'nvidia_driver_version' } end cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } + cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" } cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) + allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) @@ -245,6 +247,33 @@ def self.setup(chef_run, nvidia_driver_version: nil) .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) .with_code(%r{rm -f /tmp/nvidia.run}) end + elsif platform == 'ubuntu' && version == '22.04' + it 'installs gcc' do + is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5) + end + + it 'creates dkms/nvidia.conf' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to create_template('/etc/dkms/nvidia.conf').with( + source: 'nvidia/amazon/dkms/nvidia.conf.erb', + cookbook: 'aws-parallelcluster-platform', + owner: 'root', + group: 'root', + mode: '0644', + variables: { compiler_path: compiler_path } + ) + end + it 'installs nvidia driver' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) + end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index ce3a27532..d00f0a32f 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -105,4 +105,4 @@ def wait_sync_file(path) retry_delay 10 timeout 5 end -end +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb new file mode 100644 index 000000000..3a476f4c9 --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb @@ -0,0 +1,13 @@ +def gcc_major_version_used_by_kernel + # Detects the gcc major version used to compile the kernel, e.g. 12. + # If the version cannot be detected, nil is returned. + # (Tested only on Ubuntu) + begin + gcc_major_version = Mixlib::ShellOut.new("cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2").run_command.stdout.strip + rescue => error + Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}") + return "" + end + Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}") + gcc_major_version +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb index 849b19009..ff6af2da7 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb @@ -1,6 +1,9 @@ require 'chefspec' require 'chefspec/berkshelf' +# Chef::Mixin::ShellOut is required to mock shellout +include Chef::Mixin::ShellOut + RSpec.configure do |c| c.before(:each) do allow(File).to receive(:exist?).and_call_original diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb new file mode 100644 index 000000000..e6b766451 --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb @@ -0,0 +1,33 @@ +require_relative '../../../../libraries/ubuntu/helpers' +require 'spec_helper' + +describe 'gcc_major_version_used_by_kernel' do + let(:cmd) { "cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2" } + let(:shellout) { double(run_command: nil) } + let(:shellout_execution) { double(error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') } + + context 'when gcc version can be detected' do + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout) + allow(shellout).to receive(:run_command).and_return(shellout_execution) + allow(shellout_execution).to receive(:stdout).and_return("1") + end + + it 'returns the correct gcc major version' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("1") + end + end + + context 'when gcc version cannot be detected' do + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout) + allow(shellout).to receive(:run_command).and_raise(Mixlib::ShellOut::ShellCommandFailed) + end + + it 'returns an empty string' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("") + end + end +end