From f438831d174353dbefcaaa3f48858287633cba23 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 11 Dec 2024 13:40:58 +0100 Subject: [PATCH] [nvidia] On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel. This is required because, NVIDIA driver must be compiled with the same gcc version used by the kernel. If this is not the case, the NVIDIA driver installation would fail a compiler version check. On newer version of Ubuntu22.04 (kernel 6.8+), the kernel is compiled with gcc-12, however gcc-11 is installed as default version by build-essentials, making this change necessary. Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 1 + ...ubuntu20+.rb => nvidia_driver_ubuntu20.rb} | 2 +- .../nvidia_driver/nvidia_driver_ubuntu22.rb | 46 +++++++++++++++++++ .../spec/unit/resources/nvidia_driver_spec.rb | 29 ++++++++++++ .../libraries/ubuntu/helpers.rb | 26 +++++++++++ .../spec/spec_helper.rb | 3 ++ .../unit/libraries/ubuntu/helpers_spec.rb | 33 +++++++++++++ 7 files changed, 139 insertions(+), 1 deletion(-) rename cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/{nvidia_driver_ubuntu20+.rb => nvidia_driver_ubuntu20.rb} (95%) create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb create mode 100644 cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb create mode 100644 cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 3623a2afc..5ac5cb7bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5` - Auto-restart slurmctld on failure. - Upgrade mysql-community-client to version 8.0.39. +- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel. **BUG FIXES** - Fix retrieval of regions when managing volumes to correctly handle local zones. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb similarity index 95% rename from cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb rename to cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb index 1043206fc..9e71464f9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and limitations under the License. provides :nvidia_driver, platform: 'ubuntu' do |node| - node['platform_version'].to_i >= 20 + node['platform_version'].to_i == 20 end use 'partial/_nvidia_driver_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb new file mode 100644 index 000000000..b075c75f3 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_driver, platform: 'ubuntu' do |node| + node['platform_version'].to_i == 22 +end + +use 'partial/_nvidia_driver_common.rb' + +def rebuild_initramfs? + true +end + +def set_compiler? + true +end + +def compiler_version + 'gcc' +end + +def extra_packages + %w() +end + +def compiler_path + gcc_major_version = gcc_major_version_used_by_kernel + + # If the gcc version used to compile the kernel cannot be detected, + # empty string is returned, meaning that the NVIDIA driver will be compiled + # using the system default compiler. + return "" if gcc_major_version.nil? + + "CC=/usr/bin/gcc-#{gcc_major_version}" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 741fddbcc..2c77cdda3 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil) cached(:nvidia_driver_version) { 'nvidia_driver_version' } end cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } + cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" } cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) + allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) @@ -245,6 +247,33 @@ def self.setup(chef_run, nvidia_driver_version: nil) .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) .with_code(%r{rm -f /tmp/nvidia.run}) end + elsif platform == 'ubuntu' && version == '22.04' + it 'installs gcc' do + is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5) + end + + it 'creates dkms/nvidia.conf' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to create_template('/etc/dkms/nvidia.conf').with( + source: 'nvidia/amazon/dkms/nvidia.conf.erb', + cookbook: 'aws-parallelcluster-platform', + owner: 'root', + group: 'root', + mode: '0644', + variables: { compiler_path: compiler_path } + ) + end + it 'installs nvidia driver' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) + end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') diff --git a/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb new file mode 100644 index 000000000..17c5a150a --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +def gcc_major_version_used_by_kernel + # Detects the gcc major version used to compile the kernel, e.g. 12. + # If the version cannot be detected, nil is returned. + # (Tested only on Ubuntu) + begin + gcc_major_version = shell_out("cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2").stdout.strip + rescue => error + Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}") + return "" + end + Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}") + gcc_major_version +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb index 849b19009..ff6af2da7 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb @@ -1,6 +1,9 @@ require 'chefspec' require 'chefspec/berkshelf' +# Chef::Mixin::ShellOut is required to mock shellout +include Chef::Mixin::ShellOut + RSpec.configure do |c| c.before(:each) do allow(File).to receive(:exist?).and_call_original diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb new file mode 100644 index 000000000..e6b766451 --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb @@ -0,0 +1,33 @@ +require_relative '../../../../libraries/ubuntu/helpers' +require 'spec_helper' + +describe 'gcc_major_version_used_by_kernel' do + let(:cmd) { "cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2" } + let(:shellout) { double(run_command: nil) } + let(:shellout_execution) { double(error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') } + + context 'when gcc version can be detected' do + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout) + allow(shellout).to receive(:run_command).and_return(shellout_execution) + allow(shellout_execution).to receive(:stdout).and_return("1") + end + + it 'returns the correct gcc major version' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("1") + end + end + + context 'when gcc version cannot be detected' do + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout) + allow(shellout).to receive(:run_command).and_raise(Mixlib::ShellOut::ShellCommandFailed) + end + + it 'returns an empty string' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("") + end + end +end