From 48f8bc860784ec70e613949cb759b809a778c265 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 11 Dec 2024 13:40:58 +0100 Subject: [PATCH] [nvidia] On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel. This is required because, NVIDIA driver must be compiled with the same gcc version used by the kernel. If this is not the case, the NVIDIA driver installation would fail a compiler version check. On newer version of Ubuntu22.04 (kernel 6.8+), the kernel is compiled with gcc-12, however gcc-11 is installed as default version by build-essentials, making this change necessary. Signed-off-by: Giacomo Marciani --- ...ubuntu20+.rb => nvidia_driver_ubuntu20.rb} | 2 +- .../nvidia_driver/nvidia_driver_ubuntu22.rb | 34 +++++++++++++++++++ .../spec/unit/resources/nvidia_driver_spec.rb | 14 ++++++++ .../libraries/helpers.rb | 13 +++++++ .../spec/spec_helper.rb | 3 ++ .../spec/unit/libraries/helpers_spec.rb | 33 ++++++++++++++++++ 6 files changed, 98 insertions(+), 1 deletion(-) rename cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/{nvidia_driver_ubuntu20+.rb => nvidia_driver_ubuntu20.rb} (95%) create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb create mode 100644 cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb similarity index 95% rename from cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb rename to cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb index 1043206fc..9e71464f9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and limitations under the License. provides :nvidia_driver, platform: 'ubuntu' do |node| - node['platform_version'].to_i >= 20 + node['platform_version'].to_i == 20 end use 'partial/_nvidia_driver_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb new file mode 100644 index 000000000..42533aee6 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_driver, platform: 'ubuntu' do |node| + node['platform_version'].to_i == 22 +end + +use 'partial/_nvidia_driver_common.rb' + +def rebuild_initramfs? + true +end + +def compiler_path + gcc_major_version = gcc_major_version_used_by_kernel + + # If the gcc version used to compile the kernel cannot be detected, + # empty string is returned, meaning that the NVIDIA driver will be compiled + # using the system default compiler. + return "" if gcc_major_version.nil? + + "CC=/usr/bin/gcc-#{gcc_major_version}" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 741fddbcc..286385b30 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil) cached(:nvidia_driver_version) { 'nvidia_driver_version' } end cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } + cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" } cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) + allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) @@ -245,6 +247,18 @@ def self.setup(chef_run, nvidia_driver_version: nil) .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) .with_code(%r{rm -f /tmp/nvidia.run}) end + elsif platform == 'ubuntu' && version == '22.04' + it 'installs nvidia driver' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) + end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index ce3a27532..0661facf2 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -106,3 +106,16 @@ def wait_sync_file(path) timeout 5 end end + +def gcc_major_version_used_by_kernel + # Detects the gcc major version used to compile the kernel, e.g. 12. + # If the version cannot be detected, nil is returned. + begin + gcc_major_version = shell_out("awk '{print $8}' /proc/version | tr -d ',' | cut -d '.' -f 1").stdout.strip + rescue => error + Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}") + return "" + end + Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}") + gcc_major_version +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb index 849b19009..ff6af2da7 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb @@ -1,6 +1,9 @@ require 'chefspec' require 'chefspec/berkshelf' +# Chef::Mixin::ShellOut is required to mock shellout +include Chef::Mixin::ShellOut + RSpec.configure do |c| c.before(:each) do allow(File).to receive(:exist?).and_call_original diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb new file mode 100644 index 000000000..58aaff461 --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb @@ -0,0 +1,33 @@ +require_relative '../../../libraries/helpers' +require 'spec_helper' + +describe 'gcc_major_version_used_by_kernel' do + + let(:cmd) { "awk '{print $8}' /proc/version | tr -d ',' | cut -d '.' -f 1" } + let(:shellout) { double(run_command: nil, error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') } + + context 'when gcc version can be detected' do + + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout) + allow(shellout).to receive(:stdout).and_return("1") + end + + it 'returns the correct gcc major version' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("1") + end + end + + context 'when gcc version cannot be detected' do + + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_raise(Mixlib::ShellOut::ShellCommandFailed) + end + + it 'returns an empty string' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("") + end + end +end