From 4b647e4a37e303759c0d99745ef2d9a8f808f240 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 24 Nov 2023 08:58:39 +0100 Subject: [PATCH 1/3] [Dependencies] Upgrade NVIDIA driver to version 535.129.03 and CUDA Toolkit to version 12.2.2. Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 2 ++ cookbooks/aws-parallelcluster-platform/attributes/platform.rb | 2 +- .../aws-parallelcluster-platform/recipes/install/cuda.rb | 4 ++-- .../spec/unit/recipes/cuda_spec.rb | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 077c2c6278..e9bc1e7b5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Do not wait for static nodes in maintenance to signal CFN that the head node initialization is complete. - Upgrade `aws-cfn-bootstrap` to version 2.0-28. - Upgrade Python to 3.9.17. +- Upgrade NVIDIA driver to version 535.129.03. +- Upgrade CUDA Toolkit to version 12.2.2 - Use OpenRM as NVIDIA kernel module for Linux instead of NVIDIA closed source module. - Upgrade EFA installer to `1.29.0`. - Efa-driver: `efa-2.6.0-1` diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index d79a571aac..5d41c496ac 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -11,7 +11,7 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '535.54.03' +default['cluster']['nvidia']['driver_version'] = '535.129.03' default['cluster']['nvidia']['dcgm_version'] = '3.2.6' # DCV diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index a1d8ffa5ad..65c366b649 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -20,9 +20,9 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux cuda_version = '12.2' -cuda_patch = '0' +cuda_patch = '2' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '535.54.03' +cuda_version_suffix = '535.104.05' cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux' cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" cuda_samples_version = '12.2' diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 7505e655de..9796698d4d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -2,9 +2,9 @@ describe 'aws-parallelcluster-platform::cuda' do cached(:cuda_version) { '12.2' } - cached(:cuda_patch) { '0' } + cached(:cuda_patch) { '2' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '535.54.03' } + cached(:cuda_version_suffix) { '535.104.05' } context 'when nvidia not enabled' do cached(:chef_run) do From 4031b80212d6527294a53fee16ba2e166b94356e Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 22 Nov 2023 19:37:10 +0100 Subject: [PATCH 2/3] [Docs] Fix entry in changelog for 3.8.0 about OpenRM Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9bc1e7b5a..c073e710f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade Python to 3.9.17. - Upgrade NVIDIA driver to version 535.129.03. - Upgrade CUDA Toolkit to version 12.2.2 -- Use OpenRM as NVIDIA kernel module for Linux instead of NVIDIA closed source module. +- Use Open Source NVIDIA GPU drivers (OpenRM) as NVIDIA kernel module for Linux instead of NVIDIA closed source module. - Upgrade EFA installer to `1.29.0`. - Efa-driver: `efa-2.6.0-1` - Efa-config: `efa-config-1.15-1` From 5de32ac7754c3ca227bac4b4f980ae6562a47dc3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 23 Nov 2023 10:08:00 +0100 Subject: [PATCH 3/3] [Dependencies] Upgrade GDRCopy to version 2.4. Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 1 + .../resources/gdrcopy/gdrcopy_amazon2.rb | 2 +- .../resources/gdrcopy/gdrcopy_ubuntu20+.rb | 2 +- .../resources/gdrcopy/partial/_gdrcopy_common.rb | 4 ++-- .../resources/gdrcopy/partial/_gdrcopy_common_rhel.rb | 6 +++--- .../spec/unit/resources/gdrcopy_spec.rb | 10 +++++----- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c073e710f8..dbf3d1c73d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: `libfabric-aws-1.19.0-1` - Rdma-core: `rdma-core-46.0-1` - Open MPI: `openmpi40-aws-4.1.6-1` +- Upgrade GDRCopy to version 2.4. **BUG FIXES** - Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources. diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_amazon2.rb index 2d1917be95..793d8bf0a1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_amazon2.rb @@ -22,7 +22,7 @@ def gdrcopy_enabled? end def gdrcopy_platform - 'unknown_distro' + 'amzn-2' end def gdrcopy_arch diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu20+.rb index e6f1c06b28..16c45cf6a5 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu20+.rb @@ -31,7 +31,7 @@ def installation_code CUDA=/usr/local/cuda ./build-deb-packages.sh dpkg -i gdrdrv-dkms_#{gdrcopy_version_extended}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb dpkg -i libgdrapi_#{gdrcopy_version_extended}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb - dpkg -i gdrcopy-tests_#{gdrcopy_version_extended}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb + dpkg -i gdrcopy-tests_#{gdrcopy_version_extended}_#{gdrcopy_arch}.#{gdrcopy_platform}+cuda*.deb dpkg -i gdrcopy_#{gdrcopy_version_extended}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb COMMAND end diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb index 2800eca43c..b0d787374d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb @@ -12,8 +12,8 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. -property :gdrcopy_version, String, default: '2.3' -property :gdrcopy_checksum, String, default: 'b85d15901889aa42de6c4a9233792af40dd94543e82abe0439e544c87fd79475' +property :gdrcopy_version, String, default: '2.4' +property :gdrcopy_checksum, String, default: '39e74d505ca16160567f109cc23478580d157da897f134989df1d563e55f7a5b' unified_mode true default_action :setup diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common_rhel.rb index f3d3d051a9..824df60cfc 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common_rhel.rb @@ -23,8 +23,8 @@ def gdrcopy_build_dependencies def installation_code <<~COMMAND CUDA=/usr/local/cuda ./build-rpm-packages.sh - rpm -q gdrcopy-kmod-#{gdrcopy_version_extended}dkms || rpm -i gdrcopy-kmod-#{gdrcopy_version_extended}dkms.noarch#{gdrcopy_platform}.rpm - rpm -q gdrcopy-#{gdrcopy_version_extended}.#{gdrcopy_arch} || rpm -i gdrcopy-#{gdrcopy_version_extended}.#{gdrcopy_arch}#{gdrcopy_platform}.rpm - rpm -q gdrcopy-devel-#{gdrcopy_version_extended}.noarch || rpm -i gdrcopy-devel-#{gdrcopy_version_extended}.noarch#{gdrcopy_platform}.rpm + rpm -q gdrcopy-kmod-#{gdrcopy_version_extended}dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version_extended}dkms.#{gdrcopy_platform}.noarch.rpm + rpm -q gdrcopy-#{gdrcopy_version_extended}.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version_extended}.#{gdrcopy_platform}.#{gdrcopy_arch}.rpm + rpm -q gdrcopy-devel-#{gdrcopy_version_extended}.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version_extended}.#{gdrcopy_platform}.noarch.rpm COMMAND end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb index c4088c7440..c7dadb750f 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb @@ -156,7 +156,7 @@ def self.configure(chef_run) cached(:gdrcopy_arch) { 'gdrcopy_arch' } cached(:gdrcopy_platform) do platforms = { - 'amazon2' => 'unknown_distro', + 'amazon2' => 'amzn-2', 'centos7' => '.el8', 'rhel8' => '.el7', 'ubuntu20.04' => 'Ubuntu20_04', @@ -214,13 +214,13 @@ def self.configure(chef_run) expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-deb-packages.sh}) expect(installation_code).to match(/dpkg -i gdrdrv-dkms_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) expect(installation_code).to match(/dpkg -i libgdrapi_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) - expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) + expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}\+cuda\*.deb/) expect(installation_code).to match(/dpkg -i gdrcopy_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) else expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh}) - expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -i gdrcopy-kmod-#{gdrcopy_version}-1dkms.noarch#{gdrcopy_platform}.rpm/) - expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -i gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch}#{gdrcopy_platform}.rpm/) - expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -i gdrcopy-devel-#{gdrcopy_version}-1.noarch#{gdrcopy_platform}.rpm/) + expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.#{gdrcopy_platform}.noarch.rpm/) + expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_platform}.#{gdrcopy_arch}.rpm/) + expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version}-1.#{gdrcopy_platform}.noarch.rpm/) end end