From 9bd475386ae5b3ce10fcb4054cd9d20e6fed0bd4 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 6 Dec 2024 07:31:17 -0800 Subject: [PATCH] Upgrade Nvidia driver to 550.127.08 Signed-off-by: Hanwen --- CHANGELOG.md | 1 + cookbooks/aws-parallelcluster-platform/attributes/platform.rb | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca74953e1..6096716fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: `libfabric-aws-1.22.0-1` - Rdma-core: `rdma-core-54.0-1` - Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5` +- Upgrade NVIDIA driver to version 550.127.08 (from 550.90.07). This addresses [a known issue from Nivdia](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html#known-issues). - Auto-restart slurmctld on failure. **BUG FIXES** diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index d146a442c..14639a219 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -16,7 +16,7 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '550.90.07' +default['cluster']['nvidia']['driver_version'] = '550.127.08' default['cluster']['nvidia']['dcgm_version'] = '3.3.6' # DCV