From 135e9b525bb8c10094cc9645f5683f5ed328d6c9 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Fri, 12 Jan 2024 11:52:37 -0500 Subject: [PATCH] Add a constant for supported NVIDIA OpenRM Architecture and the unsupport instance types --- CHANGELOG.md | 5 +++-- cli/src/pcluster/constants.py | 1 + cli/src/pcluster/validators/ec2_validators.py | 11 +++++++---- cli/tests/pcluster/validators/test_ec2_validators.py | 5 ++++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5449d29f8f..a9012d27cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,8 @@ CHANGELOG - Add support for Python 3.11, 3.12 in pcluster CLI and aws-parallelcluster-batch-cli. - Upgrade Python to version 3.12 and NodeJS to version 18 in ParallelCluster Lambda Layer. - Build network interfaces using network card index from `NetworkCardIndex` list of EC2 DescribeInstances response, - instead of looping over `MaximumNetworkCards` range. + instead of looping over `MaximumNetworkCards` range. +- Fail cluster creation when using instance types p3, g3, p2 and g2 because their GPU architecture is not compatible with Open Source Nvidia Drivers (OpenRM) introduced as part of 3.8.0 release. 3.8.0 ------ @@ -47,7 +48,7 @@ CHANGELOG - Upgrade NVIDIA driver to version 535.129.03. - Upgrade CUDA Toolkit to version 12.2.2. - Use Open Source NVIDIA GPU drivers (OpenRM) as NVIDIA kernel module for Linux instead of NVIDIA closed source module. - - This change removes support for p3, p2, g3, and g2 instances with gpu architecture not supported by OpenRM. + - This change removes support for p3, g3, p2 and g2 instances with GPU architecture not supported by OpenRM. The open source driver only works on platforms that have the GSP (GPU System Processor). - Remove support of `all_or_nothing_batch` configuration parameter in the Slurm resume program, in favor of the new `Scheduling/ScalingStrategy` cluster configuration. - Changed cluster alarms naming convention to '[cluster-name]-[component-name]-[metric]'. - Change default EBS volume types in ADC regions from `gp2` to `gp3`, for both the root and additional volumes. diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index 3105c2ac5b..21b7d55fd2 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -31,6 +31,7 @@ DELETION_POLICIES_WITH_SNAPSHOT = DELETION_POLICIES + ["Snapshot"] SUPPORTED_ARCHITECTURES = ["x86_64", "arm64"] SUPPORTED_OSES_FOR_ARCHITECTURE = {"x86_64": SUPPORTED_OSES, "arm64": SUPPORTED_OSES} +NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES = ["p3", "p3dn", "p2", "g3", "g3s", "g2"] SLURM = "slurm" AWSBATCH = "awsbatch" diff --git a/cli/src/pcluster/validators/ec2_validators.py b/cli/src/pcluster/validators/ec2_validators.py index 867ed4bd08..d5eb6b8a9c 100644 --- a/cli/src/pcluster/validators/ec2_validators.py +++ b/cli/src/pcluster/validators/ec2_validators.py @@ -18,6 +18,7 @@ from pcluster.aws.aws_resources import CapacityReservationInfo from pcluster.aws.common import AWSClientError from pcluster.config.common import CapacityType +from pcluster.constants import NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES from pcluster.utils import get_resource_name_from_resource_arn from pcluster.validators.common import FailureLevel, Validator @@ -146,15 +147,17 @@ def _validate(self, instance_type: str, image: str): ), FailureLevel.ERROR, ) - unsupported = ["p3", "p2", "g3", "g2"] + if ( image_info and "AWS ParallelCluster AMI" in image_info.description - and instance_type.split(".")[0] in unsupported + and instance_type.split(".")[0] in NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES ): self._add_failure( - f"The instance type '{instance_type}' is not supported by OpenRM drivers. " - f"A custom AMI must be used.", + f"The instance type '{instance_type}' is not supported by NVIDIA OpenRM drivers. " + f"OpenRM can only be used on any Turing or later GPU architectures. " + f"Please consider using a different instance type or building a custom AMI " + f"with closed source NVIDIA drivers.", FailureLevel.ERROR, ) diff --git a/cli/tests/pcluster/validators/test_ec2_validators.py b/cli/tests/pcluster/validators/test_ec2_validators.py index cfd00eb23c..834d100dca 100644 --- a/cli/tests/pcluster/validators/test_ec2_validators.py +++ b/cli/tests/pcluster/validators/test_ec2_validators.py @@ -322,7 +322,10 @@ def test_instance_type_memory_info_validator(mocker, instance_type, instance_typ ( "p3.2xlarge", "ami-0185634c5a8a37250", - "The instance type 'p3.2xlarge' is not supported by OpenRM drivers. A custom AMI must be used.", + "The instance type 'p3.2xlarge' is not supported by NVIDIA OpenRM drivers. " + "Only Ampere, Turing, and Hopper architectures are supported. " + "Please consider using a different instance type or building a " + "custom AMI with closed source NVIDIA drivers.", { "ImageId": "ami-0185634c5a8a37250", "Architecture": "x86_64",