From 3f90d5c89418797013b1ba721be599a7971bcd41 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 18 Sep 2023 15:09:29 +0200 Subject: [PATCH 1/6] Reset failure for nodes assigned to instances Reset the failure for nodes that were launched successful, for which it was possible to assign an instance. This to cover the node-sharing (oversubscribe) case when nodes that failed in a job call, are actually launched (and assigned to instances) in a next iteration of the job loop. Signed-off-by: Luca Carrogu --- src/slurm_plugin/instance_manager.py | 8 ++++ tests/slurm_plugin/test_instance_manager.py | 47 +++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index 660c990c4..580a7380b 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -907,6 +907,12 @@ def _add_instances_for_nodes( update_node_address=update_node_address, ) + def _reset_failed_nodes(self, nodeset): + """Remove nodeset from failed nodes dict.""" + if nodeset: + for error_code in self.failed_nodes: + self.failed_nodes[error_code] = self.failed_nodes.get(error_code, set()).difference(nodeset) + def best_effort_node_assignment( self, assign_node_batch_size, @@ -935,6 +941,7 @@ def best_effort_node_assignment( print_with_count(successful_launched_nodes), ) self._update_dict(self.nodes_assigned_to_instances, nodes_resume_mapping) + self._reset_failed_nodes(set(nodes_resume_list)) if len(successful_launched_nodes) < len(nodes_resume_list): # set limited capacity on the failed to launch nodes self._update_failed_nodes(set(failed_launch_nodes), "LimitedInstanceCapacity", override=False) @@ -968,6 +975,7 @@ def all_or_nothing_node_assignment( print_with_count(nodes_resume_list), ) self._update_dict(self.nodes_assigned_to_instances, nodes_resume_mapping) + self._reset_failed_nodes(set(nodes_resume_list)) except InstanceToNodeAssignmentError: # Failed to assign EC2 instances to nodes # EC2 Instances already assigned, are going to be terminated by diff --git a/tests/slurm_plugin/test_instance_manager.py b/tests/slurm_plugin/test_instance_manager.py index 88ef67800..9b78ca3d9 100644 --- a/tests/slurm_plugin/test_instance_manager.py +++ b/tests/slurm_plugin/test_instance_manager.py @@ -4269,6 +4269,53 @@ def test_scaling_for_jobs_multi_node( assert_that(instance_manager.unused_launched_instances).is_equal_to(expected_unused_launched_instances) + @pytest.mark.parametrize( + "nodeset, mock_failed_nodes, expected_failed_nodes", + [ + ( + {}, + {}, + {}, + ), + ( + {}, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, + ), + ( + {"queue1-st-c5xlarge-2"}, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, + ), + ( + {"queue2-dy-c5xlarge-1"}, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue2-dy-c5xlarge-1"}, + }, + { + "Exception": {"queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": set(), + }, + ), + ], + ) + def test_reset_failed_nodes(self, instance_manager, nodeset, mock_failed_nodes, expected_failed_nodes): + instance_manager.failed_nodes = mock_failed_nodes + instance_manager._reset_failed_nodes(nodeset) + assert_that(instance_manager.failed_nodes).is_equal_to(expected_failed_nodes) class TestNodeListScalingInstanceManager: @pytest.fixture From af16907b2d35c74c52a463370013046bc2ec96a0 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 18 Sep 2023 15:14:39 +0200 Subject: [PATCH 2/6] Add job-level scaling for node sharing Add job-level scaling for the node sharing case. Before entering the job loop, perform the same optimizations done for the exclusive job case: * scale best-effort for all single node jobs * scale all for all multi node jobs Manual tests performed on running cluster given the following submission command: ``` sbatch --wrap "sleep 10000" -N 4 --constraint="[(c5.4xlarge)*3&(p4d.24xlarge)*1]" -p q4; sbatch --wrap "sleep 10000" -N 4 --constraint="[(c5.4xlarge)*3&(p4d.24xlarge)*1]" -p q4; sbatch --wrap "sleep 10000" -N 3 --constraint="[(c5.4xlarge)*3]" -p q4 ``` where there is capacity for c5.4xlarge but not for p4d.24xlarge the two scaling strategies were tested: all_or_nothing_batch = true expected nodes running at the end of the resume call: (x3) q4-dy-c4-1-* resume log: ``` 2023-09-19 10:56:32,549 - [slurm_plugin.resume:main] - INFO - ResumeProgram startup. 2023-09-19 10:56:32,550 - [slurm_plugin.resume:_get_config] - INFO - Reading /etc/parallelcluster/slurm_plugin/parallelcluster_slurm_resume.conf 2023-09-19 10:56:32,551 - [slurm_plugin.resume:main] - INFO - ResumeProgram config: SlurmResumeConfig(region='us-east-1', cluster_name='bootstrap', dynamodb_table='parallelcluster-slurm-bootstrap', hosted_zone='Z09815256PBUS3QRIMRV', dns_domain='bootstrap.pcluster.', use_private_hostname=False, head_node_private_ip='192.168.24.99', head_node_hostname='ip-192-168-24-99.ec2.internal', launch_max_batch_size=500, assign_node_max_batch_size=500, terminate_max_batch_size=1000, update_node_address=True, all_or_nothing_batch=True, job_level_scaling=True, temp_jls_for_node_sharing=False, fleet_config={'q1': {'c1': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}, 'q2': {'c2': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.2xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}, 'q3': {'c3': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.4xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}, 'q4': {'c4-1': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.4xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}, 'c4-2': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'p4d.24xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}}, run_instances_overrides={}, create_fleet_overrides={}, clustermgtd_timeout=300, clustermgtd_heartbeat_file_path='/opt/slurm/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat', _boto3_retry=1, _boto3_config={'retries': {'max_attempts': 1, 'mode': 'standard'}}, boto3_config=, logging_config='/opt/parallelcluster/pyenv/versions/3.9.16/envs/node_virtualenv/lib/python3.9/site-packages/slurm_plugin/logging/parallelcluster_resume_logging.conf', head_node_instance_id='i-0145afe796a5e375a') 2023-09-19 10:56:32,551 - [slurm_plugin.resume:_get_slurm_resume] - INFO - Slurm Resume File content: {'jobs': [{'extra': None, 'job_id': 252, 'features': '[(c5.4xlarge)*3&(p4d.24xlarge)*1]', 'nodes_alloc': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'nodes_resume': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'oversubscribe': 'OK', 'partition': 'q4', 'reservation': None}, {'extra': None, 'job_id': 253, 'features': '[(c5.4xlarge)*3&(p4d.24xlarge)*1]', 'nodes_alloc': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'nodes_resume': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'oversubscribe': 'OK', 'partition': 'q4', 'reservation': None}, {'extra': None, 'job_id': 254, 'features': '[(c5.4xlarge)*3]', 'nodes_alloc': 'q4-dy-c4-1-[1-3]', 'nodes_resume': 'q4-dy-c4-1-[1-3]', 'oversubscribe': 'OK', 'partition': 'q4', 'reservation': None}], 'all_nodes_resume': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1'} 2023-09-19 10:56:32,555 - [slurm_plugin.common:is_clustermgtd_heartbeat_valid] - INFO - Latest heartbeat from clustermgtd: 2023-09-19 10:56:00.366160+00:00 2023-09-19 10:56:32,556 - [slurm_plugin.resume:_resume] - INFO - Launching EC2 instances for the following Slurm nodes: q4-dy-c4-1-[1-3],q4-dy-c4-2-1 2023-09-19 10:56:32,609 - [slurm_plugin.resume:_resume] - INFO - Current state of Slurm nodes to resume: [('q4-dy-c4-1-1', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP'), ('q4-dy-c4-1-2', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP'), ('q4-dy-c4-1-3', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP'), ('q4-dy-c4-2-1', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP')] 2023-09-19 10:56:32,634 - [botocore.credentials:load] - INFO - Found credentials from IAM Role: bootstrap-RoleHeadNode-NKATKTSA4IIU 2023-09-19 10:56:32,675 - [slurm_plugin.instance_manager:_launch_instances] - INFO - Launching all-or-nothing instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 10:56:32,676 - [slurm_plugin.fleet_manager:create_fleet] - INFO - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-1', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'c5.4xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 3, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 3, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 10:56:35,637 - [slurm_plugin.fleet_manager:launch_ec2_instances] - INFO - Launched the following instances (x3) ['i-01fa6f17d69b9f86a', 'i-032040429aa3571b1', 'i-0553c576b546f1d1d'] 2023-09-19 10:56:35,638 - [slurm_plugin.instance_manager:_launch_instances] - INFO - Launching all-or-nothing instances for nodes (x1) ['q4-dy-c4-2-1'] 2023-09-19 10:56:35,638 - [slurm_plugin.fleet_manager:create_fleet] - INFO - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-2', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'p4d.24xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 1, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 1, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 10:56:36,709 - [slurm_plugin.fleet_manager:_launch_instances] - ERROR - Error in CreateFleet request (19e5fdb0-c13d-4634-8c12-81678a5ddb1a): InsufficientInstanceCapacity - We currently do not have sufficient p4d.24xlarge capacity in the Availability Zone you requested (us-east-1d). Our system will be working on provisioning additional capacity. You can currently get p4d.24xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1a, us-east-1b. 2023-09-19 10:56:36,810 - [slurm_plugin.instance_manager:_scaling_for_jobs] - INFO - JobID 252 - The nodes_resume list from Slurm Resume File is (x4) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3', 'q4-dy-c4-2-1'] 2023-09-19 10:56:36,810 - [slurm_plugin.instance_manager:_resize_slurm_node_list] - INFO - JobID 252 - Booking already launched instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3']: 2023-09-19 10:56:36,810 - [slurm_plugin.instance_manager:_launch_instances] - INFO - JobID 252 - Launching all-or-nothing instances for nodes (x1) ['q4-dy-c4-2-1'] 2023-09-19 10:56:36,810 - [slurm_plugin.fleet_manager:create_fleet] - INFO - JobID 252 - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-2', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'p4d.24xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 1, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 1, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 10:56:37,857 - [slurm_plugin.fleet_manager:_launch_instances] - ERROR - JobID 252 - Error in CreateFleet request (542a63d6-8e0e-41eb-ad47-ebefe7e49450): InsufficientInstanceCapacity - We currently do not have sufficient p4d.24xlarge capacity in the Availability Zone you requested (us-east-1d). Our system will be working on provisioning additional capacity. You can currently get p4d.24xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1a, us-east-1b. 2023-09-19 10:56:37,957 - [slurm_plugin.instance_manager:all_or_nothing_node_assignment] - INFO - JobID 252 - Releasing launched and booked instances (x3) ["('q4', 'c4-1', EC2Instance(id='i-01fa6f17d69b9f86a', private_ip='192.168.109.104', hostname='ip-192-168-109-104', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))", "('q4', 'c4-1', EC2Instance(id='i-032040429aa3571b1', private_ip='192.168.104.153', hostname='ip-192-168-104-153', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))", "('q4', 'c4-1', EC2Instance(id='i-0553c576b546f1d1d', private_ip='192.168.110.129', hostname='ip-192-168-110-129', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))"] 2023-09-19 10:56:37,957 - [slurm_plugin.instance_manager:_scaling_for_jobs] - INFO - JobID 253 - The nodes_resume list from Slurm Resume File is (x4) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3', 'q4-dy-c4-2-1'] 2023-09-19 10:56:37,958 - [slurm_plugin.instance_manager:_resize_slurm_node_list] - INFO - JobID 253 - Booking already launched instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3']: 2023-09-19 10:56:37,958 - [slurm_plugin.instance_manager:_launch_instances] - INFO - JobID 253 - Launching all-or-nothing instances for nodes (x1) ['q4-dy-c4-2-1'] 2023-09-19 10:56:37,958 - [slurm_plugin.fleet_manager:create_fleet] - INFO - JobID 253 - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-2', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'p4d.24xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 1, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 1, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 10:56:38,992 - [slurm_plugin.fleet_manager:_launch_instances] - ERROR - JobID 253 - Error in CreateFleet request (7654c2d2-e5fe-4d5f-a8bc-7404045f3618): InsufficientInstanceCapacity - We currently do not have sufficient p4d.24xlarge capacity in the Availability Zone you requested (us-east-1d). Our system will be working on provisioning additional capacity. You can currently get p4d.24xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1a, us-east-1b. 2023-09-19 10:56:39,093 - [slurm_plugin.instance_manager:all_or_nothing_node_assignment] - INFO - JobID 253 - Releasing launched and booked instances (x3) ["('q4', 'c4-1', EC2Instance(id='i-01fa6f17d69b9f86a', private_ip='192.168.109.104', hostname='ip-192-168-109-104', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))", "('q4', 'c4-1', EC2Instance(id='i-032040429aa3571b1', private_ip='192.168.104.153', hostname='ip-192-168-104-153', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))", "('q4', 'c4-1', EC2Instance(id='i-0553c576b546f1d1d', private_ip='192.168.110.129', hostname='ip-192-168-110-129', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))"] 2023-09-19 10:56:39,093 - [slurm_plugin.instance_manager:_scaling_for_jobs] - INFO - JobID 254 - The nodes_resume list from Slurm Resume File is (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 10:56:39,093 - [slurm_plugin.instance_manager:_resize_slurm_node_list] - INFO - JobID 254 - Booking already launched instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3']: 2023-09-19 10:56:39,111 - [slurm_plugin.instance_manager:_update_slurm_node_addrs] - INFO - JobID 254 - Nodes are now configured with instances (x3) ["('q4-dy-c4-1-1', EC2Instance(id='i-01fa6f17d69b9f86a', private_ip='192.168.109.104', hostname='ip-192-168-109-104', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))", "('q4-dy-c4-1-2', EC2Instance(id='i-032040429aa3571b1', private_ip='192.168.104.153', hostname='ip-192-168-104-153', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))", "('q4-dy-c4-1-3', EC2Instance(id='i-0553c576b546f1d1d', private_ip='192.168.110.129', hostname='ip-192-168-110-129', launch_time=datetime.datetime(2023, 9, 19, 10, 56, 34, tzinfo=tzlocal()), slurm_node=None))"] 2023-09-19 10:56:39,111 - [slurm_plugin.instance_manager:_store_assigned_hostnames] - INFO - JobID 254 - Saving assigned hostnames in DynamoDB 2023-09-19 10:56:39,146 - [slurm_plugin.instance_manager:_store_assigned_hostnames] - INFO - JobID 254 - Database update: COMPLETED 2023-09-19 10:56:39,146 - [slurm_plugin.instance_manager:_update_dns_hostnames] - INFO - JobID 254 - Updating DNS records for Z09815256PBUS3QRIMRV - bootstrap.pcluster. 2023-09-19 10:56:39,420 - [slurm_plugin.instance_manager:_update_dns_hostnames] - INFO - JobID 254 - DNS records update: COMPLETED 2023-09-19 10:56:39,421 - [slurm_plugin.instance_manager:all_or_nothing_node_assignment] - INFO - JobID 254 - Successful launched all instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 10:56:39,422 - [slurm_plugin.resume:_resume] - INFO - Successfully launched nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 10:56:39,422 - [slurm_plugin.resume:_resume] - ERROR - Failed to launch following nodes, setting nodes to DOWN: (x1) ['q4-dy-c4-2-1'] 2023-09-19 10:56:39,422 - [slurm_plugin.resume:_handle_failed_nodes] - INFO - Setting following failed nodes into DOWN state (x1) ['q4-dy-c4-2-1'] with reason: (Code:InsufficientInstanceCapacity)Failure when resuming nodes 2023-09-19 10:56:39,439 - [slurm_plugin.resume:_handle_failed_nodes] - INFO - Setting following failed nodes into DOWN state (x0) [] with reason: (Code:LimitedInstanceCapacity)Failure when resuming nodes 2023-09-19 10:56:39,442 - [slurm_plugin.resume:main] - INFO - ResumeProgram finished. ``` all_or_nothing_batch = false expected nodes running at the end of the resume call: (x3) q4-dy-c4-1-* resume log: ``` 2023-09-19 12:30:03,047 - [slurm_plugin.resume:main] - INFO - ResumeProgram startup. 2023-09-19 12:30:03,048 - [slurm_plugin.resume:_get_config] - INFO - Reading /etc/parallelcluster/slurm_plugin/parallelcluster_slurm_resume.conf 2023-09-19 12:30:03,049 - [slurm_plugin.resume:main] - INFO - ResumeProgram config: SlurmResumeConfig(region='us-east-1', cluster_name='bootstrap', dynamodb_table='parallelcluster-slurm-bootstrap', hosted_zone='Z09815256PBUS3QRIMRV', dns_domain='bootstrap.pcluster.', use_private_hostname=False, head_node_private_ip='192.168.24.99', head_node_hostname='ip-192-168-24-99.ec2.internal', launch_max_batch_size=500, assign_node_max_batch_size=500, terminate_max_batch_size=1000, update_node_address=True, all_or_nothing_batch=False, job_level_scaling=True, temp_jls_for_node_sharing=False, fleet_config={'q1': {'c1': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}, 'q2': {'c2': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.2xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}, 'q3': {'c3': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.4xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}, 'q4': {'c4-1': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'c5.4xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}, 'c4-2': {'Api': 'create-fleet', 'CapacityType': 'on-demand', 'AllocationStrategy': 'lowest-price', 'Instances': [{'InstanceType': 'p4d.24xlarge'}], 'Networking': {'SubnetIds': ['subnet-0b48ed99988e56110']}}}}, run_instances_overrides={}, create_fleet_overrides={}, clustermgtd_timeout=300, clustermgtd_heartbeat_file_path='/opt/slurm/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat', _boto3_retry=1, _boto3_config={'retries': {'max_attempts': 1, 'mode': 'standard'}}, boto3_config=, logging_config='/opt/parallelcluster/pyenv/versions/3.9.16/envs/node_virtualenv/lib/python3.9/site-packages/slurm_plugin/logging/parallelcluster_resume_logging.conf', head_node_instance_id='i-0145afe796a5e375a') 2023-09-19 12:30:03,049 - [slurm_plugin.resume:_get_slurm_resume] - INFO - Slurm Resume File content: {'jobs': [{'extra': None, 'job_id': 260, 'features': '[(c5.4xlarge)*3&(p4d.24xlarge)*1]', 'nodes_alloc': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'nodes_resume': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'oversubscribe': 'OK', 'partition': 'q4', 'reservation': None}, {'extra': None, 'job_id': 261, 'features': '[(c5.4xlarge)*3&(p4d.24xlarge)*1]', 'nodes_alloc': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'nodes_resume': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1', 'oversubscribe': 'OK', 'partition': 'q4', 'reservation': None}, {'extra': None, 'job_id': 262, 'features': '[(c5.4xlarge)*3]', 'nodes_alloc': 'q4-dy-c4-1-[1-3]', 'nodes_resume': 'q4-dy-c4-1-[1-3]', 'oversubscribe': 'OK', 'partition': 'q4', 'reservation': None}], 'all_nodes_resume': 'q4-dy-c4-1-[1-3],q4-dy-c4-2-1'} 2023-09-19 12:30:03,054 - [slurm_plugin.common:is_clustermgtd_heartbeat_valid] - INFO - Latest heartbeat from clustermgtd: 2023-09-19 12:29:03.613945+00:00 2023-09-19 12:30:03,054 - [slurm_plugin.resume:_resume] - INFO - Launching EC2 instances for the following Slurm nodes: q4-dy-c4-1-[1-3],q4-dy-c4-2-1 2023-09-19 12:30:03,109 - [slurm_plugin.resume:_resume] - INFO - Current state of Slurm nodes to resume: [('q4-dy-c4-1-1', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP'), ('q4-dy-c4-1-2', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP'), ('q4-dy-c4-1-3', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP'), ('q4-dy-c4-2-1', 'MIXED+CLOUD+NOT_RESPONDING+POWERING_UP')] 2023-09-19 12:30:03,135 - [botocore.credentials:load] - INFO - Found credentials from IAM Role: bootstrap-RoleHeadNode-NKATKTSA4IIU 2023-09-19 12:30:03,176 - [slurm_plugin.instance_manager:_launch_instances] - INFO - Launching best-effort instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 12:30:03,176 - [slurm_plugin.fleet_manager:create_fleet] - INFO - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-1', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'c5.4xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 3, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 1, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 12:30:06,736 - [slurm_plugin.fleet_manager:launch_ec2_instances] - INFO - Launched the following instances (x3) ['i-083f7c31d25b7430a', 'i-061dc215a811fe1ed', 'i-0a4d69c19b6ad8322'] 2023-09-19 12:30:06,737 - [slurm_plugin.instance_manager:_launch_instances] - INFO - Launching best-effort instances for nodes (x1) ['q4-dy-c4-2-1'] 2023-09-19 12:30:06,737 - [slurm_plugin.fleet_manager:create_fleet] - INFO - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-2', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'p4d.24xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 1, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 1, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 12:30:07,799 - [slurm_plugin.fleet_manager:_launch_instances] - ERROR - Error in CreateFleet request (b0c51c67-eed1-4b15-8872-4e390327aca7): InsufficientInstanceCapacity - We currently do not have sufficient p4d.24xlarge capacity in the Availability Zone you requested (us-east-1d). Our system will be working on provisioning additional capacity. You can currently get p4d.24xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1a, us-east-1b. 2023-09-19 12:30:07,900 - [slurm_plugin.instance_manager:_scaling_for_jobs] - INFO - JobID 260 - The nodes_resume list from Slurm Resume File is (x4) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3', 'q4-dy-c4-2-1'] 2023-09-19 12:30:07,900 - [slurm_plugin.instance_manager:_resize_slurm_node_list] - INFO - JobID 260 - Booking already launched instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3']: 2023-09-19 12:30:07,900 - [slurm_plugin.instance_manager:_launch_instances] - INFO - JobID 260 - Launching best-effort instances for nodes (x1) ['q4-dy-c4-2-1'] 2023-09-19 12:30:07,901 - [slurm_plugin.fleet_manager:create_fleet] - INFO - JobID 260 - Launching instances with create_fleet API. Parameters: {'LaunchTemplateConfigs': [{'LaunchTemplateSpecification': {'LaunchTemplateName': 'bootstrap-q4-c4-2', 'Version': '$Latest'}, 'Overrides': [{'InstanceType': 'p4d.24xlarge', 'SubnetId': 'subnet-0b48ed99988e56110'}]}], 'TargetCapacitySpecification': {'TotalTargetCapacity': 1, 'DefaultTargetCapacityType': 'on-demand'}, 'Type': 'instant', 'OnDemandOptions': {'AllocationStrategy': 'lowest-price', 'SingleInstanceType': True, 'SingleAvailabilityZone': True, 'MinTargetCapacity': 1, 'CapacityReservationOptions': {'UsageStrategy': 'use-capacity-reservations-first'}}} 2023-09-19 12:30:08,949 - [slurm_plugin.fleet_manager:_launch_instances] - ERROR - JobID 260 - Error in CreateFleet request (09653663-7ccc-45ac-9366-3fdc0299e86b): InsufficientInstanceCapacity - We currently do not have sufficient p4d.24xlarge capacity in the Availability Zone you requested (us-east-1d). Our system will be working on provisioning additional capacity. You can currently get p4d.24xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1a, us-east-1b. 2023-09-19 12:30:09,067 - [slurm_plugin.instance_manager:_update_slurm_node_addrs] - INFO - JobID 260 - Nodes are now configured with instances (x3) ["('q4-dy-c4-1-1', EC2Instance(id='i-083f7c31d25b7430a', private_ip='192.168.111.219', hostname='ip-192-168-111-219', launch_time=datetime.datetime(2023, 9, 19, 12, 30, 5, tzinfo=tzlocal()), slurm_node=None))", "('q4-dy-c4-1-2', EC2Instance(id='i-061dc215a811fe1ed', private_ip='192.168.104.231', hostname='ip-192-168-104-231', launch_time=datetime.datetime(2023, 9, 19, 12, 30, 5, tzinfo=tzlocal()), slurm_node=None))", "('q4-dy-c4-1-3', EC2Instance(id='i-0a4d69c19b6ad8322', private_ip='192.168.109.180', hostname='ip-192-168-109-180', launch_time=datetime.datetime(2023, 9, 19, 12, 30, 5, tzinfo=tzlocal()), slurm_node=None))"] 2023-09-19 12:30:09,067 - [slurm_plugin.instance_manager:_store_assigned_hostnames] - INFO - JobID 260 - Saving assigned hostnames in DynamoDB 2023-09-19 12:30:09,106 - [slurm_plugin.instance_manager:_store_assigned_hostnames] - INFO - JobID 260 - Database update: COMPLETED 2023-09-19 12:30:09,106 - [slurm_plugin.instance_manager:_update_dns_hostnames] - INFO - JobID 260 - Updating DNS records for Z09815256PBUS3QRIMRV - bootstrap.pcluster. 2023-09-19 12:30:09,331 - [slurm_plugin.instance_manager:_update_dns_hostnames] - INFO - JobID 260 - DNS records update: COMPLETED 2023-09-19 12:30:09,332 - [slurm_plugin.instance_manager:best_effort_node_assignment] - INFO - JobID 260 - Successful launched partial instances for nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 12:30:09,332 - [slurm_plugin.instance_manager:_scaling_for_jobs] - INFO - JobID 261 - The nodes_resume list from Slurm Resume File is (x4) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3', 'q4-dy-c4-2-1'] 2023-09-19 12:30:09,332 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 261 - Discarding NodeName already assigned to running instance: q4-dy-c4-1-1 2023-09-19 12:30:09,332 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 261 - Discarding NodeName already assigned to running instance: q4-dy-c4-1-2 2023-09-19 12:30:09,332 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 261 - Discarding NodeName already assigned to running instance: q4-dy-c4-1-3 2023-09-19 12:30:09,332 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 261 - Discarding NodeName already assigned to running instance: q4-dy-c4-2-1 2023-09-19 12:30:09,333 - [slurm_plugin.instance_manager:_scaling_for_jobs] - INFO - JobID 262 - The nodes_resume list from Slurm Resume File is (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 12:30:09,333 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 262 - Discarding NodeName already assigned to running instance: q4-dy-c4-1-1 2023-09-19 12:30:09,333 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 262 - Discarding NodeName already assigned to running instance: q4-dy-c4-1-2 2023-09-19 12:30:09,333 - [slurm_plugin.instance_manager:_parse_nodes_resume_list] - INFO - JobID 262 - Discarding NodeName already assigned to running instance: q4-dy-c4-1-3 2023-09-19 12:30:09,333 - [slurm_plugin.resume:_resume] - INFO - Successfully launched nodes (x3) ['q4-dy-c4-1-1', 'q4-dy-c4-1-2', 'q4-dy-c4-1-3'] 2023-09-19 12:30:09,333 - [slurm_plugin.resume:_resume] - ERROR - Failed to launch following nodes, setting nodes to DOWN: (x1) ['q4-dy-c4-2-1'] 2023-09-19 12:30:09,333 - [slurm_plugin.resume:_handle_failed_nodes] - INFO - Setting following failed nodes into DOWN state (x1) ['q4-dy-c4-2-1'] with reason: (Code:InsufficientInstanceCapacity)Failure when resuming nodes 2023-09-19 12:30:09,369 - [slurm_plugin.resume:main] - INFO - ResumeProgram finished. ``` Signed-off-by: Luca Carrogu --- src/slurm_plugin/instance_manager.py | 31 ++--- tests/slurm_plugin/test_instance_manager.py | 121 ++++++++++++-------- 2 files changed, 84 insertions(+), 68 deletions(-) diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index 580a7380b..e8704c8c4 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -634,11 +634,12 @@ def _scaling_for_jobs_single_node( all_or_nothing_batch=all_or_nothing_batch, ) else: - # Batch all single node no oversubscribe jobs in a single best-effort EC2 launch request + # Batch all single node jobs in a single best-effort EC2 launch request # This to reduce scaling time and save launch API calls - single_nodes_no_oversubscribe = [job.nodes_resume[0] for job in job_list] + # Remove duplicated node entries (possible in oversubscribe case) + single_nodes = list(dict.fromkeys([job.nodes_resume[0] for job in job_list])) self._add_instances_for_nodes( - node_list=single_nodes_no_oversubscribe, + node_list=single_nodes, launch_batch_size=launch_batch_size, update_node_address=update_node_address, all_or_nothing_batch=False, @@ -660,7 +661,8 @@ def _add_instances_for_resume_file( self._clear_unused_launched_instances() self._scaling_for_jobs_single_node( - job_list=slurm_resume_data.jobs_single_node_no_oversubscribe, + job_list=slurm_resume_data.jobs_single_node_no_oversubscribe + + slurm_resume_data.jobs_single_node_oversubscribe, launch_batch_size=launch_batch_size, assign_node_batch_size=assign_node_batch_size, update_node_address=update_node_address, @@ -668,28 +670,15 @@ def _add_instances_for_resume_file( ) self._scaling_for_jobs_multi_node( - job_list=slurm_resume_data.jobs_multi_node_no_oversubscribe, - node_list=slurm_resume_data.multi_node_no_oversubscribe, + job_list=slurm_resume_data.jobs_multi_node_no_oversubscribe + + slurm_resume_data.jobs_multi_node_oversubscribe, + node_list=slurm_resume_data.multi_node_no_oversubscribe + slurm_resume_data.multi_node_oversubscribe, launch_batch_size=launch_batch_size, assign_node_batch_size=assign_node_batch_size, update_node_address=update_node_address, all_or_nothing_batch=all_or_nothing_batch, ) - if not self.temp_jls_for_node_sharing: - # node scaling for oversubscribe nodes - node_list = list( - dict.fromkeys(slurm_resume_data.single_node_oversubscribe + slurm_resume_data.multi_node_oversubscribe) - ) - if node_list: - self._add_instances_for_nodes( - node_list=node_list, - launch_batch_size=launch_batch_size, - assign_node_batch_size=assign_node_batch_size, - update_node_address=update_node_address, - all_or_nothing_batch=all_or_nothing_batch, - ) - def _scaling_for_jobs_multi_node( self, job_list, @@ -941,7 +930,7 @@ def best_effort_node_assignment( print_with_count(successful_launched_nodes), ) self._update_dict(self.nodes_assigned_to_instances, nodes_resume_mapping) - self._reset_failed_nodes(set(nodes_resume_list)) + self._reset_failed_nodes(set(successful_launched_nodes)) if len(successful_launched_nodes) < len(nodes_resume_list): # set limited capacity on the failed to launch nodes self._update_failed_nodes(set(failed_launch_nodes), "LimitedInstanceCapacity", override=False) diff --git a/tests/slurm_plugin/test_instance_manager.py b/tests/slurm_plugin/test_instance_manager.py index 9b78ca3d9..aacc6f68b 100644 --- a/tests/slurm_plugin/test_instance_manager.py +++ b/tests/slurm_plugin/test_instance_manager.py @@ -1498,7 +1498,9 @@ def test_add_instances( "assign_node_batch_size", "update_node_address", "all_or_nothing_batch", - "expected_nodes_oversubscribe", + "expected_jobs_multi_node_oversubscribe", + "expected_multi_node_oversubscribe", + "expected_jobs_single_node_oversubscribe", "expected_jobs_multi_node_no_oversubscribe", "expected_multi_node_no_oversubscribe", "expected_jobs_single_node_no_oversubscribe", @@ -1557,12 +1559,27 @@ def test_add_instances( 30, True, False, + [ + SlurmResumeJob( + job_id=140814, + nodes_alloc="queue1-st-c5xlarge-[1-4]", + nodes_resume="queue1-st-c5xlarge-[1-3]", + oversubscribe="YES", + ), + SlurmResumeJob( + job_id=140818, + nodes_alloc="queue1-st-c5xlarge-[1-3], queue4-st-c5xlarge-11", + nodes_resume="queue1-st-c5xlarge-[1-3], queue4-st-c5xlarge-11", + oversubscribe="OK", + ), + ], [ "queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2", "queue1-st-c5xlarge-3", "queue4-st-c5xlarge-11", ], + [], [ SlurmResumeJob( job_id=140815, @@ -1608,6 +1625,14 @@ def test_add_instances( 25, False, False, + [ + SlurmResumeJob( + job_id=140814, + nodes_alloc="queue1-st-c5xlarge-[1-4]", + nodes_resume="queue1-st-c5xlarge-[1-3]", + oversubscribe="FORCE", + ), + ], [ "queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2", @@ -1616,6 +1641,7 @@ def test_add_instances( [], [], [], + [], ), ( { @@ -1637,6 +1663,8 @@ def test_add_instances( [], [], [], + [], + [], [ SlurmResumeJob( job_id=140814, @@ -1672,7 +1700,6 @@ def test_add_instances( }, [ "queue1-st-c5xlarge-1", - "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1", "queue2-st-c5xlarge-2", "queue3-st-c5xlarge-1", @@ -1681,8 +1708,15 @@ def test_add_instances( 28, True, False, + [], + [], [ - "queue3-st-c5xlarge-1", + SlurmResumeJob( + job_id=140816, + nodes_alloc="queue3-st-c5xlarge-1", + nodes_resume="queue3-st-c5xlarge-1", + oversubscribe="YES", + ), ], [ SlurmResumeJob( @@ -1712,7 +1746,9 @@ def test_add_instances_for_resume_file( assign_node_batch_size, update_node_address, all_or_nothing_batch, - expected_nodes_oversubscribe, + expected_jobs_multi_node_oversubscribe, + expected_multi_node_oversubscribe, + expected_jobs_single_node_oversubscribe, expected_jobs_multi_node_no_oversubscribe, expected_multi_node_no_oversubscribe, expected_jobs_single_node_no_oversubscribe, @@ -1734,33 +1770,23 @@ def test_add_instances_for_resume_file( ) instance_manager._scaling_for_jobs_single_node.assert_any_call( - job_list=expected_jobs_single_node_no_oversubscribe, + job_list=expected_jobs_single_node_no_oversubscribe + expected_jobs_single_node_oversubscribe, launch_batch_size=launch_batch_size, assign_node_batch_size=assign_node_batch_size, update_node_address=update_node_address, all_or_nothing_batch=all_or_nothing_batch, ) instance_manager._scaling_for_jobs_multi_node.assert_any_call( - job_list=expected_jobs_multi_node_no_oversubscribe, - node_list=expected_multi_node_no_oversubscribe, + job_list=expected_jobs_multi_node_no_oversubscribe + expected_jobs_multi_node_oversubscribe, + node_list=expected_multi_node_no_oversubscribe + expected_multi_node_oversubscribe, launch_batch_size=launch_batch_size, assign_node_batch_size=assign_node_batch_size, update_node_address=update_node_address, all_or_nothing_batch=all_or_nothing_batch, ) - if expected_nodes_oversubscribe: - instance_manager._add_instances_for_nodes.assert_any_call( - node_list=expected_nodes_oversubscribe, - launch_batch_size=launch_batch_size, - assign_node_batch_size=assign_node_batch_size, - update_node_address=update_node_address, - all_or_nothing_batch=all_or_nothing_batch, - ) assert_that(instance_manager.unused_launched_instances).is_empty() assert_that(instance_manager._scaling_for_jobs_single_node.call_count).is_equal_to(1) assert_that(instance_manager._scaling_for_jobs_multi_node.call_count).is_equal_to(1) - if expected_nodes_oversubscribe: - assert_that(instance_manager._add_instances_for_nodes.call_count).is_equal_to(1) @pytest.mark.parametrize( "slurm_resume, node_list, expected_single_node_oversubscribe, expected_multi_node_oversubscribe, " @@ -4273,42 +4299,42 @@ def test_scaling_for_jobs_multi_node( "nodeset, mock_failed_nodes, expected_failed_nodes", [ ( - {}, - {}, - {}, + {}, + {}, + {}, ), ( - {}, - { - "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, - "some_error_code": {"queue1-st-c52xlarge-1"}, - }, - { - "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, - "some_error_code": {"queue1-st-c52xlarge-1"}, - }, + {}, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, ), ( - {"queue1-st-c5xlarge-2"}, - { - "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, - "some_error_code": {"queue1-st-c52xlarge-1"}, - }, - { - "Exception": {"queue2-dy-c5xlarge-1", "queue2-st-c5xlarge-1"}, - "some_error_code": {"queue1-st-c52xlarge-1"}, - }, + {"queue1-st-c5xlarge-2"}, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue1-st-c52xlarge-1"}, + }, ), ( - {"queue2-dy-c5xlarge-1"}, - { - "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, - "some_error_code": {"queue2-dy-c5xlarge-1"}, - }, - { - "Exception": {"queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, - "some_error_code": set(), - }, + {"queue2-dy-c5xlarge-1"}, + { + "Exception": {"queue2-dy-c5xlarge-1", "queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": {"queue2-dy-c5xlarge-1"}, + }, + { + "Exception": {"queue1-st-c5xlarge-2", "queue2-st-c5xlarge-1"}, + "some_error_code": set(), + }, ), ], ) @@ -4317,6 +4343,7 @@ def test_reset_failed_nodes(self, instance_manager, nodeset, mock_failed_nodes, instance_manager._reset_failed_nodes(nodeset) assert_that(instance_manager.failed_nodes).is_equal_to(expected_failed_nodes) + class TestNodeListScalingInstanceManager: @pytest.fixture def instance_manager(self, mocker): From 6b34fd425aaba62f8569fe4fa1cd0ea60d15e66f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 19 Sep 2023 13:04:28 +0200 Subject: [PATCH 3/6] Avoid to set nodes into DOWN if no nodes are passed as input Avoid to set nodes into DOWN, hence avoid calling Slurm scontrol update, if node list is empty Avoided log line is ``` 2023-09-19 10:56:39,439 - [slurm_plugin.resume:_handle_failed_nodes] - INFO - Setting following failed nodes into DOWN state (x0) [] with reason: (Code:LimitedInstanceCapacity)Failure when resuming nodes ``` Signed-off-by: Luca Carrogu --- src/slurm_plugin/resume.py | 25 +++++++-------- tests/slurm_plugin/test_resume.py | 51 ++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py index e340b6683..6c90225a6 100644 --- a/src/slurm_plugin/resume.py +++ b/src/slurm_plugin/resume.py @@ -157,18 +157,19 @@ def _handle_failed_nodes(node_list, reason="Failure when resuming nodes"): To save time, should explicitly set nodes to DOWN in ResumeProgram so clustermgtd can maintain failed nodes. Clustermgtd will be responsible for running full DOWN -> POWER_DOWN process. """ - try: - log.info( - "Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason - ) - set_nodes_down(node_list, reason=reason) - except Exception as e: - log.error( - "Failed to place nodes %s into DOWN for reason %s with exception: %s", - print_with_count(node_list), - reason, - e, - ) + if node_list: + try: + log.info( + "Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason + ) + set_nodes_down(node_list, reason=reason) + except Exception as e: + log.error( + "Failed to place nodes %s into DOWN for reason %s with exception: %s", + print_with_count(node_list), + reason, + e, + ) def _resume(arg_nodes, resume_config, slurm_resume): diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index 9a6d32d54..1c7c58cd0 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -21,7 +21,7 @@ import slurm_plugin from assertpy import assert_that from slurm_plugin.fleet_manager import EC2Instance -from slurm_plugin.resume import SlurmResumeConfig, _get_slurm_resume, _resume +from slurm_plugin.resume import SlurmResumeConfig, _get_slurm_resume, _handle_failed_nodes, _resume from tests.common import FLEET_CONFIG, LAUNCH_OVERRIDES, client_error @@ -916,3 +916,52 @@ def test_get_slurm_resume(config_file, expected_slurm_resume, test_datadir, capl assert_that(caplog.records).is_length(1) assert_that(caplog.records[0].levelname).is_equal_to("INFO") assert_that(caplog.records[0].message).contains("Slurm Resume File content") + + +@pytest.mark.parametrize( + "node_list, reason, expected_set_nodes_down_call, expected_exception", + [ + ([], "no_reason", None, None), + ( + ["queue1-dy-c5xlarge-2"], + "InsufficientInstanceCapacity", + [ + call( + ["queue1-dy-c5xlarge-2"], + reason="InsufficientInstanceCapacity", + ) + ], + None, + ), + ( + ["queue1-dy-c5xlarge-3"], + "InsufficientInstanceCapacity", + [ + call( + ["queue1-dy-c5xlarge-3"], + reason="InsufficientInstanceCapacity", + ) + ], + Exception(), + ), + ], +) +def test_handle_failed_nodes(mocker, caplog, node_list, reason, expected_set_nodes_down_call, expected_exception): + # patch internal functions + set_nodes_down = mocker.patch("slurm_plugin.resume.set_nodes_down", side_effect=expected_exception) + caplog.set_level(logging.INFO) + + _handle_failed_nodes(node_list, reason) + if not node_list: + set_nodes_down.assert_not_called() + else: + set_nodes_down.assert_has_calls(expected_set_nodes_down_call) + + if isinstance(expected_exception, Exception): + assert_that(caplog.records).is_length(2) + assert_that(caplog.records[1].levelname).is_equal_to("ERROR") + assert_that(caplog.records[1].message).contains("Failed to place nodes") + else: + assert_that(caplog.records).is_length(1) + assert_that(caplog.records[0].levelname).is_equal_to("INFO") + assert_that(caplog.records[0].message).contains("Setting following failed nodes into DOWN state") From 2107880e871248399c1c2c86442f9a5b85fe749e Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 19 Sep 2023 21:09:09 +0200 Subject: [PATCH 4/6] Remove unused comment Signed-off-by: Luca Carrogu --- tests/slurm_plugin/test_resume.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index 1c7c58cd0..f03faad09 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -444,13 +444,7 @@ def test_resume_config(config_file, expected_attributes, test_datadir, mocker): client_error("InsufficientInstanceCapacity"), ], {"InsufficientInstanceCapacity": {"queue1-st-c5xlarge-2"}}, - [ - # call( - # ["queue1-dy-c5xlarge-1", "queue1-dy-c5xlarge-2", "queue1-st-c5xlarge-1"], - # nodeaddrs=["ip.1.0.0.1", "ip.1.0.0.2", "ip.1.0.0.3"], - # nodehostnames=None, - # ) - ], + [], {}, True, True, From 7e70729cb3147a208547184ac42b5c7acb4519fe Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Sat, 23 Sep 2023 19:24:56 +0200 Subject: [PATCH 5/6] Remove temporary resume setting Remove temporary resume setting used during development of the node-sharing job-level scaling feature Signed-off-by: Luca Carrogu --- src/slurm_plugin/instance_manager.py | 4 ---- src/slurm_plugin/resume.py | 5 ----- tests/slurm_plugin/test_resume.py | 1 - 3 files changed, 10 deletions(-) diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index e8704c8c4..42b33446e 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -79,7 +79,6 @@ def get_manager( run_instances_overrides: dict = None, create_fleet_overrides: dict = None, job_level_scaling: bool = False, - temp_jls_for_node_sharing: bool = False, ): if job_level_scaling: return JobLevelScalingInstanceManager( @@ -95,7 +94,6 @@ def get_manager( fleet_config=fleet_config, run_instances_overrides=run_instances_overrides, create_fleet_overrides=create_fleet_overrides, - temp_jls_for_node_sharing=temp_jls_for_node_sharing, ) else: return NodeListScalingInstanceManager( @@ -493,7 +491,6 @@ def __init__( fleet_config: Dict[str, any] = None, run_instances_overrides: dict = None, create_fleet_overrides: dict = None, - temp_jls_for_node_sharing: bool = False, ): super().__init__( region=region, @@ -510,7 +507,6 @@ def __init__( create_fleet_overrides=create_fleet_overrides, ) self.unused_launched_instances = {} - self.temp_jls_for_node_sharing = temp_jls_for_node_sharing def _clear_unused_launched_instances(self): """Clear and reset unused launched instances list.""" diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py index 6c90225a6..8f77a76be 100644 --- a/src/slurm_plugin/resume.py +++ b/src/slurm_plugin/resume.py @@ -47,7 +47,6 @@ class SlurmResumeConfig: "fleet_config_file": "/etc/parallelcluster/slurm_plugin/fleet-config.json", "all_or_nothing_batch": True, "job_level_scaling": True, - "temp_jls_for_node_sharing": False, } def __init__(self, config_file_path): @@ -96,9 +95,6 @@ def _get_config(self, config_file_path): self.job_level_scaling = config.getboolean( "slurm_resume", "job_level_scaling", fallback=self.DEFAULTS.get("job_level_scaling") ) - self.temp_jls_for_node_sharing = config.getboolean( - "slurm_resume", "temp_jls_for_node_sharing", fallback=self.DEFAULTS.get("temp_jls_for_node_sharing") - ) fleet_config_file = config.get( "slurm_resume", "fleet_config_file", fallback=self.DEFAULTS.get("fleet_config_file") ) @@ -209,7 +205,6 @@ def _resume(arg_nodes, resume_config, slurm_resume): run_instances_overrides=resume_config.run_instances_overrides, create_fleet_overrides=resume_config.create_fleet_overrides, job_level_scaling=resume_config.job_level_scaling, - temp_jls_for_node_sharing=resume_config.temp_jls_for_node_sharing, ) instance_manager.add_instances( slurm_resume=slurm_resume, diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index f03faad09..4004fba71 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -798,7 +798,6 @@ def test_resume_launch( job_level_scaling=job_level_scaling, assign_node_max_batch_size=500, terminate_max_batch_size=1000, - temp_jls_for_node_sharing=False, ) mocker.patch("slurm_plugin.resume.is_clustermgtd_heartbeat_valid", autospec=True, return_value=is_heartbeat_valid) mock_handle_failed_nodes = mocker.patch("slurm_plugin.resume._handle_failed_nodes", autospec=True) From 3a6a47025446a1ecde8a296162b478c23b665a22 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 26 Sep 2023 09:54:05 +0200 Subject: [PATCH 6/6] Fix missing parameter for _add_instances_for_nodes Fix missing parameter assign_node_batch_size for _add_instances_for_nodes Signed-off-by: Luca Carrogu --- src/slurm_plugin/instance_manager.py | 1 + tests/slurm_plugin/test_instance_manager.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index 42b33446e..1a99df3bd 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -637,6 +637,7 @@ def _scaling_for_jobs_single_node( self._add_instances_for_nodes( node_list=single_nodes, launch_batch_size=launch_batch_size, + assign_node_batch_size=assign_node_batch_size, update_node_address=update_node_address, all_or_nothing_batch=False, ) diff --git a/tests/slurm_plugin/test_instance_manager.py b/tests/slurm_plugin/test_instance_manager.py index aacc6f68b..c24a84b7e 100644 --- a/tests/slurm_plugin/test_instance_manager.py +++ b/tests/slurm_plugin/test_instance_manager.py @@ -3376,6 +3376,7 @@ def test_scaling_for_jobs_single_node( instance_manager._add_instances_for_nodes.assert_called_once_with( node_list=expected_single_nodes_no_oversubscribe, launch_batch_size=launch_batch_size, + assign_node_batch_size=assign_node_batch_size, update_node_address=update_node_address, all_or_nothing_batch=False, )