-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[develop] Job Level Scaling for Node Sharing #564
Changes from all commits
3f90d5c
af16907
6b34fd4
2107880
7e70729
3a6a470
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,7 +79,6 @@ def get_manager( | |
run_instances_overrides: dict = None, | ||
create_fleet_overrides: dict = None, | ||
job_level_scaling: bool = False, | ||
temp_jls_for_node_sharing: bool = False, | ||
): | ||
if job_level_scaling: | ||
return JobLevelScalingInstanceManager( | ||
|
@@ -95,7 +94,6 @@ def get_manager( | |
fleet_config=fleet_config, | ||
run_instances_overrides=run_instances_overrides, | ||
create_fleet_overrides=create_fleet_overrides, | ||
temp_jls_for_node_sharing=temp_jls_for_node_sharing, | ||
) | ||
else: | ||
return NodeListScalingInstanceManager( | ||
|
@@ -493,7 +491,6 @@ def __init__( | |
fleet_config: Dict[str, any] = None, | ||
run_instances_overrides: dict = None, | ||
create_fleet_overrides: dict = None, | ||
temp_jls_for_node_sharing: bool = False, | ||
): | ||
super().__init__( | ||
region=region, | ||
|
@@ -510,7 +507,6 @@ def __init__( | |
create_fleet_overrides=create_fleet_overrides, | ||
) | ||
self.unused_launched_instances = {} | ||
self.temp_jls_for_node_sharing = temp_jls_for_node_sharing | ||
|
||
def _clear_unused_launched_instances(self): | ||
"""Clear and reset unused launched instances list.""" | ||
|
@@ -634,12 +630,14 @@ def _scaling_for_jobs_single_node( | |
all_or_nothing_batch=all_or_nothing_batch, | ||
) | ||
else: | ||
# Batch all single node no oversubscribe jobs in a single best-effort EC2 launch request | ||
# Batch all single node jobs in a single best-effort EC2 launch request | ||
# This to reduce scaling time and save launch API calls | ||
single_nodes_no_oversubscribe = [job.nodes_resume[0] for job in job_list] | ||
# Remove duplicated node entries (possible in oversubscribe case) | ||
single_nodes = list(dict.fromkeys([job.nodes_resume[0] for job in job_list])) | ||
self._add_instances_for_nodes( | ||
node_list=single_nodes_no_oversubscribe, | ||
node_list=single_nodes, | ||
launch_batch_size=launch_batch_size, | ||
assign_node_batch_size=assign_node_batch_size, | ||
update_node_address=update_node_address, | ||
all_or_nothing_batch=False, | ||
) | ||
|
@@ -660,36 +658,24 @@ def _add_instances_for_resume_file( | |
self._clear_unused_launched_instances() | ||
|
||
self._scaling_for_jobs_single_node( | ||
job_list=slurm_resume_data.jobs_single_node_no_oversubscribe, | ||
job_list=slurm_resume_data.jobs_single_node_no_oversubscribe | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this stage, can SlurmResumeData contain a property There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. absolutely, I think we can drop the distinction between "oversubscribe" and "no oversubscribe", now that we are able to manage both types. I'm considering this for next PR |
||
+ slurm_resume_data.jobs_single_node_oversubscribe, | ||
launch_batch_size=launch_batch_size, | ||
assign_node_batch_size=assign_node_batch_size, | ||
update_node_address=update_node_address, | ||
all_or_nothing_batch=all_or_nothing_batch, | ||
) | ||
|
||
self._scaling_for_jobs_multi_node( | ||
job_list=slurm_resume_data.jobs_multi_node_no_oversubscribe, | ||
node_list=slurm_resume_data.multi_node_no_oversubscribe, | ||
job_list=slurm_resume_data.jobs_multi_node_no_oversubscribe | ||
+ slurm_resume_data.jobs_multi_node_oversubscribe, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, having a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, see other comment |
||
node_list=slurm_resume_data.multi_node_no_oversubscribe + slurm_resume_data.multi_node_oversubscribe, | ||
launch_batch_size=launch_batch_size, | ||
assign_node_batch_size=assign_node_batch_size, | ||
update_node_address=update_node_address, | ||
all_or_nothing_batch=all_or_nothing_batch, | ||
) | ||
|
||
if not self.temp_jls_for_node_sharing: | ||
# node scaling for oversubscribe nodes | ||
node_list = list( | ||
dict.fromkeys(slurm_resume_data.single_node_oversubscribe + slurm_resume_data.multi_node_oversubscribe) | ||
) | ||
if node_list: | ||
self._add_instances_for_nodes( | ||
node_list=node_list, | ||
launch_batch_size=launch_batch_size, | ||
assign_node_batch_size=assign_node_batch_size, | ||
update_node_address=update_node_address, | ||
all_or_nothing_batch=all_or_nothing_batch, | ||
) | ||
|
||
def _scaling_for_jobs_multi_node( | ||
self, | ||
job_list, | ||
|
@@ -907,6 +893,12 @@ def _add_instances_for_nodes( | |
update_node_address=update_node_address, | ||
) | ||
|
||
def _reset_failed_nodes(self, nodeset): | ||
"""Remove nodeset from failed nodes dict.""" | ||
if nodeset: | ||
for error_code in self.failed_nodes: | ||
self.failed_nodes[error_code] = self.failed_nodes.get(error_code, set()).difference(nodeset) | ||
|
||
def best_effort_node_assignment( | ||
self, | ||
assign_node_batch_size, | ||
|
@@ -935,6 +927,7 @@ def best_effort_node_assignment( | |
print_with_count(successful_launched_nodes), | ||
) | ||
self._update_dict(self.nodes_assigned_to_instances, nodes_resume_mapping) | ||
self._reset_failed_nodes(set(successful_launched_nodes)) | ||
if len(successful_launched_nodes) < len(nodes_resume_list): | ||
# set limited capacity on the failed to launch nodes | ||
self._update_failed_nodes(set(failed_launch_nodes), "LimitedInstanceCapacity", override=False) | ||
|
@@ -968,6 +961,7 @@ def all_or_nothing_node_assignment( | |
print_with_count(nodes_resume_list), | ||
) | ||
self._update_dict(self.nodes_assigned_to_instances, nodes_resume_mapping) | ||
self._reset_failed_nodes(set(nodes_resume_list)) | ||
Comment on lines
963
to
+964
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIT: It seems that both
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really, they are different. Let's sync on this |
||
except InstanceToNodeAssignmentError: | ||
# Failed to assign EC2 instances to nodes | ||
# EC2 Instances already assigned, are going to be terminated by | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,7 +47,6 @@ class SlurmResumeConfig: | |
"fleet_config_file": "/etc/parallelcluster/slurm_plugin/fleet-config.json", | ||
"all_or_nothing_batch": True, | ||
"job_level_scaling": True, | ||
"temp_jls_for_node_sharing": False, | ||
} | ||
|
||
def __init__(self, config_file_path): | ||
|
@@ -96,9 +95,6 @@ def _get_config(self, config_file_path): | |
self.job_level_scaling = config.getboolean( | ||
"slurm_resume", "job_level_scaling", fallback=self.DEFAULTS.get("job_level_scaling") | ||
) | ||
self.temp_jls_for_node_sharing = config.getboolean( | ||
"slurm_resume", "temp_jls_for_node_sharing", fallback=self.DEFAULTS.get("temp_jls_for_node_sharing") | ||
) | ||
fleet_config_file = config.get( | ||
"slurm_resume", "fleet_config_file", fallback=self.DEFAULTS.get("fleet_config_file") | ||
) | ||
|
@@ -157,18 +153,19 @@ def _handle_failed_nodes(node_list, reason="Failure when resuming nodes"): | |
To save time, should explicitly set nodes to DOWN in ResumeProgram so clustermgtd can maintain failed nodes. | ||
Clustermgtd will be responsible for running full DOWN -> POWER_DOWN process. | ||
""" | ||
try: | ||
log.info( | ||
"Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason | ||
) | ||
set_nodes_down(node_list, reason=reason) | ||
except Exception as e: | ||
log.error( | ||
"Failed to place nodes %s into DOWN for reason %s with exception: %s", | ||
print_with_count(node_list), | ||
reason, | ||
e, | ||
) | ||
if node_list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, which code path is resulting in us calling the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when we have reset the node failures with _reset_failed_nodes, it could happen that the error key is still there but there are no more nodes associated to that error |
||
try: | ||
log.info( | ||
"Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason | ||
) | ||
set_nodes_down(node_list, reason=reason) | ||
except Exception as e: | ||
log.error( | ||
"Failed to place nodes %s into DOWN for reason %s with exception: %s", | ||
print_with_count(node_list), | ||
reason, | ||
e, | ||
) | ||
|
||
|
||
def _resume(arg_nodes, resume_config, slurm_resume): | ||
|
@@ -208,7 +205,6 @@ def _resume(arg_nodes, resume_config, slurm_resume): | |
run_instances_overrides=resume_config.run_instances_overrides, | ||
create_fleet_overrides=resume_config.create_fleet_overrides, | ||
job_level_scaling=resume_config.job_level_scaling, | ||
temp_jls_for_node_sharing=resume_config.temp_jls_for_node_sharing, | ||
) | ||
instance_manager.add_instances( | ||
slurm_resume=slurm_resume, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any reason to use
list(dict.fromkeys(...))
this instead oflist(set(...))
? We're expecting only a single node, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted to preserve the order
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we'll have only a single node.
dict.fromkeys
is fine, I just wanted to understand if there was another reason other than the order.