Skip to content

Commit

Permalink
Refactor logic in _is_node_in_replacement_valid() to account for `n…
Browse files Browse the repository at this point in the history
…ode.instance` being `None`
  • Loading branch information
dreambeyondorange committed Mar 1, 2024
1 parent b5c8bb5 commit 58913aa
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
18 changes: 9 additions & 9 deletions src/slurm_plugin/clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1207,17 +1207,17 @@ def _is_node_in_replacement_valid(self, node: SlurmNode, check_node_is_valid):
If check_node_is_valid=True, check whether a node is in replacement,
If check_node_is_valid=False, check whether a node is replacement timeout.
"""
if (
node.is_backing_instance_valid(
self._config.ec2_instance_missing_max_count,
self._nodes_without_backing_instance_count_map,
log_warn_if_unhealthy=True,
)
and node.name in self._static_nodes_in_replacement
):
time_is_expired = time_is_up(
log.debug(f"Checking if node is in replacement {node}")
if node.is_backing_instance_valid(
self._config.ec2_instance_missing_max_count,
self._nodes_without_backing_instance_count_map,
log_warn_if_unhealthy=True,
) and node.name in self._static_nodes_in_replacement:
# Set `time_is_expired` to `False` if `node.instance` is `None` since we don't have a launch time yet
time_is_expired = False if not node.instance else time_is_up(
node.instance.launch_time, self._current_time, grace_time=self._config.node_replacement_timeout
)
log.debug(f"Node {node} is in replacement and timer expired? {time_is_expired}, instance? {node.instance}")
return not time_is_expired if check_node_is_valid else time_is_expired
return False

Expand Down
2 changes: 1 addition & 1 deletion src/slurm_plugin/slurm_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def is_backing_instance_valid(
if log_warn_if_unhealthy:
logger.warning(
f"Incrementing missing EC2 instance count for node {self.name} to "
f"{nodes_without_backing_instance_count_map[self.name]}."
f"{nodes_without_backing_instance_count_map[self.name].count}."
)
else:
# Remove the slurm node from the map since the instance is healthy
Expand Down

0 comments on commit 58913aa

Please sign in to comment.