Skip to content

Commit

Permalink
Patch ec2_missing_backing_instance_limit
Browse files Browse the repository at this point in the history
Signed-off-by: Francesco Giordano <[email protected]>
  • Loading branch information
francesco-giordano committed Jan 29, 2024
1 parent 786ef08 commit e55734a
Showing 1 changed file with 35 additions and 8 deletions.
43 changes: 35 additions & 8 deletions src/slurm_plugin/slurm_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

CONFIG_FILE_DIR = "/etc/parallelcluster/slurm_plugin"

ec2_missing_backing_instance_limit = 2
node_failure_count = {}

class PartitionStatus(Enum):
UP = "UP"
Expand Down Expand Up @@ -124,6 +126,7 @@ def __init__(self, name, nodeaddr, nodehostname, state, partitions=None, reason=
self.is_failing_health_check = False
self.error_code = self._parse_error_code()
self.queue_name, self._node_type, self.compute_resource_name = parse_nodename(name)
self.ec2_backing_instance = None

def is_nodeaddr_set(self):
"""Check if nodeaddr(private ip) for the node is set."""
Expand Down Expand Up @@ -263,16 +266,40 @@ def is_powering_down_with_nodeaddr(self):

def is_backing_instance_valid(self, log_warn_if_unhealthy=True):
"""Check if a slurm node's addr is set, it points to a valid instance in EC2."""
if self.ec2_backing_instance is not None:
return self.ec2_backing_instance

logger.debug("node_failure_count: " + str(node_failure_count))
self.ec2_backing_instance = True
if self.is_nodeaddr_set():
if not self.instance:
if log_warn_if_unhealthy:
logger.warning(
"Node state check: no corresponding instance in EC2 for node %s, node state: %s",
self,
self.state_string,
)
return False
return True
cycle_missing = node_failure_count.get(self.name, 0)
if cycle_missing >= ec2_missing_backing_instance_limit:
if log_warn_if_unhealthy:
logger.warning(
"Node state check: no corresponding instance in EC2 for node %s, node state: %s",
self,
self.state_string
)
node_failure_count.pop(self.name, None)
self.ec2_backing_instance = False
else:
# If the limit has not been reach increase the count and is backing instance still valid
if log_warn_if_unhealthy:
logger.warning(
"Node state check: no corresponding instance in EC2 for node %s, node state: %s."
" Retry left: %s",
self,
self.state_string,
str(ec2_missing_backing_instance_limit - cycle_missing)
)
node_failure_count[self.name] = cycle_missing + 1
else:
node_failure_count.pop(self.name, None)
else:
node_failure_count.pop(self.name, None)

return self.ec2_backing_instance

@abstractmethod
def needs_reset_when_inactive(self):
Expand Down

0 comments on commit e55734a

Please sign in to comment.