Skip to content

Commit

Permalink
Merge pull request #2856 from mr0re1/remove_allow_power_down
Browse files Browse the repository at this point in the history
SlurmGCP6. Fix nodes stack in `down*` state.
  • Loading branch information
mr0re1 authored Aug 2, 2024
2 parents 61abdc7 + 0571bb2 commit df6d788
Showing 1 changed file with 1 addition and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -167,19 +167,6 @@ def _find_tpu_node_status(nodename, state):

return NodeStatus.unchanged


def allow_power_down(state):
config = run(f"{lkp.scontrol} show config").stdout.rstrip()
m = re.search(r"SuspendExcStates\s+=\s+(?P<states>[\w\(\)]+)", config)
if not m:
log.warning("SuspendExcStates not found in Slurm config")
return True
states = set(m.group("states").split(","))
if "(null)" in states or bool(state & state.flags.union(state.base)):
return False
return True


def find_node_status(nodename):
"""Determine node/instance status that requires action"""
state = lkp.slurm_node(nodename)
Expand Down Expand Up @@ -207,7 +194,7 @@ def find_node_status(nodename):
return NodeStatus.unbacked
if state.base != "DOWN" and not power_flags:
return NodeStatus.unbacked
if state.base == "DOWN" and not power_flags and allow_power_down(state):
if state.base == "DOWN" and not power_flags:
return NodeStatus.power_down
if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename):
return NodeStatus.resume
Expand Down

0 comments on commit df6d788

Please sign in to comment.