Skip to content

Commit

Permalink
Avoid to set nodes into DOWN if no nodes are passed as input
Browse files Browse the repository at this point in the history
Avoid to set nodes into DOWN, hence avoid calling Slurm scontrol update, if node list is empty
Avoided log line is
```
2023-09-19 10:56:39,439 - [slurm_plugin.resume:_handle_failed_nodes] - INFO - Setting following failed nodes into DOWN state (x0) [] with reason: (Code:LimitedInstanceCapacity)Failure when resuming nodes
```

Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Sep 19, 2023
1 parent af16907 commit 92dc440
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 13 deletions.
25 changes: 13 additions & 12 deletions src/slurm_plugin/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,18 +157,19 @@ def _handle_failed_nodes(node_list, reason="Failure when resuming nodes"):
To save time, should explicitly set nodes to DOWN in ResumeProgram so clustermgtd can maintain failed nodes.
Clustermgtd will be responsible for running full DOWN -> POWER_DOWN process.
"""
try:
log.info(
"Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason
)
set_nodes_down(node_list, reason=reason)
except Exception as e:
log.error(
"Failed to place nodes %s into DOWN for reason %s with exception: %s",
print_with_count(node_list),
reason,
e,
)
if node_list:
try:
log.info(
"Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason
)
set_nodes_down(node_list, reason=reason)
except Exception as e:
log.error(
"Failed to place nodes %s into DOWN for reason %s with exception: %s",
print_with_count(node_list),
reason,
e,
)


def _resume(arg_nodes, resume_config, slurm_resume):
Expand Down
16 changes: 15 additions & 1 deletion tests/slurm_plugin/test_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import slurm_plugin
from assertpy import assert_that
from slurm_plugin.fleet_manager import EC2Instance
from slurm_plugin.resume import SlurmResumeConfig, _get_slurm_resume, _resume
from slurm_plugin.resume import SlurmResumeConfig, _get_slurm_resume, _resume, _handle_failed_nodes

from tests.common import FLEET_CONFIG, LAUNCH_OVERRIDES, client_error

Expand Down Expand Up @@ -916,3 +916,17 @@ def test_get_slurm_resume(config_file, expected_slurm_resume, test_datadir, capl
assert_that(caplog.records).is_length(1)
assert_that(caplog.records[0].levelname).is_equal_to("INFO")
assert_that(caplog.records[0].message).contains("Slurm Resume File content")


@pytest.mark.parametrize(
"node_list, reason, expected_set_nodes_down_call, expected_exception",
[
([], "no_reason", None, None),
],
)
def test_handle_failed_nodes(mocker, node_list, reason, expected_set_nodes_down_call, expected_exception):
set_nodes_down = mocker.patch("slurm_plugin.resume.set_nodes_down")
_handle_failed_nodes(node_list, reason)
if not expected_set_nodes_down_call:
set_nodes_down.assert_not_called()
# TODO complete test coverage

0 comments on commit 92dc440

Please sign in to comment.