Skip to content

Commit

Permalink
testsuite: cover sched reload with partial alloc
Browse files Browse the repository at this point in the history
Problem: there is no test coverage for reloading the scheduler
with partially allocated jobs.

Add a test that runs if the scheduler was able to send the hello
request with the partial-ok flag.
  • Loading branch information
garlick committed Dec 17, 2024
1 parent a49ba81 commit c027728
Showing 1 changed file with 50 additions and 0 deletions.
50 changes: 50 additions & 0 deletions t/t1026-rv1-partial-release.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,24 @@ hk_wait_for_running () {
sleep 0.1
done
}
# Usage: hk_wait_for_allocated_nnodes count
hk_wait_for_allocated_nnodes () {
count=0
while test $(flux housekeeping list -no {allocated.nnodes} | wc -l) -ne $1; do
count=$(($count+1));
test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s
sleep 0.1
done
}

fluxion_free_cores() {
FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s free -no {ncores}
}
fluxion_free_nnodes() {
FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s free -no {nnodes}
}


test_expect_success 'load fluxion modules' '
Expand All @@ -44,6 +57,12 @@ test_expect_success 'load fluxion modules' '
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'

# Check job manager hello debug message for +partial-ok flag
if flux dmesg | grep +partial-ok; then
test_set_prereq HAVE_PARTIAL_OK
fi

test_expect_success 'run a normal job, resources are free' '
flux run -vvv -xN4 /bin/true &&
test_debug "echo free=\$(fluxion_free_cores)" &&
Expand Down Expand Up @@ -103,6 +122,37 @@ test_expect_success 'no errors from fluxion' '
flux dmesg -H >log2.out &&
test_must_fail grep "free RPC failed to remove all resources" log.out
'
test_expect_success HAVE_PARTIAL_OK 'reconfigure housekeeping with sticky rank 0' '
flux config load <<-EOF
[job-manager.housekeeping]
command = [
"sh",
"-c",
"test \$(flux getattr rank) -eq 0 && sleep inf; exit 0"
]
release-after = "0s"
EOF
'
test_expect_success HAVE_PARTIAL_OK 'run a job on 4 nodes and wait for 3/4 housekeeping' '
flux run -N4 true &&
hk_wait_for_allocated_nnodes 1
'
test_expect_success HAVE_PARTIAL_OK 'fluxion shows housekeeping node allocated' '
test $(fluxion_free_nnodes) -eq 3
'
test_expect_success HAVE_PARTIAL_OK 'remove fluxion modules' '
flux module remove sched-fluxion-qmanager &&
flux module remove sched-fluxion-resource
'
test_expect_success HAVE_PARTIAL_OK 'load fluxion modules' '
flux module load sched-fluxion-resource &&
flux module load sched-fluxion-qmanager &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'
test_expect_success HAVE_PARTIAL_OK 'fluxion still shows housekeeping node allocated' '
test $(fluxion_free_nnodes) -eq 3
'
test_expect_success 'unload fluxion modules' '
flux module remove sched-fluxion-qmanager &&
flux module remove sched-fluxion-resource &&
Expand Down

0 comments on commit c027728

Please sign in to comment.