Skip to content

Commit

Permalink
testsuite: cover sched reload with partial alloc
Browse files Browse the repository at this point in the history
Problem: there is no test coverage for reloading the scheduler
with partially allocated jobs.

Add a test that runs if the scheduler was able to send the hello
request with the partial-ok flag.

Use the convenience scripts from sharness.d to load/reload the
scheduler modules, and load qmanager with synchronization so that
tests are not racing with the hello handshake that happens after
module loading completes.
  • Loading branch information
garlick committed Dec 17, 2024
1 parent e84acdf commit 9125350
Showing 1 changed file with 51 additions and 4 deletions.
55 changes: 51 additions & 4 deletions t/t1026-rv1-partial-release.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,39 @@ hk_wait_for_running () {
sleep 0.1
done
}
# Usage: hk_wait_for_allocated_nnodes count
hk_wait_for_allocated_nnodes () {
count=0
while test $(flux housekeeping list -no {allocated.nnodes} | wc -l) -ne $1; do
count=$(($count+1));
test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s
sleep 0.1
done
}

fluxion_free_cores() {
FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s free -no {ncores}
}
fluxion_free_nnodes() {
FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s free -no {nnodes}
}


test_expect_success 'load fluxion modules' '
flux module remove -f sched-simple &&
flux module load sched-fluxion-resource &&
flux module load sched-fluxion-qmanager &&
load_resource &&
load_qmanager_sync &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'

# Check job manager hello debug message for +partial-ok flag
if flux dmesg | grep +partial-ok; then
test_set_prereq HAVE_PARTIAL_OK
fi

test_expect_success 'run a normal job, resources are free' '
flux run -vvv -xN4 /bin/true &&
test_debug "echo free=\$(fluxion_free_cores)" &&
Expand Down Expand Up @@ -103,9 +122,37 @@ test_expect_success 'no errors from fluxion' '
flux dmesg -H >log2.out &&
test_must_fail grep "free RPC failed to remove all resources" log.out
'
test_expect_success HAVE_PARTIAL_OK 'reconfigure housekeeping with sticky rank 0' '
flux config load <<-EOF
[job-manager.housekeeping]
command = [
"sh",
"-c",
"test \$(flux getattr rank) -eq 0 && sleep inf; exit 0"
]
release-after = "0s"
EOF
'
test_expect_success HAVE_PARTIAL_OK 'run a job on 4 nodes and wait for 3/4 housekeeping' '
flux run -N4 true &&
hk_wait_for_allocated_nnodes 1
'
test_expect_success HAVE_PARTIAL_OK 'fluxion shows housekeeping node allocated' '
test $(fluxion_free_nnodes) -eq 3
'
test_expect_success HAVE_PARTIAL_OK 'reload fluxion modules' '
remove_qmanager &&
reload_resource &&
load_qmanager_sync &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'
test_expect_success HAVE_PARTIAL_OK 'fluxion still shows housekeeping node allocated' '
test $(fluxion_free_nnodes) -eq 3
'
test_expect_success 'unload fluxion modules' '
flux module remove sched-fluxion-qmanager &&
flux module remove sched-fluxion-resource &&
remove_qmanager &&
remove_resource &&
flux module load sched-simple
'
test_done

0 comments on commit 9125350

Please sign in to comment.