Skip to content

Commit

Permalink
testsuite: cover sched reload with partial alloc
Browse files Browse the repository at this point in the history
Problem: there is no test coverage for reloading the scheduler
with partially allocated jobs.

Add a test that runs if the scheduler was able to send the hello
request with the partial-ok flag.

Use the convenience scripts from sharness.d to load/reload the
scheduler modules, and load qmanager with synchronization so that
tests are not racing with the hello handshake that happens after
module loading completes.
  • Loading branch information
garlick committed Dec 18, 2024
1 parent e84acdf commit 07d200e
Showing 1 changed file with 97 additions and 15 deletions.
112 changes: 97 additions & 15 deletions t/t1026-rv1-partial-release.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,44 @@ hk_wait_for_running () {
sleep 0.1
done
}

fluxion_free_cores() {
# Usage: hk_wait_for_allocated_nnodes count
hk_wait_for_allocated_nnodes () {
count=0
while test $(flux housekeeping list -no {allocated.nnodes}) -ne $1; do
count=$(($count+1));
test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s
sleep 0.1
done
}
# Usage: fluxion_free ncores|nnodes
fluxion_free () {
FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s free -no {$1}
}
# Usage: fluxion_allocated ncores|nnodes
fluxion_allocated () {
FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s free -no {ncores}
flux resource list -s allocated -no {$1}
}


test_expect_success 'load fluxion modules' '
flux module remove -f sched-simple &&
flux module load sched-fluxion-resource &&
flux module load sched-fluxion-qmanager &&
load_resource match-format=rv1_nosched &&
load_qmanager_sync &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'

# Check job manager hello debug message for +partial-ok flag
if flux dmesg | grep +partial-ok; then
test_set_prereq HAVE_PARTIAL_OK
fi

test_expect_success 'run a normal job, resources are free' '
flux run -vvv -xN4 /bin/true &&
test_debug "echo free=\$(fluxion_free_cores)" &&
test $(fluxion_free_cores) -eq $TOTAL_NCORES
test_debug "echo free=\$(fluxion_free ncores)" &&
test $(fluxion_free ncores) -eq $TOTAL_NCORES
'
test_expect_success 'run 4 single node jobs, resources are free' '
flux submit -v --cc=1-4 -xN1 --wait /bin/true &&
Expand All @@ -56,17 +76,17 @@ test_expect_success 'run 4 single node jobs, resources are free' '
'
test_expect_success 'run 16 single core jobs, resources are free' '
flux submit -v --cc=1-16 -n1 --wait /bin/true &&
test_debug "echo free=\$(fluxion_free_cores)" &&
test $(fluxion_free_cores) -eq $TOTAL_NCORES
test_debug "echo free=\$(fluxion_free ncores)" &&
test $(fluxion_free ncores) -eq $TOTAL_NCORES
'
test_expect_success 'clear dmesg buffer' '
flux dmesg -C
'
test_expect_success 'run a job with unequal core distribution, resources are free' '
flux run -vvv -n7 -l flux getattr rank &&
test_debug "flux job info $(flux job last) R | jq" &&
test_debug "echo free=\$(fluxion_free_cores)" &&
test $(fluxion_free_cores) -eq $TOTAL_NCORES
test_debug "echo free=\$(fluxion_free ncores)" &&
test $(fluxion_free ncores) -eq $TOTAL_NCORES
'
test_expect_success 'attempt to ensure dmesg buffer synchronized' '
flux logger test-sentinel &&
Expand Down Expand Up @@ -96,16 +116,78 @@ test_expect_success 'attempt to ensure dmesg buffer synchronized' '
dmesg_wait test-sentinel
'
test_expect_success 'all resources free' '
test_debug "echo free=\$(fluxion_free_cores)" &&
test $(fluxion_free_cores) -eq $TOTAL_NCORES
test_debug "echo free=\$(fluxion_free ncores)" &&
test $(fluxion_free ncores) -eq $TOTAL_NCORES
'
test_expect_success 'no errors from fluxion' '
flux dmesg -H >log2.out &&
test_must_fail grep "free RPC failed to remove all resources" log.out
'
test_expect_success HAVE_PARTIAL_OK 'reconfigure housekeeping with sticky node' '
flux config load <<-EOF
[job-manager.housekeeping]
command = [
"sh",
"-c",
"test \$(flux getattr rank) -eq 0 && sleep inf; exit 0"
]
release-after = "0s"
EOF
'
test_expect_success HAVE_PARTIAL_OK 'run a job and wait for node to get stuck' '
flux run -N4 true &&
hk_wait_for_running 1 &&
hk_wait_for_allocated_nnodes 1
'
test_expect_success HAVE_PARTIAL_OK 'fluxion shows 1 node allocated' '
test $(fluxion_allocated nnodes) -eq 1
'
test_expect_success HAVE_PARTIAL_OK 'reload fluxion modules' '
remove_qmanager &&
reload_resource match-format=rv1_nosched &&
load_qmanager_sync &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'
test_expect_success HAVE_PARTIAL_OK 'fluxion still shows 1 node allocated' '
test $(fluxion_allocated nnodes) -eq 1
'
test_expect_success HAVE_PARTIAL_OK 'kill housekeeping' '
flux housekeeping kill --all
'
test_expect_success HAVE_PARTIAL_OK 'fluxion shows 0 nodes allocated' '
hk_wait_for_running 0 &&
test $(fluxion_allocated nnodes) -eq 0
'
test_expect_success HAVE_PARTIAL_OK 'reload fluxion modules with match-format=rv1' '
remove_qmanager &&
reload_resource match-format=rv1 &&
load_qmanager_sync &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'
test_expect_success HAVE_PARTIAL_OK 'run a job and wait for node to get stuck' '
flux run -N4 true
hk_wait_for_running 1 &&
hk_wait_for_allocated_nnodes 1
'
test_expect_success HAVE_PARTIAL_OK 'fluxion shows 1 nodes allocated' '
test $(fluxion_allocated nnodes) -eq 1
'
test_expect_success HAVE_PARTIAL_OK 'reload fluxion modules with match-format=rv1' '
remove_qmanager &&
reload_resource match-format=rv1 &&
load_qmanager_sync &&
flux resource list &&
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
'
test_expect_success HAVE_PARTIAL_OK 'fluxion still shows 1 node allocated' '
test $(fluxion_allocated nnodes) -eq 1
'

test_expect_success 'unload fluxion modules' '
flux module remove sched-fluxion-qmanager &&
flux module remove sched-fluxion-resource &&
remove_qmanager &&
remove_resource &&
flux module load sched-simple
'
test_done

0 comments on commit 07d200e

Please sign in to comment.