-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1307 from grondo/issue#1301
readers: fix rv1exec cancel with multiple entries in `R_lite` array
- Loading branch information
Showing
6 changed files
with
229 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/bin/bash -e | ||
|
||
idset_from_count() { | ||
if test $1 -eq 1; then | ||
echo "0" | ||
else | ||
echo "0-$(($1 - 1))" | ||
fi | ||
} | ||
|
||
set_fake_resources() { | ||
cores=${1} | ||
ranklist=$(idset_from_count $(flux getattr size)) | ||
corelist=$(idset_from_count ${cores}) | ||
R=$(flux R encode -r${ranklist} -c${corelist}) | ||
echo Setting fake resource.R="$R" >&2 | ||
flux kvs put resource.R="$R" | ||
} | ||
|
||
RANK=$(flux getattr rank) | ||
|
||
# Usage: modload {all|<rank>} modname [args ...] | ||
modload() { | ||
local where=$1; shift | ||
if test "$where" = "all" || test $where -eq $RANK; then | ||
flux module load $* | ||
fi | ||
} | ||
|
||
|
||
modload all content | ||
modload 0 content-sqlite | ||
modload all kvs | ||
modload all kvs-watch | ||
|
||
modload 0 job-manager | ||
|
||
modload all job-ingest | ||
modload all job-info | ||
modload 0 job-list | ||
modload all barrier | ||
modload 0 heartbeat | ||
|
||
if test $RANK -eq 0; then | ||
# Set fake resources for testing | ||
set_fake_resources ${TEST_UNDER_FLUX_CORES_PER_RANK:-2} | ||
fi | ||
modload all resource noverify | ||
|
||
if [ "${TEST_UNDER_FLUX_NO_JOB_EXEC}" != "y" ] | ||
then | ||
modload 0 job-exec | ||
fi | ||
|
||
# mirror sched-simple default of limited=8 | ||
if [ "${TEST_UNDER_FLUX_SCHED_SIMPLE_MODE}x" != "x" ] | ||
then | ||
mode=${TEST_UNDER_FLUX_SCHED_SIMPLE_MODE} | ||
else | ||
mode="limited=8" | ||
fi | ||
|
||
modload 0 sched-simple mode=${mode} | ||
#--setbit 0x2 enables creation of reason_pending field | ||
if [ $RANK -eq 0 ] | ||
then | ||
flux module debug --setbit 0x2 sched-simple | ||
fi | ||
|
||
test $RANK -ne 0 || flux admin cleanup-push <<-EOT | ||
flux queue stop --all --nocheckpoint | ||
flux cancel --all --states RUN | ||
flux queue idle | ||
EOT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/bin/bash -e | ||
|
||
RANK=$(flux getattr rank) | ||
|
||
# Usage: modrm {all|<rank>} modname | ||
modrm() { | ||
local where=$1; shift | ||
if test "$where" = "all" || test $where -eq $RANK; then | ||
flux module remove -f $* | ||
fi | ||
} | ||
|
||
if [ "${TEST_UNDER_FLUX_NO_EXEC}" != "y" ] | ||
then | ||
modrm 0 job-exec | ||
fi | ||
modrm 0 heartbeat | ||
modrm 0 sched-simple | ||
modrm all resource | ||
modrm 0 job-list | ||
modrm all job-info | ||
modrm 0 job-manager | ||
modrm all barrier | ||
modrm all kvs-watch | ||
modrm all job-ingest | ||
|
||
modrm all kvs | ||
|
||
flux content flush | ||
|
||
modrm 0 content-sqlite | ||
modrm all content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/bin/sh | ||
test_description='\ | ||
Test fluxion handling of partial release driven by housekeeping' | ||
|
||
. $(dirname $0)/sharness.sh | ||
|
||
export TEST_UNDER_FLUX_CORES_PER_RANK=4 | ||
test_under_flux 4 job | ||
|
||
TOTAL_NCORES=16 | ||
|
||
# Usage dmesg_wait PATTERN | ||
# Wait up to 10s for PATTERN to appear in dmesg output | ||
# | ||
dmesg_wait() { | ||
count=0 | ||
while ! flux dmesg -H | grep "$1" >/dev/null 2>&1; do | ||
count=$((count+1)) | ||
test $count -eq 100 && return 1 # max 100 * 0.1 sleep = 10s | ||
sleep 0.1 | ||
done | ||
} | ||
|
||
# Usage: hk_wait_for_running count | ||
hk_wait_for_running () { | ||
count=0 | ||
while test $(flux housekeeping list -no {id} | wc -l) -ne $1; do | ||
count=$(($count+1)); | ||
test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s | ||
sleep 0.1 | ||
done | ||
} | ||
|
||
fluxion_free_cores() { | ||
FLUX_RESOURCE_LIST_RPC=sched.resource-status \ | ||
flux resource list -s free -no {ncores} | ||
} | ||
|
||
|
||
test_expect_success 'load fluxion modules' ' | ||
flux module remove -f sched-simple && | ||
flux module load sched-fluxion-resource && | ||
flux module load sched-fluxion-qmanager && | ||
flux resource list && | ||
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list | ||
' | ||
test_expect_success 'run a normal job, resources are free' ' | ||
flux run -vvv -xN4 /bin/true && | ||
test_debug "echo free=\$(fluxion_free_cores)" && | ||
test $(fluxion_free_cores) -eq $TOTAL_NCORES | ||
' | ||
test_expect_success 'run 4 single node jobs, resources are free' ' | ||
flux submit -v --cc=1-4 -xN1 --wait /bin/true && | ||
flux resource list -s free -no "total={ncores}" && | ||
test $(flux resource list -s free -no {ncores}) -eq $TOTAL_NCORES | ||
' | ||
test_expect_success 'run 16 single core jobs, resources are free' ' | ||
flux submit -v --cc=1-16 -n1 --wait /bin/true && | ||
test_debug "echo free=\$(fluxion_free_cores)" && | ||
test $(fluxion_free_cores) -eq $TOTAL_NCORES | ||
' | ||
test_expect_success 'clear dmesg buffer' ' | ||
flux dmesg -C | ||
' | ||
test_expect_success 'run a job with unequal core distribution, resources are free' ' | ||
flux run -vvv -n7 -l flux getattr rank && | ||
test_debug "flux job info $(flux job last) R | jq" && | ||
test_debug "echo free=\$(fluxion_free_cores)" && | ||
test $(fluxion_free_cores) -eq $TOTAL_NCORES | ||
' | ||
test_expect_success 'attempt to ensure dmesg buffer synchronized' ' | ||
flux logger test-sentinel && | ||
dmesg_wait test-sentinel | ||
' | ||
test_expect_success 'no fluxion errors logged' ' | ||
flux dmesg -H >log.out && | ||
test_debug "cat log.out" && | ||
test_must_fail grep "free RPC failed to remove all resources" log.out | ||
' | ||
test_expect_success 'clear dmesg buffer' ' | ||
flux dmesg -C | ||
' | ||
test_expect_success 'enable housekeeping with immediate partial release' ' | ||
flux config load <<-EOF | ||
[job-manager.housekeeping] | ||
command = ["sleep", "0"] | ||
release-after = "0s" | ||
EOF | ||
' | ||
test_expect_success 'run a job across all nodes, wait for housekeeping' ' | ||
flux run -N4 -n4 /bin/true && | ||
hk_wait_for_running 0 | ||
' | ||
test_expect_success 'attempt to ensure dmesg buffer synchronized' ' | ||
flux logger test-sentinel && | ||
dmesg_wait test-sentinel | ||
' | ||
test_expect_success 'all resources free' ' | ||
test_debug "echo free=\$(fluxion_free_cores)" && | ||
test $(fluxion_free_cores) -eq $TOTAL_NCORES | ||
' | ||
test_expect_success 'no errors from fluxion' ' | ||
flux dmesg -H >log2.out && | ||
test_must_fail grep "free RPC failed to remove all resources" log.out | ||
' | ||
test_expect_success 'unload fluxion modules' ' | ||
flux module remove sched-fluxion-qmanager && | ||
flux module remove sched-fluxion-resource && | ||
flux module load sched-simple | ||
' | ||
test_done |