From 2b4b6ff90506cbc1ec878a25beabc82c2ddda93b Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 4 Oct 2024 01:25:48 +0000 Subject: [PATCH 1/3] resource/readers: fix rv1exec based partial cancel Problem: The rv1exec reader partial cancel support doesn't work when there are multiple entries in the execution R_lite array. This is because resource_reader_rv1exec_t::partial_cancel_internal() doesn't accumulate ranks as it loops over the R_lite array. This results in the nuisance log message remove: Final .free RPC failed to remove all resources for jobid... for every job that doesn't have the same core or gpu ids allocated for every rank. Accumulate ranks while looping over entries in R_lite instead of throwing away all but the last ranks idset. Fixes #1301 --- resource/readers/resource_reader_rv1exec.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/resource/readers/resource_reader_rv1exec.cpp b/resource/readers/resource_reader_rv1exec.cpp index d630d239a..bd9242347 100644 --- a/resource/readers/resource_reader_rv1exec.cpp +++ b/resource/readers/resource_reader_rv1exec.cpp @@ -943,6 +943,7 @@ int resource_reader_rv1exec_t::partial_cancel_internal (resource_graph_t &g, json_t *entry = nullptr; const char *ranks = nullptr; struct idset *r_ids = nullptr; + size_t len; // Implementing cancellation of rank subgraph // will require further parsing of nodelist, @@ -956,23 +957,24 @@ int resource_reader_rv1exec_t::partial_cancel_internal (resource_graph_t &g, errno = EINVAL; goto error; } + if (!(r_ids = idset_create (0, IDSET_FLAG_AUTOGROW))) + goto error; json_array_foreach (rlite, index, entry) { - if (json_unpack (entry, "{s:s}", "rank", &ranks) < 0) { + if (json_unpack (entry, "{s:s%}", "rank", &ranks, &len) < 0) { errno = EINVAL; goto error; } + if (idset_decode_add (r_ids, ranks, len, NULL) < 0) + goto error; } - if (!(r_ids = idset_decode (ranks))) - goto error; rank = idset_first (r_ids); while (rank != IDSET_INVALID_ID) { mod_data.ranks_removed.insert (rank); rank = idset_next (r_ids, rank); } - idset_destroy (r_ids); rc = 0; - error: + idset_destroy (r_ids); return rc; } From d1928cbc05cd740a263266e8ba6545722600e1f4 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 4 Oct 2024 20:17:14 +0000 Subject: [PATCH 2/3] testsuite: support "job" personality for test_under_flux Problem: The test_under_flux sharness helper doesn't support the "job" personality due to missing rc{1,3}-job files and also an environment variable necessary to find them. This makes it so tests can't use TEST_UNDER_FLUX_CORES_PER_RANK to easily set up fake resources for testing. Add the missing rc*-job files in t/rc/ and export FLUX_SOURCE_DIR in sched-sharness.sh so test_under_flux can find them. --- t/rc/rc1-job | 74 ++++++++++++++++++++++++++++++++++ t/rc/rc3-job | 32 +++++++++++++++ t/sharness.d/sched-sharness.sh | 4 ++ 3 files changed, 110 insertions(+) create mode 100755 t/rc/rc1-job create mode 100755 t/rc/rc3-job diff --git a/t/rc/rc1-job b/t/rc/rc1-job new file mode 100755 index 000000000..1a02897c8 --- /dev/null +++ b/t/rc/rc1-job @@ -0,0 +1,74 @@ +#!/bin/bash -e + +idset_from_count() { + if test $1 -eq 1; then + echo "0" + else + echo "0-$(($1 - 1))" + fi +} + +set_fake_resources() { + cores=${1} + ranklist=$(idset_from_count $(flux getattr size)) + corelist=$(idset_from_count ${cores}) + R=$(flux R encode -r${ranklist} -c${corelist}) + echo Setting fake resource.R="$R" >&2 + flux kvs put resource.R="$R" +} + +RANK=$(flux getattr rank) + +# Usage: modload {all|} modname [args ...] +modload() { + local where=$1; shift + if test "$where" = "all" || test $where -eq $RANK; then + flux module load $* + fi +} + + +modload all content +modload 0 content-sqlite +modload all kvs +modload all kvs-watch + +modload 0 job-manager + +modload all job-ingest +modload all job-info +modload 0 job-list +modload all barrier +modload 0 heartbeat + +if test $RANK -eq 0; then + # Set fake resources for testing + set_fake_resources ${TEST_UNDER_FLUX_CORES_PER_RANK:-2} +fi +modload all resource noverify + +if [ "${TEST_UNDER_FLUX_NO_JOB_EXEC}" != "y" ] +then + modload 0 job-exec +fi + +# mirror sched-simple default of limited=8 +if [ "${TEST_UNDER_FLUX_SCHED_SIMPLE_MODE}x" != "x" ] +then + mode=${TEST_UNDER_FLUX_SCHED_SIMPLE_MODE} +else + mode="limited=8" +fi + +modload 0 sched-simple mode=${mode} +#--setbit 0x2 enables creation of reason_pending field +if [ $RANK -eq 0 ] +then + flux module debug --setbit 0x2 sched-simple +fi + +test $RANK -ne 0 || flux admin cleanup-push <<-EOT + flux queue stop --all --nocheckpoint + flux cancel --all --states RUN + flux queue idle +EOT diff --git a/t/rc/rc3-job b/t/rc/rc3-job new file mode 100755 index 000000000..533489858 --- /dev/null +++ b/t/rc/rc3-job @@ -0,0 +1,32 @@ +#!/bin/bash -e + +RANK=$(flux getattr rank) + +# Usage: modrm {all|} modname +modrm() { + local where=$1; shift + if test "$where" = "all" || test $where -eq $RANK; then + flux module remove -f $* + fi +} + +if [ "${TEST_UNDER_FLUX_NO_EXEC}" != "y" ] +then + modrm 0 job-exec +fi +modrm 0 heartbeat +modrm 0 sched-simple +modrm all resource +modrm 0 job-list +modrm all job-info +modrm 0 job-manager +modrm all barrier +modrm all kvs-watch +modrm all job-ingest + +modrm all kvs + +flux content flush + +modrm 0 content-sqlite +modrm all content diff --git a/t/sharness.d/sched-sharness.sh b/t/sharness.d/sched-sharness.sh index 9dba81fc1..48b6961db 100644 --- a/t/sharness.d/sched-sharness.sh +++ b/t/sharness.d/sched-sharness.sh @@ -22,6 +22,10 @@ else export PYTHONPATH="${SHARNESS_TEST_SRCDIR}/../src/python${PYTHONPATH:+:${PYTHONPATH}}" fi +# Setup FLUX_SOURCE_DIR for use in flux-sharness.sh +FLUX_SOURCE_DIR="$(cd ${SHARNESS_TEST_SRCDIR}/.. && pwd)" +export FLUX_SOURCE_DIR + ## Set up environment using flux(1) in PATH flux --help >/dev/null 2>&1 || error "Failed to find flux in PATH" eval $(flux env) From e6dad53d5aa34249a7cf46de6f7181c5bf27eacf Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 4 Oct 2024 20:30:56 +0000 Subject: [PATCH 3/3] testsuite: add real-world test for partial release Problem: The Fluxion testsuite does not contain a test that does end-to-end testing of resource release handling, including partial release as triggered by the Flux housekeeping service. Add a new test t1026-rv1-partial-release.t which implements a first cut of this testing. Add to the test specific cases that trigger issue #1301. --- t/CMakeLists.txt | 1 + t/t1026-rv1-partial-release.t | 111 ++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100755 t/t1026-rv1-partial-release.t diff --git a/t/CMakeLists.txt b/t/CMakeLists.txt index 9b0d5fa52..30d3a5369 100644 --- a/t/CMakeLists.txt +++ b/t/CMakeLists.txt @@ -28,6 +28,7 @@ set(ALL_TESTS t1023-multiqueue-constraints.t t1024-alloc-check.t t1025-rv1-reload.t + t1026-rv1-partial-release.t t3000-jobspec.t t3001-resource-basic.t t3002-resource-prefix.t diff --git a/t/t1026-rv1-partial-release.t b/t/t1026-rv1-partial-release.t new file mode 100755 index 000000000..da315d633 --- /dev/null +++ b/t/t1026-rv1-partial-release.t @@ -0,0 +1,111 @@ +#!/bin/sh +test_description='\ +Test fluxion handling of partial release driven by housekeeping' + +. $(dirname $0)/sharness.sh + +export TEST_UNDER_FLUX_CORES_PER_RANK=4 +test_under_flux 4 job + +TOTAL_NCORES=16 + +# Usage dmesg_wait PATTERN +# Wait up to 10s for PATTERN to appear in dmesg output +# +dmesg_wait() { + count=0 + while ! flux dmesg -H | grep "$1" >/dev/null 2>&1; do + count=$((count+1)) + test $count -eq 100 && return 1 # max 100 * 0.1 sleep = 10s + sleep 0.1 + done +} + +# Usage: hk_wait_for_running count +hk_wait_for_running () { + count=0 + while test $(flux housekeeping list -no {id} | wc -l) -ne $1; do + count=$(($count+1)); + test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s + sleep 0.1 + done +} + +fluxion_free_cores() { + FLUX_RESOURCE_LIST_RPC=sched.resource-status \ + flux resource list -s free -no {ncores} +} + + +test_expect_success 'load fluxion modules' ' + flux module remove -f sched-simple && + flux module load sched-fluxion-resource && + flux module load sched-fluxion-qmanager && + flux resource list && + FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list +' +test_expect_success 'run a normal job, resources are free' ' + flux run -vvv -xN4 /bin/true && + test_debug "echo free=\$(fluxion_free_cores)" && + test $(fluxion_free_cores) -eq $TOTAL_NCORES +' +test_expect_success 'run 4 single node jobs, resources are free' ' + flux submit -v --cc=1-4 -xN1 --wait /bin/true && + flux resource list -s free -no "total={ncores}" && + test $(flux resource list -s free -no {ncores}) -eq $TOTAL_NCORES +' +test_expect_success 'run 16 single core jobs, resources are free' ' + flux submit -v --cc=1-16 -n1 --wait /bin/true && + test_debug "echo free=\$(fluxion_free_cores)" && + test $(fluxion_free_cores) -eq $TOTAL_NCORES +' +test_expect_success 'clear dmesg buffer' ' + flux dmesg -C +' +test_expect_success 'run a job with unequal core distribution, resources are free' ' + flux run -vvv -n7 -l flux getattr rank && + test_debug "flux job info $(flux job last) R | jq" && + test_debug "echo free=\$(fluxion_free_cores)" && + test $(fluxion_free_cores) -eq $TOTAL_NCORES +' +test_expect_success 'attempt to ensure dmesg buffer synchronized' ' + flux logger test-sentinel && + dmesg_wait test-sentinel +' +test_expect_success 'no fluxion errors logged' ' + flux dmesg -H >log.out && + test_debug "cat log.out" && + test_must_fail grep "free RPC failed to remove all resources" log.out +' +test_expect_success 'clear dmesg buffer' ' + flux dmesg -C +' +test_expect_success 'enable housekeeping with immediate partial release' ' + flux config load <<-EOF + [job-manager.housekeeping] + command = ["sleep", "0"] + release-after = "0s" + EOF +' +test_expect_success 'run a job across all nodes, wait for housekeeping' ' + flux run -N4 -n4 /bin/true && + hk_wait_for_running 0 +' +test_expect_success 'attempt to ensure dmesg buffer synchronized' ' + flux logger test-sentinel && + dmesg_wait test-sentinel +' +test_expect_success 'all resources free' ' + test_debug "echo free=\$(fluxion_free_cores)" && + test $(fluxion_free_cores) -eq $TOTAL_NCORES +' +test_expect_success 'no errors from fluxion' ' + flux dmesg -H >log2.out && + test_must_fail grep "free RPC failed to remove all resources" log.out +' +test_expect_success 'unload fluxion modules' ' + flux module remove sched-fluxion-qmanager && + flux module remove sched-fluxion-resource && + flux module load sched-simple +' +test_done