From 2b4b6ff90506cbc1ec878a25beabc82c2ddda93b Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Fri, 4 Oct 2024 01:25:48 +0000
Subject: [PATCH 1/3] resource/readers: fix rv1exec based partial cancel

Problem: The rv1exec reader partial cancel support doesn't work when
there are multiple entries in the execution R_lite array. This is because
resource_reader_rv1exec_t::partial_cancel_internal() doesn't accumulate
ranks as it loops over the R_lite array. This results in the nuisance log
message

  remove: Final .free RPC failed to remove all resources for jobid...

for every job that doesn't have the same core or gpu ids allocated for
every rank.

Accumulate ranks while looping over entries in R_lite instead of
throwing away all but the last ranks idset.

Fixes #1301
---
 resource/readers/resource_reader_rv1exec.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/resource/readers/resource_reader_rv1exec.cpp b/resource/readers/resource_reader_rv1exec.cpp
index d630d239a..bd9242347 100644
--- a/resource/readers/resource_reader_rv1exec.cpp
+++ b/resource/readers/resource_reader_rv1exec.cpp
@@ -943,6 +943,7 @@ int resource_reader_rv1exec_t::partial_cancel_internal (resource_graph_t &g,
     json_t *entry = nullptr;
     const char *ranks = nullptr;
     struct idset *r_ids = nullptr;
+    size_t len;
 
     // Implementing cancellation of rank subgraph
     // will require further parsing of nodelist,
@@ -956,23 +957,24 @@ int resource_reader_rv1exec_t::partial_cancel_internal (resource_graph_t &g,
         errno = EINVAL;
         goto error;
     }
+    if (!(r_ids = idset_create (0, IDSET_FLAG_AUTOGROW)))
+        goto error;
     json_array_foreach (rlite, index, entry) {
-        if (json_unpack (entry, "{s:s}", "rank", &ranks) < 0) {
+        if (json_unpack (entry, "{s:s%}", "rank", &ranks, &len) < 0) {
             errno = EINVAL;
             goto error;
         }
+        if (idset_decode_add (r_ids, ranks, len, NULL) < 0)
+            goto error;
     }
-    if (!(r_ids = idset_decode (ranks)))
-        goto error;
     rank = idset_first (r_ids);
     while (rank != IDSET_INVALID_ID) {
         mod_data.ranks_removed.insert (rank);
         rank = idset_next (r_ids, rank);
     }
-    idset_destroy (r_ids);
     rc = 0;
-
 error:
+    idset_destroy (r_ids);
     return rc;
 }
 

From d1928cbc05cd740a263266e8ba6545722600e1f4 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Fri, 4 Oct 2024 20:17:14 +0000
Subject: [PATCH 2/3] testsuite: support "job" personality for test_under_flux

Problem: The test_under_flux sharness helper doesn't support the "job"
personality due to missing rc{1,3}-job files and also an environment
variable necessary to find them. This makes it so tests can't use
TEST_UNDER_FLUX_CORES_PER_RANK to easily set up fake resources for
testing.

Add the missing rc*-job files in t/rc/ and export FLUX_SOURCE_DIR in
sched-sharness.sh so test_under_flux can find them.
---
 t/rc/rc1-job                   | 74 ++++++++++++++++++++++++++++++++++
 t/rc/rc3-job                   | 32 +++++++++++++++
 t/sharness.d/sched-sharness.sh |  4 ++
 3 files changed, 110 insertions(+)
 create mode 100755 t/rc/rc1-job
 create mode 100755 t/rc/rc3-job

diff --git a/t/rc/rc1-job b/t/rc/rc1-job
new file mode 100755
index 000000000..1a02897c8
--- /dev/null
+++ b/t/rc/rc1-job
@@ -0,0 +1,74 @@
+#!/bin/bash -e
+
+idset_from_count() {
+    if test $1 -eq 1; then
+        echo "0"
+    else
+        echo "0-$(($1 - 1))"
+    fi
+}
+
+set_fake_resources() {
+    cores=${1}
+    ranklist=$(idset_from_count $(flux getattr size))
+    corelist=$(idset_from_count ${cores})
+    R=$(flux R encode -r${ranklist} -c${corelist})
+    echo Setting fake resource.R="$R" >&2
+    flux kvs put resource.R="$R"
+}
+
+RANK=$(flux getattr rank)
+
+# Usage: modload {all|<rank>} modname [args ...]
+modload() {
+    local where=$1; shift
+    if test "$where" = "all" || test $where -eq $RANK; then
+        flux module load $*
+    fi
+}
+
+
+modload all content
+modload 0 content-sqlite
+modload all kvs
+modload all kvs-watch
+
+modload 0 job-manager
+
+modload all job-ingest
+modload all job-info
+modload 0 job-list
+modload all barrier
+modload 0 heartbeat
+
+if test $RANK -eq 0; then
+    # Set fake resources for testing
+    set_fake_resources ${TEST_UNDER_FLUX_CORES_PER_RANK:-2}
+fi
+modload all resource noverify
+
+if [ "${TEST_UNDER_FLUX_NO_JOB_EXEC}" != "y" ]
+then
+    modload 0 job-exec
+fi
+
+# mirror sched-simple default of limited=8
+if [ "${TEST_UNDER_FLUX_SCHED_SIMPLE_MODE}x" != "x" ]
+then
+    mode=${TEST_UNDER_FLUX_SCHED_SIMPLE_MODE}
+else
+    mode="limited=8"
+fi
+
+modload 0 sched-simple mode=${mode}
+#--setbit 0x2 enables creation of reason_pending field
+if [ $RANK -eq 0 ]
+then
+    flux module debug --setbit 0x2 sched-simple
+fi
+
+test $RANK -ne 0 || flux admin cleanup-push <<-EOT
+	flux queue stop --all --nocheckpoint
+	flux cancel --all --states RUN
+	flux queue idle
+EOT
diff --git a/t/rc/rc3-job b/t/rc/rc3-job
new file mode 100755
index 000000000..533489858
--- /dev/null
+++ b/t/rc/rc3-job
@@ -0,0 +1,32 @@
+#!/bin/bash -e
+
+RANK=$(flux getattr rank)
+
+# Usage: modrm {all|<rank>} modname
+modrm() {
+    local where=$1; shift
+    if test "$where" = "all" || test $where -eq $RANK; then
+        flux module remove -f $*
+    fi
+}
+
+if [ "${TEST_UNDER_FLUX_NO_EXEC}" != "y" ]
+then
+    modrm 0 job-exec
+fi
+modrm 0 heartbeat
+modrm 0 sched-simple
+modrm all resource
+modrm 0 job-list
+modrm all job-info
+modrm 0 job-manager
+modrm all barrier
+modrm all kvs-watch
+modrm all job-ingest
+
+modrm all kvs
+
+flux content flush
+
+modrm 0 content-sqlite
+modrm all content
diff --git a/t/sharness.d/sched-sharness.sh b/t/sharness.d/sched-sharness.sh
index 9dba81fc1..48b6961db 100644
--- a/t/sharness.d/sched-sharness.sh
+++ b/t/sharness.d/sched-sharness.sh
@@ -22,6 +22,10 @@ else
   export PYTHONPATH="${SHARNESS_TEST_SRCDIR}/../src/python${PYTHONPATH:+:${PYTHONPATH}}"
 fi
 
+# Setup FLUX_SOURCE_DIR for use in flux-sharness.sh
+FLUX_SOURCE_DIR="$(cd ${SHARNESS_TEST_SRCDIR}/.. && pwd)"
+export FLUX_SOURCE_DIR
+
 ## Set up environment using flux(1) in PATH
 flux --help >/dev/null 2>&1 || error "Failed to find flux in PATH"
 eval $(flux env)

From e6dad53d5aa34249a7cf46de6f7181c5bf27eacf Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Fri, 4 Oct 2024 20:30:56 +0000
Subject: [PATCH 3/3] testsuite: add real-world test for partial release

Problem: The Fluxion testsuite does not contain a test that does
end-to-end testing of resource release handling, including partial
release as triggered by the Flux housekeeping service.

Add a new test t1026-rv1-partial-release.t which implements a first
cut of this testing.

Add to the test specific cases that trigger issue #1301.
---
 t/CMakeLists.txt              |   1 +
 t/t1026-rv1-partial-release.t | 111 ++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100755 t/t1026-rv1-partial-release.t

diff --git a/t/CMakeLists.txt b/t/CMakeLists.txt
index 9b0d5fa52..30d3a5369 100644
--- a/t/CMakeLists.txt
+++ b/t/CMakeLists.txt
@@ -28,6 +28,7 @@ set(ALL_TESTS
   t1023-multiqueue-constraints.t
   t1024-alloc-check.t
   t1025-rv1-reload.t
+  t1026-rv1-partial-release.t
   t3000-jobspec.t
   t3001-resource-basic.t
   t3002-resource-prefix.t
diff --git a/t/t1026-rv1-partial-release.t b/t/t1026-rv1-partial-release.t
new file mode 100755
index 000000000..da315d633
--- /dev/null
+++ b/t/t1026-rv1-partial-release.t
@@ -0,0 +1,111 @@
+#!/bin/sh
+test_description='\
+Test fluxion handling of partial release driven by housekeeping'
+
+. $(dirname $0)/sharness.sh
+
+export TEST_UNDER_FLUX_CORES_PER_RANK=4
+test_under_flux 4 job
+
+TOTAL_NCORES=16
+
+# Usage dmesg_wait PATTERN
+# Wait up to 10s for PATTERN to appear in dmesg output
+#
+dmesg_wait() {
+	count=0
+	while ! flux dmesg -H | grep "$1" >/dev/null 2>&1; do
+		count=$((count+1))
+		test $count -eq 100 && return 1 # max 100 * 0.1 sleep = 10s
+		sleep 0.1
+	done
+}
+
+# Usage: hk_wait_for_running count
+hk_wait_for_running () {
+        count=0
+        while test $(flux housekeeping list -no {id} | wc -l) -ne $1; do
+                count=$(($count+1));
+                test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s
+                sleep 0.1
+        done
+}
+
+fluxion_free_cores() {
+	FLUX_RESOURCE_LIST_RPC=sched.resource-status \
+		flux resource list -s free -no {ncores}
+}
+
+
+test_expect_success 'load fluxion modules' '
+	flux module remove -f sched-simple &&
+	flux module load sched-fluxion-resource &&
+	flux module load sched-fluxion-qmanager &&
+	flux resource list &&
+	FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
+'
+test_expect_success 'run a normal job, resources are free' '
+	flux run -vvv -xN4 /bin/true &&
+	test_debug "echo free=\$(fluxion_free_cores)" &&
+	test $(fluxion_free_cores) -eq $TOTAL_NCORES
+'
+test_expect_success 'run 4 single node jobs, resources are free' '
+	flux submit -v --cc=1-4 -xN1 --wait /bin/true &&
+	flux resource list -s free -no "total={ncores}" &&
+	test $(flux resource list -s free -no {ncores}) -eq $TOTAL_NCORES
+'
+test_expect_success 'run 16 single core jobs, resources are free' '
+	flux submit -v --cc=1-16 -n1 --wait /bin/true &&
+	test_debug "echo free=\$(fluxion_free_cores)" &&
+	test $(fluxion_free_cores) -eq $TOTAL_NCORES
+'
+test_expect_success 'clear dmesg buffer' '
+	flux dmesg -C
+'
+test_expect_success 'run a job with unequal core distribution, resources are free' '
+	flux run -vvv -n7 -l flux getattr rank &&
+	test_debug "flux job info $(flux job last) R | jq" &&
+	test_debug "echo free=\$(fluxion_free_cores)" &&
+	test $(fluxion_free_cores) -eq $TOTAL_NCORES
+'
+test_expect_success 'attempt to ensure dmesg buffer synchronized' '
+	flux logger test-sentinel &&
+	dmesg_wait test-sentinel
+'
+test_expect_success 'no fluxion errors logged' '
+	flux dmesg -H >log.out &&
+	test_debug "cat log.out" &&
+	test_must_fail grep "free RPC failed to remove all resources" log.out
+'
+test_expect_success 'clear dmesg buffer' '
+	flux dmesg -C
+'
+test_expect_success 'enable housekeeping with immediate partial release' '
+	flux config load <<-EOF
+	[job-manager.housekeeping]
+	command = ["sleep", "0"]
+	release-after = "0s"
+	EOF
+'
+test_expect_success 'run a job across all nodes, wait for housekeeping' '
+	flux run -N4 -n4 /bin/true &&
+	hk_wait_for_running 0
+'
+test_expect_success 'attempt to ensure dmesg buffer synchronized' '
+	flux logger test-sentinel &&
+	dmesg_wait test-sentinel
+'
+test_expect_success 'all resources free' '
+	test_debug "echo free=\$(fluxion_free_cores)" &&
+	test $(fluxion_free_cores) -eq $TOTAL_NCORES
+'
+test_expect_success 'no errors from fluxion' '
+	flux dmesg -H >log2.out &&
+	test_must_fail grep "free RPC failed to remove all resources" log.out
+'
+test_expect_success 'unload fluxion modules' '
+	flux module remove sched-fluxion-qmanager &&
+	flux module remove sched-fluxion-resource &&
+	flux module load sched-simple
+'
+test_done