From 26cded4e59ec5051faec1ada8522717e6b5de33f Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 15 Nov 2024 06:37:12 -0800 Subject: [PATCH 1/6] job-manager: support partial-ok in hello request Problem: RFC 27 allows the scheduler to send a partial-ok flag in the hello request, and then receive partially allocated jobs in hello responses. If the hello request includes this flag, pass it on to housekeeping. For each partially released housekeeping job, include the 'free' idset in the response per RFC 27. --- src/modules/job-manager/alloc.c | 11 ++-- src/modules/job-manager/housekeeping.c | 73 +++++++++++++++++++------- src/modules/job-manager/housekeeping.h | 4 +- 3 files changed, 64 insertions(+), 24 deletions(-) diff --git a/src/modules/job-manager/alloc.c b/src/modules/job-manager/alloc.c index d5e9e68716e2..5e24a1d1e7aa 100644 --- a/src/modules/job-manager/alloc.c +++ b/src/modules/job-manager/alloc.c @@ -309,17 +309,22 @@ static void hello_cb (flux_t *h, { struct job_manager *ctx = arg; struct job *job; + int partial_ok = 0; /* N.B. no "state" is set in struct alloc after a hello msg, so do * not set ctx->alloc->sched_sender in here. Do so only in the * ready callback */ - if (flux_request_decode (msg, NULL, NULL) < 0) + if (flux_request_unpack (msg, NULL, "{s?b}", "partial-ok", &partial_ok) < 0 + && flux_request_decode (msg, NULL, NULL) < 0) goto error; if (!flux_msg_is_streaming (msg)) { errno = EPROTO; goto error; } - flux_log (h, LOG_DEBUG, "scheduler: hello"); + flux_log (h, + LOG_DEBUG, + "scheduler: hello%s", + partial_ok ? " +partial-ok" : ""); job = zhashx_first (ctx->active_jobs); while (job) { if (job->has_resources && !job->alloc_bypass) { @@ -334,7 +339,7 @@ static void hello_cb (flux_t *h, } job = zhashx_next (ctx->active_jobs); } - if (housekeeping_hello_respond (ctx->housekeeping, msg) < 0) + if (housekeeping_hello_respond (ctx->housekeeping, msg, partial_ok) < 0) goto error; if (flux_respond_error (h, msg, ENODATA, NULL) < 0) flux_log_error (h, "%s: flux_respond_error", __FUNCTION__); diff --git a/src/modules/job-manager/housekeeping.c b/src/modules/job-manager/housekeeping.c index 50e16f21a717..5c2224bc0337 100644 --- a/src/modules/job-manager/housekeeping.c +++ b/src/modules/job-manager/housekeeping.c @@ -78,6 +78,7 @@ #include #include #include +#include #ifdef HAVE_ARGZ_ADD #include #else @@ -110,11 +111,11 @@ struct allocation { flux_jobid_t id; struct rlist *rl; // R, diminished each time a subset is released struct idset *pending; // ranks in need of housekeeping + struct idset *free; // ranks that have been released to the scheduler struct housekeeping *hk; flux_watcher_t *timer; bool timer_armed; bool timer_expired; - int free_count; // number of releases double t_start; struct bulk_exec *bulk_exec; void *list_handle; @@ -142,6 +143,7 @@ static void allocation_destroy (struct allocation *a) int saved_errno = errno; rlist_destroy (a->rl); idset_destroy (a->pending); + idset_destroy (a->free); flux_watcher_destroy (a->timer); bulk_exec_destroy (a->bulk_exec); free (a); @@ -182,6 +184,7 @@ static struct allocation *allocation_create (struct housekeeping *hk, a->t_start = flux_reactor_now (flux_get_reactor (hk->ctx->h)); if (!(a->rl = rlist_from_json (R, NULL)) || !(a->pending = rlist_ranks (a->rl)) + || !(a->free = idset_create (idset_universe_size (a->pending), 0)) || !(a->timer = flux_timer_watcher_create (r, 0, 0., @@ -242,7 +245,8 @@ static void allocation_release (struct allocation *a) || !(rl = rlist_copy_ranks (a->rl, ranks)) || !(R = rlist_to_R (rl)) || alloc_send_free_request (ctx->alloc, R, a->id, final) < 0 - || rlist_remove_ranks (a->rl, ranks) < 0) { + || rlist_remove_ranks (a->rl, ranks) < 0 + || idset_add (a->free, ranks) < 0) { char *s = idset_encode (ranks, IDSET_FLAG_RANGE); flux_log (ctx->h, LOG_ERR, @@ -251,8 +255,6 @@ static void allocation_release (struct allocation *a) s ? s : "NULL"); free (s); } - else - a->free_count++; json_decref (R); rlist_destroy (rl); idset_destroy (ranks); @@ -465,36 +467,61 @@ int housekeeping_start (struct housekeeping *hk, return alloc_send_free_request (hk->ctx->alloc, R, id, true); } +static int set_idset_string (json_t *obj, const char *key, struct idset *ids) +{ + char *s; + json_t *o = NULL; + + if (!(s = idset_encode (ids, IDSET_FLAG_RANGE)) + || !(o = json_string (s)) + || json_object_set_new (obj, key, o) < 0) { + json_decref (o); + free (s); + return -1; + } + free (s); + return 0; +} + static int housekeeping_hello_respond_one (struct housekeeping *hk, const flux_msg_t *msg, struct allocation *a, + bool partial_ok, flux_error_t *error) { struct job *job; + json_t *payload = NULL; - if (a->free_count > 0) { - errprintf (error, "partial release is not supported by RFC 27 hello"); - goto error; + if (!idset_empty (a->free) && !partial_ok) { + errprintf (error, + "scheduler does not support restart with partially" + " released resources"); + return -1; } if (!(job = zhashx_lookup (hk->ctx->inactive_jobs, &a->id)) && !(job = zhashx_lookup (hk->ctx->active_jobs, &a->id))) { errprintf (error, "the job could not be looked up during RFC 27 hello"); - goto error; + return -1; } - if (flux_respond_pack (hk->ctx->h, - msg, - "{s:I s:I s:I s:f}", - "id", job->id, - "priority", job->priority, - "userid", (json_int_t)job->userid, - "t_submit", job->t_submit) < 0) { - errprintf (error, - "the RFC 27 hello response could not be sent: %s", - strerror (errno)); + if (!(payload = json_pack ("{s:I s:I s:I s:f}", + "id", job->id, + "priority", job->priority, + "userid", (json_int_t)job->userid, + "t_submit", job->t_submit))) goto error; + if (!idset_empty (a->free)) { + if (set_idset_string (payload, "free", a->free) < 0) + goto error; } + if (flux_respond_pack (hk->ctx->h, msg, "O", payload) < 0) + goto error; + json_decref (payload); return 0; error: + errprintf (error, + "failed to send scheduler HELLO handshake: %s", + strerror (errno)); + json_decref (payload); return -1; } @@ -513,14 +540,20 @@ static void kill_continuation (flux_future_t *f, void *arg) * allocations. Send remaining housekeeping tasks a SIGTERM, log an error, * and delete the allocation. */ -int housekeeping_hello_respond (struct housekeeping *hk, const flux_msg_t *msg) +int housekeeping_hello_respond (struct housekeeping *hk, + const flux_msg_t *msg, + bool partial_ok) { struct allocation *a; flux_error_t error; a = zlistx_first (hk->allocations); while (a) { - if (housekeeping_hello_respond_one (hk, msg, a, &error) < 0) { + if (housekeeping_hello_respond_one (hk, + msg, + a, + partial_ok, + &error) < 0) { char *ranks; char *hosts = NULL; flux_future_t *f; diff --git a/src/modules/job-manager/housekeeping.h b/src/modules/job-manager/housekeeping.h index a468d3967632..e38b3a24461d 100644 --- a/src/modules/job-manager/housekeeping.h +++ b/src/modules/job-manager/housekeeping.h @@ -31,7 +31,9 @@ int housekeeping_start (struct housekeeping *hk, * It should inform the scheduler about resources that are still allocated, * but no longer directly held by jobs. */ -int housekeeping_hello_respond (struct housekeeping *hk, const flux_msg_t *msg); +int housekeeping_hello_respond (struct housekeeping *hk, + const flux_msg_t *msg, + bool partial_ok); json_t *housekeeping_get_stats (struct housekeeping *hk); From b3d908a78b00af4e870811c83ec5274220bc2717 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 15 Nov 2024 06:42:26 -0800 Subject: [PATCH 2/6] libschedutil: add SCHEDUTIL_HELLO_PARTIAL_OK flag Problem: libschedutil provides no way for the scheduler to indicate that the partial-ok flag should be set in the hello request. Add the SCHEDUTIL_HELLO_PARTIAL_OK flag which is passed to schedutil_create(). --- src/common/libschedutil/hello.c | 14 +++++++++----- src/common/libschedutil/init.h | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/common/libschedutil/hello.c b/src/common/libschedutil/hello.c index ca6460a4afa2..026c03d839b9 100644 --- a/src/common/libschedutil/hello.c +++ b/src/common/libschedutil/hello.c @@ -78,16 +78,20 @@ int schedutil_hello (schedutil_t *util) { flux_future_t *f; int rc = -1; + int partial_ok = 0; if (!util || !util->ops->hello) { errno = EINVAL; return -1; } - if (!(f = flux_rpc (util->h, - "job-manager.sched-hello", - NULL, - FLUX_NODEID_ANY, - FLUX_RPC_STREAMING))) + if ((util->flags & SCHEDUTIL_HELLO_PARTIAL_OK)) + partial_ok = 1; + if (!(f = flux_rpc_pack (util->h, + "job-manager.sched-hello", + FLUX_NODEID_ANY, + FLUX_RPC_STREAMING, + "{s:b}", + "partial-ok", partial_ok))) return -1; while (1) { const flux_msg_t *msg; diff --git a/src/common/libschedutil/init.h b/src/common/libschedutil/init.h index a0903a26c5e5..d7e6b4148459 100644 --- a/src/common/libschedutil/init.h +++ b/src/common/libschedutil/init.h @@ -23,6 +23,7 @@ typedef struct schedutil_ctx schedutil_t; enum schedutil_flags { SCHEDUTIL_FREE_NOLOOKUP = 1, // now the default so this flag is ignored + SCHEDUTIL_HELLO_PARTIAL_OK = 2, }; /* Create a handle for the schedutil convenience library. From 92cd4be1146b45389d49e3c1a980b86d0da6218f Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 19 Nov 2024 08:50:24 -0800 Subject: [PATCH 3/6] libschedutil: support hello 'free' key Problem: when processing hello responses, all schedulers now need to process R - free for partial releases. As a convenience, change the libschedutil hello callback to subtract the free idset from the R it fetched from the KVS. Note that the scheduling key, if present, remains the full object which is opaque to flux-core. --- src/common/Makefile.am | 3 ++- src/common/libschedutil/hello.c | 34 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/common/Makefile.am b/src/common/Makefile.am index dcabbe2ef396..9fe7827818fa 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -138,7 +138,8 @@ libflux_idset_la_LDFLAGS = \ libflux_schedutil_la_SOURCES = libflux_schedutil_la_LIBADD = \ $(builddir)/libschedutil/libschedutil.la \ - $(builddir)/libczmqcontainers/libczmqcontainers.la \ + $(builddir)/librlist/librlist.la \ + libflux-internal.la \ libflux-core.la \ $(JANSSON_LIBS) libflux_schedutil_la_LDFLAGS = \ diff --git a/src/common/libschedutil/hello.c b/src/common/libschedutil/hello.c index 026c03d839b9..9ae24bf05bbf 100644 --- a/src/common/libschedutil/hello.c +++ b/src/common/libschedutil/hello.c @@ -15,6 +15,9 @@ #include #include "src/common/libjob/idf58.h" +#include "src/common/librlist/rlist.h" +#include "src/common/libutil/errno_safe.h" + #include "schedutil_private.h" #include "init.h" #include "hello.h" @@ -39,6 +42,27 @@ static void raise_exception (flux_t *h, flux_jobid_t id, const char *note) flux_future_destroy (f); } +static const char *create_partial_R (const flux_msg_t *msg, + const char *R_orig, + const char *free_ranks) +{ + struct idset *ids; + struct rlist *rl = NULL; + char *R_new = NULL; + + if (!(ids = idset_decode (free_ranks)) + || !(rl = rlist_from_R (R_orig)) + || rlist_remove_ranks (rl, ids) < 0 + || !(R_new = rlist_encode (rl)) + || flux_msg_aux_set (msg, NULL, R_new, (flux_free_f)free) < 0) { + ERRNO_SAFE_WRAP (free, R_new); + R_new = NULL; + } + rlist_destroy (rl); + idset_destroy (ids); + return R_new; +} + static int schedutil_hello_job (schedutil_t *util, const flux_msg_t *msg) { @@ -46,8 +70,12 @@ static int schedutil_hello_job (schedutil_t *util, flux_future_t *f = NULL; const char *R; flux_jobid_t id; + const char *free_ranks = NULL; - if (flux_msg_unpack (msg, "{s:I}", "id", &id) < 0) + if (flux_msg_unpack (msg, + "{s:I s?s}", + "id", &id, + "free", &free_ranks) < 0) goto error; if (flux_job_kvs_key (key, sizeof (key), id, "R") < 0) { errno = EPROTO; @@ -57,6 +85,10 @@ static int schedutil_hello_job (schedutil_t *util, goto error; if (flux_kvs_lookup_get (f, &R) < 0) goto error; + if (free_ranks) { + if (!(R = create_partial_R (msg, R, free_ranks))) + goto error; + } if (util->ops->hello (util->h, msg, R, From 3d2170f3cd8028863b289a5c0e0850d54d393262 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Sat, 16 Nov 2024 06:37:08 -0800 Subject: [PATCH 4/6] sched-simple: support partial hello responses Problem: sched-simple does not support partial hello responses. Set the SCHEDUTIL_HELLO_PARTIAL_OK flag. Add a 'test-hello-nopartial' module option to get the old behavior. Set test-hello-nopartial in the current test of partial housekeeping release. --- src/modules/sched-simple/sched.c | 17 +++++++++++++---- t/t2226-housekeeping.t | 7 +++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/modules/sched-simple/sched.c b/src/modules/sched-simple/sched.c index e6dcd84da714..9b61dd1f1dc9 100644 --- a/src/modules/sched-simple/sched.c +++ b/src/modules/sched-simple/sched.c @@ -184,6 +184,8 @@ static struct simple_sched * simple_sched_create (void) * concurrency being excessively large. */ ss->alloc_limit = 8; + + ss->schedutil_flags = SCHEDUTIL_HELLO_PARTIAL_OK; return ss; } @@ -544,24 +546,28 @@ static int hello_cb (flux_t *h, unsigned int priority; uint32_t userid; double t_submit; + const char *free_ranks = NULL; if (flux_msg_unpack (msg, - "{s:I s:i s:i s:f}", + "{s:I s:i s:i s:f s?s}", "id", &id, "priority", &priority, "userid", &userid, - "t_submit", &t_submit) < 0) { + "t_submit", &t_submit, + "free", &free_ranks) < 0) { flux_log_error (h, "hello: invalid hello payload"); return -1; } flux_log (h, LOG_DEBUG, - "hello: id=%s priority=%u userid=%u t_submit=%0.1f", + "hello: id=%s priority=%u userid=%u t_submit=%0.1f %s%s", idf58 (id), priority, (unsigned int)userid, - t_submit); + t_submit, + free_ranks ? "free=" : "", + free_ranks ? free_ranks : ""); alloc = rlist_from_R (R); if (!alloc) { @@ -958,6 +964,9 @@ static int process_args (flux_t *h, struct simple_sched *ss, else if (streq (argv[i], "test-free-nolookup")) { ss->schedutil_flags |= SCHEDUTIL_FREE_NOLOOKUP; } + else if (streq (argv[i], "test-hello-nopartial")) { + ss->schedutil_flags &= ~SCHEDUTIL_HELLO_PARTIAL_OK; + } else { flux_log_error (h, "Unknown module option: '%s'", argv[i]); errno = EINVAL; diff --git a/t/t2226-housekeeping.t b/t/t2226-housekeeping.t index ec9181659d9b..b13e9ddfcdb1 100755 --- a/t/t2226-housekeeping.t +++ b/t/t2226-housekeeping.t @@ -332,8 +332,7 @@ test_expect_success 'flux resource list shows 0 nodes allocated' ' test $(flux resource list -s allocated -no {nnodes}) -eq 0 ' # The following tests exercise recovery from RFC 27 hello protocol -# with partial release. Once partial release is added to RFC 27, these -# tests should be removed or changed. +# with partial release. test_expect_success 'configure housekeeping with immediate release' ' flux config load <<-EOT [job-manager.housekeeping] @@ -347,9 +346,9 @@ test_expect_success 'run job that uses 4 nodes to trigger housekeeping' ' test_expect_success 'housekeeping is running for 1 job' ' wait_for_running 1 ' -test_expect_success 'reload scheduler' ' +test_expect_success 'reload scheduler without partial hello capability' ' flux dmesg -C && - flux module reload -f sched-simple && + flux module reload -f sched-simple test-hello-nopartial && flux dmesg -H ' test_expect_success 'wait for housekeeping to finish' ' From daefc7c955094188de9cafda228564247aa4384f Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Sat, 16 Nov 2024 10:46:59 -0800 Subject: [PATCH 5/6] testsuite: cover hello with partial allocation Problem: there is no coverage of reloading the scheduler with partially released jobs in housekeeping. Add a test. --- t/t2226-housekeeping.t | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/t/t2226-housekeeping.t b/t/t2226-housekeeping.t index b13e9ddfcdb1..3745dafa0d4f 100755 --- a/t/t2226-housekeeping.t +++ b/t/t2226-housekeeping.t @@ -27,6 +27,11 @@ kill_ranks () { flux housekeeping kill --targets=$1 --signal=$2 } +# Usage: straggler_count +straggler_count () { + flux housekeeping list -no {nnodes} +} + # Note: the hand off of resources to housekeeping occurs just before the job # becomes inactive, therefore it is safe to assume that housekeeping has run # for the job if it is enclosed between successful 'wait_for_running 0' calls. @@ -42,6 +47,16 @@ wait_for_running () { done } +# Usage: wait_for_straggler_count count +wait_for_straggler_count () { + count=0 + while test $(straggler_count) -gt $1; do + count=$(($count+1)); + test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s + sleep 0.1 + done +} + test_expect_success 'flux-housekeeping utility exists' ' flux housekeeping list --help && flux housekeeping kill --help @@ -343,8 +358,9 @@ test_expect_success 'configure housekeeping with immediate release' ' test_expect_success 'run job that uses 4 nodes to trigger housekeeping' ' flux run -N4 true ' -test_expect_success 'housekeeping is running for 1 job' ' - wait_for_running 1 +test_expect_success 'housekeeping completed except for one straggler' ' + wait_for_running 1 && + wait_for_straggler_count 1 ' test_expect_success 'reload scheduler without partial hello capability' ' flux dmesg -C && @@ -357,4 +373,26 @@ test_expect_success 'wait for housekeeping to finish' ' test_expect_success 'housekeeping jobs were terminated due to sched reload' ' flux dmesg | grep "housekeeping:.*will be terminated" ' +test_expect_success 'no node are allocated' ' + test $(flux resource list -s allocated -no {nnodes}) -eq 0 && + test $(FLUX_RESOURCE_LIST_RPC=sched.resource-status \ + flux resource list -s allocated -no {nnodes}) -eq 0 +' +test_expect_success 'run job that uses 4 nodes to trigger housekeeping' ' + flux run -N4 true +' +test_expect_success 'housekeeping completed except for one straggler' ' + wait_for_running 1 && + wait_for_straggler_count 1 +' +test_expect_success 'reload scheduler WITH partial hello capability' ' + flux dmesg -C && + flux module reload -f sched-simple && + flux dmesg -H +' +test_expect_success 'one node is allocated' ' + test $(flux resource list -s allocated -no {nnodes}) -eq 1 && + test $(FLUX_RESOURCE_LIST_RPC=sched.resource-status \ + flux resource list -s allocated -no {nnodes}) -eq 1 +' test_done From f580637d745baa95bac61c51648163f2288bfff9 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 19 Nov 2024 09:44:29 -0800 Subject: [PATCH 6/6] sched-simple: improve error log message Problem: when the hello protocol cannot process a job, it logs the name of the wrong rlist function. Make the log message a little more high level. --- src/modules/sched-simple/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modules/sched-simple/sched.c b/src/modules/sched-simple/sched.c index 9b61dd1f1dc9..04d7bd9c1192 100644 --- a/src/modules/sched-simple/sched.c +++ b/src/modules/sched-simple/sched.c @@ -576,7 +576,7 @@ static int hello_cb (flux_t *h, } s = rlist_dumps (alloc); if ((rc = rlist_set_allocated (ss->rlist, alloc)) < 0) - flux_log_error (h, "hello: rlist_remove (%s)", s); + flux_log_error (h, "hello: alloc %s", s); else flux_log (h, LOG_DEBUG, "hello: alloc %s", s); free (s);