From 257377216d79e25ac245fd38b170553ad3630064 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 27 Jan 2023 10:33:20 -0800 Subject: [PATCH] libschedutil: handle hello failure gracefully Problem: if scheduler cannot reallocate resources to a running job, the scheduler interface is torn down, requiring sys admin intervention. This was seen in conjunction with flux-framework/flux-sched#992. It's not really necessary for this to be fatal to the instance. Raise a fatal exception on the job and let it be cleaned up in the usual way. --- src/common/libschedutil/hello.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/common/libschedutil/hello.c b/src/common/libschedutil/hello.c index 49001d31b557..3c3ebfa046ca 100644 --- a/src/common/libschedutil/hello.c +++ b/src/common/libschedutil/hello.c @@ -18,6 +18,26 @@ #include "init.h" #include "hello.h" + +static void raise_exception (flux_t *h, flux_jobid_t id, const char *note) +{ + flux_future_t *f; + + flux_log (h, + LOG_INFO, + "raising fatal exception on running job id=%ju", + (uintmax_t)id); + + if (!(f = flux_job_raise (h, id, "scheduler", 0, note)) + || flux_future_get (f, NULL) < 0) { + flux_log_error (h, + "error raising fatal exception on %ju: %s", + (uintmax_t)id, + future_strerror (f, errno)); + } + flux_future_destroy (f); +} + static int schedutil_hello_job (schedutil_t *util, const flux_msg_t *msg) { @@ -40,7 +60,7 @@ static int schedutil_hello_job (schedutil_t *util, msg, R, util->cb_arg) < 0) - goto error; + raise_exception (util->h, id, "failed to reallocate R for running job"); flux_future_destroy (f); return 0; error: