Skip to content

Commit

Permalink
job-manager: include R in sched.free request
Browse files Browse the repository at this point in the history
Problem: R has to be looked up from the KVS in the sched.free
request handler, but now that the job manager caches R, this
is an unnecessary extra step.

Add R to the sched.free request payload.

Note that the `R.scheduling` key is not included.  The current design of
Fluxion in which `R.scheduling` may contain a voluminous JGF object made
caching this part of R impractical.

Change libschedutil so that
- the sched.free message handler never looks up R in the kvs
- the free callback always sets its `R` argument to NULL
- the SCHEDUTIL_FREE_NOLOOKUP flag is a no-op

Update sched-simple's free callback to unpack R from the message
instead of decoding the `R` arugment.

Note that Fluxion sets SCHEDUTIL_FREE_NOLOOKUP so it already expects
the free callback's R argument to be NULL.  Although this change increases
the size of sched.free payloads with data that Fluxion currently does not
use, the ranks in R will be required by Fluxion in the future to identify
resource subsets for partial release (flux-framework/flux-sched#1151).

This change should be accompanied by an update to RFC 27.

Update sched-simple unit test.

Fixes flux-framework#5775
  • Loading branch information
garlick committed Mar 11, 2024
1 parent 6bbe29c commit be41aba
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 75 deletions.
2 changes: 1 addition & 1 deletion src/common/libschedutil/init.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ extern "C" {
typedef struct schedutil_ctx schedutil_t;

enum schedutil_flags {
SCHEDUTIL_FREE_NOLOOKUP = 1, // ops->free() will be called with R=NULL
SCHEDUTIL_FREE_NOLOOKUP = 1, // now the default so this flag is ignored
};

/* Create a handle for the schedutil convenience library.
Expand Down
59 changes: 1 addition & 58 deletions src/common/libschedutil/ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,73 +45,16 @@ static void cancel_cb (flux_t *h,
util->ops->cancel (h, msg, util->cb_arg);
}

static void free_continuation (flux_future_t *f, void *arg)
{
schedutil_t *util = arg;
const flux_msg_t *msg = flux_future_aux_get (f, "schedutil::msg");
flux_t *h = util->h;
const char *R;

if (flux_kvs_lookup_get (f, &R) < 0) {
flux_log_error (h, "sched.free lookup R");
goto error;
}
if (schedutil_remove_outstanding_future (util, f) < 0)
flux_log_error (h, "sched.free unable to remove outstanding future");
util->ops->free (h, msg, R, util->cb_arg);
flux_future_destroy (f);
return;
error:
flux_log_error (h, "sched.free");
if (flux_respond_error (h, msg, errno, NULL) < 0)
flux_log_error (h, "sched.free respond_error");
flux_future_destroy (f);
}

static void free_cb (flux_t *h,
flux_msg_handler_t *mh,
const flux_msg_t *msg,
void *arg)
{
schedutil_t *util = arg;
flux_jobid_t id;
flux_future_t *f;
char key[64];

assert (util);

if (util->flags & SCHEDUTIL_FREE_NOLOOKUP) {
util->ops->free (h, msg, NULL, util->cb_arg);
return;
}

if (flux_request_unpack (msg, NULL, "{s:I}", "id", &id) < 0)
goto error;
if (flux_job_kvs_key (key, sizeof (key), id, "R") < 0) {
errno = EPROTO;
goto error;
}
if (!(f = flux_kvs_lookup (h, NULL, 0, key)))
goto error;
if (flux_future_aux_set (f,
"schedutil::msg",
(void *)flux_msg_incref (msg),
(flux_free_f)flux_msg_decref) < 0) {
flux_msg_decref (msg);
goto error_future;
}
if (flux_future_then (f, -1, free_continuation, util) < 0)
goto error_future;
if (schedutil_add_outstanding_future (util, f) < 0)
flux_log_error (h, "sched.free unable to add outstanding future");

return;
error_future:
flux_future_destroy (f);
error:
flux_log_error (h, "sched.free");
if (flux_respond_error (h, msg, errno, NULL) < 0)
flux_log_error (h, "sched.free respond_error");
util->ops->free (h, msg, NULL, util->cb_arg);
}

static void prioritize_cb (flux_t *h,
Expand Down
12 changes: 4 additions & 8 deletions src/common/libschedutil/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,12 @@ struct schedutil_ops {
const flux_msg_t *msg,
void *arg);

/* Callback for a free request. R is looked up as a convenience.
* 'msg' and 'R' are only valid for the duration of this call.
/* Callback for a free request.
* Currently 'R' is always NULL and may be unpacked as a JSON object
* under the "R" key in 'msg' if needed.
* 'msg' is only valid for the duration of this call.
* You should either respond to the request immediately (see
* free.h), or cache this information for later response.
*
* If R is unneeded, it is recommended that the
* SCHEDUTIL_FREE_NOLOOKUP flag be set in schedutil_create(). By
* setting this flag, R will not be looked up, saving a KVS lookup
* on every invocation of this callback. R will be set to NULL
* instead.
*/
void (*free)(flux_t *h,
const flux_msg_t *msg,
Expand Down
5 changes: 4 additions & 1 deletion src/modules/job-manager/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,10 @@ int free_request (struct alloc *alloc, struct job *job)

if (!(msg = flux_request_encode ("sched.free", NULL)))
return -1;
if (flux_msg_pack (msg, "{s:I}", "id", job->id) < 0)
if (flux_msg_pack (msg,
"{s:I s:O}",
"id", job->id,
"R", job->R_redacted) < 0)
goto error;
if (flux_send (alloc->ctx->h, msg, 0) < 0)
goto error;
Expand Down
16 changes: 10 additions & 6 deletions src/modules/sched-simple/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -336,13 +336,16 @@ static void check_cb (flux_reactor_t *r, flux_watcher_t *w,
}
}

static int try_free (flux_t *h, struct simple_sched *ss, const char *R)
static int try_free (flux_t *h, struct simple_sched *ss, json_t *R)
{
int rc = -1;
char *r = NULL;
struct rlist *alloc = rlist_from_R (R);
json_error_t error;
struct rlist *alloc = rlist_from_json (R, &error);
if (!alloc) {
flux_log_error (h, "free: unable to parse R=%s", R);
char *s = json_dumps (R, JSON_COMPACT);
flux_log_error (h, "free: unable to parse R=%s: %s", s, error.text);
ERRNO_SAFE_WRAP (free, s);
return -1;
}
r = rlist_dumps (alloc);
Expand All @@ -355,12 +358,13 @@ static int try_free (flux_t *h, struct simple_sched *ss, const char *R)
return rc;
}

void free_cb (flux_t *h, const flux_msg_t *msg, const char *R, void *arg)
void free_cb (flux_t *h, const flux_msg_t *msg, const char *R_str, void *arg)
{
struct simple_sched *ss = arg;
json_t *R;

if (!R) {
flux_log (h, LOG_ERR, "free: R is NULL");
if (flux_request_unpack (msg, NULL, "{s:o}", "R", &R) < 0) {
flux_log (h, LOG_ERR, "free: error unpacking sched.free request");
if (flux_respond_error (h, msg, EINVAL, NULL) < 0)
flux_log_error (h, "free_cb: flux_respond_error");
return;
Expand Down
5 changes: 4 additions & 1 deletion t/t2300-sched-simple.t
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,14 @@ test_expect_success 'sched-simple: cancel jobs' '
test_expect_success 'sched-simple: reload sched-simple to cover free flags' '
flux module reload sched-simple test-free-nolookup
'
# That SCHEDUTIL_FREE_NOLOOKUP is now a no-op but since flux-sched-0.33.0
# still uses it, ensure that free still works when it is used
test_expect_success 'sched-simple: submit job and cancel it' '
flux dmesg --clear &&
flux job submit basic.json >job19.id &&
flux job wait-event --timeout=5.0 $(cat job19.id) alloc &&
flux cancel $(cat job19.id) &&
$dmesg_grep -t 10 "free: R is NULL"
$dmesg_grep -t 10 "free: rank0/core0"
'
test_expect_success 'sched-simple: remove sched-simple and cancel jobs' '
flux module remove sched-simple &&
Expand Down

0 comments on commit be41aba

Please sign in to comment.