Skip to content

Commit

Permalink
job-manager: add housekeeping subsystem
Browse files Browse the repository at this point in the history
Problem: jobs get stuck in CLEANUP state while long epilog
scripts run, causing sadness and idling resources.

Introduce a new type of epilog script called "housekeeping" that is
ostensibly job independent.  Instead of freeing resources directly
to the scheduler, jobs free resources to housekeeping, post their free
event, and may reach INACTIVE state.  Meanwhile, housekeeping can run
a script on the allocated resources and return the resources to the
scheduler when complete. The resources are still allocated to the job
as far as the scheduler is concerned while housekeeping runs.  However
since the job has transitioned to INACTIVE, the flux-accounting plugin
will decrement the running job count for the user and stop billing
the user for the resources.  'flux resource list' utility shows the
resources as allocated.

By default, resources are released all at once to the scheduler, as before.
However, if configured, resources can be freed to the scheduler immediately
as they complete housekeeping on each execution target, or a timer can be
started on completion of the first target, and when the timer expires, all
the targets that have completed thus far are freed in one go. Following that,
resources are freed to the scheduler immediately as they complete.

This works with sched-simple without changes, with the exception that the
hello protocol does not currently support partial release so, as noted in
the code, housekeeping and a new job could overlap when the scheduler is
reloaded on a live system.  Some RFC 27 work is needed to resolve ths.

The Fluxion scheduler does not currently support partial release
(flux-framework/flux-sched#1151).  But as discussed over there, the
combination of receiving an R fragment and a jobid in the free request
should be sufficient to get that working.
  • Loading branch information
garlick committed Mar 21, 2024
1 parent 92b4396 commit fae7684
Show file tree
Hide file tree
Showing 9 changed files with 644 additions and 27 deletions.
1 change: 1 addition & 0 deletions src/modules/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ job_manager_la_SOURCES =
job_manager_la_LIBADD = \
$(builddir)/job-manager/libjob-manager.la \
$(top_builddir)/src/common/libjob/libjob.la \
$(top_builddir)/src/common/libsubprocess/libsubprocess.la \
$(top_builddir)/src/common/libflux-internal.la \
$(top_builddir)/src/common/libflux-core.la \
$(top_builddir)/src/common/libflux-optparse.la \
Expand Down
3 changes: 3 additions & 0 deletions src/modules/job-manager/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ libjob_manager_la_SOURCES = \
kill.c \
alloc.h \
alloc.c \
housekeeping.h \
housekeeping.c \
start.h \
start.c \
list.h \
Expand Down Expand Up @@ -125,6 +127,7 @@ TESTS = \
test_ldadd = \
libjob-manager.la \
$(top_builddir)/src/common/libtap/libtap.la \
$(top_builddir)/src/common/libsubprocess/libsubprocess.la \
$(top_builddir)/src/common/librlist/librlist.la \
$(top_builddir)/src/common/libjob/libjob.la \
$(top_builddir)/src/common/libflux-core.la \
Expand Down
37 changes: 15 additions & 22 deletions src/modules/job-manager/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "annotate.h"
#include "raise.h"
#include "queue.h"
#include "housekeeping.h"

struct res_acct {
struct rlist *allocated;
Expand Down Expand Up @@ -209,19 +210,18 @@ static void interface_teardown (struct alloc *alloc, char *s, int errnum)
}
}

/* Send sched.free request for job.
* Update flags.
/* Send sched.free request.
*/
int free_request (struct alloc *alloc, struct job *job)
int free_request (struct alloc *alloc, json_t *R, flux_jobid_t id)
{
flux_msg_t *msg;

if (!(msg = flux_request_encode ("sched.free", NULL)))
return -1;
if (flux_msg_pack (msg,
"{s:I s:O}",
"id", job->id,
"R", job->R_redacted) < 0)
"id", id,
"R", R) < 0)
goto error;
if (flux_send (alloc->ctx->h, msg, 0) < 0)
goto error;
Expand Down Expand Up @@ -490,6 +490,8 @@ static void hello_cb (flux_t *h,
}
job = zhashx_next (ctx->active_jobs);
}
if (housekeeping_hello_respond (ctx->housekeeping, msg) < 0)
goto error;
if (flux_respond_error (h, msg, ENODATA, NULL) < 0)
flux_log_error (h, "%s: flux_respond_error", __FUNCTION__);
return;
Expand Down Expand Up @@ -653,32 +655,23 @@ static void check_cb (flux_reactor_t *r,
NULL);
}

/* called from event_job_action() FLUX_JOB_STATE_CLEANUP */
int alloc_send_free_request (struct alloc *alloc, struct job *job)
int alloc_send_free_request (struct alloc *alloc, json_t *R, flux_jobid_t id)
{
flux_error_t error;

assert (job->state == FLUX_JOB_STATE_CLEANUP);
if (alloc->ready) {
if (free_request (alloc, job) < 0)
if (free_request (alloc, R, id) < 0)
return -1;
if ((job->flags & FLUX_JOB_DEBUG))
(void)event_job_post_pack (alloc->ctx->event,
job,
"debug.free-request",
0,
NULL);
}
/* event_job_action() posts the "free" event after calling this function,
* so despite a no-op above if the scheduler isn't loaded, we account for
* the resources anyway. Since "free" clears job->has_resources, the job
* will not be presented in hello responses to the scheduler at reload.
}
/* Account for resource release even if the scheduler was offline above.
* When it comes back online, these resources will not be presented as
* allocated via the hello protocol, so they are effectively free.
*/
if (acct_free (alloc, job->R_redacted, &error) < 0) {
if (acct_free (alloc, R, &error) < 0) {
flux_log (alloc->ctx->h,
LOG_ERR,
"%s: %s",
idf58 (job->id),
idf58 (id),
error.text);
}
return 0;
Expand Down
6 changes: 3 additions & 3 deletions src/modules/job-manager/alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#ifndef _FLUX_JOB_MANAGER_ALLOC_H
#define _FLUX_JOB_MANAGER_ALLOC_H

#include <jansson.h>
#include <flux/core.h>

#include "job.h"
Expand Down Expand Up @@ -42,10 +43,9 @@ int alloc_queue_count (struct alloc *alloc);
*/
int alloc_pending_count (struct alloc *alloc);

/* Call from CLEANUP state to release resources.
* This function is a no-op if job->free_pending is set.
/* Release resources back to the scheduler.
*/
int alloc_send_free_request (struct alloc *alloc, struct job *job);
int alloc_send_free_request (struct alloc *alloc, json_t *R, flux_jobid_t id);

/* List pending jobs
*/
Expand Down
7 changes: 5 additions & 2 deletions src/modules/job-manager/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "ccan/str/str.h"

#include "alloc.h"
#include "housekeeping.h"
#include "start.h"
#include "drain.h"
#include "journal.h"
Expand Down Expand Up @@ -320,15 +321,17 @@ int event_job_action (struct event *event, struct job *job)
/* N.B. start_pending indicates that the start request is still
* expecting responses. The final response is the 'release'
* response with final=true. Thus once the flag is clear,
* it is safe to release all resources to the scheduler.
* it is safe for the job to release its resources to housekeeping.
*/
if (job->has_resources
&& !job_event_is_queued (job, "epilog-start")
&& !job->perilog_active
&& !job->alloc_bypass
&& !job->start_pending
&& !job->free_posted) {
if (alloc_send_free_request (ctx->alloc, job) < 0)
if (housekeeping_start (ctx->housekeeping,
job->R_redacted,
job->id) < 0)
return -1;
if (event_job_post_pack (ctx->event, job, "free", 0, NULL) < 0)
return -1;
Expand Down
Loading

0 comments on commit fae7684

Please sign in to comment.