Skip to content

Commit

Permalink
job-manager: add housekeeping subsystem
Browse files Browse the repository at this point in the history
Problem: jobs get stuck in CLEANUP state while long epilog
scripts run, causing sadness and idling resources.

Introduce a new type of epilog script called "housekeeping" that is
ostensibly job independent.  Instead of freeing resources directly
to the scheduler, jobs free resources to housekeeping, post their free
event, and may reach INACTIVE state.  Meanwhile, housekeeping can run
a script on the allocated resources and return the resources to the
scheduler when complete. The resources are still allocated to the job
as far as the scheduler is concerned while housekeeping runs.  However
since the job has transitioned to INACTIVE, the flux-accounting plugin
will decrement the running job count for the user and stop billing
the user for the resources.  'flux resource list' utility shows the
resources as allocated.

By default, resources are released all at once to the scheduler, as before.
However, if configured, resources can be freed to the scheduler immediately
as they complete housekeeping on each execution target, or a timer can be
started on completion of the first target, and when the timer expires, all
the targets that have completed thus far are freed in one go. Following that,
resources are freed to the scheduler immediately as they complete.

This works with sched-simple without changes, with the exception that the
hello protocol does not currently support partial release so, as noted in
the code, housekeeping and a new job could overlap when the scheduler is
reloaded on a live system.  Some RFC 27 work is needed to resolve ths.

The Fluxion scheduler does not currently support partial release
(flux-framework/flux-sched#1151).  But as discussed over there, the
combination of receiving an R fragment and a jobid in the free request
should be sufficient to get that working.
  • Loading branch information
garlick committed May 8, 2024
1 parent ac763d3 commit 9134f1e
Show file tree
Hide file tree
Showing 9 changed files with 632 additions and 17 deletions.
1 change: 1 addition & 0 deletions src/modules/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ job_manager_la_SOURCES =
job_manager_la_LIBADD = \
$(builddir)/job-manager/libjob-manager.la \
$(top_builddir)/src/common/libjob/libjob.la \
$(top_builddir)/src/common/libsubprocess/libsubprocess.la \
$(top_builddir)/src/common/libflux-internal.la \
$(top_builddir)/src/common/libflux-core.la \
$(top_builddir)/src/common/libflux-optparse.la \
Expand Down
3 changes: 3 additions & 0 deletions src/modules/job-manager/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ libjob_manager_la_SOURCES = \
kill.c \
alloc.h \
alloc.c \
housekeeping.h \
housekeeping.c \
start.h \
start.c \
list.h \
Expand Down Expand Up @@ -126,6 +128,7 @@ TESTS = \
test_ldadd = \
libjob-manager.la \
$(top_builddir)/src/common/libtap/libtap.la \
$(top_builddir)/src/common/libsubprocess/libsubprocess.la \
$(top_builddir)/src/common/librlist/librlist.la \
$(top_builddir)/src/common/libjob/libjob.la \
$(top_builddir)/src/common/libflux-core.la \
Expand Down
18 changes: 6 additions & 12 deletions src/modules/job-manager/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "annotate.h"
#include "raise.h"
#include "queue.h"
#include "housekeeping.h"

struct alloc {
struct job_manager *ctx;
Expand Down Expand Up @@ -105,8 +106,7 @@ static void interface_teardown (struct alloc *alloc, char *s, int errnum)
}
}

/* Send sched.free request for job.
* Update flags.
/* Send sched.free request.
*/
int free_request (struct alloc *alloc, flux_jobid_t id, json_t *R)
{
Expand Down Expand Up @@ -352,6 +352,8 @@ static void hello_cb (flux_t *h,
}
job = zhashx_next (ctx->active_jobs);
}
if (housekeeping_hello_respond (ctx->housekeeping, msg) < 0)
goto error;
if (flux_respond_error (h, msg, ENODATA, NULL) < 0)
flux_log_error (h, "%s: flux_respond_error", __FUNCTION__);
return;
Expand Down Expand Up @@ -515,19 +517,11 @@ static void check_cb (flux_reactor_t *r,
NULL);
}

/* called from event_job_action() FLUX_JOB_STATE_CLEANUP */
int alloc_send_free_request (struct alloc *alloc, struct job *job)
int alloc_send_free_request (struct alloc *alloc, json_t *R, flux_jobid_t id)
{
assert (job->state == FLUX_JOB_STATE_CLEANUP);
if (alloc->ready) {
if (free_request (alloc, job->id, job->R_redacted) < 0)
if (free_request (alloc, id, R) < 0)
return -1;
if ((job->flags & FLUX_JOB_DEBUG))
(void)event_job_post_pack (alloc->ctx->event,
job,
"debug.free-request",
0,
NULL);
}
return 0;
}
Expand Down
6 changes: 3 additions & 3 deletions src/modules/job-manager/alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#ifndef _FLUX_JOB_MANAGER_ALLOC_H
#define _FLUX_JOB_MANAGER_ALLOC_H

#include <jansson.h>
#include <flux/core.h>

#include "job.h"
Expand Down Expand Up @@ -47,10 +48,9 @@ int alloc_queue_count (struct alloc *alloc);
*/
int alloc_pending_count (struct alloc *alloc);

/* Call from CLEANUP state to release resources.
* This function is a no-op if job->free_pending is set.
/* Release resources back to the scheduler.
*/
int alloc_send_free_request (struct alloc *alloc, struct job *job);
int alloc_send_free_request (struct alloc *alloc, json_t *R, flux_jobid_t id);

/* List pending jobs
*/
Expand Down
7 changes: 5 additions & 2 deletions src/modules/job-manager/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "ccan/str/str.h"

#include "alloc.h"
#include "housekeeping.h"
#include "start.h"
#include "drain.h"
#include "journal.h"
Expand Down Expand Up @@ -320,15 +321,17 @@ int event_job_action (struct event *event, struct job *job)
/* N.B. start_pending indicates that the start request is still
* expecting responses. The final response is the 'release'
* response with final=true. Thus once the flag is clear,
* it is safe to release all resources to the scheduler.
* it is safe for the job to release its resources to housekeeping.
*/
if (job->has_resources
&& !job_event_is_queued (job, "epilog-start")
&& !job->perilog_active
&& !job->alloc_bypass
&& !job->start_pending
&& !job->free_posted) {
if (alloc_send_free_request (ctx->alloc, job) < 0)
if (housekeeping_start (ctx->housekeeping,
job->R_redacted,
job->id) < 0)
return -1;
if (event_job_post_pack (ctx->event, job, "free", 0, NULL) < 0)
return -1;
Expand Down
Loading

0 comments on commit 9134f1e

Please sign in to comment.