Skip to content

Commit

Permalink
job-manager: add housekeeping subsystem
Browse files Browse the repository at this point in the history
Problem: jobs get stuck in CLEANUP state while long epilog
scripts run, causing sadness and idling resources.

Introduce a new type of epilog script called "housekeeping" that runs
after the job.  Instead of freeing resources directly to the scheduler,
jobs free resources to housekeeping, post their free event, and may reach
INACTIVE state.  Meanwhile, housekeeping can run a script on the allocated
resources and return the resources to the scheduler when complete.  The
resources are still allocated to the job as far as the scheduler is
concerned while housekeeping runs.  However since the job has transitioned
to INACTIVE, the flux-accounting plugin will decrement the running job
count for the user and stop billing the user for the resources.
'flux resource list' utility shows the resources as allocated.

By default, resources are released to the scheduler only after all ranks
complete housekeeping, as before.  However, if configured, resources can
be freed to the scheduler immediately as they complete housekeeping on
each execution target, or a timer can be started on completion of the
first target, and when the timer expires, all the targets that have
completed thus far are freed in one go. Following that, resources are
freed to the scheduler immediately as they complete.

This works with sched-simple without changes, with the exception that the
hello protocol does not currently support partial release so, as noted in
the code, housekeeping and a new job could overlap when the scheduler is
reloaded on a live system.  Some RFC 27 work is needed to resolve ths.

The Fluxion scheduler does not currently support partial release
(flux-framework/flux-sched#1151).  But as discussed over there, the
combination of receiving an R fragment and a jobid in the free request
should be sufficient to get that working.
  • Loading branch information
garlick committed Jul 1, 2024
1 parent 46f8685 commit e73ccfc
Show file tree
Hide file tree
Showing 9 changed files with 944 additions and 25 deletions.
1 change: 1 addition & 0 deletions src/modules/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ job_manager_la_SOURCES =
job_manager_la_LIBADD = \
$(builddir)/job-manager/libjob-manager.la \
$(top_builddir)/src/common/libjob/libjob.la \
$(top_builddir)/src/common/libsubprocess/libsubprocess.la \
$(top_builddir)/src/common/libflux-internal.la \
$(top_builddir)/src/common/libflux-core.la \
$(top_builddir)/src/common/libflux-optparse.la \
Expand Down
3 changes: 3 additions & 0 deletions src/modules/job-manager/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ libjob_manager_la_SOURCES = \
kill.c \
alloc.h \
alloc.c \
housekeeping.h \
housekeeping.c \
start.h \
start.c \
list.h \
Expand Down Expand Up @@ -126,6 +128,7 @@ TESTS = \
test_ldadd = \
libjob-manager.la \
$(top_builddir)/src/common/libtap/libtap.la \
$(top_builddir)/src/common/libsubprocess/libsubprocess.la \
$(top_builddir)/src/common/librlist/librlist.la \
$(top_builddir)/src/common/libjob/libjob.la \
$(top_builddir)/src/common/libflux-core.la \
Expand Down
24 changes: 10 additions & 14 deletions src/modules/job-manager/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "annotate.h"
#include "raise.h"
#include "queue.h"
#include "housekeeping.h"

struct alloc {
struct job_manager *ctx;
Expand Down Expand Up @@ -99,8 +100,7 @@ static void interface_teardown (struct alloc *alloc, char *s, int errnum)
}
}

/* Send sched.free request for job.
* Update flags.
/* Send sched.free request.
*/
int free_request (struct alloc *alloc, flux_jobid_t id, json_t *R)
{
Expand Down Expand Up @@ -343,6 +343,8 @@ static void hello_cb (flux_t *h,
}
job = zhashx_next (ctx->active_jobs);
}
if (housekeeping_hello_respond (ctx->housekeeping, msg) < 0)
goto error;
if (flux_respond_error (h, msg, ENODATA, NULL) < 0)
flux_log_error (h, "%s: flux_respond_error", __FUNCTION__);
return;
Expand Down Expand Up @@ -504,20 +506,11 @@ static void check_cb (flux_reactor_t *r,
NULL);
}

/* called from event_job_action() FLUX_JOB_STATE_CLEANUP */
int alloc_send_free_request (struct alloc *alloc, struct job *job)
int alloc_send_free_request (struct alloc *alloc, json_t *R, flux_jobid_t id)
{
if (job->state != FLUX_JOB_STATE_CLEANUP)
return -1;
if (alloc->scheduler_is_online) {
if (free_request (alloc, job->id, job->R_redacted) < 0)
if (free_request (alloc, id, R) < 0)
return -1;
if ((job->flags & FLUX_JOB_DEBUG))
(void)event_job_post_pack (alloc->ctx->event,
job,
"debug.free-request",
0,
NULL);
}
return 0;
}
Expand Down Expand Up @@ -691,7 +684,8 @@ static void resource_status_cb (flux_t *h,
}
job = zhashx_first (alloc->ctx->active_jobs);
while (job) {
if (job->has_resources && job->R_redacted && !job->alloc_bypass) {
if ((job->has_resources && !job->free_posted)
&& job->R_redacted && !job->alloc_bypass) {
struct rlist *rl2;
json_error_t jerror;

Expand All @@ -711,6 +705,8 @@ static void resource_status_cb (flux_t *h,
}
job = zhashx_next (alloc->ctx->active_jobs);
}
if (housekeeping_stat_append (ctx->housekeeping, rl, &error) < 0)
goto error;
if (!(R = rlist_to_R (rl))) {
errprintf (&error, "error converting rlist to JSON");
goto error;
Expand Down
6 changes: 3 additions & 3 deletions src/modules/job-manager/alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#ifndef _FLUX_JOB_MANAGER_ALLOC_H
#define _FLUX_JOB_MANAGER_ALLOC_H

#include <jansson.h>
#include <flux/core.h>

#include "job.h"
Expand Down Expand Up @@ -47,10 +48,9 @@ int alloc_queue_count (struct alloc *alloc);
*/
int alloc_pending_count (struct alloc *alloc);

/* Call from CLEANUP state to release resources.
* This function is a no-op if job->free_pending is set.
/* Release resources back to the scheduler.
*/
int alloc_send_free_request (struct alloc *alloc, struct job *job);
int alloc_send_free_request (struct alloc *alloc, json_t *R, flux_jobid_t id);

/* List pending jobs
*/
Expand Down
12 changes: 7 additions & 5 deletions src/modules/job-manager/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "ccan/str/str.h"

#include "alloc.h"
#include "housekeeping.h"
#include "start.h"
#include "drain.h"
#include "journal.h"
Expand Down Expand Up @@ -320,17 +321,18 @@ int event_job_action (struct event *event, struct job *job)
/* N.B. start_pending indicates that the start request is still
* expecting responses. The final response is the 'release'
* response with final=true. Thus once the flag is clear,
* it is safe to release all resources to the scheduler.
* it is safe for the job to release its resources to housekeeping.
*/
if (job->has_resources
&& !job_event_is_queued (job, "epilog-start")
&& !job->perilog_active
&& !job->start_pending
&& !job->free_posted) {
if (!job->alloc_bypass) {
if (alloc_send_free_request (ctx->alloc, job) < 0)
return -1;
}
if (housekeeping_start (ctx->housekeeping,
job->R_redacted,
job->id,
job->userid) < 0)
return -1;
if (event_job_post_pack (ctx->event, job, "free", 0, NULL) < 0)
return -1;
job->free_posted = 1;
Expand Down
Loading

0 comments on commit e73ccfc

Please sign in to comment.