Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

run epilog even if job prolog fails or is canceled #6249

Merged
merged 5 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions doc/man5/flux-config-job-manager.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@ prolog
while the prolog is active terminates the prolog. The default is true.

epilog
(optional) Table of configuration for a job-manager epilog. If configured,
the epilog is started at the job ``finish`` event, i.e. after all user
processes and job shells have terminated. The ``[job-manager.epilog]``
table supports the following keys:
(optional) Table of configuration for a job-manager epilog. If
configured, the epilog is started at the job ``finish`` event,
i.e. after all user processes and job shells have terminated, or after
prolog failure (in which case there will not be a job ``finish`` event.)
The ``[job-manager.epilog]`` table supports the following keys:

command
(optional) An array of strings specifying the command to run. If
Expand Down
103 changes: 98 additions & 5 deletions src/modules/job-manager/plugins/perilog.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@
struct perilog_proc {
flux_plugin_t *p;
flux_jobid_t id;
uint32_t userid;
json_t *R;
bool prolog;
bool cancel_on_exception;
bool canceled;
Expand All @@ -115,6 +117,13 @@
char *failed_ranks;
};

static struct perilog_proc *procdesc_run (flux_t *h,
flux_plugin_t *p,
struct perilog_procdesc *pd,
flux_jobid_t id,
uint32_t userid,
json_t *R);

static void timeout_cb (flux_reactor_t *r,
flux_watcher_t *w,
int revents,
Expand Down Expand Up @@ -251,13 +260,15 @@

static struct perilog_proc * perilog_proc_create (flux_plugin_t *p,
flux_jobid_t id,
uint32_t userid,
bool prolog)
{
struct perilog_proc *proc = calloc (1, sizeof (*proc));
if (proc == NULL)
return NULL;
proc->p = p;
proc->id = id;
proc->userid = id;
proc->prolog = prolog;
if (zhashx_insert (perilog_config.processes, &proc->id, proc) < 0) {
free (proc);
Expand All @@ -273,6 +284,7 @@
int saved_errno = errno;
idset_destroy (proc->ranks);
free (proc->failed_ranks);
json_decref (proc->R);
bulk_exec_destroy (proc->bulk_exec);
flux_future_destroy (proc->kill_f);
flux_future_destroy (proc->drain_f);
Expand Down Expand Up @@ -477,6 +489,88 @@
return f;
}

static bool perilog_proc_failed (struct perilog_proc *proc)
{
if (proc->canceled
|| proc->timedout
|| bulk_exec_rc (proc->bulk_exec) > 0)
return true;
return false;
}

static void perilog_proc_finish (struct perilog_proc *proc)
{
flux_t *h = flux_jobtap_get_flux (proc->p);
flux_plugin_t *p;
uint32_t userid;
flux_jobid_t id;
json_t *R;
struct perilog_procdesc *pd;
bool run_epilog = false;


/* If a prolog was completing, and it failed in some way, then there
* will be no finish event to trigger the epilog. However, an epilog
* should still be run in case it is required to clean up or revert
* something done by the prolog. So do that here.
*/
if (proc->prolog
&& perilog_proc_failed (proc)
&& (pd = perilog_config.epilog)) {
/* epilog process can't be started until prolog perilog_proc is
* deleted, so capture necessary info here and set a boolean to
* create the epilog before leaving this function.
*/
run_epilog = true;
p = proc->p;
id = proc->id;
userid = proc->userid;
R = proc->R;

/* The epilog-start event must be posted before the prolog-finish
* event to avoid the job potentially going straight to INACTIVE
* after the prolog-finish event is posted below
*/
if (flux_jobtap_event_post_pack (p,
id,
"epilog-start",
"{s:s}",
"description",
"job-manager.epilog") < 0) {
flux_log_error (h,

Check warning on line 540 in src/modules/job-manager/plugins/perilog.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/plugins/perilog.c#L540

Added line #L540 was not covered by tests
"%s: failed to post epilog-start on prolog-finish",
idf58 (proc->id));
run_epilog = false;

Check warning on line 543 in src/modules/job-manager/plugins/perilog.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/plugins/perilog.c#L543

Added line #L543 was not covered by tests
}
}
emit_finish_event (proc, proc->bulk_exec);
perilog_proc_delete (proc);

if (run_epilog) {
struct perilog_proc *epilog;

if (!(epilog = procdesc_run (h, p, pd, id,userid, R))
|| flux_jobtap_job_aux_set (p,
id,
"perilog_proc",
epilog,
NULL) < 0) {
flux_log_error (h,

Check warning on line 558 in src/modules/job-manager/plugins/perilog.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/plugins/perilog.c#L558

Added line #L558 was not covered by tests
"%s: failed to start epilog on prolog-finish",
idf58 (proc->id));

/* Since epilog-start event was emitted above, we must emit an
* epilog-finish event to avoid hanging the job
*/
if (flux_jobtap_epilog_finish (p, id,"job-manager.epilog", 1) < 0) {
flux_log_error (h,

Check warning on line 566 in src/modules/job-manager/plugins/perilog.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/plugins/perilog.c#L565-L566

Added lines #L565 - L566 were not covered by tests
"%s: failed to post epilog-finish event",
idf58 (proc->id));
}
perilog_proc_delete (epilog);

Check warning on line 570 in src/modules/job-manager/plugins/perilog.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/plugins/perilog.c#L570

Added line #L570 was not covered by tests
}
}
}

static void drain_failed_cb (flux_future_t *f, void *arg)
{
Expand All @@ -493,8 +587,7 @@
}
/* future destroyed by perilog_proc_delete()
*/
emit_finish_event (proc, proc->bulk_exec);
perilog_proc_delete (proc);
perilog_proc_finish (proc);
}

static bool perilog_per_rank (struct perilog_proc *proc)
Expand Down Expand Up @@ -532,8 +625,7 @@
idf58 (proc->id),
perilog_proc_name (proc));
}
emit_finish_event (proc, bulk_exec);
perilog_proc_delete (proc);
perilog_proc_finish (proc);
}
}

Expand Down Expand Up @@ -643,7 +735,7 @@
struct bulk_exec *bulk_exec = NULL;
double timeout;

if (!(proc = perilog_proc_create (p, id, pd->prolog))) {
if (!(proc = perilog_proc_create (p, id, userid, pd->prolog))) {
flux_log_error (h,
"%s: proc_create",
pd->prolog ? "prolog" : "epilog");
Expand Down Expand Up @@ -709,6 +801,7 @@
flux_watcher_start (w);
proc->timer = w;
}
proc->R = json_incref (R);
proc->bulk_exec = bulk_exec;
proc->ranks = ranks;
proc->cancel_on_exception = pd->cancel_on_exception;
Expand Down
22 changes: 21 additions & 1 deletion t/t2274-manager-perilog-per-rank.t
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ test_expect_success 'perilog: load a basic per-rank prolog config' '
[job-manager.prolog]
per-rank = true
command = [ "sleep", "5" ]
[job-manager.epilog]
per-rank = true
command = [ "sleep", "30" ]
cancel-on-exception = true
EOF
flux jobtap query perilog.so | jq .conf.prolog
'
Expand All @@ -167,11 +171,27 @@ test_expect_success 'perilog: prolog runs on all 4 ranks of a 4 node job' '
| jq -e ".procs.$jobid.active == 4" &&
flux cancel $jobid &&
flux jobtap query perilog.so &&
flux job wait-event $jobid clean
flux job wait-event $jobid prolog-finish
'
test_expect_success 'perilog: canceled prolog does not drain ranks' '
no_drained_ranks
'
test_expect_success 'perilog: epilog runs even if prolog is canceled' '
flux dmesg -H &&
flux job wait-event -vHt 30 $jobid epilog-start &&
flux jobtap query perilog.so | jq .procs &&
flux jobtap query perilog.so \
| jq -e ".procs.$jobid.name == \"epilog\"" &&
flux jobtap query perilog.so \
| jq -e ".procs.$jobid.active_ranks == \"0-3\"" &&
flux jobtap query perilog.so \
| jq -e ".procs.$jobid.total == 4" &&
flux jobtap query perilog.so \
| jq -e ".procs.$jobid.active == 4" &&
flux cancel $jobid &&
flux jobtap query perilog.so | jq &&
flux job wait-event -Hvt 30 $jobid clean
'
test_expect_success 'perilog: signaled prolog is reported' '
flux config load <<-EOF &&
[job-manager.prolog]
Expand Down
Loading