From 94e51329a965850e7fe0375fe87b0ce74e6eefd9 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 21 Feb 2024 16:30:23 -0800 Subject: [PATCH 01/16] job-exec: fix whitespace in bulk-exec.h Problem: Some tabs snuck into job-exec/bulk-exec.h. Convert tabs to spaces. --- src/modules/job-exec/bulk-exec.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/modules/job-exec/bulk-exec.h b/src/modules/job-exec/bulk-exec.h index 6a344bace8f8..a1565b443431 100644 --- a/src/modules/job-exec/bulk-exec.h +++ b/src/modules/job-exec/bulk-exec.h @@ -25,8 +25,8 @@ typedef void (*exec_exit_f) (struct bulk_exec *, void *arg, typedef void (*exec_io_f) (struct bulk_exec *, flux_subprocess_t *, const char *stream, - const char *data, - int data_len, + const char *data, + int data_len, void *arg); typedef void (*exec_error_f) (struct bulk_exec *, @@ -42,10 +42,10 @@ struct bulk_exec_ops { }; struct bulk_exec * bulk_exec_create (struct bulk_exec_ops *ops, - const char *service, - flux_jobid_t id, - const char *name, - void *arg); + const char *service, + flux_jobid_t id, + const char *name, + void *arg); void *bulk_exec_aux_get (struct bulk_exec *exec, const char *key); From 8bb9112755aa71ccf01318a6c0009f5b8090a2cf Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 22 Feb 2024 07:34:02 -0800 Subject: [PATCH 02/16] job-exec: bulk-exec: add bulk_exec_get_subprocess() Problem: There is no way to get an underlying flux_subprocess_t handle from the bulk-exec interface used in the job-exec module. Add bulk_exec_get_subprocess() to return a subprocess by rank from the bulk-exec interface. --- src/modules/job-exec/bulk-exec.c | 12 ++++++++++++ src/modules/job-exec/bulk-exec.h | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/src/modules/job-exec/bulk-exec.c b/src/modules/job-exec/bulk-exec.c index 2c492eb9b2ff..4b8b7281f1c6 100644 --- a/src/modules/job-exec/bulk-exec.c +++ b/src/modules/job-exec/bulk-exec.c @@ -752,5 +752,17 @@ const char *bulk_exec_service_name (struct bulk_exec *exec) return exec->service; } +flux_subprocess_t *bulk_exec_get_subprocess (struct bulk_exec *exec, int rank) +{ + flux_subprocess_t *p = zlist_first (exec->processes); + while (p) { + if (flux_subprocess_rank (p) == rank) + return p; + p = zlist_next (exec->processes); + } + errno = ENOENT; + return NULL; +} + /* vi: ts=4 sw=4 expandtab */ diff --git a/src/modules/job-exec/bulk-exec.h b/src/modules/job-exec/bulk-exec.h index a1565b443431..0442a789bd3e 100644 --- a/src/modules/job-exec/bulk-exec.h +++ b/src/modules/job-exec/bulk-exec.h @@ -97,4 +97,9 @@ int bulk_exec_total (struct bulk_exec *exec); /* Get subprocess remote exec service name (never returns NULL) */ const char *bulk_exec_service_name (struct bulk_exec *exec); +/* Get the subprocess handle for a rank + */ +flux_subprocess_t *bulk_exec_get_subprocess (struct bulk_exec *exec, + int rank); + #endif /* !HAVE_JOB_EXEC_BULK_EXEC_H */ From a62c3792da0082f6b6e7bb3d53a18fdcab08dbca Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 22 Feb 2024 07:35:34 -0800 Subject: [PATCH 03/16] job-exec: make job exception optional in lost_shell() Problem: When a noncritical job shell is lost, the job-exec module always raises a nonfatal job exception and sends an RPC to the leader shell exception service. However, it may be useful to only notify the job shell and not clutter the eventlog with an exception in some circumstances. Add a raise_exception parameter to lost_shell(). A job exception will not be raised if this parameter is false. --- src/modules/job-exec/exec.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/modules/job-exec/exec.c b/src/modules/job-exec/exec.c index 7efffccde771..c7697beac52f 100644 --- a/src/modules/job-exec/exec.c +++ b/src/modules/job-exec/exec.c @@ -162,19 +162,22 @@ static void lost_shell_continuation (flux_future_t *f, void *arg) } static int lost_shell (struct jobinfo *job, + bool raise_exception, int shell_rank, const char *hostname) { flux_future_t *f; - /* Raise a non-fatal job exception */ - jobinfo_raise (job, - "node-failure", - FLUX_JOB_EXCEPTION_CRIT, - "%s on %s (shell rank %d)", - "lost contact with job shell", - hostname, - shell_rank); + if (raise_exception) { + /* Raise a non-fatal job exception */ + jobinfo_raise (job, + "node-failure", + FLUX_JOB_EXCEPTION_CRIT, + "%s on %s (shell rank %d)", + "lost contact with job shell", + hostname, + shell_rank); + } /* Also notify job shell rank 0 of exception */ @@ -207,7 +210,7 @@ static void error_cb (struct bulk_exec *exec, flux_subprocess_t *p, void *arg) if (cmd) { if (errnum == EHOSTUNREACH) { if (!idset_test (job->critical_ranks, shell_rank) - && lost_shell (job, shell_rank, hostname) == 0) + && lost_shell (job, true, shell_rank, hostname) == 0) return; jobinfo_fatal_error (job, 0, From c72efa5ff50f9794d1076718a88b2d866f92a180 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 22 Feb 2024 07:41:26 -0800 Subject: [PATCH 04/16] job-exec: notify leader shell of abnormal exit of other shells Problem: If a job shell terminates by a signal, then it may not have been able to succesfully complete termination tasks that allow the leader shell to know it no longer needs to wait for data from the terminated shell. This can lead to permanent hangs. For example, the shell output plugin will wait forever for EOF from other shells that terminate before the output plugin shutdown procedure. Enable recovery from this condition by notifying the leader shell of any other shells that are terminated by a signal. --- src/modules/job-exec/exec.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/modules/job-exec/exec.c b/src/modules/job-exec/exec.c index c7697beac52f..916bd30e81b4 100644 --- a/src/modules/job-exec/exec.c +++ b/src/modules/job-exec/exec.c @@ -291,6 +291,23 @@ static void exit_cb (struct bulk_exec *exec, "failed to terminate barrier: %s", strerror (errno)); } + + /* If a shell exits due to signal report the shell as lost to + * the leader shell. This avoids potential hangs in the leader + * shell if it is waiting for data from job shells that did not + * exit cleanly. + */ + unsigned int rank = idset_first (ranks); + while (rank != IDSET_INVALID_ID) { + flux_subprocess_t *p = bulk_exec_get_subprocess (exec, rank); + int signo = flux_subprocess_signaled (p); + if (p && signo > 0) { + int shell_rank = resource_set_rank_index (job->R, rank); + if (shell_rank != 0) + lost_shell (job, false, shell_rank, NULL); + } + rank = idset_next (ranks, rank); + } } static int parse_service_option (json_t *jobspec, From 15b4ad6949c5d7e8afe3a51e0df8d0ae0ba35ad0 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 7 Mar 2024 20:18:50 +0000 Subject: [PATCH 05/16] job-exec: allow formatted message to be passed to lost_shell() Problem: The lost_shell() function that raises a job exception and notifies the leader of lost shells constructs a message only for the job exception, and that message is specific to a node failure and doesn't capture other reasons a shell might be lost. It would be better to be more flexible. Support passing the message as printf-style arguments instead of only taking a hostname in lost_shell(). If a job exception is being raised, do not also pass the message in the shell exception RPC to avoid duplication. --- src/modules/job-exec/exec.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/modules/job-exec/exec.c b/src/modules/job-exec/exec.c index 916bd30e81b4..d19693f7e60c 100644 --- a/src/modules/job-exec/exec.c +++ b/src/modules/job-exec/exec.c @@ -164,19 +164,29 @@ static void lost_shell_continuation (flux_future_t *f, void *arg) static int lost_shell (struct jobinfo *job, bool raise_exception, int shell_rank, - const char *hostname) + const char *fmt, + ...) { flux_future_t *f; + char msgbuf[160]; + int msglen = sizeof (msgbuf); + char *msg = msgbuf; + va_list ap; + + if (fmt) { + va_start (ap, fmt); + if (vsnprintf (msg, msglen, fmt, ap) >= msglen) + (void) snprintf (msg, msglen, "%s", "lost contact with job shell"); + va_end (ap); + } if (raise_exception) { /* Raise a non-fatal job exception */ jobinfo_raise (job, "node-failure", FLUX_JOB_EXCEPTION_CRIT, - "%s on %s (shell rank %d)", - "lost contact with job shell", - hostname, - shell_rank); + "%s", + msg); } /* Also notify job shell rank 0 of exception @@ -210,7 +220,13 @@ static void error_cb (struct bulk_exec *exec, flux_subprocess_t *p, void *arg) if (cmd) { if (errnum == EHOSTUNREACH) { if (!idset_test (job->critical_ranks, shell_rank) - && lost_shell (job, true, shell_rank, hostname) == 0) + && lost_shell (job, + true, + shell_rank, + "%s on %s (shell rank %d)", + "lost contact with job shell", + hostname, + shell_rank) == 0) return; jobinfo_fatal_error (job, 0, From cb033ea3fdcaa1f10ee66fc2bc958d2282e63731 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 7 Mar 2024 21:32:45 +0000 Subject: [PATCH 06/16] job-exec: send a message when shell is lost due to signal Problem: The RPC sent to the job shell exception service when a shell is lost doesn't include any other details. Include a message in the RPC payload. When a job shell is lost due to a signal, pass an informational message to lost_shell(). --- src/modules/job-exec/exec.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/modules/job-exec/exec.c b/src/modules/job-exec/exec.c index d19693f7e60c..d001355a5410 100644 --- a/src/modules/job-exec/exec.c +++ b/src/modules/job-exec/exec.c @@ -33,6 +33,7 @@ #endif #include +#include #include "src/common/libjob/idf58.h" #include "ccan/str/str.h" @@ -193,10 +194,11 @@ static int lost_shell (struct jobinfo *job, */ if (!(f = jobinfo_shell_rpc_pack (job, "exception", - "{s:s s:i s:i}", + "{s:s s:i s:i s:s}", "type", "lost-shell", "severity", FLUX_JOB_EXCEPTION_CRIT, - "shell_rank", shell_rank))) + "shell_rank", shell_rank, + "message", msg))) return -1; if (flux_future_then (f, -1., lost_shell_continuation, job) < 0) { flux_future_destroy (f); @@ -317,10 +319,16 @@ static void exit_cb (struct bulk_exec *exec, while (rank != IDSET_INVALID_ID) { flux_subprocess_t *p = bulk_exec_get_subprocess (exec, rank); int signo = flux_subprocess_signaled (p); + int shell_rank = resource_set_rank_index (job->R, rank); if (p && signo > 0) { - int shell_rank = resource_set_rank_index (job->R, rank); if (shell_rank != 0) - lost_shell (job, false, shell_rank, NULL); + lost_shell (job, + false, + shell_rank, + "shell rank %d (on %s): %s", + shell_rank, + flux_get_hostbyrank (job->h, rank), + strsignal (signo)); } rank = idset_next (ranks, rank); } From f816dbe229ce3d1a4a9cc8315a68d19502c6e059 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 8 Mar 2024 00:37:06 +0000 Subject: [PATCH 07/16] job-exec: do not duplicate lost shell message Problem: If a message is set when lost_shell() is called, it will be displayed twice: once when the exception note is printed, and another when the shell displays the message included with the exception RPC payload. If an exception is raised on the job, clear the message so the shell won't bother displaying it. --- src/modules/job-exec/exec.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/modules/job-exec/exec.c b/src/modules/job-exec/exec.c index d001355a5410..46aee6bfe6e4 100644 --- a/src/modules/job-exec/exec.c +++ b/src/modules/job-exec/exec.c @@ -188,6 +188,11 @@ static int lost_shell (struct jobinfo *job, FLUX_JOB_EXCEPTION_CRIT, "%s", msg); + /* If an exception was raised, do not duplicate the message + * to the shell exception service since the message will already + * be displayed as part of the exception note: + */ + msg = ""; } /* Also notify job shell rank 0 of exception From 0a5e622294b522b487cde88150c6b15fb1d6aa2c Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 21 Feb 2024 07:15:32 -0800 Subject: [PATCH 08/16] shell: exception: add missing return Problem: The job shell exception message handler is missing a return after a successful response, and so falls through to sending an error as well. Add the missing return statement. --- src/shell/exception.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shell/exception.c b/src/shell/exception.c index 8274d402ad20..886529dc5112 100644 --- a/src/shell/exception.c +++ b/src/shell/exception.c @@ -48,7 +48,7 @@ static void exception_handler (flux_t *h, if (flux_respond (h, msg, NULL) < 0) shell_log_errno ("flux_respond"); - + return; error: if (flux_respond_error (h, msg, errno, NULL) < 0) shell_log_errno ("flux_respond_error"); From 1ac3d73f5cc7d2fe924ada5830b2f34a81f78e7e Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 21 Feb 2024 08:07:18 -0800 Subject: [PATCH 09/16] shell: exception: notify shell.lost subscribers of shell rank Problem: The exception message sent to the job shell from the execution service contains the shell rank, but this is not passed along to the shell.lost plugin callback. Add plugin arguments containing the lost shell rank to the shell.lost plugin callback. --- src/shell/exception.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/shell/exception.c b/src/shell/exception.c index 886529dc5112..fd9fe1cabb9b 100644 --- a/src/shell/exception.c +++ b/src/shell/exception.c @@ -43,8 +43,19 @@ static void exception_handler (flux_t *h, "shell_rank", &shell_rank) < 0) goto error; - if (streq (type, "lost-shell") && severity > 0) - flux_shell_plugstack_call (shell, "shell.lost", NULL); + if (streq (type, "lost-shell") && severity > 0) { + flux_plugin_arg_t *args = flux_plugin_arg_create (); + if (!args + || flux_plugin_arg_pack (args, + FLUX_PLUGIN_ARG_IN, + "{s:i}", + "shell_rank", shell_rank) < 0) { + flux_plugin_arg_destroy (args); + goto error; + } + flux_shell_plugstack_call (shell, "shell.lost", args); + flux_plugin_arg_destroy (args); + } if (flux_respond (h, msg, NULL) < 0) shell_log_errno ("flux_respond"); From ad083612311eb4148a8a8876879f29dd5d8e68b3 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 7 Mar 2024 22:08:52 +0000 Subject: [PATCH 10/16] shell: exception: display exception message if not empty Problem: The job execution service may send a message to the shell's exception service, but this is currently ignored. Attempt to unpack any message sent to the shell exception service and print it at warning level if not empty. --- src/shell/exception.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/shell/exception.c b/src/shell/exception.c index fd9fe1cabb9b..a4565244bb4a 100644 --- a/src/shell/exception.c +++ b/src/shell/exception.c @@ -34,15 +34,20 @@ static void exception_handler (flux_t *h, const char *type; int severity = -1; int shell_rank = -1; + const char *message = ""; if (flux_request_unpack (msg, NULL, - "{s:s s:i s:i}", + "{s:s s:i s:i s?s}", "type", &type, "severity", &severity, - "shell_rank", &shell_rank) < 0) + "shell_rank", &shell_rank, + "message", &message) < 0) goto error; + if (strlen (message) > 0) + shell_warn ("%s", message); + if (streq (type, "lost-shell") && severity > 0) { flux_plugin_arg_t *args = flux_plugin_arg_create (); if (!args From c410e20cc3ae7c1a0aca12448b8f7e8c2608f942 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 22 Feb 2024 07:52:35 -0800 Subject: [PATCH 11/16] shell: output: track active shell ranks for proper refcounting Problem: If a shell is lost after its output plugin sends the eof message to the leader, then the leader could double decrement its internal reference count and exit before other shells have completed. Track active job shells in an idset and only decrement the refcount if a lost job shell or EOF was from a shell that was active. Fixes #2492 --- src/shell/output.c | 55 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/src/shell/output.c b/src/shell/output.c index 23c51c47577a..ec9cc25e2a6b 100644 --- a/src/shell/output.c +++ b/src/shell/output.c @@ -88,6 +88,7 @@ struct shell_output { struct eventlogger *ev; double batch_timeout; int refcount; + struct idset *active_shells; zlist_t *pending_writes; json_t *output; bool stopped; @@ -528,15 +529,25 @@ static void shell_output_decref (struct shell_output *out, } } +static void shell_output_decref_shell_rank (struct shell_output *out, + int shell_rank, + flux_msg_handler_t *mh) +{ + if (idset_test (out->active_shells, shell_rank) + && idset_clear (out->active_shells, shell_rank) == 0) + shell_output_decref (out, mh); +} + static int shell_output_write_leader (struct shell_output *out, const char *type, + int shell_rank, json_t *o, flux_msg_handler_t *mh) // may be NULL { json_t *entry; if (streq (type, "eof")) { - shell_output_decref (out, mh); + shell_output_decref_shell_rank (out, shell_rank, mh); return 0; } if (!(entry = eventlog_entry_pack (0., type, "O", o))) // increfs 'o' @@ -581,16 +592,18 @@ static void shell_output_write_cb (flux_t *h, void *arg) { struct shell_output *out = arg; + int shell_rank; json_t *o; const char *type; if (flux_request_unpack (msg, NULL, - "{s:s s:o}", + "{s:s s:i s:o}", "name", &type, + "shell_rank", &shell_rank, "context", &o) < 0) goto error; - if (shell_output_write_leader (out, type, o, mh) < 0) + if (shell_output_write_leader (out, type, shell_rank, o, mh) < 0) goto error; if (flux_respond (out->shell->h, msg, NULL) < 0) shell_log_errno ("flux_respond"); @@ -618,9 +631,10 @@ static int shell_output_write_type (struct shell_output *out, json_t *context) { flux_future_t *f = NULL; + int shell_rank = out->shell->info->shell_rank; - if (out->shell->info->shell_rank == 0) { - if (shell_output_write_leader (out, type, context, NULL) < 0) + if (shell_rank == 0) { + if (shell_output_write_leader (out, type, 0, context, NULL) < 0) shell_log_errno ("shell_output_write_leader"); } else { @@ -628,8 +642,9 @@ static int shell_output_write_type (struct shell_output *out, "write", 0, 0, - "{s:s s:O}", + "{s:s s:i s:O}", "name", type, + "shell_rank", shell_rank, "context", context))) goto error; if (flux_future_then (f, -1, shell_output_write_completion, out) < 0) @@ -699,10 +714,11 @@ static void shell_output_type_file_cleanup (struct shell_output_type_file *ofp) void shell_output_destroy (struct shell_output *out) { if (out) { + int shell_rank = out->shell->info->shell_rank; int saved_errno = errno; flux_future_t *f = NULL; - if (out->shell->info->shell_rank != 0) { + if (shell_rank != 0) { /* Nonzero shell rank: send EOF to leader shell to notify * that no more messages will be sent to shell.write */ @@ -710,8 +726,10 @@ void shell_output_destroy (struct shell_output *out) "write", 0, 0, - "{s:s s:{}}", - "name", "eof", "context"))) + "{s:s s:i s:{}}", + "name", "eof", + "shell_rank", shell_rank, + "context"))) shell_log_errno ("shell.write: eof"); flux_future_destroy (f); } @@ -1127,11 +1145,19 @@ static int shell_lost (flux_plugin_t *p, void *data) { struct shell_output *out = data; + int shell_rank; + /* A shell has been lost. We need to decref the output refcount by 1 * since we'll never hear from that shell to avoid rank 0 shell from * hanging. */ - shell_output_decref (out, NULL); + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:i}", + "shell_rank", &shell_rank) < 0) + return shell_log_errno ("shell.lost: unpack of shell_rank failed"); + shell_output_decref_shell_rank (out, shell_rank, NULL); + shell_debug ("lost shell rank %d", shell_rank); return 0; } @@ -1223,6 +1249,15 @@ struct shell_output *shell_output_create (flux_shell_t *shell) * to be decremented as they send EOF or exit. */ out->refcount = (shell->info->shell_size - 1 + ntasks); + + /* Account for active shells to avoid double-decrement of + * refcount when a shell exits prematurely + */ + if (!(out->active_shells = idset_create (0, IDSET_FLAG_AUTOGROW)) + || idset_range_set (out->active_shells, + 0, + shell->info->shell_size - 1) < 0) + goto error; if (flux_shell_add_completion_ref (shell, "output.write") < 0) goto error; if (!(out->output = json_array ())) { From 00c13a6518036413d6446533bb83c5a18dcae0e3 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 8 Mar 2024 19:42:23 +0000 Subject: [PATCH 12/16] shell: output: suppress shell_output_write errors with ENOSYS Problem: During almost any abnormal job termination, the following error message is printed from most if not all non leader job shells: flux-shell: ERROR: output: shell_output_write: Function not implemented This error is not useful. Suppress the error with ENOSYS, since this just indicates the rank 0 job shell is no longer around which is probably obvious from other, more specific errors. Fixes #5736 --- src/shell/output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/shell/output.c b/src/shell/output.c index ec9cc25e2a6b..53e9341c53f5 100644 --- a/src/shell/output.c +++ b/src/shell/output.c @@ -617,7 +617,7 @@ static void shell_output_write_completion (flux_future_t *f, void *arg) { struct shell_output *out = arg; - if (flux_future_get (f, NULL) < 0) + if (flux_future_get (f, NULL) < 0 && errno != ENOSYS) shell_log_errno ("shell_output_write"); zlist_remove (out->pending_writes, f); flux_future_destroy (f); @@ -738,7 +738,7 @@ void shell_output_destroy (struct shell_output *out) flux_future_t *f; while ((f = zlist_pop (out->pending_writes))) { // follower only - if (flux_future_get (f, NULL) < 0) + if (flux_future_get (f, NULL) < 0 && errno != ENOSYS) shell_log_errno ("shell_output_write"); flux_future_destroy (f); } From 59c8c3a08104bf713cb990bc9aa74d3000083dbf Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 8 Mar 2024 19:45:06 +0000 Subject: [PATCH 13/16] shell: doom: support exit-on-error and timeout for lost shells Problem: The doom plugin supports raising a job exception when one or more tasks exit early or with an error, but the plugin does not track lost shells. This may cause jobs that unexpectedly lose a job shell to hang indefinitely. Add a `shell.lost` callback to the doom plugin and treat this the same as an abnormal task exit. Modify the exception note in this case to indicate that a shell and not an individual task exited. --- src/shell/doom.c | 54 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/src/shell/doom.c b/src/shell/doom.c index 98e0c321ed74..dcd6ee0c739d 100644 --- a/src/shell/doom.c +++ b/src/shell/doom.c @@ -52,6 +52,7 @@ struct shell_doom { bool exit_on_error; int exit_rc; int exit_rank; + bool lost_shell; }; static int get_exit_code (json_t *task_info) @@ -78,6 +79,24 @@ static int get_exit_rank (json_t *task_info) return rank; } +static void doom_check (struct shell_doom *doom, + int rank, + int exitcode, + bool lost_shell) +{ + doom->exit_rank = rank; + doom->exit_rc = exitcode; + doom->lost_shell = lost_shell; + if (doom->exit_on_error && doom->exit_rc != 0) { + shell_die (doom->exit_rc, + "%srank %d failed and exit-on-error is set", + doom->lost_shell ? "shell " : "", + doom->exit_rank); + } + else if (doom->timeout != TIMEOUT_NONE) + flux_watcher_start (doom->timer); +} + static void doom_post (struct shell_doom *doom, json_t *task_info) { flux_kvs_txn_t *txn; @@ -100,16 +119,10 @@ static void doom_post (struct shell_doom *doom, json_t *task_info) || !(f = flux_kvs_commit (doom->shell->h, NULL, 0, txn))) shell_log_errno ("error posting task-exit eventlog entry"); - doom->exit_rc = get_exit_code (task_info); - doom->exit_rank = get_exit_rank (task_info); - - if (doom->exit_on_error && doom->exit_rc != 0) { - shell_die (doom->exit_rc, - "rank %d failed and exit-on-error is set", - doom->exit_rank); - } - else if (f && doom->timeout != TIMEOUT_NONE) - flux_watcher_start (doom->timer); + doom_check (doom, + get_exit_rank (task_info), + get_exit_code (task_info), + false); flux_future_destroy (f); // fire and forget free (entrystr); @@ -163,7 +176,8 @@ static void doom_timeout (flux_reactor_t *r, fsd_format_duration (fsd, sizeof (fsd), doom->timeout); shell_die (doom->exit_rc, - "rank %d exited and exit-timeout=%s has expired", + "%srank %d exited and exit-timeout=%s has expired", + doom->lost_shell ? "shell " : "", doom->exit_rank, fsd); } @@ -195,6 +209,22 @@ static int doom_task_exit (flux_plugin_t *p, return 0; } +static int doom_shell_lost (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args, + void *arg) +{ + struct shell_doom *doom = arg; + int shell_rank; + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:i}", + "shell_rank", &shell_rank) < 0) + return shell_log_errno ("shell.lost: unpack of shell_rank failed"); + doom_check (doom, shell_rank, 1, true); + return 0; +} + static void doom_destroy (struct shell_doom *doom) { if (doom) { @@ -285,6 +315,8 @@ static int doom_init (flux_plugin_t *p, doom_destroy (doom); return -1; } + if (flux_plugin_add_handler (p, "shell.lost", doom_shell_lost, doom) < 0) + return shell_log_errno ("failed to add shell.lost handler"); return 0; } From be9c79e12f50a12ccd6b13f00186e4588de07067 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 8 Mar 2024 01:17:19 +0000 Subject: [PATCH 14/16] python: correct exit status for signaled jobs Problem: In JobWatcher._status_to_exitcode() the exitcode computed for a job that was signaled is 127+n, but this should be 128+n as used elsewhere in Flux (modeled after the bash shell). Correct the computation of exitcode to use 128+n. --- src/bindings/python/flux/job/watcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bindings/python/flux/job/watcher.py b/src/bindings/python/flux/job/watcher.py index 9e43572773fb..09b065254b55 100644 --- a/src/bindings/python/flux/job/watcher.py +++ b/src/bindings/python/flux/job/watcher.py @@ -401,7 +401,7 @@ def _status_to_exitcode(status): if os.WIFEXITED(status): status = os.WEXITSTATUS(status) elif os.WIFSIGNALED(status): - status = 127 + os.WTERMSIG(status) + status = 128 + os.WTERMSIG(status) return status def start(self): From 694eb07a35804026ef6a6580dc5d7ad4f6b36279 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 8 Mar 2024 02:37:36 +0000 Subject: [PATCH 15/16] testsuite: add reproducer for lost shell hang Problem: There's no reproducer for issue #2492, a shell hang in the output plugin when a shell is lost. Add a reproduer under issues/t2492-shell-lost.sh. --- t/Makefile.am | 1 + t/issues/t2492-shell-lost.sh | 66 ++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100755 t/issues/t2492-shell-lost.sh diff --git a/t/Makefile.am b/t/Makefile.am index a377ec78c3d2..2f05858cb719 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -357,6 +357,7 @@ dist_check_SCRIPTS = \ issues/t5308-kvsdir-initial-path.py \ issues/t5368-kvs-commit-clear.py \ issues/t5657-kvs-getroot-namespace.sh \ + issues/t2492-shell-lost.sh \ python/__init__.py \ python/subflux.py \ python/tap \ diff --git a/t/issues/t2492-shell-lost.sh b/t/issues/t2492-shell-lost.sh new file mode 100755 index 000000000000..159685688626 --- /dev/null +++ b/t/issues/t2492-shell-lost.sh @@ -0,0 +1,66 @@ +#!/bin/sh + +log() { printf "t2492: $@\n" >&2; } +die() { log "$@"; exit 1; } + +# Check if we need to start parent job, if so, reexec under flux-start +# using the per-broker for test-pmi-clique so that the instance mapping +# simulates multiple nodes. +# +export FLUX_URI_RESOLVE_LOCAL=t +if test "$T2492_ACTIVE" != "t"; then + export T2492_ACTIVE=t + log "Re-launching test script under flux-start using $(command -v flux)" + exec flux start -s 4 $0 +fi + +CRITICAL_RANKS=$(cat < Date: Fri, 8 Mar 2024 19:52:57 +0000 Subject: [PATCH 16/16] testsuite: test exit-timeout and exit-on-error with lost shell Problem: Behavior of the doom plugin exit-timeout and exit-on-error handling is not tested when a shell is lost instead of an abnormal task exit. Add tests to t2614-job-shell-doom.t that ensure behavior of these shell options if a shell is lost. --- t/t2614-job-shell-doom.t | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/t/t2614-job-shell-doom.t b/t/t2614-job-shell-doom.t index faae8f2449b5..134f70a0c79f 100755 --- a/t/t2614-job-shell-doom.t +++ b/t/t2614-job-shell-doom.t @@ -42,7 +42,23 @@ test_expect_success 'flux-shell: run script with 2 nodes and exit-on-error' ' test_must_fail run_timeout 30 flux run \ -n2 -N2 -o exit-on-error ./testscript.sh ' - +test_expect_success 'flux-shell: exit-timeout catches lost shell' ' + cat >test2.sh <<-"EOF" && + #!/bin/bash + if test $FLUX_TASK_RANK -eq 1; then + kill -9 $PPID + exit + fi + sleep 30 + EOF + chmod +x test2.sh && + test_must_fail run_timeout 30 flux run \ + -n2 -N2 -o exit-timeout=1s ./test2.sh +' +test_expect_success 'flux-shell: exit-on-error catches lost shell' ' + test_must_fail run_timeout 30 flux run \ + -n2 -N2 -o exit-on-error ./test2.sh +' test_expect_success 'flux-shell: exit-timeout=aaa is rejected' ' test_must_fail flux run -o exit-timeout=aaa /bin/true '