From 02b7f06a84bbf48e11e3aac13951fa783fcb1bd8 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 29 Mar 2022 11:39:13 -0700 Subject: [PATCH 1/4] systemd: skip restart on special exit code Problem: systemd unconditionally restarts the broker when it exits/crashes, but there are situations where that is not desirable. Set RestartPreventExitStatus to an exit code that can be used in cases where restart would be inappropriate. Set this value in a broker attribute 'broker.exit-norestart'. --- etc/flux.service.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etc/flux.service.in b/etc/flux.service.in index 1444b717b028..169f732d6b2a 100644 --- a/etc/flux.service.in +++ b/etc/flux.service.in @@ -20,11 +20,13 @@ ExecStart=/bin/bash -c '\ -Sbroker.rc2_none \ -Sbroker.quorum=0 \ -Sbroker.quorum-timeout=none \ + -Sbroker.exit-norestart=42 \ ' SyslogIdentifier=flux ExecReload=@X_BINDIR@/flux config reload Restart=always RestartSec=5s +RestartPreventExitStatus=42 User=flux Group=flux RuntimeDirectory=flux From cbcdac3d91ff600b3c93e874102448c675f361d8 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 29 Mar 2022 15:13:54 -0700 Subject: [PATCH 2/4] broker: support special norestart exit code Problem: need a way for systemd to tell Flux what broker exit code to use to inhibit automatic restart. If the broker attribute 'broker.exit-norestart' is set, store it's value to be used in situations when automatic restart is not desired. The value is set to zero if the attribute is not set (e.g. broker was not started by systemd so there is no implicit restart). --- src/broker/state_machine.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/broker/state_machine.c b/src/broker/state_machine.c index 1f4b96751485..ef97dbf5845d 100644 --- a/src/broker/state_machine.c +++ b/src/broker/state_machine.c @@ -63,6 +63,8 @@ struct state_machine { struct quorum quorum; struct flux_msglist *wait_requests; + + int exit_norestart; }; typedef void (*action_f)(struct state_machine *s); @@ -479,6 +481,21 @@ static void runat_completion_cb (struct runat *r, const char *name, void *arg) } } +/* If '-Sbroker.exit-norestart' was set on the command line, set + * s->exit_norestart to its value; otherwise leave it set it to 0. + */ +static void norestart_configure (struct state_machine *s) +{ + const char *val; + + if (attr_get (s->ctx->attrs, "broker.exit-norestart", &val, NULL) == 0) { + errno = 0; + int rc = strtol (val, NULL, 10); + if (errno == 0 && rc >= 1) + s->exit_norestart = rc; + } +} + static void prep_cb (flux_reactor_t *r, flux_watcher_t *w, int revents, @@ -993,6 +1010,7 @@ struct state_machine *state_machine_create (struct broker *ctx) log_err ("error configuring quorum attributes"); goto error; } + norestart_configure (s); overlay_set_monitor_cb (ctx->overlay, overlay_monitor_cb, s); if (s->ctx->rank == 0) { if (!(s->quorum.f = flux_rpc_pack (ctx->h, From 706422f63cb41e77ad6aa8bcde275b13283ce136 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 29 Mar 2022 15:14:48 -0700 Subject: [PATCH 3/4] broker: prevent systemd restart if rc1 fails Problem: if rc1 fails in a system instance, the broker is automatically restarted by systemd, but the main anticipated cause of rc1 failure is misconfiguration, so a more appropriate response is to leave Flux off until someone can resolve the problem and restart it manually. If rc1 fails, and the "broker.exit-norestart" attribute is set, exit with that value instead of the script exit code. --- src/broker/state_machine.c | 11 +++++++++-- t/t0025-broker-state-machine.t | 14 +++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/broker/state_machine.c b/src/broker/state_machine.c index ef97dbf5845d..fd17926241f9 100644 --- a/src/broker/state_machine.c +++ b/src/broker/state_machine.c @@ -460,8 +460,15 @@ static void runat_completion_cb (struct runat *r, const char *name, void *arg) log_err ("runat_get_exit_code %s", name); if (!strcmp (name, "rc1")) { - if (rc != 0) - s->ctx->exit_rc = rc; + /* If rc1 fails, it most likely will fail again on restart, so if + * running under systemd, exit with the broker.exit-norestart value. + */ + if (rc != 0) { + if (s->exit_norestart != 0) + s->ctx->exit_rc = s->exit_norestart; + else + s->ctx->exit_rc = rc; + } state_machine_post (s, rc == 0 ? "rc1-success" : "rc1-fail"); } else if (!strcmp (name, "rc2")) { diff --git a/t/t0025-broker-state-machine.t b/t/t0025-broker-state-machine.t index f4d343f0337e..55e771493c4b 100755 --- a/t/t0025-broker-state-machine.t +++ b/t/t0025-broker-state-machine.t @@ -183,7 +183,7 @@ test_expect_success 'all expected events and state transitions occurred on rank ' test_expect_success 'capture state transitions from instance with rc1 failure' ' - test_must_fail flux start \ + test_expect_code 1 flux start \ -o,-Slog-filename=states_rc1.log \ -o,-Sbroker.rc1_path=/bin/false \ -o,-Sbroker.rc3_path= \ @@ -199,7 +199,7 @@ test_expect_success 'all expected events and state transitions occurred' ' ' test_expect_success 'capture state transitions from instance with rc2 failure' ' - test_must_fail flux start \ + test_expect_code 1 flux start \ -o,-Slog-filename=states_rc2.log \ ${ARGS} \ /bin/false @@ -217,7 +217,7 @@ test_expect_success 'all expected events and state transitions occurred' ' ' test_expect_success 'capture state transitions from instance with rc3 failure' ' - test_must_fail flux start \ + test_expect_code 1 flux start \ -o,-Slog-filename=states_rc3.log \ -o,-Sbroker.rc1_path= \ -o,-Sbroker.rc3_path=/bin/false \ @@ -235,6 +235,14 @@ test_expect_success 'all expected events and state transitions occurred' ' grep "rc3-fail: finalize->exit" states_rc3.log ' +test_expect_success 'instance rc1 failure exits with norestart code' ' + test_expect_code 99 flux start \ + -o,-Sbroker.exit-norestart=99 \ + -o,-Sbroker.rc1_path=/bin/false \ + -o,-Sbroker.rc3_path= \ + /bin/true +' + test_expect_success 'broker.quorum-timeout=none is accepted' ' flux start ${ARGS} -o,-Sbroker.quorum-timeout=none /bin/true ' From c4d294c1b881a67300fd9b9d17fc308654de48d1 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 29 Mar 2022 15:07:09 -0700 Subject: [PATCH 4/4] flux-broker-attributes(7): broker.exit-norestart Problem: broker.exit-norestart has no documentation. Add an entry to flux-broker-attributes(7). --- doc/man7/flux-broker-attributes.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/man7/flux-broker-attributes.rst b/doc/man7/flux-broker-attributes.rst index c08acf67f8f5..ec9845760aa7 100644 --- a/doc/man7/flux-broker-attributes.rst +++ b/doc/man7/flux-broker-attributes.rst @@ -100,6 +100,10 @@ broker.rc1_path [Updates: C] broker.rc3_path [Updates: C] The path to the broker's rc3 script. Default: ``${prefix}/etc/flux/rc1``. +broker.exit-restart [Updates: C, R] + A numeric exit code that the broker uses to indicate that it should not be + restarted. This is set by the systemd unit file. Default: unset. + broker.starttime Timestamp of broker startup from :man3:`flux_reactor_now`.