Skip to content

Commit

Permalink
Merge pull request #4246 from garlick/rc1_norestart
Browse files Browse the repository at this point in the history
broker: prevent systemd restart if rc1 fails
  • Loading branch information
mergify[bot] authored Mar 29, 2022
2 parents 4d01f7b + c4d294c commit 227091b
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 5 deletions.
4 changes: 4 additions & 0 deletions doc/man7/flux-broker-attributes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ broker.rc1_path [Updates: C]
broker.rc3_path [Updates: C]
The path to the broker's rc3 script. Default: ``${prefix}/etc/flux/rc1``.

broker.exit-restart [Updates: C, R]
A numeric exit code that the broker uses to indicate that it should not be
restarted. This is set by the systemd unit file. Default: unset.

broker.starttime
Timestamp of broker startup from :man3:`flux_reactor_now`.

Expand Down
2 changes: 2 additions & 0 deletions etc/flux.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ ExecStart=/bin/bash -c '\
-Sbroker.rc2_none \
-Sbroker.quorum=0 \
-Sbroker.quorum-timeout=none \
-Sbroker.exit-norestart=42 \
'
SyslogIdentifier=flux
ExecReload=@X_BINDIR@/flux config reload
Restart=always
RestartSec=5s
RestartPreventExitStatus=42
User=flux
Group=flux
RuntimeDirectory=flux
Expand Down
29 changes: 27 additions & 2 deletions src/broker/state_machine.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ struct state_machine {
struct quorum quorum;

struct flux_msglist *wait_requests;

int exit_norestart;
};

typedef void (*action_f)(struct state_machine *s);
Expand Down Expand Up @@ -458,8 +460,15 @@ static void runat_completion_cb (struct runat *r, const char *name, void *arg)
log_err ("runat_get_exit_code %s", name);

if (!strcmp (name, "rc1")) {
if (rc != 0)
s->ctx->exit_rc = rc;
/* If rc1 fails, it most likely will fail again on restart, so if
* running under systemd, exit with the broker.exit-norestart value.
*/
if (rc != 0) {
if (s->exit_norestart != 0)
s->ctx->exit_rc = s->exit_norestart;
else
s->ctx->exit_rc = rc;
}
state_machine_post (s, rc == 0 ? "rc1-success" : "rc1-fail");
}
else if (!strcmp (name, "rc2")) {
Expand All @@ -479,6 +488,21 @@ static void runat_completion_cb (struct runat *r, const char *name, void *arg)
}
}

/* If '-Sbroker.exit-norestart' was set on the command line, set
* s->exit_norestart to its value; otherwise leave it set it to 0.
*/
static void norestart_configure (struct state_machine *s)
{
const char *val;

if (attr_get (s->ctx->attrs, "broker.exit-norestart", &val, NULL) == 0) {
errno = 0;
int rc = strtol (val, NULL, 10);
if (errno == 0 && rc >= 1)
s->exit_norestart = rc;
}
}

static void prep_cb (flux_reactor_t *r,
flux_watcher_t *w,
int revents,
Expand Down Expand Up @@ -993,6 +1017,7 @@ struct state_machine *state_machine_create (struct broker *ctx)
log_err ("error configuring quorum attributes");
goto error;
}
norestart_configure (s);
overlay_set_monitor_cb (ctx->overlay, overlay_monitor_cb, s);
if (s->ctx->rank == 0) {
if (!(s->quorum.f = flux_rpc_pack (ctx->h,
Expand Down
14 changes: 11 additions & 3 deletions t/t0025-broker-state-machine.t
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ test_expect_success 'all expected events and state transitions occurred on rank
'

test_expect_success 'capture state transitions from instance with rc1 failure' '
test_must_fail flux start \
test_expect_code 1 flux start \
-o,-Slog-filename=states_rc1.log \
-o,-Sbroker.rc1_path=/bin/false \
-o,-Sbroker.rc3_path= \
Expand All @@ -199,7 +199,7 @@ test_expect_success 'all expected events and state transitions occurred' '
'

test_expect_success 'capture state transitions from instance with rc2 failure' '
test_must_fail flux start \
test_expect_code 1 flux start \
-o,-Slog-filename=states_rc2.log \
${ARGS} \
/bin/false
Expand All @@ -217,7 +217,7 @@ test_expect_success 'all expected events and state transitions occurred' '
'

test_expect_success 'capture state transitions from instance with rc3 failure' '
test_must_fail flux start \
test_expect_code 1 flux start \
-o,-Slog-filename=states_rc3.log \
-o,-Sbroker.rc1_path= \
-o,-Sbroker.rc3_path=/bin/false \
Expand All @@ -235,6 +235,14 @@ test_expect_success 'all expected events and state transitions occurred' '
grep "rc3-fail: finalize->exit" states_rc3.log
'

test_expect_success 'instance rc1 failure exits with norestart code' '
test_expect_code 99 flux start \
-o,-Sbroker.exit-norestart=99 \
-o,-Sbroker.rc1_path=/bin/false \
-o,-Sbroker.rc3_path= \
/bin/true
'

test_expect_success 'broker.quorum-timeout=none is accepted' '
flux start ${ARGS} -o,-Sbroker.quorum-timeout=none /bin/true
'
Expand Down

0 comments on commit 227091b

Please sign in to comment.