Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

connect: rewrite envoy bootstrap on every restart #19787

Merged
merged 8 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/19787.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
connect: Fixed envoy sidecars being unable to restart after node reboots
```
9 changes: 9 additions & 0 deletions client/allocrunner/interfaces/task_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ type TaskPrestartResponse struct {

// Done lets the hook indicate that it completed successfully and
// should not be run again.
//
// Use sparringly! In general hooks should be idempotent and therefore Done
// is unneeded. You never know at what point an agent might crash, and it can
// be hard to reason about how Done=true impacts agent restarts and node
// reboots. See #19787 for example.
//
// Done is useful for expensive operations such as downloading artifacts, or
// for operations which might fail needlessly if rerun while a node is
// disconnected.
Done bool
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great comment, this actually clarifies a lot

}

Expand Down
4 changes: 2 additions & 2 deletions client/allocrunner/taskrunner/connect_native_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ func (h *connectNativeHook) Prestart(
merge(environment, h.bridgeEnv(request.TaskEnv.EnvMap))
merge(environment, h.hostEnv(request.TaskEnv.EnvMap))

// tls/acl setup for native task done
response.Done = true
// tls/acl setup for native task done but since SecretsDir is a tmpfs, don't
// mark Done=true as this hook will need to rerun on node reboots
response.Env = environment
return nil
}
Expand Down
15 changes: 0 additions & 15 deletions client/allocrunner/taskrunner/connect_native_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,9 +287,6 @@ func TestTaskRunner_ConnectNativeHook_Noop(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), request, response))

// Assert the hook is Done
require.True(t, response.Done)

// Assert no environment variables configured to be set
require.Empty(t, response.Env)

Expand Down Expand Up @@ -352,9 +349,6 @@ func TestTaskRunner_ConnectNativeHook_Ok(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))

// Assert the hook is Done
require.True(t, response.Done)

// Assert only CONSUL_HTTP_ADDR env variable is set
require.Equal(t, map[string]string{"CONSUL_HTTP_ADDR": testConsul.HTTPAddr}, response.Env)

Expand Down Expand Up @@ -424,9 +418,6 @@ func TestTaskRunner_ConnectNativeHook_with_SI_token(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))

// Assert the hook is Done
require.True(t, response.Done)

// Assert environment variable for token is set
require.NotEmpty(t, response.Env)
require.Equal(t, token, response.Env["CONSUL_HTTP_TOKEN"])
Expand Down Expand Up @@ -504,9 +495,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))

// Assert the hook is Done
require.True(t, response.Done)

// Remove variables we are not interested in
delete(response.Env, "CONSUL_HTTP_ADDR")

Expand Down Expand Up @@ -634,9 +622,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS_override(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))

// Assert the hook is Done
require.True(t, response.Done)

// Assert environment variable for CONSUL_HTTP_SSL is set, because it was
// the only one not overridden by task env block config
require.NotEmpty(t, response.Env)
Expand Down
6 changes: 2 additions & 4 deletions client/allocrunner/taskrunner/envoy_bootstrap_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,6 @@ func (h *envoyBootstrapHook) lookupService(svcKind, svcName string, taskEnv *tas
// Must be aware of both launching envoy as a sidecar proxy, as well as a connect gateway.
func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestartRequest, resp *ifs.TaskPrestartResponse) error {
if !req.Task.Kind.IsConnectProxy() && !req.Task.Kind.IsAnyConnectGateway() {
// Not a Connect proxy sidecar
resp.Done = true
return nil
}

Expand Down Expand Up @@ -358,8 +356,8 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart

// Command succeeded, exit.
if cmdErr == nil {
// Bootstrap written. Mark as done and move on.
resp.Done = true
// Bootstrap written. Move on without marking as Done as Prestart needs
// to rerun after node reboots.
return false, nil
}

Expand Down
13 changes: 0 additions & 13 deletions client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,6 @@ func TestEnvoyBootstrapHook_with_SI_token(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, resp))

// Assert it is Done
require.True(t, resp.Done)

// Ensure the default path matches
env := map[string]string{
taskenv.SecretsDir: req.TaskDir.SecretsDir,
Expand Down Expand Up @@ -463,9 +460,6 @@ func TestTaskRunner_EnvoyBootstrapHook_sidecar_ok(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, resp))

// Assert it is Done
require.True(t, resp.Done)

require.NotNil(t, resp.Env)
require.Equal(t, "127.0.0.2:19001", resp.Env[envoyAdminBindEnvPrefix+"foo"])

Expand Down Expand Up @@ -547,7 +541,6 @@ func TestTaskRunner_EnvoyBootstrapHook_gateway_ok(t *testing.T) {
require.NoError(t, h.Prestart(context.Background(), req, &resp))

// Assert the hook is Done
require.True(t, resp.Done)
require.NotNil(t, resp.Env)

// Read the Envoy Config file
Expand Down Expand Up @@ -596,9 +589,6 @@ func TestTaskRunner_EnvoyBootstrapHook_Noop(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, resp))

// Assert it is Done
require.True(t, resp.Done)

// Assert no file was written
_, err := os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json"))
require.Error(t, err)
Expand Down Expand Up @@ -677,9 +667,6 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
require.ErrorIs(t, err, errEnvoyBootstrapError)
require.True(t, structs.IsRecoverable(err))

// Assert it is not Done
require.False(t, resp.Done)

// Assert no file was written
_, err = os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json"))
require.Error(t, err)
Expand Down
Loading