From 33c6934057e80f3a4a3d6f1060345262980e334d Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Fri, 6 Oct 2023 10:42:39 +0200 Subject: [PATCH] fix install/enroll cmd to fail when agent restart fails (#3207) * surface errors that might occur during enroll * fail install command if agent cannot be restarted * do not print success message if there was an enroll error. Print an error message and the error instead * add logs to show the different enroll attempts * add more context t errors * refactor internal/pkg/agent/install/perms_unix.go and add more context to errors restore main version * ignore agent restart error on enroll tests as there is no agent to be restarted --- ...-Surface-errors-during-Agent's-enroll.yaml | 32 +++++++++ internal/pkg/agent/cmd/enroll.go | 2 +- internal/pkg/agent/cmd/enroll_cmd.go | 62 +++++++++++----- internal/pkg/agent/cmd/enroll_cmd_test.go | 71 +++++++++++++------ internal/pkg/agent/cmd/install.go | 4 +- internal/pkg/agent/install/perms_unix.go | 32 +++++---- 6 files changed, 151 insertions(+), 52 deletions(-) create mode 100644 changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml new file mode 100644 index 00000000000..f8361f99433 --- /dev/null +++ b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Surface errors during Agent's enroll process, failing if any happens. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: install/enroll + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3207 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 1bce5f7e547..adaa278f32f 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { // Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted // This is because we are fixing permissions twice, once during installation and again during the enrollment step. // When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions. - var fixPermissions bool = fromInstall + fixPermissions := fromInstall if runtime.GOOS == "darwin" { fixPermissions = false } diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index b5992f10188..f43e1483a8a 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -172,7 +172,7 @@ func newEnrollCmd( ) } -// newEnrollCmdWithStore creates an new enrollment and accept a custom store. +// newEnrollCmdWithStore creates a new enrollment and accept a custom store. func newEnrollCmdWithStore( log *logger.Logger, options *enrollCmdOption, @@ -187,10 +187,11 @@ func newEnrollCmdWithStore( }, nil } -// Execute tries to enroll the agent into Fleet. +// Execute enrolls the agent into Fleet. func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { var err error defer c.stopAgent() // ensure its stopped no matter what + span, ctx := apm.StartSpan(ctx, "enroll", "app.internal") defer func() { apm.CaptureError(ctx, err).Send() @@ -235,7 +236,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { // Ensure that the agent does not use a proxy configuration // when connecting to the local fleet server. // Note that when running fleet-server the enroll request will be sent to :8220, - // however when the agent is running afterwards requests will be sent to :8221 + // however when the agent is running afterward requests will be sent to :8221 c.remoteConfig.Transport.Proxy.Disable = true } @@ -256,7 +257,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { err = c.enrollWithBackoff(ctx, persistentConfig) if err != nil { - return errors.New(err, "fail to enroll") + return fmt.Errorf("fail to enroll: %w", err) } if c.options.FixPermissions { @@ -267,17 +268,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } defer func() { - fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + if err != nil { + fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err) + } else { + fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + } }() if c.agentProc == nil { - if err := c.daemonReload(ctx); err != nil { - c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err) - } else { - c.log.Info("Successfully triggered restart on running Elastic Agent.") + if err = c.daemonReloadWithBackoff(ctx); err != nil { + c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) + return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) } + + c.log.Info("Successfully triggered restart on running Elastic Agent.") return nil } + c.log.Info("Elastic Agent has been enrolled; start Elastic Agent") return nil } @@ -443,6 +450,11 @@ func (c *enrollCmd) prepareFleetTLS() error { func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { err := c.daemonReload(ctx) + if err != nil && + (errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled)) { + return fmt.Errorf("could not reload deamon: %w", err) + } if err == nil { return nil } @@ -450,17 +462,20 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { signal := make(chan struct{}) backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) - for i := 5; i >= 0; i-- { + var i int + for ; i < 5; i++ { backExp.Wait() c.log.Info("Retrying to restart...") err = c.daemonReload(ctx) - if err == nil { + if err == nil || + errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { break } } close(signal) - return err + return fmt.Errorf("could not reload deamon after %d retries: %w", i+1, err) } func (c *enrollCmd) daemonReload(ctx context.Context) error { @@ -478,8 +493,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ c.log.Infof("Starting enrollment to URL: %s", c.client.URI()) err := c.enroll(ctx, persistentConfig) + if err == nil { + return nil + } + + const deadline = 10 * time.Minute + const frequency = 60 * time.Second + + c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s", + deadline, + frequency, + c.client.URI()) signal := make(chan struct{}) - backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute) + defer close(signal) + backExp := backoff.NewExpBackoff(signal, frequency, deadline) for { retry := false @@ -498,7 +525,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ err = c.enroll(ctx, persistentConfig) } - close(signal) return err } @@ -547,8 +573,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte c.options.FleetServer.ElasticsearchInsecure, ) if err != nil { - return err + return fmt.Errorf( + "failed creating fleet-server bootstrap config: %w", err) } + // no longer need bootstrap at this point serverConfig.Server.Bootstrap = false fleetConfig.Server = serverConfig.Server @@ -568,11 +596,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte reader, err := yamlToReader(configToStore) if err != nil { - return err + return fmt.Errorf("yamlToReader failed: %w", err) } if err := safelyStoreAgentInfo(c.configStore, reader); err != nil { - return err + return fmt.Errorf("failed to store agent config: %w", err) } // clear action store diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 189ad7b6563..426924273d1 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -16,8 +16,11 @@ import ( "os" "runtime" "strconv" + "strings" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -159,14 +162,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -216,16 +229,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.NoError(t, err) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -275,16 +296,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index 4fbd37f40da..2cb46bd599d 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("problem reading prompt response") } if url == "" { - fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n") + fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.") return nil } } @@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { } }() } + + fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.") } if enroll { diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index e84dcd5039c..fc357fd4fde 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -8,6 +8,7 @@ package install import ( "errors" + "fmt" "io/fs" "os" "path/filepath" @@ -18,19 +19,26 @@ func fixPermissions(topPath string) error { return recursiveRootPermissions(topPath) } -func recursiveRootPermissions(path string) error { - return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error { - if err == nil { - // all files should be owned by root:root - err = os.Chown(name, 0, 0) - if err != nil { - return err - } - // remove any world permissions from the file - err = os.Chmod(name, info.Mode().Perm()&0770) - } else if errors.Is(err, fs.ErrNotExist) { +func recursiveRootPermissions(root string) error { + return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error { + if errors.Is(err, fs.ErrNotExist) { return nil } - return err + if err != nil { + return fmt.Errorf("walk on %q failed: %w", path, err) + } + + // all files should be owned by root:root + err = os.Chown(path, 0, 0) + if err != nil { + return fmt.Errorf("could not fix ownership of %q: %w", path, err) + } + // remove any world permissions from the file + err = os.Chmod(path, info.Mode().Perm()&0770) + if err != nil { + return fmt.Errorf("could not fix permissions of %q: %w", path, err) + } + + return nil }) }