Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

install fails if enroll fails and surface errors #3554

Merged
merged 5 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Surface errors during Agent's enroll process, failing if any happens.

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; a word indicating the component this changeset affects.
component: install/enroll

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3207

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
2 changes: 1 addition & 1 deletion dev-tools/mage/godaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ var (
}
)

// BuildGoDaemon builds the go-deamon binary.
// BuildGoDaemon builds the go-daemon binary.
func BuildGoDaemon() error {
if GOOS != "linux" {
return errors.New("go-daemon only builds for linux")
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/cmd/enroll.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
var fixPermissions bool = fromInstall
fixPermissions := fromInstall
if runtime.GOOS == "darwin" {
fixPermissions = false
}
Expand Down
67 changes: 49 additions & 18 deletions internal/pkg/agent/cmd/enroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ func newEnrollCmd(
)
}

// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
func newEnrollCmdWithStore(
log *logger.Logger,
options *enrollCmdOption,
Expand All @@ -187,10 +187,11 @@ func newEnrollCmdWithStore(
}, nil
}

// Execute tries to enroll the agent into Fleet.
// Execute enrolls the agent into Fleet.
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
var err error
defer c.stopAgent() // ensure its stopped no matter what

span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
defer func() {
apm.CaptureError(ctx, err).Send()
Expand Down Expand Up @@ -235,7 +236,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
// Ensure that the agent does not use a proxy configuration
// when connecting to the local fleet server.
// Note that when running fleet-server the enroll request will be sent to :8220,
// however when the agent is running afterwards requests will be sent to :8221
// however when the agent is running afterward requests will be sent to :8221
c.remoteConfig.Transport.Proxy.Disable = true
}

Expand All @@ -256,7 +257,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {

err = c.enrollWithBackoff(ctx, persistentConfig)
if err != nil {
return errors.New(err, "fail to enroll")
return fmt.Errorf("fail to enroll: %w", err)
}

if c.options.FixPermissions {
Expand All @@ -267,17 +268,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
}

defer func() {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
if err != nil {
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
} else {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
}
}()

if c.agentProc == nil {
if err := c.daemonReload(ctx); err != nil {
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
} else {
c.log.Info("Successfully triggered restart on running Elastic Agent.")
if err = c.daemonReloadWithBackoff(ctx); err != nil {
c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
ycombinator marked this conversation as resolved.
Show resolved Hide resolved
}

c.log.Info("Successfully triggered restart on running Elastic Agent.")
return nil
}

c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
return nil
}
Expand Down Expand Up @@ -443,24 +450,35 @@ func (c *enrollCmd) prepareFleetTLS() error {

func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
ycombinator marked this conversation as resolved.
Show resolved Hide resolved
err := c.daemonReload(ctx)
if err != nil &&
(errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled)) {
return fmt.Errorf("could not reload daemon: %w", err)
}
if err == nil {
return nil
}

signal := make(chan struct{})
defer close(signal)
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)

for i := 5; i >= 0; i-- {
for i := 0; i < 5; i++ {
backExp.Wait()
c.log.Info("Retrying to restart...")
err = c.daemonReload(ctx)
if err != nil &&
(errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled)) {
return fmt.Errorf("could not reload daemon after %d retries: %w",
i+1, err)
}
if err == nil {
break
return nil
}
}

close(signal)
return err
return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err)
}

func (c *enrollCmd) daemonReload(ctx context.Context) error {
Expand All @@ -478,8 +496,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[

c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
err := c.enroll(ctx, persistentConfig)
if err == nil {
return nil
}

const deadline = 10 * time.Minute
const frequency = 60 * time.Second

c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
deadline,
frequency,
c.client.URI())
signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
defer close(signal)
backExp := backoff.NewExpBackoff(signal, frequency, deadline)

for {
retry := false
Expand All @@ -498,7 +528,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
err = c.enroll(ctx, persistentConfig)
}

close(signal)
return err
}

Expand Down Expand Up @@ -547,8 +576,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
c.options.FleetServer.ElasticsearchInsecure,
)
if err != nil {
return err
return fmt.Errorf(
"failed creating fleet-server bootstrap config: %w", err)
}

// no longer need bootstrap at this point
serverConfig.Server.Bootstrap = false
fleetConfig.Server = serverConfig.Server
Expand All @@ -568,11 +599,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte

reader, err := yamlToReader(configToStore)
if err != nil {
return err
return fmt.Errorf("yamlToReader failed: %w", err)
}

if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
return err
return fmt.Errorf("failed to store agent config: %w", err)
}

// clear action store
Expand Down
71 changes: 50 additions & 21 deletions internal/pkg/agent/cmd/enroll_cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ import (
"os"
"runtime"
"strconv"
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
Expand Down Expand Up @@ -159,14 +162,24 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)

if err != nil &&
ycombinator marked this conversation as resolved.
Show resolved Hide resolved
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
ycombinator marked this conversation as resolved.
Show resolved Hide resolved
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the description it seems that we expect the error about not being able to trigger a restart, we can simplify with:

Suggested change
if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
require.ErrorContainsf(t, err, "could not reload agent daemon, unable to trigger restart", "enroll command returned an unexpected error: %v", err)


config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)

assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
},
))

Expand Down Expand Up @@ -216,16 +229,24 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

require.True(t, store.Called)

ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)
if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

assert.True(t, store.Called)
config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
assert.NoError(t, err)
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
},
))

Expand Down Expand Up @@ -275,16 +296,24 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

require.True(t, store.Called)

ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)

if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

assert.True(t, store.Called)
config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
},
))

Expand Down
4 changes: 3 additions & 1 deletion internal/pkg/agent/cmd/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
return fmt.Errorf("problem reading prompt response")
}
if url == "" {
fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n")
fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.")
return nil
}
}
Expand Down Expand Up @@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
}
}()
}

fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.")
}

if enroll {
Expand Down
32 changes: 20 additions & 12 deletions internal/pkg/agent/install/perms_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package install

import (
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
Expand All @@ -18,19 +19,26 @@ func fixPermissions(topPath string) error {
return recursiveRootPermissions(topPath)
}

func recursiveRootPermissions(path string) error {
return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error {
if err == nil {
// all files should be owned by root:root
err = os.Chown(name, 0, 0)
if err != nil {
return err
}
// remove any world permissions from the file
err = os.Chmod(name, info.Mode().Perm()&0770)
} else if errors.Is(err, fs.ErrNotExist) {
func recursiveRootPermissions(root string) error {
ycombinator marked this conversation as resolved.
Show resolved Hide resolved
return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error {
if errors.Is(err, fs.ErrNotExist) {
return nil
}
return err
if err != nil {
return fmt.Errorf("walk on %q failed: %w", path, err)
}

// all files should be owned by root:root
err = os.Chown(path, 0, 0)
if err != nil {
return fmt.Errorf("could not fix ownership of %q: %w", path, err)
}
// remove any world permissions from the file
err = os.Chmod(path, info.Mode().Perm()&0770)
if err != nil {
return fmt.Errorf("could not fix permissions of %q: %w", path, err)
}

return nil
})
}
Loading