From eddcfa93b45c375471d6530266b02470ffca447d Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Tue, 25 Oct 2022 11:49:59 -0700 Subject: [PATCH 01/32] Add agent diagnostics action Add the ability for the agent to recieve a diagnostics action. This action makes the agent take a diagnostics bundle and upload it to a new fleet-server file upload API. Changed internal details of how diagnostics bundles are taken to better support this: Hooks are provided through a callback func so that log files generated as the agent is running can be included in the bundle. Change the Hook func signature from returning a []byte to a []byte, time.Time so that the file mod times collected through diagnostics hooks can be returned from the same filesystem interaction. Upload client has a built-in retry mechanism that will resend requests if a 429 is recieved. --- .../handlers/handler_action_diagnostics.go | 104 ++++++ .../application/coordinator/coordinator.go | 353 ++++++++++++------ .../pkg/agent/application/managed_mode.go | 10 + internal/pkg/agent/control/server/server.go | 5 +- internal/pkg/core/monitoring/config/config.go | 19 + internal/pkg/diagnostics/diagnostics.go | 107 +++++- internal/pkg/fleetapi/action.go | 27 ++ internal/pkg/fleetapi/uploader/uploader.go | 227 +++++++++++ .../pkg/fleetapi/uploader/uploader_test.go | 216 +++++++++++ 9 files changed, 947 insertions(+), 121 deletions(-) create mode 100644 internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go create mode 100644 internal/pkg/fleetapi/uploader/uploader.go create mode 100644 internal/pkg/fleetapi/uploader/uploader_test.go diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go new file mode 100644 index 00000000000..f5158892752 --- /dev/null +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -0,0 +1,104 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package handlers + +import ( + "bytes" + "context" + "fmt" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" + "github.com/elastic/elastic-agent/internal/pkg/agent/control/client" + "github.com/elastic/elastic-agent/internal/pkg/diagnostics" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" + "github.com/elastic/elastic-agent/pkg/core/logger" +) + +// Uploader is the interface used to upload a diagnostics bundle to fleet-server. +type Uploader interface { + UploadDiagnostics(context.Context, string, *bytes.Buffer) error +} + +// Diagnostics is the handler to process Diagnostics actions. +// When a Diagnostics action is received a full diagnostics bundle is taken and uploaded to fleet-server. +type Diagnostics struct { + log *logger.Logger + coord *coordinator.Coordinator // TODO use of coordinator or control server/client? + uploader Uploader +} + +// NewDiagnostics returns a new Diagnostics handler. +func NewDiagnostics(log *logger.Logger, coord *coordinator.Coordinator, uploader Uploader) *Diagnostics { + return &Diagnostics{ + log: log, + coord: coord, + uploader: uploader, + } +} + +// Handle processes the passed Diagnostics action. +func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.Acker) error { + h.log.Debugf("handlerDiagnostics: action '%+v' received", a) + action, ok := a.(*fleetapi.ActionDiagnostics) + if !ok { + return fmt.Errorf("invalid type, expected ActionDiagnostics and received %T", a) + } + + // Gather agent diagnostics + diagHooks := append(diagnostics.GlobalHooks(), h.coord.DiagnosticHooks()()...) + aDiag := make([]client.DiagnosticFileResult, 0, len(diagHooks)) + for _, hook := range diagHooks { + if ctx.Err() != nil { + return ctx.Err() + } + + p, ts := hook.Hook(ctx) + aDiag = append(aDiag, client.DiagnosticFileResult{ + Name: hook.Name, + Filename: hook.Filename, + Description: hook.Description, + ContentType: hook.ContentType, + Content: p, + Generated: ts, + }) + } + + runtimeDiag := h.coord.PerformDiagnostics(ctx) + uDiag := make([]client.DiagnosticUnitResult, 0, len(runtimeDiag)) + for _, diag := range runtimeDiag { + files := make([]client.DiagnosticFileResult, 0, diag.Results) + for _, f := range diag.Results { + files = append(files, client.DiagnosticFileResult{ + Name: f.Name, + Filename: f.Filename, + Description: f.Description, + ContentType: f.ContentType, + Content: f.Content, + Generated: f.Generated.AsTime(), + }) + } + uDiag = append(uDiag, client.DiagnosticUnitResult{ + ComponentID: diag.Component.ID, + UnitID: diag.Unit.ID, + UnitType: diag.Unit.Type, + Err: diag.Err, + Results: files, + }) + } + + var b bytes.Buffer + err := diagnostics.ZipArchive(b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments + if err != nil { + return fmt.Errorf("error creating diagnostics bundle: %w", err) + } + + err = h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) + _ = ack.Ack(ctx, action) // TODO ack should have the file upload ID in it + if err != nil { + return fmt.Errorf("unable to upload diagnostics: %w", err) + } + return nil +} diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index 48a476a5164..8302dc39406 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -8,23 +8,28 @@ import ( "context" "errors" "fmt" + "io" + "io/fs" + "os" + "path/filepath" + "strings" + "time" "gopkg.in/yaml.v2" - "github.com/elastic/elastic-agent/internal/pkg/diagnostics" - "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" - - "github.com/elastic/elastic-agent/internal/pkg/fleetapi" - + "github.com/elastic/elastic-agent-client/v7/pkg/client" "go.elastic.co/apm" - "github.com/elastic/elastic-agent-client/v7/pkg/client" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" agentclient "github.com/elastic/elastic-agent/internal/pkg/agent/control/client" "github.com/elastic/elastic-agent/internal/pkg/agent/transpiler" "github.com/elastic/elastic-agent/internal/pkg/capabilities" "github.com/elastic/elastic-agent/internal/pkg/config" + "github.com/elastic/elastic-agent/internal/pkg/diagnostics" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/component/runtime" "github.com/elastic/elastic-agent/pkg/core/logger" @@ -285,7 +290,7 @@ func (c *Coordinator) Upgrade(ctx context.Context, version string, sourceURI str return nil } -// AckUpgrade performs acknowledgement for upgrade. +// AckUpgrade is the method used on startup to ack a prevously successful upgrade action. func (c *Coordinator) AckUpgrade(ctx context.Context, acker acker.Acker) error { return c.upgradeMgr.Ack(ctx, acker) } @@ -350,117 +355,245 @@ func (c *Coordinator) Run(ctx context.Context) error { } } -// DiagnosticHooks returns diagnostic hooks that can be connected to the control server to provide diagnostic -// information about the state of the Elastic Agent. -func (c *Coordinator) DiagnosticHooks() diagnostics.Hooks { - return diagnostics.Hooks{ - { - Name: "pre-config", - Filename: "pre-config.yaml", - Description: "current pre-configuration of the running Elastic Agent before variable substitution", - ContentType: "application/yaml", - Hook: func(_ context.Context) []byte { - if c.state.ast == nil { - return []byte("error: failed no configuration by the coordinator") - } - cfg, err := c.state.ast.Map() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - o, err := yaml.Marshal(cfg) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - return o +// DiagnosticHooks returns diagnostic hooks callback that can be connected to the control server to provide +// diagnostic information about the state of the Elastic Agent. A callback is used in order to all log files +// at the time of diagnostics. +func (c *Coordinator) DiagnosticHooks() func() diagnostics.Hooks { + return func() diagnostics.Hooks { + hooks := diagnostics.Hooks{ + diagnostics.Hook{ + Name: "pre-config", + Filename: "pre-config.yaml", + Description: "current pre-configuration of the running Elastic Agent before variable substitution", + ContentType: "application/yaml", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + if c.state.ast == nil { + return []byte("error: failed no configuration by the coordinator"), ts + } + cfg, err := c.state.ast.Map() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + o, err := yaml.Marshal(cfg) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + return o, ts + }, }, - }, - { - Name: "variables", - Filename: "variables.yaml", - Description: "current variable contexts of the running Elastic Agent", - ContentType: "application/yaml", - Hook: func(_ context.Context) []byte { - if c.state.vars == nil { - return []byte("error: failed no variables by the coordinator") - } - vars := make([]map[string]interface{}, 0, len(c.state.vars)) - for _, v := range c.state.vars { - m, err := v.Map() + diagnostics.Hook{ + Name: "variables", + Filename: "variables.yaml", + Description: "current variable contexts of the running Elastic Agent", + ContentType: "application/yaml", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + if c.state.vars == nil { + return []byte("error: failed no variables by the coordinator"), ts + } + vars := make([]map[string]interface{}, 0, len(c.state.vars)) + for _, v := range c.state.vars { + m, err := v.Map() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + vars = append(vars, m) + } + o, err := yaml.Marshal(struct { + Variables []map[string]interface{} `yaml:"variables"` + }{ + Variables: vars, + }) if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) + return []byte(fmt.Sprintf("error: %q", err)), ts } - vars = append(vars, m) - } - o, err := yaml.Marshal(struct { - Variables []map[string]interface{} `yaml:"variables"` - }{ - Variables: vars, - }) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - return o + return o, ts + }, }, - }, - { - Name: "computed-config", - Filename: "computed-config.yaml", - Description: "current computed configuration of the running Elastic Agent after variable substitution", - ContentType: "application/yaml", - Hook: func(_ context.Context) []byte { - if c.state.ast == nil || c.state.vars == nil { - return []byte("error: failed no configuration or variables received by the coordinator") - } - cfg, _, err := c.compute() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - o, err := yaml.Marshal(cfg) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - return o + diagnostics.Hook{ + Name: "computed-config", + Filename: "computed-config.yaml", + Description: "current computed configuration of the running Elastic Agent after variable substitution", + ContentType: "application/yaml", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + if c.state.ast == nil || c.state.vars == nil { + return []byte("error: failed no configuration or variables received by the coordinator"), ts + } + cfg, _, err := c.compute() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + o, err := yaml.Marshal(cfg) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + return o, ts + }, }, - }, - { - Name: "components", - Filename: "components.yaml", - Description: "current expected components model of the running Elastic Agent", - ContentType: "application/yaml", - Hook: func(_ context.Context) []byte { - if c.state.ast == nil || c.state.vars == nil { - return []byte("error: failed no configuration or variables received by the coordinator") - } - _, comps, err := c.compute() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - o, err := yaml.Marshal(struct { - Components []component.Component `yaml:"components"` - }{ - Components: comps, - }) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - return o + diagnostics.Hook{ + Name: "components", + Filename: "components.yaml", + Description: "current expected components model of the running Elastic Agent", + ContentType: "application/yaml", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + if c.state.ast == nil || c.state.vars == nil { + return []byte("error: failed no configuration or variables received by the coordinator"), ts + } + _, comps, err := c.compute() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + o, err := yaml.Marshal(struct { + Components []component.Component `yaml:"components"` + }{ + Components: comps, + }) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + return o, ts + }, }, - }, - { - Name: "state", - Filename: "state.yaml", - Description: "current state of running components by the Elastic Agent", - ContentType: "application/yaml", - Hook: func(_ context.Context) []byte { - s := c.State(true) - o, err := yaml.Marshal(s) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) - } - return o + diagnostics.Hook{ + Name: "state", + Filename: "state.yaml", + Description: "current state of running components by the Elastic Agent", + ContentType: "application/yaml", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + s := c.State(true) + o, err := yaml.Marshal(s) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)), ts + } + return o, ts + }, }, - }, + } + } +} + +func (c *Coordinator) addLogHooks() []diagnostics.Hook { + hooks := []diagnostics.Hook{{ + Name: "logs directory", + Filename: "logs/", + Description: "The logs directory for the agent.", + ContentType: diagnostics.ContentTypeDirectory, + Hook: func(_ context.Context) ([]byte, time.Time) { return nil, time.Now().UTC() }, + }} + + logPath := filepath.Join(paths.Home(), "logs") + string(filepath.Separator) + err := filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { + if errors.Is(fErr, fs.ErrNotExist) { + return nil + } + if fErr != nil { + return fmt.Errorf("unable to walk log dir: %w", fErr) + } + name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) + if name == "" { + return nil + } + if d.IsDir() { + // TODO check if subdirectories in the logs dir can exist. + hooks = append(hooks, diagnostics.Hook{ + Name: "logs subdirectory", + Filename: "logs/" + name + "/", + Description: "A logs subdirectory", + ContentType: diagnostics.ContentTypeDirectory, + Hook: func(_ context.Context) ([]byte, time.Time) { return nil, time.Now().UTC() }, + }) + } else { + hooks = append(hooks, diagnostics.Hook{ + Name: "log file", + Filename: "logs/" + name, + ContentType: "text/plain", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + lf, err := os.Open(path) + if err != nil { + return []byte(fmt.Sprintf("unable to open log file: %v", err)), ts + } + if stat, err := lf.Stat(); err == nil { + ts = stat.ModTime().UTC() + } + defer lf.Close() + p, err := io.ReadAll(lf) + if err != nil { + return []byte(fmt.Sprintf("unable to read log file: %v", err)), ts + } + return p, ts + }, + }) + } + return nil + }) + if err != nil { + c.logger.With("err", err).Error("Unable to gather agent logs") + } + return hooks +} + +func (c *Coordinator) addServiceLogHooks() []diagnostics.Hook { + hooks := []diagnostics.Hook{{ + Name: "services log dir", + Filename: "services/", + ContentType: diagnostics.ContentTypeDirectory, + Hook: func(_ context.Context) []byte { return nil }, + }} + for _, spec := range c.specs.ServiceSpecs() { + if spec.Spec.Service.Log == nil || spec.Spec.Service.Log.Path == "" { + // no log path set in specification + continue + } + + logPath := filepath.Dir(spec.Spec.Service.Log.Path) + string(filepath.Separator) + err := filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { + if errors.Is(fErr, fs.ErrNotExist) { + return nil + } + if fErr != nil { + return fmt.Errorf("unable to walk log directory %q for service input %s: %w", logPath, spec.InputType, fErr) + } + name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) + if name == "" { + return nil + } + if d.IsDir() { + return nil + } + hooks = append(hooks, diagnostics.Hook{ + Name: "service logs", + Filename: "services/" + name, + ContentType: "text/plain", + Hook: func(_ context.Context) ([]byte, time.Time) { + ts := time.Now().UTC() + lf, err := os.Open(path) + if err != nil { + return []byte(fmt.Sprintf("unable to open log file: %v", err)), ts + } + if stat, err := lf.Stat(); err == nil { + ts = stat.ModTime().UTC() + } + defer lf.Close() + p, err := io.ReadAll(lf) + if err != nil { + return []byte(fmt.Sprintf("unable to read log file: %v", err)), ts + } + return p, ts + + }, + }) + return nil + }) + if err != nil { + c.logger.With("err", err).Error("Unable to gather component logs") + } } + return hooks } // runner performs the actual work of running all the managers diff --git a/internal/pkg/agent/application/managed_mode.go b/internal/pkg/agent/application/managed_mode.go index af53e150888..f614029737d 100644 --- a/internal/pkg/agent/application/managed_mode.go +++ b/internal/pkg/agent/application/managed_mode.go @@ -25,6 +25,7 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker/lazy" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker/retrier" fleetclient "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi/uploader" "github.com/elastic/elastic-agent/internal/pkg/queue" "github.com/elastic/elastic-agent/internal/pkg/remote" "github.com/elastic/elastic-agent/internal/pkg/runner" @@ -348,6 +349,15 @@ func (m *managedConfigManager) initDispatcher(canceller context.CancelFunc) *han ), ) + m.dispatcher.MustRegister( + &fleetapi.ActionDiagnostics{}, + handlers.NewDiagnostics( + m.log, + m.coord, + uploader.New(m.agentInfo.AgentID(), m.client, m.cfg.Settings.MonitoringConfig.Uploader), + ), + ) + m.dispatcher.MustRegister( &fleetapi.ActionApp{}, handlers.NewAppAction(m.log, m.coord), diff --git a/internal/pkg/agent/control/server/server.go b/internal/pkg/agent/control/server/server.go index 67fe85fab2b..7e6b6799bbb 100644 --- a/internal/pkg/agent/control/server/server.go +++ b/internal/pkg/agent/control/server/server.go @@ -9,7 +9,6 @@ import ( "encoding/json" "fmt" "net" - "time" "github.com/elastic/elastic-agent/pkg/component/runtime" @@ -186,14 +185,14 @@ func (s *Server) DiagnosticAgent(ctx context.Context, _ *cproto.DiagnosticAgentR if ctx.Err() != nil { return nil, ctx.Err() } - r := h.Hook(ctx) + r, ts := h.Hook(ctx) res = append(res, &cproto.DiagnosticFileResult{ Name: h.Name, Filename: h.Filename, Description: h.Description, ContentType: h.ContentType, Content: r, - Generated: timestamppb.New(time.Now().UTC()), + Generated: timestamppb.New(ts), }) } if ctx.Err() != nil { diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go index bf5edd9716f..7366c0eed44 100644 --- a/internal/pkg/core/monitoring/config/config.go +++ b/internal/pkg/core/monitoring/config/config.go @@ -4,6 +4,8 @@ package config +import "time" + const defaultPort = 6791 const defaultNamespace = "default" @@ -18,6 +20,7 @@ type MonitoringConfig struct { Pprof *PprofConfig `yaml:"pprof" config:"pprof"` MonitorTraces bool `yaml:"traces" config:"traces"` APM APMConfig `yaml:"apm,omitempty" config:"apm,omitempty" json:"apm,omitempty"` + Uploader Uploader `yaml:"uploader,omitempty" json:"uploader,omitempty"` } // MonitoringHTTPConfig is a config defining HTTP endpoint published by agent @@ -57,6 +60,7 @@ func DefaultConfig() *MonitoringConfig { }, Namespace: defaultNamespace, APM: defaultAPMConfig(), + Uploader: defaultUploader(), } } @@ -80,3 +84,18 @@ type APMTLS struct { func defaultAPMConfig() APMConfig { return APMConfig{} } + +// Uploader contains the configuration for retries when uploading a file (diagnostics bundle) to fleet-server. +type Uploader struct { + MaxRetries int `config:"max_retries"` + InitDur time.Duration `config:"init_duration"` + MaxDur time.Duration `config:"max_duration"` +} + +func defaultUploader() Uploader { + return Uploader{ + MaxRetries: 10, + InitDur: time.Second, + MaxDur: time.Minute * 10, + } +} diff --git a/internal/pkg/diagnostics/diagnostics.go b/internal/pkg/diagnostics/diagnostics.go index a20e990c0ee..cb629496ef1 100644 --- a/internal/pkg/diagnostics/diagnostics.go +++ b/internal/pkg/diagnostics/diagnostics.go @@ -5,13 +5,18 @@ package diagnostics import ( + "archive/zip" "bytes" "context" "fmt" + "io" "runtime/pprof" + "strings" + "time" "gopkg.in/yaml.v2" + "github.com/elastic/elastic-agent/internal/pkg/agent/control/client" "github.com/elastic/elastic-agent/internal/pkg/release" ) @@ -21,7 +26,7 @@ type Hook struct { Filename string Description string ContentType string - Hook func(ctx context.Context) []byte + Hook func(ctx context.Context) ([]byte, time.Time) } // Hooks is a set of diagnostic hooks. @@ -35,13 +40,13 @@ func GlobalHooks() Hooks { Filename: "version.txt", Description: "version information", ContentType: "application/yaml", - Hook: func(_ context.Context) []byte { + Hook: func(_ context.Context) ([]byte, time.Time) { v := release.Info() o, err := yaml.Marshal(v) if err != nil { - return []byte(fmt.Sprintf("error: %q", err)) + return []byte(fmt.Sprintf("error: %q", err)), time.Now().UTC() } - return o + return o, time.Now().UTC() }, }, { @@ -89,14 +94,100 @@ func GlobalHooks() Hooks { } } -func pprofDiag(name string) func(context.Context) []byte { - return func(_ context.Context) []byte { +func pprofDiag(name string) func(context.Context) ([]byte, time.Time) { + return func(_ context.Context) ([]byte, time.Time) { var w bytes.Buffer err := pprof.Lookup(name).WriteTo(&w, 1) if err != nil { // error is returned as the content - return []byte(fmt.Sprintf("failed to write pprof to bytes buffer: %s", err)) + return []byte(fmt.Sprintf("failed to write pprof to bytes buffer: %s", err)), time.Now().UTC() } - return w.Bytes() + return w.Bytes(), time.Now().UTC() } } + +// ZipArchive creates a zipped diagnostics bundle using the passed writer with the passed diagnostics. +// If any error is encounted when writing the contents of the archive it is returned. +func ZipArchive(w io.Writer, agentDiag []client.DiagnosticFileResult, unitDiags []client.DiagnosticUnitResult) error { + zw := zip.NewWriter(w) + defer zw.Close() + // Create directories in the zip archive before writing any files + for _, ad := range agentDiag { + if ad.ContentType == ContentTypeDirectory { + _, err := zw.Create(ad.Filename) + if err != nil { + return err + } + } + } + // Write agent diagnostics content + for _, ad := range agentDiag { + if ad.ContentType != ContentTypeDirectory { + zf, err := zw.CreateHeader(&zip.FileHeader{ + Name: ad.Filename, + Method: zip.Deflate, + Modified: ad.Generated, + }) + if err != nil { + return err + } + _, err = zf.Write(ad.Content) + if err != nil { + return err + } + } + } + + // Handle unit diagnostics + // structure each unit into its own component directory + compDirs := make(map[string][]client.DiagnosticUnitResult) + for _, ud := range unitDiags { + compDir := strings.ReplaceAll(ud.ComponentID, "/", "-") + compDirs[compDir] = append(compDirs[compDir], ud) + } + // write each units diagnostics into its own directory + // layout becomes components/// + _, err := zw.Create("components/") + if err != nil { + return err + } + for dirName, units := range compDirs { + _, err = zw.Create(fmt.Sprintf("components/%s/", dirName)) + if err != nil { + return err + } + for _, ud := range units { + unitDir := strings.ReplaceAll(strings.TrimPrefix(ud.UnitID, ud.ComponentID+"-"), "/", "-") + _, err = zw.Create(fmt.Sprintf("components/%s/%s/", dirName, unitDir)) + if err != nil { + return err + } + if ud.Err != nil { + w, err := zw.Create(fmt.Sprintf("components/%s/%s/error.txt", dirName, unitDir)) + if err != nil { + return err + } + _, err = w.Write([]byte(fmt.Sprintf("%s\n", ud.Err))) + if err != nil { + return err + } + continue + } + for _, fr := range ud.Results { + w, err := zw.CreateHeader(&zip.FileHeader{ + Name: fmt.Sprintf("components/%s/%s/%s", dirName, unitDir, fr.Name), + Method: zip.Deflate, + Modified: fr.Generated, + }) + if err != nil { + return err + } + _, err = w.Write(fr.Content) + if err != nil { + return err + } + } + } + } + return nil +} diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index f23ec6e89e4..7de4a4b7d40 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -33,6 +33,8 @@ const ( ActionTypeInputAction = "INPUT_ACTION" // ActionTypeCancel specifies a cancel action. ActionTypeCancel = "CANCEL" + // ActionTypeDiagnostics specifies a diagnostics action. + ActionTypeDiagnostics = "DIAGNOSTICS" ) // Error values that the Action interface can return @@ -348,6 +350,31 @@ func (a *ActionCancel) String() string { return s.String() } +// ActionDiagnostics is a request to gather and upload a diagnostics bundle. +type ActionDiagnostics struct { + ActionID string `json:"action_id"` + ActionType string `json:"type"` +} + +// ID returns the ID of the action. +func (a *ActionDiagnostics) ID() string { + return a.ActionID +} + +// Type returns the type of the action. +func (a *ActionDiagnostics) Type() string { + return a.ActionType +} + +func (a *ActionDiagnostics) String() string { + var s strings.Builder + s.WriteString("action_id: ") + s.WriteString(a.ActionID) + s.WriteString(", type: ") + s.WriteString(a.ActionType) + return s.String() +} + // ActionApp is the application action request. type ActionApp struct { ActionID string `json:"id" mapstructure:"id"` diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go new file mode 100644 index 00000000000..3ffdfbbacfc --- /dev/null +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -0,0 +1,227 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +// Package uploader contains the methods needed to upload a file using fleet-server's upload endpoints. +package uploader + +import ( + "bytes" + "context" + "crypto/sha256" + "encoding/json" + "fmt" + "io" + "math" + "net/http" + "net/url" + "time" + + "github.com/elastic/elastic-agent/internal/pkg/core/backoff" + "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" +) + +// The following constants correspond with fleer-server API paths for file uploads. +const ( + PathNewUpload = "/api/fleet/uploads" + PathChunk = "/api/fleet/uploads/%s/%d" + PathFinishUpload = "/api/fleet/uploads/%s" +) + +// FileData contains metadata about a file. +type FileData struct { + Size int64 `json:"size"` + Name string `json:"name"` + Extension string `json:"ext"` + Mime string `json:"mime_type"` + Compression string `json:"Compression"` + Hash struct { + SHA256 string `json:"sha256"` + MD5 string `json:"md5"` + } `json:"hash"` + Accessed string `json:"accessed"` + Attributes []string `json:"attributes"` + Created string `json:"created"` + CTime string `json:"ctime"` + Device string `json:"device"` + Directory string `json:"directory"` + DriveLetter string `json:"drive_letter"` + Ext string `json:"extension"` + GID string `json:"gid"` + Group string `json:"group"` + INode string `json:"inode"` + Mode string `json:"mode"` + MTime string `json:"mtime"` + Owner string `json:"owner"` + Path string `json:"path"` + TargetPath string `json:"target_path"` + Type string `json:"type"` + UID string `json:"uid"` +} + +// NewUploadRequest is the struct that is passed as the request body when starting a new file upload. +type NewUploadRequest struct { + AgentID string `json:"agent_id"` + ActionID string `json:"action_id"` + Source string `json:"source"` + File FileData `json:"file"` + Contents []FileData `json:"contents"` + Event struct { + ID string `json:"id"` + } `json:"event"` + Host struct { + Hostname string `json:"hostname"` + } `json:"host"` +} + +// NewUploadResponse is the body for the success case when requesting a new file upload. +type NewUploadResponse struct { + UploadID string `json:"upload_id"` + ChunkSize int64 `json:"chunk_size"` +} + +// retrySender wraps the underlying Sender with retry logic. +type retrySender struct { + c client.Sender + max int + wait backoff.Backoff +} + +// Send calls the underlying Sender's Send method. If a 429 status code is returned the request is retried after a backoff period. +// TODO What to do if another error or status is received? +func (r *retrySender) Send(ctx context.Context, method, path string, params url.Values, headers http.Header, body io.Reader) (resp *http.Response, err error) { + r.wait.Reset() + + var b bytes.Buffer + tr := io.TeeReader(body, &b) + for i := 0; i < r.max; i++ { + resp, err = r.c.Send(ctx, method, path, params, headers, tr) + if err != nil { + return resp, err + } + if resp.StatusCode == http.StatusTooManyRequests { + tr = bytes.NewReader(b.Bytes()) + r.wait.Wait() + continue + } + return resp, err + } + return resp, err +} + +// URI calls the underlying Sender's URI method. +func (r *retrySender) URI() string { + return r.c.URI() +} + +// Client provides methods to upload a file to ES through fleet-server. +type Client struct { + agentID string + c client.Sender +} + +// New returns a new Client for the agent identified by the passed id. +// The sender is wrapped with retry logic specified by the Uploader config. +// Any request that would return a 429 (too many requests) is retried (up to maxRetries times) with a backoff. +func New(id string, c client.Sender, cfg config.Uploader) *Client { + return &Client{ + agentID: id, + c: &retrySender{ + c: c, + max: cfg.MaxRetries, + wait: backoff.NewEqualJitterBackoff(nil, cfg.InitDur, cfg.MaxDur), + }, + } +} + +// New sends a new file upload request to the fleet-server. +func (c *Client) New(ctx context.Context, r *NewUploadRequest) (*NewUploadResponse, error) { + b, err := json.Marshal(r) + if err != nil { + return nil, err + } + resp, err := c.c.Send(ctx, "POST", PathNewUpload, nil, nil, bytes.NewBuffer(b)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, client.ExtractError(resp.Body) + } + + var newUploadResp NewUploadResponse + dec := json.NewDecoder(resp.Body) + if err := dec.Decode(&newUploadResp); err != nil { + return nil, err + } + return &newUploadResp, nil +} + +// Chunk uploads a file chunk to fleet-server. +func (c *Client) Chunk(ctx context.Context, uploadID string, chunkID int, r io.Reader) error { + resp, err := c.c.Send(ctx, "PUT", fmt.Sprintf(PathChunk, uploadID, chunkID), nil, nil, r) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return client.ExtractError(resp.Body) + } + return nil +} + +// Finish calls the finalize endpoint for the file upload. +func (c *Client) Finish(ctx context.Context, id string) error { + resp, err := c.c.Send(ctx, "POST", fmt.Sprintf(PathFinishUpload, id), nil, nil, nil) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return client.ExtractError(resp.Body) + } + return nil +} + +// UploadDiagnostics is a wrapper to upload a diagnostics request identified by the passed action id contained in the buffer to fleet-server. +func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buffer) error { + size := b.Len() + upReq := NewUploadRequest{ + AgentID: c.agentID, + ActionID: id, + File: FileData{ + Size: int64(size), + Name: fmt.Sprintf("elastic-agent-diagnostics-%s-%s.zip", c.agentID, id), + Extension: "zip", + Mime: "application/zip", + Compression: "Deflate", + Hash: struct { + SHA256 string `json:"sha256"` + MD5 string `json:"md5"` + }{ + SHA256: fmt.Sprintf("%x", sha256.Sum256(b.Bytes())), + }, + Created: time.Now().UTC().Format(time.RFC3339), + }, + } + upResp, err := c.New(ctx, &upReq) + if err != nil { + return err + } + + uploadID := upResp.UploadID + chunkSize := upResp.ChunkSize + totalChunks := int(math.Ceil(float64(size) / float64(chunkSize))) + for chunk := 0; chunk < totalChunks; chunk++ { + err := c.Chunk(ctx, uploadID, chunk, io.LimitReader(b, chunkSize)) + if err != nil { + return err + } + } + + return c.Finish(ctx, uploadID) +} diff --git a/internal/pkg/fleetapi/uploader/uploader_test.go b/internal/pkg/fleetapi/uploader/uploader_test.go new file mode 100644 index 00000000000..b15e3453dbb --- /dev/null +++ b/internal/pkg/fleetapi/uploader/uploader_test.go @@ -0,0 +1,216 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package uploader + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +type mockBackoff struct { + mock.Mock +} + +func (m *mockBackoff) Wait() bool { + args := m.Called() + return args.Bool(0) +} + +func (m *mockBackoff) NextWait() time.Duration { + args := m.Called() + return args.Get(0).(time.Duration) +} + +func (m *mockBackoff) Reset() { + m.Called() +} + +type mockSender struct { + mock.Mock +} + +func (m *mockSender) Send(ctx context.Context, method, path string, params url.Values, headers http.Header, body io.Reader) (*http.Response, error) { + args := m.Called(ctx, method, path, params, headers, body) + return args.Get(0).(*http.Response), args.Error(1) +} + +func (m *mockSender) URI() string { + args := m.Called() + return args.String(0) +} + +func Test_retrySender_Send(t *testing.T) { + tests := []struct { + name string + sender func() *mockSender + status int + err error + }{{ + name: "200 return", + sender: func() *mockSender { + m := &mockSender{} + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{StatusCode: 200}, nil).Once() + return m + }, + status: 200, + err: nil, + }, { + name: "429 retries", + sender: func() *mockSender { + m := &mockSender{} + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{StatusCode: 429}, nil).Once() + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{StatusCode: 200}, nil).Once() + return m + }, + status: 200, + err: nil, + }, { + name: "too many retries", + sender: func() *mockSender { + m := &mockSender{} + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{StatusCode: 429}, nil).Times(3) + return m + }, + status: 429, + err: nil, + }, { + name: "503 return", + sender: func() *mockSender { + m := &mockSender{} + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{StatusCode: 503}, nil).Once() + return m + }, + status: 503, + err: nil, + }, { + name: "error", + sender: func() *mockSender { + m := &mockSender{} + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{}, errors.New("oh no")).Once() + return m + }, + status: 0, + err: errors.New("oh no"), + }} + + backoff := &mockBackoff{} + backoff.On("Reset").Return() + backoff.On("Wait").Return(true) + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + sender := tc.sender() + c := &retrySender{ + c: sender, + max: 3, + wait: backoff, + } + resp, err := c.Send(context.Background(), "POST", "/", nil, nil, bytes.NewReader([]byte("abcd"))) + if err != nil { + assert.EqualError(t, err, tc.err.Error()) + } else { + assert.Equal(t, tc.err, err) + assert.Equal(t, tc.status, resp.StatusCode) + } + sender.AssertExpectations(t) + }) + } +} + +// This test validates that the body that is sent on a reattempt is the same as the original +func Test_retrySender_bodyValidation(t *testing.T) { + var body1, body2 []byte + var err error + sender := &mockSender{} + sender.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { + r := args.Get(5).(io.Reader) + body1, err = io.ReadAll(r) + require.NoError(t, err) + }).Return(&http.Response{StatusCode: 429}, nil).Once() + sender.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { + r := args.Get(5).(io.Reader) + body2, err = io.ReadAll(r) + require.NoError(t, err) + }).Return(&http.Response{StatusCode: 200}, nil).Once() + + backoff := &mockBackoff{} + backoff.On("Reset").Return() + backoff.On("Wait").Return(true) + + c := &retrySender{ + c: sender, + max: 3, + wait: backoff, + } + resp, err := c.Send(context.Background(), "POST", "/", nil, nil, bytes.NewReader([]byte("abcd"))) + require.NoError(t, err) + assert.Equal(t, resp.StatusCode, 200) + assert.Equal(t, []byte("abcd"), body1) + assert.Equal(t, []byte("abcd"), body2) + sender.AssertExpectations(t) +} + +func Test_Client_UploadDiagnostics(t *testing.T) { + var chunk0, chunk1, chunk2 []byte + var err error + sender := &mockSender{} + sender.On("Send", mock.Anything, "POST", PathNewUpload, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{ + StatusCode: 200, + Body: io.NopCloser(bytes.NewReader([]byte(`{"upload_id":"test-upload","chunk_size":2}`))), + }, nil).Once() + + // Validate that the Chunk endpoint is called 3 times, and that the chunks are uploaded as expected. + sender.On("Send", mock.Anything, "PUT", fmt.Sprintf(PathChunk, "test-upload", 0), mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { + r := args.Get(5).(io.Reader) + chunk0, err = io.ReadAll(r) + require.NoError(t, err) + }).Return(&http.Response{ + StatusCode: 200, + Body: io.NopCloser(bytes.NewReader(nil)), + }, nil).Once() + sender.On("Send", mock.Anything, "PUT", fmt.Sprintf(PathChunk, "test-upload", 1), mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { + r := args.Get(5).(io.Reader) + chunk1, err = io.ReadAll(r) + require.NoError(t, err) + }).Return(&http.Response{ + StatusCode: 200, + Body: io.NopCloser(bytes.NewReader(nil)), + }, nil).Once() + sender.On("Send", mock.Anything, "PUT", fmt.Sprintf(PathChunk, "test-upload", 2), mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { + r := args.Get(5).(io.Reader) + chunk2, err = io.ReadAll(r) + require.NoError(t, err) + }).Return(&http.Response{ + StatusCode: 200, + Body: io.NopCloser(bytes.NewReader(nil)), + }, nil).Once() + + sender.On("Send", mock.Anything, "POST", fmt.Sprintf(PathFinishUpload, "test-upload"), mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{ + StatusCode: 200, + Body: io.NopCloser(bytes.NewReader(nil)), + }, nil).Once() + + c := &Client{ + c: sender, + agentID: "test-agent", + } + err = c.UploadDiagnostics(context.Background(), "test-id", bytes.NewBufferString("abcde")) + require.NoError(t, err) + assert.Equal(t, "ab", string(chunk0)) + assert.Equal(t, "cd", string(chunk1)) + assert.Equal(t, "e", string(chunk2)) + sender.AssertExpectations(t) +} From e9b59a2e2a1f1e788c5b841490bc7d935333d446 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Wed, 16 Nov 2022 13:49:50 -0800 Subject: [PATCH 02/32] Fix PR and use control client instead of coord in handler Use the control.Client interface instead of a reference to the coordinator so that less boilerplate code is used. Fix callback setups. Cleanup uploader structs as the API has simplified a little. --- .../handlers/handler_action_diagnostics.go | 53 +---- .../application/coordinator/coordinator.go | 9 +- .../pkg/agent/application/managed_mode.go | 1 - internal/pkg/agent/cmd/diagnostics.go | 205 +----------------- internal/pkg/agent/cmd/run.go | 6 +- internal/pkg/agent/control/server/server.go | 35 +-- internal/pkg/diagnostics/diagnostics.go | 5 +- internal/pkg/fleetapi/action.go | 10 + internal/pkg/fleetapi/uploader/uploader.go | 52 +---- 9 files changed, 71 insertions(+), 305 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index f5158892752..66d09fd3404 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -9,7 +9,6 @@ import ( "context" "fmt" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" "github.com/elastic/elastic-agent/internal/pkg/agent/control/client" "github.com/elastic/elastic-agent/internal/pkg/diagnostics" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" @@ -26,15 +25,15 @@ type Uploader interface { // When a Diagnostics action is received a full diagnostics bundle is taken and uploaded to fleet-server. type Diagnostics struct { log *logger.Logger - coord *coordinator.Coordinator // TODO use of coordinator or control server/client? + client client.Client uploader Uploader } // NewDiagnostics returns a new Diagnostics handler. -func NewDiagnostics(log *logger.Logger, coord *coordinator.Coordinator, uploader Uploader) *Diagnostics { +func NewDiagnostics(log *logger.Logger, uploader Uploader) *Diagnostics { return &Diagnostics{ log: log, - coord: coord, + client: client.New(), uploader: uploader, } } @@ -48,49 +47,17 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } // Gather agent diagnostics - diagHooks := append(diagnostics.GlobalHooks(), h.coord.DiagnosticHooks()()...) - aDiag := make([]client.DiagnosticFileResult, 0, len(diagHooks)) - for _, hook := range diagHooks { - if ctx.Err() != nil { - return ctx.Err() - } - - p, ts := hook.Hook(ctx) - aDiag = append(aDiag, client.DiagnosticFileResult{ - Name: hook.Name, - Filename: hook.Filename, - Description: hook.Description, - ContentType: hook.ContentType, - Content: p, - Generated: ts, - }) + aDiag, err := h.client.DiagnosticAgent(ctx) + if err != nil { + return fmt.Errorf("unable to gather agent diagnostics: %w", err) } - - runtimeDiag := h.coord.PerformDiagnostics(ctx) - uDiag := make([]client.DiagnosticUnitResult, 0, len(runtimeDiag)) - for _, diag := range runtimeDiag { - files := make([]client.DiagnosticFileResult, 0, diag.Results) - for _, f := range diag.Results { - files = append(files, client.DiagnosticFileResult{ - Name: f.Name, - Filename: f.Filename, - Description: f.Description, - ContentType: f.ContentType, - Content: f.Content, - Generated: f.Generated.AsTime(), - }) - } - uDiag = append(uDiag, client.DiagnosticUnitResult{ - ComponentID: diag.Component.ID, - UnitID: diag.Unit.ID, - UnitType: diag.Unit.Type, - Err: diag.Err, - Results: files, - }) + uDiag, err := h.client.DiagnosticUnits(ctx) + if err != nil { + return fmt.Errorf("unable to gather unit diagnostics: %w", err) } var b bytes.Buffer - err := diagnostics.ZipArchive(b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments + err = diagnostics.ZipArchive(&b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments if err != nil { return fmt.Errorf("error creating diagnostics bundle: %w", err) } diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index 8302dc39406..7d897a7c676 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -17,9 +17,10 @@ import ( "gopkg.in/yaml.v2" - "github.com/elastic/elastic-agent-client/v7/pkg/client" "go.elastic.co/apm" + "github.com/elastic/elastic-agent-client/v7/pkg/client" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" @@ -473,6 +474,10 @@ func (c *Coordinator) DiagnosticHooks() func() diagnostics.Hooks { }, }, } + hooks = append(hooks, c.addLogHooks()...) + hooks = append(hooks, c.addServiceLogHooks()...) + + return hooks } } @@ -542,7 +547,7 @@ func (c *Coordinator) addServiceLogHooks() []diagnostics.Hook { Name: "services log dir", Filename: "services/", ContentType: diagnostics.ContentTypeDirectory, - Hook: func(_ context.Context) []byte { return nil }, + Hook: func(_ context.Context) ([]byte, time.Time) { return nil, time.Time{} }, }} for _, spec := range c.specs.ServiceSpecs() { if spec.Spec.Service.Log == nil || spec.Spec.Service.Log.Path == "" { diff --git a/internal/pkg/agent/application/managed_mode.go b/internal/pkg/agent/application/managed_mode.go index f614029737d..e20bdb62569 100644 --- a/internal/pkg/agent/application/managed_mode.go +++ b/internal/pkg/agent/application/managed_mode.go @@ -353,7 +353,6 @@ func (m *managedConfigManager) initDispatcher(canceller context.CancelFunc) *han &fleetapi.ActionDiagnostics{}, handlers.NewDiagnostics( m.log, - m.coord, uploader.New(m.agentInfo.AgentID(), m.client, m.cfg.Settings.MonitoringConfig.Uploader), ), ) diff --git a/internal/pkg/agent/cmd/diagnostics.go b/internal/pkg/agent/cmd/diagnostics.go index 9fab842375e..2d548849d15 100644 --- a/internal/pkg/agent/cmd/diagnostics.go +++ b/internal/pkg/agent/cmd/diagnostics.go @@ -5,24 +5,16 @@ package cmd import ( - "archive/zip" "context" - stderrors "errors" "fmt" - "io" - "io/fs" "os" - "path/filepath" - "strings" "time" - "github.com/hashicorp/go-multierror" "github.com/spf13/cobra" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/control/client" "github.com/elastic/elastic-agent/internal/pkg/cli" - "github.com/elastic/elastic-agent/pkg/component" + "github.com/elastic/elastic-agent/internal/pkg/diagnostics" ) func newDiagnosticsCommand(_ []string, streams *cli.IOStreams) *cobra.Command { @@ -74,201 +66,16 @@ func diagnosticCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("failed to fetch component/unit diagnostics: %w", err) } - err = createZip(fileName, agentDiag, unitDiags) - if err != nil { - return fmt.Errorf("unable to create archive %q: %w", fileName, err) - } - fmt.Fprintf(streams.Out, "Created diagnostics archive %q\n", fileName) - fmt.Fprintln(streams.Out, "***** WARNING *****\nCreated archive may contain plain text credentials.\nEnsure that files in archive are redacted before sharing.\n*******************") - return nil -} - -// createZip creates a zip archive with the passed fileName. -// -// The passed DiagnosticsInfo and AgentConfig data is written in the specified output format. -// Any local log files are collected and copied into the archive. -func createZip(fileName string, agentDiag []client.DiagnosticFileResult, unitDiags []client.DiagnosticUnitResult) error { f, err := os.Create(fileName) if err != nil { return err } - zw := zip.NewWriter(f) - - // write all Elastic Agent diagnostics at the top level - for _, ad := range agentDiag { - zf, err := zw.Create(ad.Filename) - if err != nil { - return closeHandlers(err, zw, f) - } - _, err = zf.Write(ad.Content) - if err != nil { - return closeHandlers(err, zw, f) - } - } - - // structure each unit into its own component directory - compDirs := make(map[string][]client.DiagnosticUnitResult) - for _, ud := range unitDiags { - compDir := strings.ReplaceAll(ud.ComponentID, "/", "-") - compDirs[compDir] = append(compDirs[compDir], ud) - } - - // write each units diagnostics into its own directory - // layout becomes components/// - _, err = zw.Create("components/") - if err != nil { - return closeHandlers(err, zw, f) - } - for dirName, units := range compDirs { - _, err = zw.Create(fmt.Sprintf("components/%s/", dirName)) - if err != nil { - return closeHandlers(err, zw, f) - } - for _, ud := range units { - unitDir := strings.ReplaceAll(strings.TrimPrefix(ud.UnitID, ud.ComponentID+"-"), "/", "-") - _, err = zw.Create(fmt.Sprintf("components/%s/%s/", dirName, unitDir)) - if err != nil { - return closeHandlers(err, zw, f) - } - if ud.Err != nil { - w, err := zw.Create(fmt.Sprintf("components/%s/%s/error.txt", dirName, unitDir)) - if err != nil { - return closeHandlers(err, zw, f) - } - _, err = w.Write([]byte(fmt.Sprintf("%s\n", ud.Err))) - if err != nil { - return closeHandlers(err, zw, f) - } - continue - } - for _, fr := range ud.Results { - w, err := zw.Create(fmt.Sprintf("components/%s/%s/%s", dirName, unitDir, fr.Name)) - if err != nil { - return closeHandlers(err, zw, f) - } - _, err = w.Write(fr.Content) - if err != nil { - return closeHandlers(err, zw, f) - } - } - } - } - - if err := zipLogs(zw); err != nil { - return closeHandlers(err, zw, f) - } - - return closeHandlers(nil, zw, f) -} - -// zipLogs walks paths.Logs() and copies the file structure into zw in "logs/" -func zipLogs(zw *zip.Writer) error { - _, err := zw.Create("logs/") - if err != nil { - return err - } - - if err := collectServiceComponentsLogs(zw); err != nil { - return fmt.Errorf("failed to collect endpoint-security logs: %w", err) - } - - // using Data() + "/logs", for some reason default paths/Logs() is the home dir... - logPath := filepath.Join(paths.Home(), "logs") + string(filepath.Separator) - return filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { - if stderrors.Is(fErr, fs.ErrNotExist) { - return nil - } - if fErr != nil { - return fmt.Errorf("unable to walk log dir: %w", fErr) - } - - // name is the relative dir/file name replacing any filepath seperators with / - // this will clean log names on windows machines and will nop on *nix - name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) - if name == "" { - return nil - } - - if d.IsDir() { - _, err := zw.Create("logs/" + name + "/") - if err != nil { - return fmt.Errorf("unable to create log directory in archive: %w", err) - } - return nil - } - - return saveLogs(name, path, zw) - }) -} - -func collectServiceComponentsLogs(zw *zip.Writer) error { - platform, err := component.LoadPlatformDetail() - if err != nil { - return fmt.Errorf("failed to gather system information: %w", err) - } - specs, err := component.LoadRuntimeSpecs(paths.Components(), platform) - if err != nil { - return fmt.Errorf("failed to detect inputs and outputs: %w", err) - } - for _, spec := range specs.ServiceSpecs() { - if spec.Spec.Service.Log == nil || spec.Spec.Service.Log.Path == "" { - // no log path set in specification - continue - } - - logPath := filepath.Dir(spec.Spec.Service.Log.Path) + string(filepath.Separator) - err = filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { - if fErr != nil { - if stderrors.Is(fErr, fs.ErrNotExist) { - return nil - } - - return fmt.Errorf("unable to walk log directory %q for service input %s: %w", logPath, spec.InputType, fErr) - } - - name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) - if name == "" { - return nil - } - - if d.IsDir() { - return nil - } + defer f.Close() - return saveLogs("services/"+name, path, zw) - }) - if err != nil { - return err - } + if err := diagnostics.ZipArchive(f, agentDiag, unitDiags); err != nil { + return fmt.Errorf("unable to create archive %q: %w", fileName, err) } + fmt.Fprintf(streams.Out, "Created diagnostics archive %q\n", fileName) + fmt.Fprintln(streams.Out, "***** WARNING *****\nCreated archive may contain plain text credentials.\nEnsure that files in archive are redacted before sharing.\n*******************") return nil } - -func saveLogs(name string, logPath string, zw *zip.Writer) error { - lf, err := os.Open(logPath) - if err != nil { - return fmt.Errorf("unable to open log file: %w", err) - } - zf, err := zw.Create("logs/" + name) - if err != nil { - return closeHandlers(fmt.Errorf("unable to create log file in archive: %w", err), lf) - } - _, err = io.Copy(zf, lf) - if err != nil { - return closeHandlers(fmt.Errorf("log file copy failed: %w", err), lf) - } - - return lf.Close() -} - -// closeHandlers will close all passed closers attaching any errors to the passed err and returning the result -func closeHandlers(err error, closers ...io.Closer) error { - var mErr *multierror.Error - mErr = multierror.Append(mErr, err) - for _, c := range closers { - if inErr := c.Close(); inErr != nil { - mErr = multierror.Append(mErr, inErr) - } - } - return mErr.ErrorOrNil() -} diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index e6f9ec8d0f7..903d59fb21b 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -178,9 +178,7 @@ func run(override cfgOverrider, modifiers ...component.PlatformModifier) error { _ = serverStopFn() }() - diagHooks := diagnostics.GlobalHooks() - diagHooks = append(diagHooks, coord.DiagnosticHooks()...) - control := server.New(logger.Named("control"), agentInfo, coord, tracer, diagHooks) + control := server.New(logger.Named("control"), agentInfo, coord, tracer, diagnostics.GlobalHooks, coord.DiagnosticHooks()) // start the control listener if err := control.Start(); err != nil { return err @@ -395,7 +393,7 @@ func initTracer(agentName, version string, mcfg *monitoringCfg.MonitoringConfig) cfg := mcfg.APM - // nolint:godox // the TODO is intentional + //nolint:godox // the TODO is intentional // TODO(stn): Ideally, we'd use apmtransport.NewHTTPTransportOptions() // but it doesn't exist today. Update this code once we have something // available via the APM Go agent. diff --git a/internal/pkg/agent/control/server/server.go b/internal/pkg/agent/control/server/server.go index 7e6b6799bbb..665dd6abea1 100644 --- a/internal/pkg/agent/control/server/server.go +++ b/internal/pkg/agent/control/server/server.go @@ -32,23 +32,23 @@ import ( type Server struct { cproto.UnimplementedElasticAgentControlServer - logger *logger.Logger - agentInfo *info.AgentInfo - coord *coordinator.Coordinator - listener net.Listener - server *grpc.Server - tracer *apm.Tracer - diagHooks diagnostics.Hooks + logger *logger.Logger + agentInfo *info.AgentInfo + coord *coordinator.Coordinator + listener net.Listener + server *grpc.Server + tracer *apm.Tracer + diagHooksFn []func() diagnostics.Hooks } // New creates a new control protocol server. -func New(log *logger.Logger, agentInfo *info.AgentInfo, coord *coordinator.Coordinator, tracer *apm.Tracer, diagHooks diagnostics.Hooks) *Server { +func New(log *logger.Logger, agentInfo *info.AgentInfo, coord *coordinator.Coordinator, tracer *apm.Tracer, diagHooksFn ...func() diagnostics.Hooks) *Server { return &Server{ - logger: log, - agentInfo: agentInfo, - coord: coord, - tracer: tracer, - diagHooks: diagHooks, + logger: log, + agentInfo: agentInfo, + coord: coord, + tracer: tracer, + diagHooksFn: diagHooksFn, } } @@ -180,8 +180,13 @@ func (s *Server) Upgrade(ctx context.Context, request *cproto.UpgradeRequest) (* // DiagnosticAgent returns diagnostic information for this running Elastic Agent. func (s *Server) DiagnosticAgent(ctx context.Context, _ *cproto.DiagnosticAgentRequest) (*cproto.DiagnosticAgentResponse, error) { - res := make([]*cproto.DiagnosticFileResult, 0, len(s.diagHooks)) - for _, h := range s.diagHooks { + diagHooks := make([]diagnostics.Hook, 0) + for _, fn := range s.diagHooksFn { + hooks := fn() + diagHooks = append(diagHooks, hooks...) + } + res := make([]*cproto.DiagnosticFileResult, 0, len(diagHooks)) + for _, h := range diagHooks { if ctx.Err() != nil { return nil, ctx.Err() } diff --git a/internal/pkg/diagnostics/diagnostics.go b/internal/pkg/diagnostics/diagnostics.go index cb629496ef1..75bcbd9232e 100644 --- a/internal/pkg/diagnostics/diagnostics.go +++ b/internal/pkg/diagnostics/diagnostics.go @@ -20,6 +20,9 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/release" ) +// ContentTypeDirectory should be used to indicate that a directory should be made in the resulting bundle +const ContentTypeDirectory = "directory" + // Hook is a hook that gets used when diagnostic information is requested from the Elastic Agent. type Hook struct { Name string @@ -107,7 +110,7 @@ func pprofDiag(name string) func(context.Context) ([]byte, time.Time) { } // ZipArchive creates a zipped diagnostics bundle using the passed writer with the passed diagnostics. -// If any error is encounted when writing the contents of the archive it is returned. +// If any error is encountered when writing the contents of the archive it is returned. func ZipArchive(w io.Writer, agentDiag []client.DiagnosticFileResult, unitDiags []client.DiagnosticUnitResult) error { zw := zip.NewWriter(w) defer zw.Close() diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index 7de4a4b7d40..45cd2958b36 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -582,6 +582,16 @@ func (a *Actions) UnmarshalYAML(unmarshal func(interface{}) error) error { "fail to decode CANCEL_ACTION action", errors.TypeConfig) } + case ActionTypeDiagnostics: + action = &ActionDiagnostics{ + ActionID: n.ActionID, + ActionType: n.ActionType, + } + if err := json.Unmarshal(n.Data, action); err != nil { + return errors.New(err, + "fail to decode DIAGNOSTICS_ACTION action", + errors.TypeConfig) + } default: action = &ActionUnknown{ ActionID: n.ActionID, diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index 3ffdfbbacfc..514f31c7c83 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -15,7 +15,6 @@ import ( "math" "net/http" "net/url" - "time" "github.com/elastic/elastic-agent/internal/pkg/core/backoff" "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" @@ -31,48 +30,23 @@ const ( // FileData contains metadata about a file. type FileData struct { - Size int64 `json:"size"` - Name string `json:"name"` - Extension string `json:"ext"` - Mime string `json:"mime_type"` - Compression string `json:"Compression"` - Hash struct { + Size int64 `json:"size"` + Name string `json:"name"` + Extension string `json:"ext"` + Mime string `json:"mime_type"` + Hash struct { SHA256 string `json:"sha256"` MD5 string `json:"md5"` } `json:"hash"` - Accessed string `json:"accessed"` - Attributes []string `json:"attributes"` - Created string `json:"created"` - CTime string `json:"ctime"` - Device string `json:"device"` - Directory string `json:"directory"` - DriveLetter string `json:"drive_letter"` - Ext string `json:"extension"` - GID string `json:"gid"` - Group string `json:"group"` - INode string `json:"inode"` - Mode string `json:"mode"` - MTime string `json:"mtime"` - Owner string `json:"owner"` - Path string `json:"path"` - TargetPath string `json:"target_path"` - Type string `json:"type"` - UID string `json:"uid"` } // NewUploadRequest is the struct that is passed as the request body when starting a new file upload. type NewUploadRequest struct { - AgentID string `json:"agent_id"` ActionID string `json:"action_id"` + AgentID string `json:"agent_id"` Source string `json:"source"` File FileData `json:"file"` Contents []FileData `json:"contents"` - Event struct { - ID string `json:"id"` - } `json:"event"` - Host struct { - Hostname string `json:"hostname"` - } `json:"host"` } // NewUploadResponse is the body for the success case when requesting a new file upload. @@ -89,7 +63,6 @@ type retrySender struct { } // Send calls the underlying Sender's Send method. If a 429 status code is returned the request is retried after a backoff period. -// TODO What to do if another error or status is received? func (r *retrySender) Send(ctx context.Context, method, path string, params url.Values, headers http.Header, body io.Reader) (resp *http.Response, err error) { r.wait.Reset() @@ -191,21 +164,20 @@ func (c *Client) Finish(ctx context.Context, id string) error { func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buffer) error { size := b.Len() upReq := NewUploadRequest{ - AgentID: c.agentID, ActionID: id, + AgentID: c.agentID, + Source: "elastic-agent", File: FileData{ - Size: int64(size), - Name: fmt.Sprintf("elastic-agent-diagnostics-%s-%s.zip", c.agentID, id), - Extension: "zip", - Mime: "application/zip", - Compression: "Deflate", + Size: int64(size), + Name: fmt.Sprintf("elastic-agent-diagnostics-%s-%s.zip", c.agentID, id), + Extension: "zip", + Mime: "application/zip", Hash: struct { SHA256 string `json:"sha256"` MD5 string `json:"md5"` }{ SHA256: fmt.Sprintf("%x", sha256.Sum256(b.Bytes())), }, - Created: time.Now().UTC().Format(time.RFC3339), }, } upResp, err := c.New(ctx, &upReq) From ba94a2b9fe2060b972d119fb2237d47aaec1c2e9 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 17 Nov 2022 13:07:22 -0800 Subject: [PATCH 03/32] Change AckEvent construction from fleet acker to an action method Change the construction of AckEvent from being an internal method in the fleet.Acker to a method all Actions implement in order to reduce the acker complexity and increase the clarity of what is sent when an action is acked at the cost some code duplication. --- .../handlers/handler_action_diagnostics.go | 14 ++- .../pkg/fleetapi/acker/fleet/fleet_acker.go | 51 ++------ internal/pkg/fleetapi/action.go | 119 ++++++++++++++++++ internal/pkg/fleetapi/uploader/uploader.go | 9 +- 4 files changed, 143 insertions(+), 50 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 66d09fd3404..d7bc72188a4 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -18,7 +18,7 @@ import ( // Uploader is the interface used to upload a diagnostics bundle to fleet-server. type Uploader interface { - UploadDiagnostics(context.Context, string, *bytes.Buffer) error + UploadDiagnostics(context.Context, string, *bytes.Buffer) (string, error) } // Diagnostics is the handler to process Diagnostics actions. @@ -49,21 +49,29 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A // Gather agent diagnostics aDiag, err := h.client.DiagnosticAgent(ctx) if err != nil { + action.Err = err + _ = ack.Ack(ctx, action) return fmt.Errorf("unable to gather agent diagnostics: %w", err) } uDiag, err := h.client.DiagnosticUnits(ctx) if err != nil { + action.Err = err + _ = ack.Ack(ctx, action) return fmt.Errorf("unable to gather unit diagnostics: %w", err) } var b bytes.Buffer err = diagnostics.ZipArchive(&b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments if err != nil { + action.Err = err + _ = ack.Ack(ctx, action) return fmt.Errorf("error creating diagnostics bundle: %w", err) } - err = h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) - _ = ack.Ack(ctx, action) // TODO ack should have the file upload ID in it + uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) + action.Err = err + action.UploadID = uploadID + _ = ack.Ack(ctx, action) if err != nil { return fmt.Errorf("unable to upload diagnostics: %w", err) } diff --git a/internal/pkg/fleetapi/acker/fleet/fleet_acker.go b/internal/pkg/fleetapi/acker/fleet/fleet_acker.go index b78a55069d8..6a6efa29a2a 100644 --- a/internal/pkg/fleetapi/acker/fleet/fleet_acker.go +++ b/internal/pkg/fleetapi/acker/fleet/fleet_acker.go @@ -6,7 +6,6 @@ package fleet import ( "context" - "encoding/json" "fmt" "strings" "time" @@ -60,10 +59,11 @@ func (f *Acker) Ack(ctx context.Context, action fleetapi.Action) (err error) { // checkin agentID := f.agentInfo.AgentID() cmd := fleetapi.NewAckCmd(f.agentInfo, f.client) + event := action.AckEvent() + event.AgentID = agentID + event.Timestamp = time.Now().Format(fleetTimeFormat) req := &fleetapi.AckRequest{ - Events: []fleetapi.AckEvent{ - constructEvent(action, agentID), - }, + Events: []fleetapi.AckEvent{event}, } _, err = cmd.Execute(ctx, req) @@ -89,7 +89,10 @@ func (f *Acker) AckBatch(ctx context.Context, actions []fleetapi.Action) (res *f events := make([]fleetapi.AckEvent, 0, len(actions)) ids := make([]string, 0, len(actions)) for _, action := range actions { - events = append(events, constructEvent(action, agentID)) + event := action.AckEvent() + event.AgentID = agentID + event.Timestamp = time.Now().Format(fleetTimeFormat) + events = append(events, event) ids = append(ids, action.ID()) } @@ -117,41 +120,3 @@ func (f *Acker) AckBatch(ctx context.Context, actions []fleetapi.Action) (res *f func (f *Acker) Commit(ctx context.Context) error { return nil } - -func constructEvent(action fleetapi.Action, agentID string) fleetapi.AckEvent { - ackev := fleetapi.AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - Timestamp: time.Now().Format(fleetTimeFormat), - ActionID: action.ID(), - AgentID: agentID, - Message: fmt.Sprintf("Action '%s' of type '%s' acknowledged.", action.ID(), action.Type()), - } - - if a, ok := action.(fleetapi.RetryableAction); ok { - if err := a.GetError(); err != nil { - ackev.Error = err.Error() - var payload struct { - Retry bool `json:"retry"` - Attempt int `json:"retry_attempt,omitempty"` - } - payload.Retry = true - payload.Attempt = a.RetryAttempt() - if a.RetryAttempt() < 1 { - payload.Retry = false - } - p, _ := json.Marshal(payload) - ackev.Payload = p - } - } - - if a, ok := action.(*fleetapi.ActionApp); ok { - ackev.ActionInputType = a.InputType - ackev.ActionData = a.Data - ackev.ActionResponse = a.Response - ackev.StartedAt = a.StartedAt - ackev.CompletedAt = a.CompletedAt - ackev.Error = a.Error - } - return ackev -} diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index 45cd2958b36..b0c9285f861 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -48,6 +48,7 @@ type Action interface { fmt.Stringer Type() string ID() string + AckEvent() AckEvent } // ScheduledAction is an Action that may be executed at a later date @@ -137,6 +138,15 @@ func (a *ActionUnknown) OriginalType() string { return a.originalType } +func (a *ActionUnknown) AckEvent() AckEvent { + return AckEvent{ + EventType: "ERROR", // TODO Discuss EventType/SubType needed - by default only ACTION_RESULT was used - what is (or was) the intended purpose of these attributes? Are they documented? Can we change them to better support acking an error or a retry? + SubType: "FAILED", + ActionID: a.ActionID, + Error: fmt.Sprintf("Action %q of type %q is unknown to the elastic-agent", a.ActionID, a.originalType), + } +} + // ActionPolicyReassign is a request to apply a new type ActionPolicyReassign struct { ActionID string `yaml:"action_id"` @@ -162,6 +172,15 @@ func (a *ActionPolicyReassign) ID() string { return a.ActionID } +func (a *ActionPolicyReassign) AckEvent() AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } +} + // ActionPolicyChange is a request to apply a new type ActionPolicyChange struct { ActionID string `yaml:"action_id"` @@ -188,6 +207,15 @@ func (a *ActionPolicyChange) ID() string { return a.ActionID } +func (a *ActionPolicyPolicyChange) AckEvent() AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } +} + // ActionUpgrade is a request for agent to upgrade. type ActionUpgrade struct { ActionID string `yaml:"action_id"` @@ -209,6 +237,31 @@ func (a *ActionUpgrade) String() string { return s.String() } +func (a *ActionUpgrade) AckEvent() AckEvent { + event := AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } + if a.Err != nil { + // FIXME Do we want to change EventType/SubType here? + event.Error = a.Err.Error() + var payload struct { + Retry bool `json:"retry"` + Attempt int `json:"retry_attempt,omitempty"` + } + payload.Retry = true + payload.Attempt = a.Retry + if a.Retry < 1 { // retry is set to -1 if it will not re attempt + payload.Retry = false + } + p, _ := json.Marshal(payload) + event.Payload = p + } + return event +} + // Type returns the type of the Action. func (a *ActionUpgrade) Type() string { return a.ActionType @@ -294,6 +347,15 @@ func (a *ActionUnenroll) ID() string { return a.ActionID } +func (a *ActionUnenroll) AckEvent() AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } +} + // ActionSettings is a request to change agent settings. type ActionSettings struct { ActionID string `yaml:"action_id"` @@ -322,6 +384,15 @@ func (a *ActionSettings) String() string { return s.String() } +func (a *ActionSettings) AckEvent() AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } +} + // ActionCancel is a request to cancel an action. type ActionCancel struct { ActionID string `yaml:"action_id"` @@ -350,10 +421,21 @@ func (a *ActionCancel) String() string { return s.String() } +func (a *ActionCancel) AckEvent() AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } +} + // ActionDiagnostics is a request to gather and upload a diagnostics bundle. type ActionDiagnostics struct { ActionID string `json:"action_id"` ActionType string `json:"type"` + UploadID string `json:"-"` + Err error `json:"-"` } // ID returns the ID of the action. @@ -375,6 +457,28 @@ func (a *ActionDiagnostics) String() string { return s.String() } +func (a *ActionDiagnostics) AckEvent() AckEvent { + event := AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + } + if a.Err != nil { + event.Error = a.Err.Error() + } + if a.UploadID != "" { + var payload struct { + UploadID string `json:"upload_id"` + } + payload.UploadID = a.UploadID + p, _ := json.Marshall(payload) + event.Payload = p + } + + return event +} + // ActionApp is the application action request. type ActionApp struct { ActionID string `json:"id" mapstructure:"id"` @@ -409,6 +513,21 @@ func (a *ActionApp) Type() string { return a.ActionType } +func (a *ActionApp) AckEvent() AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), + ActionInputType: a.InputType, + ActionData: a.Data, + ActionResponse: a.Response, + StartedAt: a.StartedAt, + CompletedAt: a.CompletedAt, + Error: a.Error, + } +} + // MarshalMap marshals ActionApp into a corresponding map func (a *ActionApp) MarshalMap() (map[string]interface{}, error) { var res map[string]interface{} diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index 514f31c7c83..77ed15bbf48 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -161,7 +161,7 @@ func (c *Client) Finish(ctx context.Context, id string) error { } // UploadDiagnostics is a wrapper to upload a diagnostics request identified by the passed action id contained in the buffer to fleet-server. -func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buffer) error { +func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buffer) (string, error) { size := b.Len() upReq := NewUploadRequest{ ActionID: id, @@ -182,7 +182,7 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff } upResp, err := c.New(ctx, &upReq) if err != nil { - return err + return "", err } uploadID := upResp.UploadID @@ -191,9 +191,10 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff for chunk := 0; chunk < totalChunks; chunk++ { err := c.Chunk(ctx, uploadID, chunk, io.LimitReader(b, chunkSize)) if err != nil { - return err + return uploadID, err } } - return c.Finish(ctx, uploadID) + err = c.Finish(ctx, uploadID) + return uploadID, err } From ef7038a844aa83141eba6f2c5e4398e1312e5347 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 17 Nov 2022 15:47:48 -0800 Subject: [PATCH 04/32] Fix linter and tests --- .../pkg/agent/application/dispatcher/dispatcher_test.go | 4 ++++ internal/pkg/fleetapi/acker/fleet/fleet_acker_test.go | 8 ++++---- internal/pkg/fleetapi/action.go | 9 +++++---- internal/pkg/fleetapi/uploader/uploader_test.go | 3 ++- internal/pkg/queue/actionqueue_test.go | 5 +++++ 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/internal/pkg/agent/application/dispatcher/dispatcher_test.go b/internal/pkg/agent/application/dispatcher/dispatcher_test.go index c9c1397443c..43b074fbd9d 100644 --- a/internal/pkg/agent/application/dispatcher/dispatcher_test.go +++ b/internal/pkg/agent/application/dispatcher/dispatcher_test.go @@ -53,6 +53,10 @@ func (m *mockAction) String() string { args := m.Called() return args.String(0) } +func (m *mockAction) AckEvent() fleetapi.AckEvent { + args := m.Called() + return args.Get(0).(fleetapi.AckEvent) +} func (m *mockScheduledAction) StartTime() (time.Time, error) { args := m.Called() return args.Get(0).(time.Time), args.Error(1) diff --git a/internal/pkg/fleetapi/acker/fleet/fleet_acker_test.go b/internal/pkg/fleetapi/acker/fleet/fleet_acker_test.go index ebea939a910..6abe28fec72 100644 --- a/internal/pkg/fleetapi/acker/fleet/fleet_acker_test.go +++ b/internal/pkg/fleetapi/acker/fleet/fleet_acker_test.go @@ -86,13 +86,13 @@ func TestAcker_Ack(t *testing.T) { }, { name: "ack", - actions: []fleetapi.Action{&fleetapi.ActionUnknown{ActionID: "ack-test-action-id"}}, + actions: []fleetapi.Action{&fleetapi.ActionUnknown{ActionID: "ack-test-action-id", ActionType: fleetapi.ActionTypeUnknown}}, }, { name: "ackbatch", actions: []fleetapi.Action{ - &fleetapi.ActionUnknown{ActionID: "ack-test-action-id1"}, - &fleetapi.ActionUnknown{ActionID: "ack-test-action-id2"}, + &fleetapi.ActionUnknown{ActionID: "ack-test-action-id1", ActionType: fleetapi.ActionTypeUnknown}, + &fleetapi.ActionUnknown{ActionID: "ack-test-action-id2", ActionType: fleetapi.ActionTypeUnknown}, }, }, { @@ -153,7 +153,7 @@ func TestAcker_Ack(t *testing.T) { assert.EqualValues(t, "ACKNOWLEDGED", req.Events[i].SubType) assert.EqualValues(t, ac.ID(), req.Events[i].ActionID) assert.EqualValues(t, agentInfo.AgentID(), req.Events[i].AgentID) - assert.EqualValues(t, fmt.Sprintf("Action '%s' of type '%s' acknowledged.", ac.ID(), ac.Type()), req.Events[i].Message) + assert.EqualValues(t, fmt.Sprintf("Action %q of type %q acknowledged.", ac.ID(), ac.Type()), req.Events[i].Message) // Check if the fleet acker handles RetryableActions correctly using the UpgradeAction if a, ok := ac.(*fleetapi.ActionUpgrade); ok { if a.Err != nil { diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index b0c9285f861..23e2450457f 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -140,9 +140,10 @@ func (a *ActionUnknown) OriginalType() string { func (a *ActionUnknown) AckEvent() AckEvent { return AckEvent{ - EventType: "ERROR", // TODO Discuss EventType/SubType needed - by default only ACTION_RESULT was used - what is (or was) the intended purpose of these attributes? Are they documented? Can we change them to better support acking an error or a retry? - SubType: "FAILED", + EventType: "ACTION_RESULT", // TODO Discuss EventType/SubType needed - by default only ACTION_RESULT was used - what is (or was) the intended purpose of these attributes? Are they documented? Can we change them to better support acking an error or a retry? + SubType: "ACKNOWLEDGED", ActionID: a.ActionID, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), Error: fmt.Sprintf("Action %q of type %q is unknown to the elastic-agent", a.ActionID, a.originalType), } } @@ -207,7 +208,7 @@ func (a *ActionPolicyChange) ID() string { return a.ActionID } -func (a *ActionPolicyPolicyChange) AckEvent() AckEvent { +func (a *ActionPolicyChange) AckEvent() AckEvent { return AckEvent{ EventType: "ACTION_RESULT", SubType: "ACKNOWLEDGED", @@ -472,7 +473,7 @@ func (a *ActionDiagnostics) AckEvent() AckEvent { UploadID string `json:"upload_id"` } payload.UploadID = a.UploadID - p, _ := json.Marshall(payload) + p, _ := json.Marshal(payload) event.Payload = p } diff --git a/internal/pkg/fleetapi/uploader/uploader_test.go b/internal/pkg/fleetapi/uploader/uploader_test.go index b15e3453dbb..0c03a16ecdf 100644 --- a/internal/pkg/fleetapi/uploader/uploader_test.go +++ b/internal/pkg/fleetapi/uploader/uploader_test.go @@ -207,8 +207,9 @@ func Test_Client_UploadDiagnostics(t *testing.T) { c: sender, agentID: "test-agent", } - err = c.UploadDiagnostics(context.Background(), "test-id", bytes.NewBufferString("abcde")) + id, err := c.UploadDiagnostics(context.Background(), "test-id", bytes.NewBufferString("abcde")) require.NoError(t, err) + assert.Equal(t, "test-upload", id) assert.Equal(t, "ab", string(chunk0)) assert.Equal(t, "cd", string(chunk1)) assert.Equal(t, "e", string(chunk2)) diff --git a/internal/pkg/queue/actionqueue_test.go b/internal/pkg/queue/actionqueue_test.go index 29643a80326..176c31b93db 100644 --- a/internal/pkg/queue/actionqueue_test.go +++ b/internal/pkg/queue/actionqueue_test.go @@ -37,6 +37,11 @@ func (m *mockAction) ID() string { return args.String(0) } +func (m *mockAction) AckEvent() fleetapi.AckEvent { + args := m.Called() + return args.Get(0).(fleetapi.AckEvent) +} + func (m *mockAction) StartTime() (time.Time, error) { args := m.Called() return args.Get(0).(time.Time), args.Error(1) From 5e2792abc258d65bf29be085cdeeee2e34aa1eb9 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 21 Nov 2022 14:36:31 -0800 Subject: [PATCH 05/32] Fix linter --- internal/pkg/queue/actionqueue_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/queue/actionqueue_test.go b/internal/pkg/queue/actionqueue_test.go index 176c31b93db..d5a7a5c41d4 100644 --- a/internal/pkg/queue/actionqueue_test.go +++ b/internal/pkg/queue/actionqueue_test.go @@ -2,7 +2,7 @@ // or more contributor license agreements. Licensed under the Elastic License; // you may not use this file except in compliance with the Elastic License. -//nolint:errcheck,dupl // lots of casting in test cases +//nolint:dupl // lots of casting in test cases package queue import ( From 267cb6c347401bcf6589a3dbb98521fdcb2639ae Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 12 Dec 2022 08:11:58 -0800 Subject: [PATCH 06/32] Fix merge --- internal/pkg/agent/control/server/server.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/pkg/agent/control/server/server.go b/internal/pkg/agent/control/server/server.go index 04ceac39489..379d9059b5a 100644 --- a/internal/pkg/agent/control/server/server.go +++ b/internal/pkg/agent/control/server/server.go @@ -45,12 +45,12 @@ type Server struct { // New creates a new control protocol server. func New(log *logger.Logger, agentInfo *info.AgentInfo, coord *coordinator.Coordinator, tracer *apm.Tracer, grpcConfig *configuration.GRPCConfig, diagHooksFn ...func() diagnostics.Hooks) *Server { return &Server{ - logger: log, - agentInfo: agentInfo, - coord: coord, - tracer: tracer, - grpcConfig: grpcConfig, - diagHooks: diagHooksFn, + logger: log, + agentInfo: agentInfo, + coord: coord, + tracer: tracer, + grpcConfig: grpcConfig, + diagHooksFn: diagHooksFn, } } From 3a949f389be885da2688a5994cdd95c6c0f9b296 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 12 Dec 2022 11:19:18 -0800 Subject: [PATCH 07/32] Fix tests --- internal/pkg/agent/control/control_test.go | 2 +- internal/pkg/basecmd/version/cmd_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/agent/control/control_test.go b/internal/pkg/agent/control/control_test.go index a247588dbdd..a380969689a 100644 --- a/internal/pkg/agent/control/control_test.go +++ b/internal/pkg/agent/control/control_test.go @@ -22,7 +22,7 @@ import ( ) func TestServerClient_Version(t *testing.T) { - srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, nil, configuration.DefaultGRPCConfig()) + srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, configuration.DefaultGRPCConfig()) err := srv.Start() require.NoError(t, err) defer srv.Stop() diff --git a/internal/pkg/basecmd/version/cmd_test.go b/internal/pkg/basecmd/version/cmd_test.go index 7b94fbde7ee..1f8164dd532 100644 --- a/internal/pkg/basecmd/version/cmd_test.go +++ b/internal/pkg/basecmd/version/cmd_test.go @@ -58,7 +58,7 @@ func TestCmdBinaryOnlyYAML(t *testing.T) { } func TestCmdDaemon(t *testing.T) { - srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, nil, configuration.DefaultGRPCConfig()) + srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, configuration.DefaultGRPCConfig()) require.NoError(t, srv.Start()) defer srv.Stop() @@ -74,7 +74,7 @@ func TestCmdDaemon(t *testing.T) { } func TestCmdDaemonYAML(t *testing.T) { - srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, nil, configuration.DefaultGRPCConfig()) + srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, configuration.DefaultGRPCConfig()) require.NoError(t, srv.Start()) defer srv.Stop() From b3164274c4b833a6fc1ccaf2616d23e092626546 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 12 Dec 2022 13:57:44 -0800 Subject: [PATCH 08/32] remove duplication when creating an ackevent for an action --- internal/pkg/fleetapi/action.go | 58 +++++++++------------------------ 1 file changed, 16 insertions(+), 42 deletions(-) diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index 23e2450457f..7a8276d6b05 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -100,6 +100,15 @@ type FleetAction struct { //MinimumExecutionDuration int64 // disabled, used by fleet-server for scheduling } +func newAckEvent(id, aType string) AckEvent { + return AckEvent{ + EventType: "ACTION_RESULT", + SubType: "ACKNOWLEDGED", + ActionID: id, + Message: fmt.Sprintf("Action %q of type %q acknowledged.", id, aType), + } +} + // ActionUnknown is an action that is not know by the current version of the Agent and we don't want // to return an error at parsing time but at execution time we can report or ignore. // @@ -174,12 +183,7 @@ func (a *ActionPolicyReassign) ID() string { } func (a *ActionPolicyReassign) AckEvent() AckEvent { - return AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + return newAckEvent(a.ActionID, a.ActionType) } // ActionPolicyChange is a request to apply a new @@ -209,12 +213,7 @@ func (a *ActionPolicyChange) ID() string { } func (a *ActionPolicyChange) AckEvent() AckEvent { - return AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + return newAckEvent(a.ActionID, a.ActionType) } // ActionUpgrade is a request for agent to upgrade. @@ -239,12 +238,7 @@ func (a *ActionUpgrade) String() string { } func (a *ActionUpgrade) AckEvent() AckEvent { - event := AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + event := newAckEvent(a.ActionID, a.ActionType) if a.Err != nil { // FIXME Do we want to change EventType/SubType here? event.Error = a.Err.Error() @@ -349,12 +343,7 @@ func (a *ActionUnenroll) ID() string { } func (a *ActionUnenroll) AckEvent() AckEvent { - return AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + return newAckEvent(a.ActionID, a.ActionType) } // ActionSettings is a request to change agent settings. @@ -386,12 +375,7 @@ func (a *ActionSettings) String() string { } func (a *ActionSettings) AckEvent() AckEvent { - return AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + return newAckEvent(a.ActionID, a.ActionType) } // ActionCancel is a request to cancel an action. @@ -423,12 +407,7 @@ func (a *ActionCancel) String() string { } func (a *ActionCancel) AckEvent() AckEvent { - return AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + return newAckEvent(a.ActionID, a.ActionType) } // ActionDiagnostics is a request to gather and upload a diagnostics bundle. @@ -459,12 +438,7 @@ func (a *ActionDiagnostics) String() string { } func (a *ActionDiagnostics) AckEvent() AckEvent { - event := AckEvent{ - EventType: "ACTION_RESULT", - SubType: "ACKNOWLEDGED", - ActionID: a.ActionID, - Message: fmt.Sprintf("Action %q of type %q acknowledged.", a.ActionID, a.ActionType), - } + event := newAckEvent(a.ActionID, a.ActionType) if a.Err != nil { event.Error = a.Err.Error() } From cffeca658de9e7e98cfa804997f2908bef062f24 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Wed, 4 Jan 2023 10:05:45 -0800 Subject: [PATCH 09/32] Retry upload for non-context errors --- .../handlers/handler_action_diagnostics.go | 5 +---- internal/pkg/fleetapi/uploader/uploader.go | 11 +++++++++-- internal/pkg/fleetapi/uploader/uploader_test.go | 16 +++++++++++++--- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index e23bec87552..2a835143526 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -45,18 +45,17 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A if !ok { return fmt.Errorf("invalid type, expected ActionDiagnostics and received %T", a) } + defer ack.Ack(ctx, action) // Gather agent diagnostics aDiag, err := h.client.DiagnosticAgent(ctx) if err != nil { action.Err = err - _ = ack.Ack(ctx, action) return fmt.Errorf("unable to gather agent diagnostics: %w", err) } uDiag, err := h.client.DiagnosticUnits(ctx) if err != nil { action.Err = err - _ = ack.Ack(ctx, action) return fmt.Errorf("unable to gather unit diagnostics: %w", err) } @@ -64,14 +63,12 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A err = diagnostics.ZipArchive(&b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments if err != nil { action.Err = err - _ = ack.Ack(ctx, action) return fmt.Errorf("error creating diagnostics bundle: %w", err) } uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) action.Err = err action.UploadID = uploadID - _ = ack.Ack(ctx, action) if err != nil { return fmt.Errorf("unable to upload diagnostics: %w", err) } diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index 77ed15bbf48..2b805348a12 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -10,6 +10,7 @@ import ( "context" "crypto/sha256" "encoding/json" + "errors" "fmt" "io" "math" @@ -62,7 +63,8 @@ type retrySender struct { wait backoff.Backoff } -// Send calls the underlying Sender's Send method. If a 429 status code is returned the request is retried after a backoff period. +// Send calls the underlying Sender's Send method. +// If a non context-related error is returned or the 429 status code is returned the request is retried after a backoff period. func (r *retrySender) Send(ctx context.Context, method, path string, params url.Values, headers http.Header, body io.Reader) (resp *http.Response, err error) { r.wait.Reset() @@ -71,7 +73,12 @@ func (r *retrySender) Send(ctx context.Context, method, path string, params url. for i := 0; i < r.max; i++ { resp, err = r.c.Send(ctx, method, path, params, headers, tr) if err != nil { - return resp, err + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return resp, err + } + tr = bytes.NewReader(b.Bytes()) + r.wait.Wait() + continue } if resp.StatusCode == http.StatusTooManyRequests { tr = bytes.NewReader(b.Bytes()) diff --git a/internal/pkg/fleetapi/uploader/uploader_test.go b/internal/pkg/fleetapi/uploader/uploader_test.go index 0c03a16ecdf..a0314f433a9 100644 --- a/internal/pkg/fleetapi/uploader/uploader_test.go +++ b/internal/pkg/fleetapi/uploader/uploader_test.go @@ -96,14 +96,24 @@ func Test_retrySender_Send(t *testing.T) { status: 503, err: nil, }, { - name: "error", + name: "context error", sender: func() *mockSender { m := &mockSender{} - m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{}, errors.New("oh no")).Once() + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{}, context.Canceled).Once() return m }, status: 0, - err: errors.New("oh no"), + err: context.Canceled, + }, { + name: "non-context error will retry", + sender: func() *mockSender { + m := &mockSender{} + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{}, errors.New("oh no")).Once() + m.On("Send", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&http.Response{StatusCode: 200}, nil).Once() + return m + }, + status: 200, + err: nil, }} backoff := &mockBackoff{} From 9e6cdb88847c6031c1fe81ab739f1e711599b443 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 5 Jan 2023 08:04:24 -0800 Subject: [PATCH 10/32] Fix linter --- .../handlers/handler_action_diagnostics.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 3c6c38f520b..97073c2e6d8 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -14,9 +14,6 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/pkg/core/logger" - - "go.uber.org/zap/zapcore" - "go.uber.org/zap/zapio" ) // Uploader is the interface used to upload a diagnostics bundle to fleet-server. @@ -48,7 +45,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A if !ok { return fmt.Errorf("invalid type, expected ActionDiagnostics and received %T", a) } - defer ack.Ack(ctx, action) + defer ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack // Gather agent diagnostics aDiag, err := h.client.DiagnosticAgent(ctx) @@ -63,9 +60,16 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } var b bytes.Buffer - eLog := zapio.Writer{Log, h.log, Level, zapcore.WarnLevel} // create a writer that outputs to the log at level:warn - defer eLog.Sync() - err = diagnostics.ZipArchive(eLog, &b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments + // create a buffer that any redaction error messages are written into as warnings. + // if the buffer is not empty after the bundle is assembled then the message is written to the log + // zapio.Writer would be a better way to pass a writer to ZipArchive, but logp embeds the zap.Logger so we are unable to access it here. + var wBuf bytes.Buffer + defer func() { + if str := wBuf.String(); str != "" { + h.log.Warn(str) + } + }() + err = diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments if err != nil { action.Err = err return fmt.Errorf("error creating diagnostics bundle: %w", err) From 43187542372dd6c1790f05f5627f333f7fba3220 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 23 Jan 2023 11:13:13 -0800 Subject: [PATCH 11/32] review feedback, fix diagnostics acks --- .../handlers/handler_action_diagnostics.go | 2 +- internal/pkg/fleetapi/ack_cmd.go | 1 + internal/pkg/fleetapi/action.go | 16 ++++++++-------- internal/pkg/fleetapi/uploader/uploader.go | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 97073c2e6d8..4ae9cf3da4c 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -77,7 +77,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) action.Err = err - action.UploadID = uploadID + action.FileID = uploadID if err != nil { return fmt.Errorf("unable to upload diagnostics: %w", err) } diff --git a/internal/pkg/fleetapi/ack_cmd.go b/internal/pkg/fleetapi/ack_cmd.go index e8d8ac7e9e3..cd060a10f57 100644 --- a/internal/pkg/fleetapi/ack_cmd.go +++ b/internal/pkg/fleetapi/ack_cmd.go @@ -28,6 +28,7 @@ type AckEvent struct { AgentID string `json:"agent_id"` // : 'agent1', Message string `json:"message,omitempty"` // : 'hello2', Payload json.RawMessage `json:"payload,omitempty"` // : 'payload2', + Data json.RawMessage `json:"data,omitempty"` // : 'data', ActionInputType string `json:"action_input_type,omitempty"` // copy of original action input_type ActionData json.RawMessage `json:"action_data,omitempty"` // copy of original action data diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index 7a8276d6b05..40f7110a562 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -34,7 +34,7 @@ const ( // ActionTypeCancel specifies a cancel action. ActionTypeCancel = "CANCEL" // ActionTypeDiagnostics specifies a diagnostics action. - ActionTypeDiagnostics = "DIAGNOSTICS" + ActionTypeDiagnostics = "REQUEST_DIAGNOSTICS" ) // Error values that the Action interface can return @@ -414,7 +414,7 @@ func (a *ActionCancel) AckEvent() AckEvent { type ActionDiagnostics struct { ActionID string `json:"action_id"` ActionType string `json:"type"` - UploadID string `json:"-"` + FileID string `json:"-"` Err error `json:"-"` } @@ -442,13 +442,13 @@ func (a *ActionDiagnostics) AckEvent() AckEvent { if a.Err != nil { event.Error = a.Err.Error() } - if a.UploadID != "" { - var payload struct { - UploadID string `json:"upload_id"` + if a.FileID != "" { + var data struct { + FileID string `json:"file_id"` } - payload.UploadID = a.UploadID - p, _ := json.Marshal(payload) - event.Payload = p + data.FileID = a.FileID + p, _ := json.Marshal(data) + event.Data = p } return event diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index 2b805348a12..9fa8c4f252d 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -121,7 +121,7 @@ func (c *Client) New(ctx context.Context, r *NewUploadRequest) (*NewUploadRespon if err != nil { return nil, err } - resp, err := c.c.Send(ctx, "POST", PathNewUpload, nil, nil, bytes.NewBuffer(b)) + resp, err := c.c.Send(ctx, http.MethodPost, PathNewUpload, nil, nil, bytes.NewBuffer(b)) if err != nil { return nil, err } From 1a1a9d9fb9c729441054835b4d49e1330880e5e1 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Tue, 24 Jan 2023 09:32:39 -0800 Subject: [PATCH 12/32] Add JSON deserialization, fix yaml --- internal/pkg/fleetapi/action.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index 40f7110a562..5480c0c378a 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -588,6 +588,16 @@ func (a *Actions) UnmarshalJSON(data []byte) error { "fail to decode CANCEL_ACTION action", errors.TypeConfig) } + case ActionTypeDiagnostics: + action = &ActionDiagnostics{ + ActionID: response.ActionID, + ActionType: response.ActionType, + } + if err := json.Unmarshal(response.Data, action); err != nil { + return errors.New(err, + "fail to decode REQUEST_DIAGNOSTICS_ACTION action", + errors.TypeConfig) + } default: action = &ActionUnknown{ ActionID: response.ActionID, @@ -681,9 +691,9 @@ func (a *Actions) UnmarshalYAML(unmarshal func(interface{}) error) error { ActionID: n.ActionID, ActionType: n.ActionType, } - if err := json.Unmarshal(n.Data, action); err != nil { + if err := yaml.Unmarshal(n.Data, action); err != nil { return errors.New(err, - "fail to decode DIAGNOSTICS_ACTION action", + "fail to decode REQUEST_DIAGNOSTICS_ACTION action", errors.TypeConfig) } default: From 55797c373c85765058174a519863118bab3e8e3e Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Wed, 25 Jan 2023 09:27:11 -0800 Subject: [PATCH 13/32] Add debug messages to handler --- .../actions/handlers/handler_action_diagnostics.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 4ae9cf3da4c..a5e3f335943 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -47,12 +47,14 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } defer ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack + h.log.Debug("Gathering agent diagnostics.") // Gather agent diagnostics aDiag, err := h.client.DiagnosticAgent(ctx) if err != nil { action.Err = err return fmt.Errorf("unable to gather agent diagnostics: %w", err) } + h.log.Debug("Gathering unit diagnostics.") uDiag, err := h.client.DiagnosticUnits(ctx) if err != nil { action.Err = err @@ -69,17 +71,20 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A h.log.Warn(str) } }() + h.log.Debug("Assembling diagnostics archive.") err = diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments if err != nil { action.Err = err return fmt.Errorf("error creating diagnostics bundle: %w", err) } + h.log.Debug("Sending diagnostics archive.") uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) action.Err = err action.FileID = uploadID if err != nil { return fmt.Errorf("unable to upload diagnostics: %w", err) } + h.log.Debugf("Diagnostics action '%+v' complete.", a) return nil } From 3fa3e1b1d036751409f7a6f8c0911638ae064ebf Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Wed, 25 Jan 2023 12:27:36 -0800 Subject: [PATCH 14/32] Fix uploader implementation and handler bug --- .../handlers/handler_action_diagnostics.go | 5 +++ internal/pkg/fleetapi/uploader/uploader.go | 42 +++++++++++++------ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index a5e3f335943..e903fe92ffd 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -47,6 +47,11 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } defer ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack + if err := h.client.Connect(ctx); err != nil { + action.Err = err + return fmt.Errorf("failed to connect to control server: %w", err) + } + h.log.Debug("Gathering agent diagnostics.") // Gather agent diagnostics aDiag, err := h.client.DiagnosticAgent(ctx) diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index 9fa8c4f252d..decbe809cb9 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -45,7 +45,7 @@ type FileData struct { type NewUploadRequest struct { ActionID string `json:"action_id"` AgentID string `json:"agent_id"` - Source string `json:"source"` + Source string `json:"src"` File FileData `json:"file"` Contents []FileData `json:"contents"` } @@ -56,6 +56,13 @@ type NewUploadResponse struct { ChunkSize int64 `json:"chunk_size"` } +// FinishRequest is the struct that is used when finalizing an upload. +type FinishRequest struct { + TransitHash struct { + SHA256 string `json:"sha256"` + } `json:"transithash"` +} + // retrySender wraps the underlying Sender with retry logic. type retrySender struct { c client.Sender @@ -140,8 +147,9 @@ func (c *Client) New(ctx context.Context, r *NewUploadRequest) (*NewUploadRespon } // Chunk uploads a file chunk to fleet-server. -func (c *Client) Chunk(ctx context.Context, uploadID string, chunkID int, r io.Reader) error { - resp, err := c.c.Send(ctx, "PUT", fmt.Sprintf(PathChunk, uploadID, chunkID), nil, nil, r) +func (c *Client) Chunk(ctx context.Context, uploadID string, chunkID int, sha256Hash []byte, r io.Reader) error { + h := http.Header{"X-Chunk-Sha2": {fmt.Sprintf("%x", sha256Hash)}} + resp, err := c.c.Send(ctx, "PUT", fmt.Sprintf(PathChunk, uploadID, chunkID), nil, h, r) if err != nil { return err } @@ -154,8 +162,13 @@ func (c *Client) Chunk(ctx context.Context, uploadID string, chunkID int, r io.R } // Finish calls the finalize endpoint for the file upload. -func (c *Client) Finish(ctx context.Context, id string) error { - resp, err := c.c.Send(ctx, "POST", fmt.Sprintf(PathFinishUpload, id), nil, nil, nil) +func (c *Client) Finish(ctx context.Context, id string, r *FinishRequest) error { + b, err := json.Marshal(r) + if err != nil { + return err + } + + resp, err := c.c.Send(ctx, "POST", fmt.Sprintf(PathFinishUpload, id), nil, nil, bytes.NewBuffer(b)) if err != nil { return err } @@ -168,6 +181,8 @@ func (c *Client) Finish(ctx context.Context, id string) error { } // UploadDiagnostics is a wrapper to upload a diagnostics request identified by the passed action id contained in the buffer to fleet-server. +// +// A buffer is used instead of a reader as we need to know the file length before uploading. func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buffer) (string, error) { size := b.Len() upReq := NewUploadRequest{ @@ -179,12 +194,6 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff Name: fmt.Sprintf("elastic-agent-diagnostics-%s-%s.zip", c.agentID, id), Extension: "zip", Mime: "application/zip", - Hash: struct { - SHA256 string `json:"sha256"` - MD5 string `json:"md5"` - }{ - SHA256: fmt.Sprintf("%x", sha256.Sum256(b.Bytes())), - }, }, } upResp, err := c.New(ctx, &upReq) @@ -195,13 +204,20 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff uploadID := upResp.UploadID chunkSize := upResp.ChunkSize totalChunks := int(math.Ceil(float64(size) / float64(chunkSize))) + transitHash := sha256.New() for chunk := 0; chunk < totalChunks; chunk++ { - err := c.Chunk(ctx, uploadID, chunk, io.LimitReader(b, chunkSize)) + var data bytes.Buffer + io.CopyN(&data, b, chunkSize) //nolint:errcheck // copy chunkSize bytes to a buffer so we can get the checksum + hash := sha256.Sum256(data.Bytes()) + err := c.Chunk(ctx, uploadID, chunk, hash[:], &data) // hash[:] uses the array as a slice if err != nil { return uploadID, err } + transitHash.Write(hash[:]) //nolint:errcheck // used to calculate transit hash, no need to check errors on this write } + var fr FinishRequest + fr.TransitHash.SHA256 = fmt.Sprintf("%x", transitHash.Sum(nil)) - err = c.Finish(ctx, uploadID) + err = c.Finish(ctx, uploadID, &fr) return uploadID, err } From abe952e6d444b85e822b00408f5367529947c5af Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Wed, 25 Jan 2023 13:11:22 -0800 Subject: [PATCH 15/32] Review feedback --- internal/pkg/fleetapi/uploader/uploader.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index decbe809cb9..ae0f6cd805a 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -188,7 +188,7 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff upReq := NewUploadRequest{ ActionID: id, AgentID: c.agentID, - Source: "elastic-agent", + Source: "agent", File: FileData{ Size: int64(size), Name: fmt.Sprintf("elastic-agent-diagnostics-%s-%s.zip", c.agentID, id), @@ -207,13 +207,13 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff transitHash := sha256.New() for chunk := 0; chunk < totalChunks; chunk++ { var data bytes.Buffer - io.CopyN(&data, b, chunkSize) //nolint:errcheck // copy chunkSize bytes to a buffer so we can get the checksum + io.CopyN(&data, b, chunkSize) //nolint:errorcheck // copy chunkSize bytes to a buffer so we can get the checksum hash := sha256.Sum256(data.Bytes()) err := c.Chunk(ctx, uploadID, chunk, hash[:], &data) // hash[:] uses the array as a slice if err != nil { return uploadID, err } - transitHash.Write(hash[:]) //nolint:errcheck // used to calculate transit hash, no need to check errors on this write + transitHash.Write(hash[:]) //nolint:errorcheck // used to calculate transit hash, no need to check errors on this write } var fr FinishRequest fr.TransitHash.SHA256 = fmt.Sprintf("%x", transitHash.Sum(nil)) From aea2b959dafc894df64dec17520c227f4a104045 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Wed, 25 Jan 2023 15:38:53 -0800 Subject: [PATCH 16/32] Fix linter --- internal/pkg/fleetapi/uploader/uploader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index ae0f6cd805a..7ebebf26046 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -207,13 +207,13 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff transitHash := sha256.New() for chunk := 0; chunk < totalChunks; chunk++ { var data bytes.Buffer - io.CopyN(&data, b, chunkSize) //nolint:errorcheck // copy chunkSize bytes to a buffer so we can get the checksum + io.CopyN(&data, b, chunkSize) //nolint:errcheck // copy chunkSize bytes to a buffer so we can get the checksum hash := sha256.Sum256(data.Bytes()) err := c.Chunk(ctx, uploadID, chunk, hash[:], &data) // hash[:] uses the array as a slice if err != nil { return uploadID, err } - transitHash.Write(hash[:]) //nolint:errorcheck // used to calculate transit hash, no need to check errors on this write + transitHash.Write(hash[:]) // used to calculate transit hash, no need to check errors on this write } var fr FinishRequest fr.TransitHash.SHA256 = fmt.Sprintf("%x", transitHash.Sum(nil)) From db52a043253b6c8430e26ba9fa9c26173486d0b8 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 26 Jan 2023 10:54:12 -0800 Subject: [PATCH 17/32] Change diag ack to use upload_id add dates to diag directories Change the data attribute of a diagnostics request ack to use upload_id to be consistent with the upload endpoints values. Add creation times for directories when creating a zip archive so they do not use a 0 timestamp value. --- .../handlers/handler_action_diagnostics.go | 6 ++-- internal/pkg/diagnostics/diagnostics.go | 31 ++++++++++++++++--- internal/pkg/fleetapi/action.go | 8 ++--- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index e903fe92ffd..6b42c8d9214 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -8,6 +8,7 @@ import ( "bytes" "context" "fmt" + "time" "github.com/elastic/elastic-agent/internal/pkg/agent/control/v2/client" "github.com/elastic/elastic-agent/internal/pkg/diagnostics" @@ -45,6 +46,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A if !ok { return fmt.Errorf("invalid type, expected ActionDiagnostics and received %T", a) } + ts := time.Now().UTC() defer ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack if err := h.client.Connect(ctx); err != nil { @@ -84,9 +86,9 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } h.log.Debug("Sending diagnostics archive.") - uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, &b) + uploadID, err := h.uploader.UploadDiagnostics(ctx, ts.Format("2006-01-02T15-04-05Z07-00"), &b) // RFC3339 format that uses - instead of : so it works on Windows action.Err = err - action.FileID = uploadID + action.UploadID = uploadID if err != nil { return fmt.Errorf("unable to upload diagnostics: %w", err) } diff --git a/internal/pkg/diagnostics/diagnostics.go b/internal/pkg/diagnostics/diagnostics.go index 083d61e63f4..d5262841bcc 100644 --- a/internal/pkg/diagnostics/diagnostics.go +++ b/internal/pkg/diagnostics/diagnostics.go @@ -117,12 +117,17 @@ func pprofDiag(name string) func(context.Context) ([]byte, time.Time) { // ZipArchive creates a zipped diagnostics bundle using the passed writer with the passed diagnostics. // If any error is encountered when writing the contents of the archive it is returned. func ZipArchive(errOut, w io.Writer, agentDiag []client.DiagnosticFileResult, unitDiags []client.DiagnosticUnitResult) error { + ts := time.Now().UTC() zw := zip.NewWriter(w) defer zw.Close() // Create directories in the zip archive before writing any files for _, ad := range agentDiag { if ad.ContentType == ContentTypeDirectory { - _, err := zw.Create(ad.Filename) + _, err := zw.CreateHeader(&zip.FileHeader{ + Name: ad.Filename, + Method: zip.Deflate, + Modified: ts, + }) if err != nil { return err } @@ -155,23 +160,39 @@ func ZipArchive(errOut, w io.Writer, agentDiag []client.DiagnosticFileResult, un } // write each units diagnostics into its own directory // layout becomes components/// - _, err := zw.Create("components/") + _, err := zw.CreateHeader(&zip.FileHeader{ + Name: "components/", + Method: zip.Deflate, + Modified: ts, + }) if err != nil { return err } for dirName, units := range compDirs { - _, err = zw.Create(fmt.Sprintf("components/%s/", dirName)) + _, err := zw.CreateHeader(&zip.FileHeader{ + Name: fmt.Sprintf("components/%s/", dirName), + Method: zip.Deflate, + Modified: ts, + }) if err != nil { return err } for _, ud := range units { unitDir := strings.ReplaceAll(strings.TrimPrefix(ud.UnitID, ud.ComponentID+"-"), "/", "-") - _, err = zw.Create(fmt.Sprintf("components/%s/%s/", dirName, unitDir)) + _, err := zw.CreateHeader(&zip.FileHeader{ + Name: fmt.Sprintf("components/%s/%s/", dirName, unitDir), + Method: zip.Deflate, + Modified: ts, + }) if err != nil { return err } if ud.Err != nil { - w, err := zw.Create(fmt.Sprintf("components/%s/%s/error.txt", dirName, unitDir)) + w, err := zw.CreateHeader(&zip.FileHeader{ + Name: fmt.Sprintf("components/%s/%s/error.txt", dirName, unitDir), + Method: zip.Deflate, + Modified: ts, + }) if err != nil { return err } diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index 5480c0c378a..dc40da63a22 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -414,7 +414,7 @@ func (a *ActionCancel) AckEvent() AckEvent { type ActionDiagnostics struct { ActionID string `json:"action_id"` ActionType string `json:"type"` - FileID string `json:"-"` + UploadID string `json:"-"` Err error `json:"-"` } @@ -442,11 +442,11 @@ func (a *ActionDiagnostics) AckEvent() AckEvent { if a.Err != nil { event.Error = a.Err.Error() } - if a.FileID != "" { + if a.UploadID != "" { var data struct { - FileID string `json:"file_id"` + UploadID string `json:"file_id"` } - data.FileID = a.FileID + data.UploadID = a.UploadID p, _ := json.Marshal(data) event.Data = p } From 214fb22595c599c97070e97d7034353a5e9b5320 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 26 Jan 2023 17:55:19 -0800 Subject: [PATCH 18/32] Add rate limiter to diagnostics action handler --- .../handlers/handler_action_diagnostics.go | 18 +++++++++- .../pkg/agent/application/managed_mode.go | 3 +- internal/pkg/core/monitoring/config/config.go | 34 ++++++++++++++++--- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 6b42c8d9214..5dc199cc743 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -11,12 +11,21 @@ import ( "time" "github.com/elastic/elastic-agent/internal/pkg/agent/control/v2/client" + "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" "github.com/elastic/elastic-agent/internal/pkg/diagnostics" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/pkg/core/logger" + + "golang.org/x/time/rate" ) +// ErrRateLimit is the rate limit error that is returned if the handler is ran too often. +// This may occur if the user sends multiple diagnostics actions to an agent in a short duration +// or if the agent goes offline and retrieves multiple diagnostics actions. +// In either case the 1st action will succeed and the others will ack with an the error. +var ErrRateLimit = fmt.Errorf("rate limit exceeded") + // Uploader is the interface used to upload a diagnostics bundle to fleet-server. type Uploader interface { UploadDiagnostics(context.Context, string, *bytes.Buffer) (string, error) @@ -26,14 +35,16 @@ type Uploader interface { // When a Diagnostics action is received a full diagnostics bundle is taken and uploaded to fleet-server. type Diagnostics struct { log *logger.Logger + limiter *rate.Limiter client client.Client uploader Uploader } // NewDiagnostics returns a new Diagnostics handler. -func NewDiagnostics(log *logger.Logger, uploader Uploader) *Diagnostics { +func NewDiagnostics(log *logger.Logger, cfg config.Limit, uploader Uploader) *Diagnostics { return &Diagnostics{ log: log, + limiter: rate.NewLimiter(rate.Every(cfg.Interval), cfg.Burst), client: client.New(), uploader: uploader, } @@ -49,6 +60,11 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A ts := time.Now().UTC() defer ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack + if !h.limiter.Allow() { + action.Err = ErrRateLimit + return ErrRateLimit + } + if err := h.client.Connect(ctx); err != nil { action.Err = err return fmt.Errorf("failed to connect to control server: %w", err) diff --git a/internal/pkg/agent/application/managed_mode.go b/internal/pkg/agent/application/managed_mode.go index 6b1a9941746..929524de692 100644 --- a/internal/pkg/agent/application/managed_mode.go +++ b/internal/pkg/agent/application/managed_mode.go @@ -353,7 +353,8 @@ func (m *managedConfigManager) initDispatcher(canceller context.CancelFunc) *han &fleetapi.ActionDiagnostics{}, handlers.NewDiagnostics( m.log, - uploader.New(m.agentInfo.AgentID(), m.client, m.cfg.Settings.MonitoringConfig.Uploader), + m.cfg.Settings.MonitoringConfig.Diagnostics.Limit, + uploader.New(m.agentInfo.AgentID(), m.client, m.cfg.Settings.MonitoringConfig.Diagnostics.Uploader), ), ) diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go index 7366c0eed44..8444264dc91 100644 --- a/internal/pkg/core/monitoring/config/config.go +++ b/internal/pkg/core/monitoring/config/config.go @@ -20,7 +20,7 @@ type MonitoringConfig struct { Pprof *PprofConfig `yaml:"pprof" config:"pprof"` MonitorTraces bool `yaml:"traces" config:"traces"` APM APMConfig `yaml:"apm,omitempty" config:"apm,omitempty" json:"apm,omitempty"` - Uploader Uploader `yaml:"uploader,omitempty" json:"uploader,omitempty"` + Diagnostics Diagnostics `yaml:"diagnostics,omitempty" json:"diagnostics,omitempty"` } // MonitoringHTTPConfig is a config defining HTTP endpoint published by agent @@ -58,9 +58,9 @@ func DefaultConfig() *MonitoringConfig { Host: "localhost", Port: defaultPort, }, - Namespace: defaultNamespace, - APM: defaultAPMConfig(), - Uploader: defaultUploader(), + Namespace: defaultNamespace, + APM: defaultAPMConfig(), + Diagnostics: defaultDiagnostics(), } } @@ -99,3 +99,29 @@ func defaultUploader() Uploader { MaxDur: time.Minute * 10, } } + +// Limit contains the configuration for rate-limiting operations +type Limit struct { + Interval time.Duration `config:"interval"` + Burst int `config:"burst"` +} + +func defaultLimit() Limit { + return Limit{ + Interval: time.Minute, + Burst: 1, + } +} + +// Diagnostics containts the configuration needed to configure the diagnostics handler. +type Diagnostics struct { + Uploader Uploader `config:"uploader"` + Limit Limit `config:"limit"` +} + +func defaultDiagnostics() Diagnostics { + return Diagnostics{ + Uploader: defaultUploader(), + Limit: defaultLimit(), + } +} From 78ba518282a22fffab0837c9aafd0bae24fed421 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 26 Jan 2023 18:08:00 -0800 Subject: [PATCH 19/32] update config --- _meta/config/common.p2.yml.tmpl | 16 ++++++++++++++++ _meta/config/common.reference.p2.yml.tmpl | 16 ++++++++++++++++ elastic-agent.reference.yml | 16 ++++++++++++++++ elastic-agent.yml | 16 ++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/_meta/config/common.p2.yml.tmpl b/_meta/config/common.p2.yml.tmpl index f6d930898a6..160b8de5181 100644 --- a/_meta/config/common.p2.yml.tmpl +++ b/_meta/config/common.p2.yml.tmpl @@ -49,6 +49,22 @@ inputs: # port: 6791 # # Metrics buffer endpoint # buffer.enabled: false +# # Configuration for the diagnostics action handler +# diagnostics: +# # Rate limit for the action handler. Does not affect diagnostics collected through the CLI. +# limit: +# # Rate limit interval. +# interval: 1m +# # Rate limit burst. +# burst: 1 +# # Configuration for the file-upload client. Client may retry failed requests with an exponential backoff. +# uploader: +# # Max retries allowed when uploading a chunk. +# max_retries: 10 +# # Initial duration of the backoff. +# init_dur: 1s +# # Max duration of the backoff. +# max_dur: 1m # # Allow fleet to reload its configuration locally on disk. # # Notes: Only specific process configuration will be reloaded. diff --git a/_meta/config/common.reference.p2.yml.tmpl b/_meta/config/common.reference.p2.yml.tmpl index d48d590d618..d6ade3ca971 100644 --- a/_meta/config/common.reference.p2.yml.tmpl +++ b/_meta/config/common.reference.p2.yml.tmpl @@ -125,6 +125,22 @@ inputs: # port: 6791 # # Metrics buffer endpoint # buffer.enabled: false +# # Configuration for the diagnostics action handler +# diagnostics: +# # Rate limit for the action handler. Does not affect diagnostics collected through the CLI. +# limit: +# # Rate limit interval. +# interval: 1m +# # Rate limit burst. +# burst: 1 +# # Configuration for the file-upload client. Client may retry failed requests with an exponential backoff. +# uploader: +# # Max retries allowed when uploading a chunk. +# max_retries: 10 +# # Initial duration of the backoff. +# init_dur: 1s +# # Max duration of the backoff. +# max_dur: 1m # # Allow fleet to reload its configuration locally on disk. # # Notes: Only specific process configuration and external input configurations will be reloaded. diff --git a/elastic-agent.reference.yml b/elastic-agent.reference.yml index 55482b5bf81..d4e029f47eb 100644 --- a/elastic-agent.reference.yml +++ b/elastic-agent.reference.yml @@ -131,6 +131,22 @@ inputs: # port: 6791 # # Metrics buffer endpoint # buffer.enabled: false +# # Configuration for the diagnostics action handler +# diagnostics: +# # Rate limit for the action handler. Does not affect diagnostics collected through the CLI. +# limit: +# # Rate limit interval. +# interval: 1m +# # Rate limit burst. +# burst: 1 +# # Configuration for the file-upload client. Client may retry failed requests with an exponential backoff. +# uploader: +# # Max retries allowed when uploading a chunk. +# max_retries: 10 +# # Initial duration of the backoff. +# init_dur: 1s +# # Max duration of the backoff. +# max_dur: 1m # # Allow fleet to reload its configuration locally on disk. # # Notes: Only specific process configuration and external input configurations will be reloaded. diff --git a/elastic-agent.yml b/elastic-agent.yml index 5da59d4fd81..a427917da9e 100644 --- a/elastic-agent.yml +++ b/elastic-agent.yml @@ -55,6 +55,22 @@ inputs: # port: 6791 # # Metrics buffer endpoint # buffer.enabled: false +# # Configuration for the diagnostics action handler +# diagnostics: +# # Rate limit for the action handler. Does not affect diagnostics collected through the CLI. +# limit: +# # Rate limit interval. +# interval: 1m +# # Rate limit burst. +# burst: 1 +# # Configuration for the file-upload client. Client may retry failed requests with an exponential backoff. +# uploader: +# # Max retries allowed when uploading a chunk. +# max_retries: 10 +# # Initial duration of the backoff. +# init_dur: 1s +# # Max duration of the backoff. +# max_dur: 1m # # Allow fleet to reload its configuration locally on disk. # # Notes: Only specific process configuration will be reloaded. From 2693ce2db559f86c55b04dac000ae08d1d7aa356 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Thu, 26 Jan 2023 18:18:23 -0800 Subject: [PATCH 20/32] Add changelog fragment, fix linter --- ...785407-add-diagnostics-action-handler.yaml | 36 +++++++++++++++++++ internal/pkg/core/monitoring/config/config.go | 2 +- 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 changelog/fragments/1674785407-add-diagnostics-action-handler.yaml diff --git a/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml b/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml new file mode 100644 index 00000000000..a1c24f7e0d1 --- /dev/null +++ b/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml @@ -0,0 +1,36 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: add diagnostics action handler + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +description: | + Add support for the REQUEST_DIAGNOSTICS action. + When this action is recieved the agent will collect a diagnostics bundle and + uploads it to fleet-server using the file upload APIs. + The handler has a configurable rate limit in order to prevent DOS attacks. + The uploader may retry failures with a configurable exponential backoff. + +# Affected component; a word indicating the component this changeset affects. +component: diagnostics + +# PR number; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: 1234 + +# Issue number; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: 1234 diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go index 8444264dc91..50027b4830f 100644 --- a/internal/pkg/core/monitoring/config/config.go +++ b/internal/pkg/core/monitoring/config/config.go @@ -113,7 +113,7 @@ func defaultLimit() Limit { } } -// Diagnostics containts the configuration needed to configure the diagnostics handler. +// Diagnostics contains the configuration needed to configure the diagnostics handler. type Diagnostics struct { Uploader Uploader `config:"uploader"` Limit Limit `config:"limit"` From b29e75f89646e7d3f78d4fa4a6d447c4b189533a Mon Sep 17 00:00:00 2001 From: Julia Bardi Date: Fri, 27 Jan 2023 11:11:09 +0100 Subject: [PATCH 21/32] changed file_id to upload_id, updated changelog --- .../fragments/1674785407-add-diagnostics-action-handler.yaml | 4 ++-- internal/pkg/fleetapi/action.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml b/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml index a1c24f7e0d1..08136b6cbbe 100644 --- a/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml +++ b/changelog/fragments/1674785407-add-diagnostics-action-handler.yaml @@ -29,8 +29,8 @@ component: diagnostics # If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. # NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. # Please provide it if you are adding a fragment for a different PR. -#pr: 1234 +pr: 1703 # Issue number; optional; the GitHub issue related to this changeset (either closes or is part of). # If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: 1234 +issue: 1883 diff --git a/internal/pkg/fleetapi/action.go b/internal/pkg/fleetapi/action.go index dc40da63a22..4e4c8d8e2b3 100644 --- a/internal/pkg/fleetapi/action.go +++ b/internal/pkg/fleetapi/action.go @@ -444,7 +444,7 @@ func (a *ActionDiagnostics) AckEvent() AckEvent { } if a.UploadID != "" { var data struct { - UploadID string `json:"file_id"` + UploadID string `json:"upload_id"` } data.UploadID = a.UploadID p, _ := json.Marshal(data) From abd31a8289a9dd4cfe4c86b6f37d131e039161d4 Mon Sep 17 00:00:00 2001 From: Julia Bardi Date: Fri, 27 Jan 2023 11:25:26 +0100 Subject: [PATCH 22/32] updated diagnostics file name --- internal/pkg/fleetapi/uploader/uploader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index 7ebebf26046..bfa0f17a38f 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -191,7 +191,7 @@ func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buff Source: "agent", File: FileData{ Size: int64(size), - Name: fmt.Sprintf("elastic-agent-diagnostics-%s-%s.zip", c.agentID, id), + Name: fmt.Sprintf("elastic-agent-diagnostics-%s.zip", id), Extension: "zip", Mime: "application/zip", }, From 383a576b0bd00fff9bd92755d90b857346c523ea Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Fri, 27 Jan 2023 16:37:52 -0800 Subject: [PATCH 23/32] Revert hooks changes, move log collection to ZipArchive --- .../handlers/handler_action_diagnostics.go | 72 +++- .../application/coordinator/coordinator.go | 356 ++++++------------ .../pkg/agent/application/managed_mode.go | 1 + internal/pkg/agent/cmd/run.go | 4 +- internal/pkg/agent/control/v2/control_test.go | 2 +- .../pkg/agent/control/v2/server/server.go | 44 +-- internal/pkg/diagnostics/diagnostics.go | 183 +++++++-- 7 files changed, 337 insertions(+), 325 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 5dc199cc743..fb8e96abab2 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -10,6 +10,7 @@ import ( "fmt" "time" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" "github.com/elastic/elastic-agent/internal/pkg/agent/control/v2/client" "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" "github.com/elastic/elastic-agent/internal/pkg/diagnostics" @@ -35,17 +36,17 @@ type Uploader interface { // When a Diagnostics action is received a full diagnostics bundle is taken and uploaded to fleet-server. type Diagnostics struct { log *logger.Logger + coord *coordinator.Coordinator limiter *rate.Limiter - client client.Client uploader Uploader } // NewDiagnostics returns a new Diagnostics handler. -func NewDiagnostics(log *logger.Logger, cfg config.Limit, uploader Uploader) *Diagnostics { +func NewDiagnostics(log *logger.Logger, coord *coordinator.Coordinator, cfg config.Limit, uploader Uploader) *Diagnostics { return &Diagnostics{ log: log, + coord: coord, limiter: rate.NewLimiter(rate.Every(cfg.Interval), cfg.Burst), - client: client.New(), uploader: uploader, } } @@ -65,24 +66,14 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A return ErrRateLimit } - if err := h.client.Connect(ctx); err != nil { - action.Err = err - return fmt.Errorf("failed to connect to control server: %w", err) - } - h.log.Debug("Gathering agent diagnostics.") - // Gather agent diagnostics - aDiag, err := h.client.DiagnosticAgent(ctx) + aDiag, err := h.runHooks(ctx) if err != nil { action.Err = err return fmt.Errorf("unable to gather agent diagnostics: %w", err) } h.log.Debug("Gathering unit diagnostics.") - uDiag, err := h.client.DiagnosticUnits(ctx) - if err != nil { - action.Err = err - return fmt.Errorf("unable to gather unit diagnostics: %w", err) - } + uDiag := h.diagUnits(ctx) var b bytes.Buffer // create a buffer that any redaction error messages are written into as warnings. @@ -95,7 +86,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } }() h.log.Debug("Assembling diagnostics archive.") - err = diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) // TODO Do we want to pass a buffer/a reader around? or write the file to a temp dir and read (to avoid memory usage)? file usage may need more thought for containerized deployments + err = diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) if err != nil { action.Err = err return fmt.Errorf("error creating diagnostics bundle: %w", err) @@ -111,3 +102,52 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A h.log.Debugf("Diagnostics action '%+v' complete.", a) return nil } + +func (h *Diagnostics) runHooks(ctx context.Context) ([]client.DiagnosticFileResult, error) { + hooks := append(h.coord.DiagnosticHooks(), diagnostics.GlobalHooks()...) + diags := make([]client.DiagnosticFileResult, 0, len(hooks)) + for _, hook := range hooks { + if ctx.Err != nil { + return diags, ctx.Err() + } + diags = append(diags, client.DiagnosticFileResult{ + Name: hook.Name, + Filename: hook.Filename, + Description: hook.Description, + ContentType: hook.ContentType, + Content: hook.Hook(), + Generated: time.Now().UTC(), + }) + } + return diags, nil +} + +func (h *Diagnostics) diagUnits(ctx context.Context) []client.DiagnosticUnitResult { + uDiag := make([]client.DiagnosticUnitResult, 0) + rr := h.coord.PerformDiagnostics(ctx) + for _, r := range rr { + diag := client.DiagnosticUnitResult{ + ComponentID: r.Component.ID, + UnitID: r.Unit.ID, + UnitType: r.Unit.Type, + } + if r.Err != nil { + diag.Error = r.Err.Error() + } else { + results := make([]client.DiagnosticFileResult, 0, len(r.Results)) + for _, res := range r.Results { + results = append(results, client.DiagnosticFileResult{ + Name: res.Name, + Filename: res.Filename, + Description: res.Description, + ContentType: res.ContentType, + Content: res.Content, + Generated: res.Generated, + }) + } + diag.Results = results + } + uDiag = append(uDiag, diag) + } + return uDiag +} diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index 75e5331e11e..2a393f8957f 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -8,31 +8,25 @@ import ( "context" "errors" "fmt" - "io" - "io/fs" - "os" - "path/filepath" - "strings" - "time" "github.com/elastic/elastic-agent-libs/logp" "gopkg.in/yaml.v2" + "github.com/elastic/elastic-agent/internal/pkg/diagnostics" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" + + "github.com/elastic/elastic-agent/internal/pkg/fleetapi" + "go.elastic.co/apm" "github.com/elastic/elastic-agent-client/v7/pkg/client" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" agentclient "github.com/elastic/elastic-agent/internal/pkg/agent/control/v2/client" "github.com/elastic/elastic-agent/internal/pkg/agent/transpiler" "github.com/elastic/elastic-agent/internal/pkg/capabilities" "github.com/elastic/elastic-agent/internal/pkg/config" - "github.com/elastic/elastic-agent/internal/pkg/diagnostics" - "github.com/elastic/elastic-agent/internal/pkg/fleetapi" - "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/component/runtime" "github.com/elastic/elastic-agent/pkg/core/logger" @@ -299,7 +293,7 @@ func (c *Coordinator) Upgrade(ctx context.Context, version string, sourceURI str return nil } -// AckUpgrade is the method used on startup to ack a prevously successful upgrade action. +// AckUpgrade performs acknowledgement for upgrade. func (c *Coordinator) AckUpgrade(ctx context.Context, acker acker.Acker) error { return c.upgradeMgr.Ack(ctx, acker) } @@ -417,249 +411,117 @@ func (c *Coordinator) Run(ctx context.Context) error { } } -// DiagnosticHooks returns diagnostic hooks callback that can be connected to the control server to provide -// diagnostic information about the state of the Elastic Agent. A callback is used in order to all log files -// at the time of diagnostics. -func (c *Coordinator) DiagnosticHooks() func() diagnostics.Hooks { - return func() diagnostics.Hooks { - hooks := diagnostics.Hooks{ - diagnostics.Hook{ - Name: "pre-config", - Filename: "pre-config.yaml", - Description: "current pre-configuration of the running Elastic Agent before variable substitution", - ContentType: "application/yaml", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - if c.state.ast == nil { - return []byte("error: failed no configuration by the coordinator"), ts - } - cfg, err := c.state.ast.Map() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - o, err := yaml.Marshal(cfg) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - return o, ts - }, +// DiagnosticHooks returns diagnostic hooks that can be connected to the control server to provide diagnostic +// information about the state of the Elastic Agent. +func (c *Coordinator) DiagnosticHooks() diagnostics.Hooks { + return diagnostics.Hooks{ + { + Name: "pre-config", + Filename: "pre-config.yaml", + Description: "current pre-configuration of the running Elastic Agent before variable substitution", + ContentType: "application/yaml", + Hook: func(_ context.Context) []byte { + if c.state.ast == nil { + return []byte("error: failed no configuration by the coordinator") + } + cfg, err := c.state.ast.Map() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + o, err := yaml.Marshal(cfg) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + return o }, - diagnostics.Hook{ - Name: "variables", - Filename: "variables.yaml", - Description: "current variable contexts of the running Elastic Agent", - ContentType: "application/yaml", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - if c.state.vars == nil { - return []byte("error: failed no variables by the coordinator"), ts - } - vars := make([]map[string]interface{}, 0, len(c.state.vars)) - for _, v := range c.state.vars { - m, err := v.Map() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - vars = append(vars, m) - } - o, err := yaml.Marshal(struct { - Variables []map[string]interface{} `yaml:"variables"` - }{ - Variables: vars, - }) + }, + { + Name: "variables", + Filename: "variables.yaml", + Description: "current variable contexts of the running Elastic Agent", + ContentType: "application/yaml", + Hook: func(_ context.Context) []byte { + if c.state.vars == nil { + return []byte("error: failed no variables by the coordinator") + } + vars := make([]map[string]interface{}, 0, len(c.state.vars)) + for _, v := range c.state.vars { + m, err := v.Map() if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts + return []byte(fmt.Sprintf("error: %q", err)) } - return o, ts - }, + vars = append(vars, m) + } + o, err := yaml.Marshal(struct { + Variables []map[string]interface{} `yaml:"variables"` + }{ + Variables: vars, + }) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + return o }, - diagnostics.Hook{ - Name: "computed-config", - Filename: "computed-config.yaml", - Description: "current computed configuration of the running Elastic Agent after variable substitution", - ContentType: "application/yaml", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - if c.state.ast == nil || c.state.vars == nil { - return []byte("error: failed no configuration or variables received by the coordinator"), ts - } - cfg, _, err := c.compute() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - o, err := yaml.Marshal(cfg) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - return o, ts - }, + }, + { + Name: "computed-config", + Filename: "computed-config.yaml", + Description: "current computed configuration of the running Elastic Agent after variable substitution", + ContentType: "application/yaml", + Hook: func(_ context.Context) []byte { + if c.state.ast == nil || c.state.vars == nil { + return []byte("error: failed no configuration or variables received by the coordinator") + } + cfg, _, err := c.compute() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + o, err := yaml.Marshal(cfg) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + return o }, - diagnostics.Hook{ - Name: "components", - Filename: "components.yaml", - Description: "current expected components model of the running Elastic Agent", - ContentType: "application/yaml", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - if c.state.ast == nil || c.state.vars == nil { - return []byte("error: failed no configuration or variables received by the coordinator"), ts - } - _, comps, err := c.compute() - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - o, err := yaml.Marshal(struct { - Components []component.Component `yaml:"components"` - }{ - Components: comps, - }) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - return o, ts - }, + }, + { + Name: "components", + Filename: "components.yaml", + Description: "current expected components model of the running Elastic Agent", + ContentType: "application/yaml", + Hook: func(_ context.Context) []byte { + if c.state.ast == nil || c.state.vars == nil { + return []byte("error: failed no configuration or variables received by the coordinator") + } + _, comps, err := c.compute() + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + o, err := yaml.Marshal(struct { + Components []component.Component `yaml:"components"` + }{ + Components: comps, + }) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + return o }, - diagnostics.Hook{ - Name: "state", - Filename: "state.yaml", - Description: "current state of running components by the Elastic Agent", - ContentType: "application/yaml", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - s := c.State(true) - o, err := yaml.Marshal(s) - if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), ts - } - return o, ts - }, + }, + { + Name: "state", + Filename: "state.yaml", + Description: "current state of running components by the Elastic Agent", + ContentType: "application/yaml", + Hook: func(_ context.Context) []byte { + s := c.State(true) + o, err := yaml.Marshal(s) + if err != nil { + return []byte(fmt.Sprintf("error: %q", err)) + } + return o }, - } - hooks = append(hooks, c.addLogHooks()...) - hooks = append(hooks, c.addServiceLogHooks()...) - - return hooks - } -} - -func (c *Coordinator) addLogHooks() []diagnostics.Hook { - hooks := []diagnostics.Hook{{ - Name: "logs directory", - Filename: "logs/", - Description: "The logs directory for the agent.", - ContentType: diagnostics.ContentTypeDirectory, - Hook: func(_ context.Context) ([]byte, time.Time) { return nil, time.Now().UTC() }, - }} - - logPath := filepath.Join(paths.Home(), "logs") + string(filepath.Separator) - err := filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { - if errors.Is(fErr, fs.ErrNotExist) { - return nil - } - if fErr != nil { - return fmt.Errorf("unable to walk log dir: %w", fErr) - } - name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) - if name == "" { - return nil - } - if d.IsDir() { - // TODO check if subdirectories in the logs dir can exist. - hooks = append(hooks, diagnostics.Hook{ - Name: "logs subdirectory", - Filename: "logs/" + name + "/", - Description: "A logs subdirectory", - ContentType: diagnostics.ContentTypeDirectory, - Hook: func(_ context.Context) ([]byte, time.Time) { return nil, time.Now().UTC() }, - }) - } else { - hooks = append(hooks, diagnostics.Hook{ - Name: "log file", - Filename: "logs/" + name, - ContentType: "text/plain", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - lf, err := os.Open(path) - if err != nil { - return []byte(fmt.Sprintf("unable to open log file: %v", err)), ts - } - if stat, err := lf.Stat(); err == nil { - ts = stat.ModTime().UTC() - } - defer lf.Close() - p, err := io.ReadAll(lf) - if err != nil { - return []byte(fmt.Sprintf("unable to read log file: %v", err)), ts - } - return p, ts - }, - }) - } - return nil - }) - if err != nil { - c.logger.With("err", err).Error("Unable to gather agent logs") - } - return hooks -} - -func (c *Coordinator) addServiceLogHooks() []diagnostics.Hook { - hooks := []diagnostics.Hook{{ - Name: "services log dir", - Filename: "services/", - ContentType: diagnostics.ContentTypeDirectory, - Hook: func(_ context.Context) ([]byte, time.Time) { return nil, time.Time{} }, - }} - for _, spec := range c.specs.ServiceSpecs() { - if spec.Spec.Service.Log == nil || spec.Spec.Service.Log.Path == "" { - // no log path set in specification - continue - } - - logPath := filepath.Dir(spec.Spec.Service.Log.Path) + string(filepath.Separator) - err := filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { - if errors.Is(fErr, fs.ErrNotExist) { - return nil - } - if fErr != nil { - return fmt.Errorf("unable to walk log directory %q for service input %s: %w", logPath, spec.InputType, fErr) - } - name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) - if name == "" { - return nil - } - if d.IsDir() { - return nil - } - hooks = append(hooks, diagnostics.Hook{ - Name: "service logs", - Filename: "services/" + name, - ContentType: "text/plain", - Hook: func(_ context.Context) ([]byte, time.Time) { - ts := time.Now().UTC() - lf, err := os.Open(path) - if err != nil { - return []byte(fmt.Sprintf("unable to open log file: %v", err)), ts - } - if stat, err := lf.Stat(); err == nil { - ts = stat.ModTime().UTC() - } - defer lf.Close() - p, err := io.ReadAll(lf) - if err != nil { - return []byte(fmt.Sprintf("unable to read log file: %v", err)), ts - } - return p, ts - - }, - }) - return nil - }) - if err != nil { - c.logger.With("err", err).Error("Unable to gather component logs") - } + }, } - return hooks } // runner performs the actual work of running all the managers diff --git a/internal/pkg/agent/application/managed_mode.go b/internal/pkg/agent/application/managed_mode.go index 929524de692..eec5084975a 100644 --- a/internal/pkg/agent/application/managed_mode.go +++ b/internal/pkg/agent/application/managed_mode.go @@ -353,6 +353,7 @@ func (m *managedConfigManager) initDispatcher(canceller context.CancelFunc) *han &fleetapi.ActionDiagnostics{}, handlers.NewDiagnostics( m.log, + m.coord, m.cfg.Settings.MonitoringConfig.Diagnostics.Limit, uploader.New(m.agentInfo.AgentID(), m.client, m.cfg.Settings.MonitoringConfig.Diagnostics.Uploader), ), diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index 5a48e1388ba..a5450e54c0a 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -201,7 +201,9 @@ func run(override cfgOverrider, modifiers ...component.PlatformModifier) error { _ = serverStopFn() }() - control := server.New(l.Named("control"), agentInfo, coord, tracer, cfg.Settings.GRPC, diagnostics.GlobalHooks, coord.DiagnosticHooks()) + diagHooks := diagnostics.GlobalHooks() + diagHooks = append(diagHooks, coord.DiagnosticHooks()...) + control := server.New(l.Named("control"), agentInfo, coord, tracer, diagHooks, cfg.Settings.GRPC) // start the control listener if err := control.Start(); err != nil { diff --git a/internal/pkg/agent/control/v2/control_test.go b/internal/pkg/agent/control/v2/control_test.go index 4910012d467..f4ec7a26452 100644 --- a/internal/pkg/agent/control/v2/control_test.go +++ b/internal/pkg/agent/control/v2/control_test.go @@ -22,7 +22,7 @@ import ( ) func TestServerClient_Version(t *testing.T) { - srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, configuration.DefaultGRPCConfig()) + srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, nil, configuration.DefaultGRPCConfig()) err := srv.Start() require.NoError(t, err) defer srv.Stop() diff --git a/internal/pkg/agent/control/v2/server/server.go b/internal/pkg/agent/control/v2/server/server.go index cd54e6e038f..58e5735a045 100644 --- a/internal/pkg/agent/control/v2/server/server.go +++ b/internal/pkg/agent/control/v2/server/server.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "net" + "time" "github.com/elastic/elastic-agent/pkg/component/runtime" @@ -34,25 +35,25 @@ import ( // Server is the daemon side of the control protocol. type Server struct { cproto.UnimplementedElasticAgentControlServer - logger *logger.Logger - agentInfo *info.AgentInfo - coord *coordinator.Coordinator - listener net.Listener - server *grpc.Server - tracer *apm.Tracer - grpcConfig *configuration.GRPCConfig - diagHooksFn []func() diagnostics.Hooks + logger *logger.Logger + agentInfo *info.AgentInfo + coord *coordinator.Coordinator + listener net.Listener + server *grpc.Server + tracer *apm.Tracer + diagHooks diagnostics.Hooks + grpcConfig *configuration.GRPCConfig } // New creates a new control protocol server. -func New(log *logger.Logger, agentInfo *info.AgentInfo, coord *coordinator.Coordinator, tracer *apm.Tracer, grpcConfig *configuration.GRPCConfig, diagHooksFn ...func() diagnostics.Hooks) *Server { +func New(log *logger.Logger, agentInfo *info.AgentInfo, coord *coordinator.Coordinator, tracer *apm.Tracer, diagHooks diagnostics.Hooks, grpcConfig *configuration.GRPCConfig) *Server { return &Server{ - logger: log, - agentInfo: agentInfo, - coord: coord, - tracer: tracer, - grpcConfig: grpcConfig, - diagHooksFn: diagHooksFn, + logger: log, + agentInfo: agentInfo, + coord: coord, + tracer: tracer, + diagHooks: diagHooks, + grpcConfig: grpcConfig, } } @@ -187,24 +188,19 @@ func (s *Server) Upgrade(ctx context.Context, request *cproto.UpgradeRequest) (* // DiagnosticAgent returns diagnostic information for this running Elastic Agent. func (s *Server) DiagnosticAgent(ctx context.Context, _ *cproto.DiagnosticAgentRequest) (*cproto.DiagnosticAgentResponse, error) { - diagHooks := make([]diagnostics.Hook, 0) - for _, fn := range s.diagHooksFn { - hooks := fn() - diagHooks = append(diagHooks, hooks...) - } - res := make([]*cproto.DiagnosticFileResult, 0, len(diagHooks)) - for _, h := range diagHooks { + res := make([]*cproto.DiagnosticFileResult, 0, len(s.diagHooks)) + for _, h := range s.diagHooks { if ctx.Err() != nil { return nil, ctx.Err() } - r, ts := h.Hook(ctx) + r := h.Hook(ctx) res = append(res, &cproto.DiagnosticFileResult{ Name: h.Name, Filename: h.Filename, Description: h.Description, ContentType: h.ContentType, Content: r, - Generated: timestamppb.New(ts), + Generated: timestamppb.New(time.Now().UTC()), }) } if ctx.Err() != nil { diff --git a/internal/pkg/diagnostics/diagnostics.go b/internal/pkg/diagnostics/diagnostics.go index d5262841bcc..10f03332e8f 100644 --- a/internal/pkg/diagnostics/diagnostics.go +++ b/internal/pkg/diagnostics/diagnostics.go @@ -8,8 +8,12 @@ import ( "archive/zip" "bytes" "context" + "errors" "fmt" "io" + "io/fs" + "os" + "path/filepath" "reflect" "runtime/pprof" "strings" @@ -17,8 +21,10 @@ import ( "gopkg.in/yaml.v2" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/control/v2/client" "github.com/elastic/elastic-agent/internal/pkg/release" + "github.com/elastic/elastic-agent/pkg/component" ) const ( @@ -34,7 +40,7 @@ type Hook struct { Filename string Description string ContentType string - Hook func(ctx context.Context) ([]byte, time.Time) + Hook func(ctx context.Context) []byte } // Hooks is a set of diagnostic hooks. @@ -48,13 +54,13 @@ func GlobalHooks() Hooks { Filename: "version.txt", Description: "version information", ContentType: "application/yaml", - Hook: func(_ context.Context) ([]byte, time.Time) { + Hook: func(_ context.Context) []byte { v := release.Info() o, err := yaml.Marshal(v) if err != nil { - return []byte(fmt.Sprintf("error: %q", err)), time.Now().UTC() + return []byte(fmt.Sprintf("error: %q", err)) } - return o, time.Now().UTC() + return o }, }, { @@ -102,52 +108,38 @@ func GlobalHooks() Hooks { } } -func pprofDiag(name string) func(context.Context) ([]byte, time.Time) { - return func(_ context.Context) ([]byte, time.Time) { +func pprofDiag(name string) func(context.Context) []byte { + return func(_ context.Context) []byte { var w bytes.Buffer err := pprof.Lookup(name).WriteTo(&w, 1) if err != nil { // error is returned as the content - return []byte(fmt.Sprintf("failed to write pprof to bytes buffer: %s", err)), time.Now().UTC() + return []byte(fmt.Sprintf("failed to write pprof to bytes buffer: %s", err)) } - return w.Bytes(), time.Now().UTC() + return w.Bytes() } } -// ZipArchive creates a zipped diagnostics bundle using the passed writer with the passed diagnostics. +// ZipArchive creates a zipped diagnostics bundle using the passed writer with the passed diagnostics and local logs. // If any error is encountered when writing the contents of the archive it is returned. func ZipArchive(errOut, w io.Writer, agentDiag []client.DiagnosticFileResult, unitDiags []client.DiagnosticUnitResult) error { ts := time.Now().UTC() zw := zip.NewWriter(w) defer zw.Close() - // Create directories in the zip archive before writing any files - for _, ad := range agentDiag { - if ad.ContentType == ContentTypeDirectory { - _, err := zw.CreateHeader(&zip.FileHeader{ - Name: ad.Filename, - Method: zip.Deflate, - Modified: ts, - }) - if err != nil { - return err - } - } - } + // Write agent diagnostics content for _, ad := range agentDiag { - if ad.ContentType != ContentTypeDirectory { - zf, err := zw.CreateHeader(&zip.FileHeader{ - Name: ad.Filename, - Method: zip.Deflate, - Modified: ad.Generated, - }) - if err != nil { - return err - } - err = writeRedacted(errOut, zf, ad.Filename, ad) - if err != nil { - return err - } + zf, err := zw.CreateHeader(&zip.FileHeader{ + Name: ad.Filename, + Method: zip.Deflate, + Modified: ad.Generated, + }) + if err != nil { + return err + } + err = writeRedacted(errOut, zf, ad.Filename, ad) + if err != nil { + return err } } @@ -219,7 +211,9 @@ func ZipArchive(errOut, w io.Writer, agentDiag []client.DiagnosticFileResult, un } } } - return nil + + // Gather Logs: + return zipLogs(zw, ts) } func writeRedacted(errOut, w io.Writer, fullFilePath string, fr client.DiagnosticFileResult) error { @@ -286,3 +280,120 @@ func redactKey(k string) bool { strings.Contains(k, "token") || strings.Contains(k, "key") } + +// zipLogs walks paths.Logs() and copies the file structure into zw in "logs/" +func zipLogs(zw *zip.Writer, ts time.Time) error { + _, err := zw.CreateHeader(&zip.FileHeader{ + Name: "logs", + Method: zip.Deflate, + Modified: ts, + }) + if err != nil { + return err + } + + if err := collectServiceComponentsLogs(zw); err != nil { + return fmt.Errorf("failed to collect endpoint-security logs: %w", err) + } + + // using Data() + "/logs", for some reason default paths/Logs() is the home dir... + logPath := filepath.Join(paths.Home(), "logs") + string(filepath.Separator) + return filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { + if errors.Is(fErr, fs.ErrNotExist) { + return nil + } + if fErr != nil { + return fmt.Errorf("unable to walk log dir: %w", fErr) + } + + // name is the relative dir/file name replacing any filepath seperators with / + // this will clean log names on windows machines and will nop on *nix + name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) + if name == "" { + return nil + } + + if d.IsDir() { + _, err := zw.CreateHeader(&zip.FileHeader{ + Name: "logs" + name + "/", + Method: zip.Deflate, + Modified: ts, + }) + if err != nil { + return fmt.Errorf("unable to create log directory in archive: %w", err) + } + return nil + } + + return saveLogs(name, path, zw) + }) +} + +func collectServiceComponentsLogs(zw *zip.Writer) error { + platform, err := component.LoadPlatformDetail() + if err != nil { + return fmt.Errorf("failed to gather system information: %w", err) + } + specs, err := component.LoadRuntimeSpecs(paths.Components(), platform) + if err != nil { + return fmt.Errorf("failed to detect inputs and outputs: %w", err) + } + for _, spec := range specs.ServiceSpecs() { + if spec.Spec.Service.Log == nil || spec.Spec.Service.Log.Path == "" { + // no log path set in specification + continue + } + + logPath := filepath.Dir(spec.Spec.Service.Log.Path) + string(filepath.Separator) + err = filepath.WalkDir(logPath, func(path string, d fs.DirEntry, fErr error) error { + if fErr != nil { + if errors.Is(fErr, fs.ErrNotExist) { + return nil + } + + return fmt.Errorf("unable to walk log directory %q for service input %s: %w", logPath, spec.InputType, fErr) + } + + name := filepath.ToSlash(strings.TrimPrefix(path, logPath)) + if name == "" { + return nil + } + + if d.IsDir() { + return nil + } + + return saveLogs("services/"+name, path, zw) + }) + if err != nil { + return err + } + } + return nil +} + +func saveLogs(name string, logPath string, zw *zip.Writer) error { + ts := time.Now().UTC() + lf, err := os.Open(logPath) + if err != nil { + return fmt.Errorf("unable to open log file: %w", err) + } + defer lf.Close() + if li, err := lf.Stat(); err == nil { + ts = li.ModTime() + } + zf, err := zw.CreateHeader(&zip.FileHeader{ + Name: "logs/" + name, + Method: zip.Deflate, + Modified: ts, + }) + if err != nil { + return err + } + _, err = io.Copy(zf, lf) + if err != nil { + return err + } + + return nil +} From 45467515c78cd4bb10647c96a8d8ed8b5c0740c3 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Fri, 27 Jan 2023 17:31:00 -0800 Subject: [PATCH 24/32] Cleanup and yaml redaction fix --- internal/pkg/agent/cmd/run.go | 1 - internal/pkg/agent/control/v2/server/server.go | 1 + internal/pkg/basecmd/version/cmd_test.go | 4 ++-- internal/pkg/diagnostics/diagnostics.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index b52b803de97..003c137a132 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -208,7 +208,6 @@ func run(override cfgOverrider, modifiers ...component.PlatformModifier) error { diagHooks := diagnostics.GlobalHooks() diagHooks = append(diagHooks, coord.DiagnosticHooks()...) control := server.New(l.Named("control"), agentInfo, coord, tracer, diagHooks, cfg.Settings.GRPC) - // start the control listener if err := control.Start(); err != nil { return err diff --git a/internal/pkg/agent/control/v2/server/server.go b/internal/pkg/agent/control/v2/server/server.go index 58e5735a045..057761f686e 100644 --- a/internal/pkg/agent/control/v2/server/server.go +++ b/internal/pkg/agent/control/v2/server/server.go @@ -35,6 +35,7 @@ import ( // Server is the daemon side of the control protocol. type Server struct { cproto.UnimplementedElasticAgentControlServer + logger *logger.Logger agentInfo *info.AgentInfo coord *coordinator.Coordinator diff --git a/internal/pkg/basecmd/version/cmd_test.go b/internal/pkg/basecmd/version/cmd_test.go index 9c95119d8e6..60d91cf629d 100644 --- a/internal/pkg/basecmd/version/cmd_test.go +++ b/internal/pkg/basecmd/version/cmd_test.go @@ -58,7 +58,7 @@ func TestCmdBinaryOnlyYAML(t *testing.T) { } func TestCmdDaemon(t *testing.T) { - srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, configuration.DefaultGRPCConfig()) + srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, nil, configuration.DefaultGRPCConfig()) require.NoError(t, srv.Start()) defer srv.Stop() @@ -74,7 +74,7 @@ func TestCmdDaemon(t *testing.T) { } func TestCmdDaemonYAML(t *testing.T) { - srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, configuration.DefaultGRPCConfig()) + srv := server.New(newErrorLogger(t), nil, nil, apmtest.DiscardTracer, nil, configuration.DefaultGRPCConfig()) require.NoError(t, srv.Start()) defer srv.Stop() diff --git a/internal/pkg/diagnostics/diagnostics.go b/internal/pkg/diagnostics/diagnostics.go index 0f7880b1864..f5fa2853c20 100644 --- a/internal/pkg/diagnostics/diagnostics.go +++ b/internal/pkg/diagnostics/diagnostics.go @@ -221,7 +221,7 @@ func writeRedacted(errOut, w io.Writer, fullFilePath string, fr client.Diagnosti // Should we support json too? if fr.ContentType == "application/yaml" { - unmarshalled := map[string]interface{}{} + unmarshalled := map[interface{}]interface{}{} err := yaml.Unmarshal(fr.Content, &unmarshalled) if err != nil { // Best effort, output a warning but still include the file From 37767735df312b5a143e1794e8f139ffd563d1e9 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Fri, 27 Jan 2023 18:48:32 -0800 Subject: [PATCH 25/32] handler and redact fixes --- .../handlers/handler_action_diagnostics.go | 2 +- .../application/coordinator/coordinator.go | 2 +- internal/pkg/diagnostics/diagnostics.go | 25 ++++++------------- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 9767870003d..fad8e52c954 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -108,7 +108,7 @@ func (h *Diagnostics) runHooks(ctx context.Context) ([]client.DiagnosticFileResu hooks := append(h.coord.DiagnosticHooks(), diagnostics.GlobalHooks()...) diags := make([]client.DiagnosticFileResult, 0, len(hooks)) for _, hook := range hooks { - if ctx.Err != nil { + if ctx.Err() != nil { return diags, ctx.Err() } diags = append(diags, client.DiagnosticFileResult{ diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index aeb54067887..e68ecccc58c 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -293,7 +293,7 @@ func (c *Coordinator) Upgrade(ctx context.Context, version string, sourceURI str return nil } -// AckUpgrade performs acknowledgement for upgrade. +// AckUpgrade is the method used on startup to ack a previously successful upgrade action. func (c *Coordinator) AckUpgrade(ctx context.Context, acker acker.Acker) error { return c.upgradeMgr.Ack(ctx, acker) } diff --git a/internal/pkg/diagnostics/diagnostics.go b/internal/pkg/diagnostics/diagnostics.go index f5fa2853c20..01103f9a1fd 100644 --- a/internal/pkg/diagnostics/diagnostics.go +++ b/internal/pkg/diagnostics/diagnostics.go @@ -241,32 +241,21 @@ func writeRedacted(errOut, w io.Writer, fullFilePath string, fr client.Diagnosti return err } -func redactMap(m map[string]interface{}) map[string]interface{} { +func redactMap(m map[interface{}]interface{}) map[interface{}]interface{} { for k, v := range m { if v != nil && reflect.TypeOf(v).Kind() == reflect.Map { - v = redactMap(toMapStr(v)) + v = redactMap(v.(map[interface{}]interface{})) } - if redactKey(k) { - v = REDACTED + if s, ok := k.(string); ok { + if redactKey(s) { + v = REDACTED + } + m[k] = v } - m[k] = v } return m } -func toMapStr(v interface{}) map[string]interface{} { - mm := map[string]interface{}{} - m, ok := v.(map[interface{}]interface{}) - if !ok { - return mm - } - - for k, v := range m { - mm[k.(string)] = v - } - return mm -} - func redactKey(k string) bool { // "routekey" shouldn't be redacted. // Add any other exceptions here. From 3f868d245a2c4e8d46bfabf0bb26b77863126070 Mon Sep 17 00:00:00 2001 From: Julia Bardi Date: Mon, 30 Jan 2023 11:07:01 +0100 Subject: [PATCH 26/32] fixed storing action_id correctly in files index --- .../actions/handlers/handler_action_diagnostics.go | 4 ++-- internal/pkg/fleetapi/uploader/uploader.go | 6 +++--- internal/pkg/fleetapi/uploader/uploader_test.go | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index fad8e52c954..87814c11393 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -30,7 +30,7 @@ var ErrRateLimit = fmt.Errorf("rate limit exceeded") // Uploader is the interface used to upload a diagnostics bundle to fleet-server. type Uploader interface { - UploadDiagnostics(context.Context, string, *bytes.Buffer) (string, error) + UploadDiagnostics(context.Context, string, string, *bytes.Buffer) (string, error) } // Diagnostics is the handler to process Diagnostics actions. @@ -94,7 +94,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A } h.log.Debug("Sending diagnostics archive.") - uploadID, err := h.uploader.UploadDiagnostics(ctx, ts.Format("2006-01-02T15-04-05Z07-00"), &b) // RFC3339 format that uses - instead of : so it works on Windows + uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, ts.Format("2006-01-02T15-04-05Z07-00"), &b) // RFC3339 format that uses - instead of : so it works on Windows action.Err = err action.UploadID = uploadID if err != nil { diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index bfa0f17a38f..e102ecfb301 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -183,15 +183,15 @@ func (c *Client) Finish(ctx context.Context, id string, r *FinishRequest) error // UploadDiagnostics is a wrapper to upload a diagnostics request identified by the passed action id contained in the buffer to fleet-server. // // A buffer is used instead of a reader as we need to know the file length before uploading. -func (c *Client) UploadDiagnostics(ctx context.Context, id string, b *bytes.Buffer) (string, error) { +func (c *Client) UploadDiagnostics(ctx context.Context, actionId string, timestamp string, b *bytes.Buffer) (string, error) { size := b.Len() upReq := NewUploadRequest{ - ActionID: id, + ActionID: actionId, AgentID: c.agentID, Source: "agent", File: FileData{ Size: int64(size), - Name: fmt.Sprintf("elastic-agent-diagnostics-%s.zip", id), + Name: fmt.Sprintf("elastic-agent-diagnostics-%s.zip", timestamp), Extension: "zip", Mime: "application/zip", }, diff --git a/internal/pkg/fleetapi/uploader/uploader_test.go b/internal/pkg/fleetapi/uploader/uploader_test.go index a0314f433a9..515ddaf108f 100644 --- a/internal/pkg/fleetapi/uploader/uploader_test.go +++ b/internal/pkg/fleetapi/uploader/uploader_test.go @@ -217,7 +217,7 @@ func Test_Client_UploadDiagnostics(t *testing.T) { c: sender, agentID: "test-agent", } - id, err := c.UploadDiagnostics(context.Background(), "test-id", bytes.NewBufferString("abcde")) + id, err := c.UploadDiagnostics(context.Background(), "test-id", "2023-01-30T09-40-02Z-00", bytes.NewBufferString("abcde")) require.NoError(t, err) assert.Equal(t, "test-upload", id) assert.Equal(t, "ab", string(chunk0)) From 3adfdaab3b4b7fd31ed8e4d183217e200f27e669 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 30 Jan 2023 07:09:13 -0800 Subject: [PATCH 27/32] commit the ack --- .../actions/handlers/handler_action_diagnostics.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 87814c11393..cb4e535f1d3 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -60,7 +60,10 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A return fmt.Errorf("invalid type, expected ActionDiagnostics and received %T", a) } ts := time.Now().UTC() - defer ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack + defer func() { + ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack + ack.Commit(ctx) //nolint:errcheck //no path for failing a commit + }() if !h.limiter.Allow() { action.Err = ErrRateLimit From 8d96c5371ed6b59bb63a48aa98ce8ba014cbe80d Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Mon, 30 Jan 2023 17:31:56 -0800 Subject: [PATCH 28/32] Diagnostics handler will use temp file Diagnostics handler will write the bundle to a temp file on disk before reading chunks out to requests. This is done in an attempt to avoid loading a large file in memory. If the file creation on disk fails an in-memory buffer is used instead. --- .../handlers/handler_action_diagnostics.go | 81 +++++++++++++++---- internal/pkg/fleetapi/uploader/uploader.go | 12 +-- .../pkg/fleetapi/uploader/uploader_test.go | 2 +- 3 files changed, 71 insertions(+), 24 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index cb4e535f1d3..e1a5d584a06 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -8,6 +8,8 @@ import ( "bytes" "context" "fmt" + "io" + "os" "time" "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" @@ -30,7 +32,7 @@ var ErrRateLimit = fmt.Errorf("rate limit exceeded") // Uploader is the interface used to upload a diagnostics bundle to fleet-server. type Uploader interface { - UploadDiagnostics(context.Context, string, string, *bytes.Buffer) (string, error) + UploadDiagnostics(context.Context, string, string, int64, io.Reader) (string, error) } // Diagnostics is the handler to process Diagnostics actions. @@ -79,25 +81,36 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A h.log.Debug("Gathering unit diagnostics.") uDiag := h.diagUnits(ctx) - var b bytes.Buffer - // create a buffer that any redaction error messages are written into as warnings. - // if the buffer is not empty after the bundle is assembled then the message is written to the log - // zapio.Writer would be a better way to pass a writer to ZipArchive, but logp embeds the zap.Logger so we are unable to access it here. - var wBuf bytes.Buffer - defer func() { - if str := wBuf.String(); str != "" { - h.log.Warn(str) - } - }() - h.log.Debug("Assembling diagnostics archive.") - err = diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) + var r io.Reader + // attempt to create the a temporary diagnostics file on disk in order to avoid loading a + // potentially large file in memory. + // if on-disk creation fails an in-memory buffer is used. + f, s, err := h.diagFile(aDiag, uDiag) if err != nil { - action.Err = err - return fmt.Errorf("error creating diagnostics bundle: %w", err) + var b bytes.Buffer + h.log.Warnw("Diagnostics action unable to use tempoary file, using buffer instead.", "err", err) + var wBuf bytes.Buffer + defer func() { + if str := wBuf.String(); str != "" { + h.log.Warn(str) + } + }() + err := diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) + if err != nil { + action.Err = err + return fmt.Errorf("error creating diagnostics bundle: %w", err) + } + r = &b + s = int64(b.Len()) + } else { + defer func() { + f.Close() + os.Remove(f.Name()) + }() + r = f } - h.log.Debug("Sending diagnostics archive.") - uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, ts.Format("2006-01-02T15-04-05Z07-00"), &b) // RFC3339 format that uses - instead of : so it works on Windows + uploadID, err := h.uploader.UploadDiagnostics(ctx, action.ActionID, ts.Format("2006-01-02T15-04-05Z07-00"), s, r) // RFC3339 format that uses - instead of : so it works on Windows action.Err = err action.UploadID = uploadID if err != nil { @@ -155,3 +168,37 @@ func (h *Diagnostics) diagUnits(ctx context.Context) []client.DiagnosticUnitResu } return uDiag } + +// diagFile will write the diagnostics to a temporary file and return the file ready to be read +func (h *Diagnostics) diagFile(aDiag []client.DiagnosticFileResult, uDiag []client.DiagnosticUnitResult) (*os.File, int64, error) { + f, err := os.CreateTemp("", "elastic-agent-diagnostics") + if err != nil { + return nil, 0, err + } + + name := f.Name() + var wBuf bytes.Buffer + defer func() { + if str := wBuf.String(); str != "" { + h.log.Warn(str) + } + }() + if err := diagnostics.ZipArchive(&wBuf, f, aDiag, uDiag); err != nil { + os.Remove(name) + return nil, 0, err + } + f.Sync() + + _, err = f.Seek(0, 0) + if err != nil { + os.Remove(name) + return nil, 0, err + } + + fi, err := f.Stat() + if err != nil { + os.Remove(name) + return nil, 0, err + } + return f, fi.Size(), nil +} diff --git a/internal/pkg/fleetapi/uploader/uploader.go b/internal/pkg/fleetapi/uploader/uploader.go index e102ecfb301..dbe5c759c1a 100644 --- a/internal/pkg/fleetapi/uploader/uploader.go +++ b/internal/pkg/fleetapi/uploader/uploader.go @@ -123,6 +123,9 @@ func New(id string, c client.Sender, cfg config.Uploader) *Client { } // New sends a new file upload request to the fleet-server. +// +// Request may return a 400 if the specified file.size is too large. +// Default max file size is 100mb func (c *Client) New(ctx context.Context, r *NewUploadRequest) (*NewUploadResponse, error) { b, err := json.Marshal(r) if err != nil { @@ -181,16 +184,13 @@ func (c *Client) Finish(ctx context.Context, id string, r *FinishRequest) error } // UploadDiagnostics is a wrapper to upload a diagnostics request identified by the passed action id contained in the buffer to fleet-server. -// -// A buffer is used instead of a reader as we need to know the file length before uploading. -func (c *Client) UploadDiagnostics(ctx context.Context, actionId string, timestamp string, b *bytes.Buffer) (string, error) { - size := b.Len() +func (c *Client) UploadDiagnostics(ctx context.Context, actionId string, timestamp string, size int64, r io.Reader) (string, error) { upReq := NewUploadRequest{ ActionID: actionId, AgentID: c.agentID, Source: "agent", File: FileData{ - Size: int64(size), + Size: size, Name: fmt.Sprintf("elastic-agent-diagnostics-%s.zip", timestamp), Extension: "zip", Mime: "application/zip", @@ -207,7 +207,7 @@ func (c *Client) UploadDiagnostics(ctx context.Context, actionId string, timesta transitHash := sha256.New() for chunk := 0; chunk < totalChunks; chunk++ { var data bytes.Buffer - io.CopyN(&data, b, chunkSize) //nolint:errcheck // copy chunkSize bytes to a buffer so we can get the checksum + io.CopyN(&data, r, chunkSize) //nolint:errcheck // copy chunkSize bytes to a buffer so we can get the checksum hash := sha256.Sum256(data.Bytes()) err := c.Chunk(ctx, uploadID, chunk, hash[:], &data) // hash[:] uses the array as a slice if err != nil { diff --git a/internal/pkg/fleetapi/uploader/uploader_test.go b/internal/pkg/fleetapi/uploader/uploader_test.go index 515ddaf108f..903019bfbc8 100644 --- a/internal/pkg/fleetapi/uploader/uploader_test.go +++ b/internal/pkg/fleetapi/uploader/uploader_test.go @@ -217,7 +217,7 @@ func Test_Client_UploadDiagnostics(t *testing.T) { c: sender, agentID: "test-agent", } - id, err := c.UploadDiagnostics(context.Background(), "test-id", "2023-01-30T09-40-02Z-00", bytes.NewBufferString("abcde")) + id, err := c.UploadDiagnostics(context.Background(), "test-id", "2023-01-30T09-40-02Z-00", 5, bytes.NewBufferString("abcde")) require.NoError(t, err) assert.Equal(t, "test-upload", id) assert.Equal(t, "ab", string(chunk0)) From cc1cd33d7826e402f704c57cb66a2301fa94c9bb Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Tue, 31 Jan 2023 08:40:09 -0800 Subject: [PATCH 29/32] Change to async handler, add panic recover --- .../handlers/handler_action_diagnostics.go | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index e1a5d584a06..8ccca8380cd 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -54,14 +54,30 @@ func NewDiagnostics(log *logger.Logger, coord *coordinator.Coordinator, cfg conf } } -// Handle processes the passed Diagnostics action. +// Handle processes the passed Diagnostics action asynchronously. +// +// The handler has a rate limiter to limit the number of diagnostics actions that are run at once. func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.Acker) error { h.log.Debugf("handlerDiagnostics: action '%+v' received", a) action, ok := a.(*fleetapi.ActionDiagnostics) if !ok { return fmt.Errorf("invalid type, expected ActionDiagnostics and received %T", a) } + go h.collectDiag(ctx, action, ack) + return nil +} + +// collectDiag will attempt to assemble a diagnostics bundle and upload it with the file upload APIs on fleet-server. +// +// The bundle is assembled on disk, however if it encounters any errors an in-memory-buffer is used. +func (h *Diagnostics) collectDiag(ctx context.Context, action *fleeetapi.ActioNDiagnostics, ack acker.Acker) { ts := time.Now().UTC() + defer func() { + if err := recover(); err != nil { + action.Err = err + h.log.Errorw("diagnostics handler paniced", "err", err) + } + }() defer func() { ack.Ack(ctx, action) //nolint:errcheck // no path for a failed ack ack.Commit(ctx) //nolint:errcheck //no path for failing a commit @@ -69,14 +85,14 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A if !h.limiter.Allow() { action.Err = ErrRateLimit - return ErrRateLimit + return } h.log.Debug("Gathering agent diagnostics.") aDiag, err := h.runHooks(ctx) if err != nil { action.Err = err - return fmt.Errorf("unable to gather agent diagnostics: %w", err) + return } h.log.Debug("Gathering unit diagnostics.") uDiag := h.diagUnits(ctx) @@ -114,12 +130,12 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A action.Err = err action.UploadID = uploadID if err != nil { - return fmt.Errorf("unable to upload diagnostics: %w", err) + return } h.log.Debugf("Diagnostics action '%+v' complete.", a) - return nil } +// runHooks runs the agent diagnostics hooks. func (h *Diagnostics) runHooks(ctx context.Context) ([]client.DiagnosticFileResult, error) { hooks := append(h.coord.DiagnosticHooks(), diagnostics.GlobalHooks()...) diags := make([]client.DiagnosticFileResult, 0, len(hooks)) @@ -139,6 +155,7 @@ func (h *Diagnostics) runHooks(ctx context.Context) ([]client.DiagnosticFileResu return diags, nil } +// diagUnits gathers diagnostics from units. func (h *Diagnostics) diagUnits(ctx context.Context) []client.DiagnosticUnitResult { uDiag := make([]client.DiagnosticUnitResult, 0) rr := h.coord.PerformDiagnostics(ctx) @@ -187,7 +204,7 @@ func (h *Diagnostics) diagFile(aDiag []client.DiagnosticFileResult, uDiag []clie os.Remove(name) return nil, 0, err } - f.Sync() + _ = f.Sync() _, err = f.Seek(0, 0) if err != nil { From f97d6d7f2bfb39a9356c984782e0eb7637e4b403 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Tue, 31 Jan 2023 09:06:42 -0800 Subject: [PATCH 30/32] fix linter --- .../actions/handlers/handler_action_diagnostics.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 8ccca8380cd..415f088a635 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -70,7 +70,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A // collectDiag will attempt to assemble a diagnostics bundle and upload it with the file upload APIs on fleet-server. // // The bundle is assembled on disk, however if it encounters any errors an in-memory-buffer is used. -func (h *Diagnostics) collectDiag(ctx context.Context, action *fleeetapi.ActioNDiagnostics, ack acker.Acker) { +func (h *Diagnostics) collectDiag(ctx context.Context, action *fleetapi.ActioNDiagnostics, ack acker.Acker) { ts := time.Now().UTC() defer func() { if err := recover(); err != nil { @@ -114,7 +114,7 @@ func (h *Diagnostics) collectDiag(ctx context.Context, action *fleeetapi.ActioND err := diagnostics.ZipArchive(&wBuf, &b, aDiag, uDiag) if err != nil { action.Err = err - return fmt.Errorf("error creating diagnostics bundle: %w", err) + return } r = &b s = int64(b.Len()) @@ -132,7 +132,7 @@ func (h *Diagnostics) collectDiag(ctx context.Context, action *fleeetapi.ActioND if err != nil { return } - h.log.Debugf("Diagnostics action '%+v' complete.", a) + h.log.Debugf("Diagnostics action '%+v' complete.", action) } // runHooks runs the agent diagnostics hooks. From cfa3e965bfb685ed8da7b4c75e9b03eed53462e3 Mon Sep 17 00:00:00 2001 From: Michel Laterman <82832767+michel-laterman@users.noreply.github.com> Date: Tue, 31 Jan 2023 09:29:13 -0800 Subject: [PATCH 31/32] Update internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go Co-authored-by: Blake Rouse --- .../application/actions/handlers/handler_action_diagnostics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 415f088a635..07178ec87e8 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -70,7 +70,7 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A // collectDiag will attempt to assemble a diagnostics bundle and upload it with the file upload APIs on fleet-server. // // The bundle is assembled on disk, however if it encounters any errors an in-memory-buffer is used. -func (h *Diagnostics) collectDiag(ctx context.Context, action *fleetapi.ActioNDiagnostics, ack acker.Acker) { +func (h *Diagnostics) collectDiag(ctx context.Context, action *fleetapi.ActionDiagnostics, ack acker.Acker) { ts := time.Now().UTC() defer func() { if err := recover(); err != nil { From 09293ffd022d2fada7104414e0a0b6da5cb3c335 Mon Sep 17 00:00:00 2001 From: michel-laterman Date: Tue, 31 Jan 2023 09:35:29 -0800 Subject: [PATCH 32/32] build error out of recovered item --- .../application/actions/handlers/handler_action_diagnostics.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go index 07178ec87e8..819c49aba1a 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_diagnostics.go @@ -73,7 +73,8 @@ func (h *Diagnostics) Handle(ctx context.Context, a fleetapi.Action, ack acker.A func (h *Diagnostics) collectDiag(ctx context.Context, action *fleetapi.ActionDiagnostics, ack acker.Acker) { ts := time.Now().UTC() defer func() { - if err := recover(); err != nil { + if r := recover(); r != nil { + err := fmt.Errorf("panic detected: %v", r) action.Err = err h.log.Errorw("diagnostics handler paniced", "err", err) }