From cb9bfa7437a9c38687b4bb19406b3139cda24c6e Mon Sep 17 00:00:00 2001 From: bryan Date: Sun, 12 Dec 2021 22:31:44 -0800 Subject: [PATCH 1/2] added more detailed logs around ES communication failure --- cmd/fleet/error.go | 12 ++++++++++++ internal/pkg/coordinator/monitor.go | 13 +++++++++++++ 2 files changed, 25 insertions(+) diff --git a/cmd/fleet/error.go b/cmd/fleet/error.go index 375521516..d27de8467 100644 --- a/cmd/fleet/error.go +++ b/cmd/fleet/error.go @@ -9,6 +9,7 @@ import ( "encoding/json" "net/http" "os" + "strings" "github.com/elastic/fleet-server/v7/internal/pkg/dl" "github.com/elastic/fleet-server/v7/internal/pkg/limit" @@ -144,6 +145,17 @@ func NewErrorResp(err error) errResp { } } + // Check if we have encountered a connectivity error + // Predicate taken from https://github.com/golang/go/blob/master/src/net/dial_test.go#L810 + if strings.Contains(err.Error(), "connection refused") { + return errResp{ + http.StatusServiceUnavailable, + "ServiceUnavailable", + "Fleet server unable to communicate with Elasticsearch", + zerolog.InfoLevel, + } + } + // Default return errResp{ StatusCode: http.StatusBadRequest, diff --git a/internal/pkg/coordinator/monitor.go b/internal/pkg/coordinator/monitor.go index 03019ac71..c1d31aa5f 100644 --- a/internal/pkg/coordinator/monitor.go +++ b/internal/pkg/coordinator/monitor.go @@ -128,11 +128,17 @@ func (m *monitorT) Run(ctx context.Context) (err error) { // Start timer loop to ensure leadership lT := time.NewTimer(m.checkInterval) defer lT.Stop() + + // Keep track of errored statuses + erroredOnLastRequest := false + numFailedRequests := 0 for { select { case hits := <-s.Output(): err = m.handlePolicies(ctx, hits) if err != nil { + erroredOnLastRequest = true + numFailedRequests++ m.log.Warn().Err(err).Msgf("Encountered an error while policy leadership changes; continuing to retry.") } case <-mT.C: @@ -141,6 +147,8 @@ func (m *monitorT) Run(ctx context.Context) (err error) { case <-lT.C: err = m.ensureLeadership(ctx) if err != nil { + erroredOnLastRequest = true + numFailedRequests++ m.log.Warn().Err(err).Msgf("Encountered an error while checking/assigning policy leaders; continuing to retry.") } lT.Reset(m.checkInterval) @@ -148,6 +156,11 @@ func (m *monitorT) Run(ctx context.Context) (err error) { m.releaseLeadership() return ctx.Err() } + if err == nil && erroredOnLastRequest { + erroredOnLastRequest = false + m.log.Info().Msgf("Policy leader monitor successfully recovered after %d attempts", numFailedRequests) + numFailedRequests = 0 + } } } From 1e555848fdd905095637acb4825ea30646088661 Mon Sep 17 00:00:00 2001 From: bryan Date: Tue, 28 Dec 2021 08:47:19 -0800 Subject: [PATCH 2/2] pinned clarifying comment to a version tag --- cmd/fleet/error.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/fleet/error.go b/cmd/fleet/error.go index d27de8467..73707dc9d 100644 --- a/cmd/fleet/error.go +++ b/cmd/fleet/error.go @@ -146,7 +146,7 @@ func NewErrorResp(err error) errResp { } // Check if we have encountered a connectivity error - // Predicate taken from https://github.com/golang/go/blob/master/src/net/dial_test.go#L810 + // Predicate taken from https://github.com/golang/go/blob/go1.17.5/src/net/dial_test.go#L798 if strings.Contains(err.Error(), "connection refused") { return errResp{ http.StatusServiceUnavailable,