Skip to content

Commit

Permalink
added more detailed logs around ES communication failure
Browse files Browse the repository at this point in the history
  • Loading branch information
lykkin committed Dec 13, 2021
1 parent f898297 commit cb9bfa7
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
12 changes: 12 additions & 0 deletions cmd/fleet/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"encoding/json"
"net/http"
"os"
"strings"

"github.com/elastic/fleet-server/v7/internal/pkg/dl"
"github.com/elastic/fleet-server/v7/internal/pkg/limit"
Expand Down Expand Up @@ -144,6 +145,17 @@ func NewErrorResp(err error) errResp {
}
}

// Check if we have encountered a connectivity error
// Predicate taken from https://github.com/golang/go/blob/master/src/net/dial_test.go#L810
if strings.Contains(err.Error(), "connection refused") {
return errResp{
http.StatusServiceUnavailable,
"ServiceUnavailable",
"Fleet server unable to communicate with Elasticsearch",
zerolog.InfoLevel,
}
}

// Default
return errResp{
StatusCode: http.StatusBadRequest,
Expand Down
13 changes: 13 additions & 0 deletions internal/pkg/coordinator/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,17 @@ func (m *monitorT) Run(ctx context.Context) (err error) {
// Start timer loop to ensure leadership
lT := time.NewTimer(m.checkInterval)
defer lT.Stop()

// Keep track of errored statuses
erroredOnLastRequest := false
numFailedRequests := 0
for {
select {
case hits := <-s.Output():
err = m.handlePolicies(ctx, hits)
if err != nil {
erroredOnLastRequest = true
numFailedRequests++
m.log.Warn().Err(err).Msgf("Encountered an error while policy leadership changes; continuing to retry.")
}
case <-mT.C:
Expand All @@ -141,13 +147,20 @@ func (m *monitorT) Run(ctx context.Context) (err error) {
case <-lT.C:
err = m.ensureLeadership(ctx)
if err != nil {
erroredOnLastRequest = true
numFailedRequests++
m.log.Warn().Err(err).Msgf("Encountered an error while checking/assigning policy leaders; continuing to retry.")
}
lT.Reset(m.checkInterval)
case <-ctx.Done():
m.releaseLeadership()
return ctx.Err()
}
if err == nil && erroredOnLastRequest {
erroredOnLastRequest = false
m.log.Info().Msgf("Policy leader monitor successfully recovered after %d attempts", numFailedRequests)
numFailedRequests = 0
}
}
}

Expand Down

0 comments on commit cb9bfa7

Please sign in to comment.