elastic · pchila · Nov 19, 2024 · Nov 8, 2024 · Nov 11, 2024 · Nov 11, 2024
@@ -36,11 +36,14 @@ import (
 	"github.com/elastic/elastic-agent-libs/testing"
 )
 
-// Expvar metric names.
 const (
+	// Expvar metric names.
 	successesKey = "success"
 	failuresKey  = "failures"
 	eventsKey    = "events"
+
+	// Failure threshold config key
+	failureThresholdKey = "failure_threshold"
 )
 
 var (
@@ -70,7 +73,10 @@ type metricSetWrapper struct {
 	module *Wrapper // Parent Module.
 	stats  *stats   // stats for this MetricSet.
 
-	periodic bool // Set to true if this metricset is a periodic fetcher
+	periodic         bool // Set to true if this metricset is a periodic fetcher
+	failureThreshold uint // threshold of consecutive errors needed to set the stream as degraded
+
+	consecutiveErrors uint // consecutive errors counter
 }
 
 // stats bundles common metricset stats.
@@ -106,11 +112,28 @@ func createWrapper(module mb.Module, metricSets []mb.MetricSet, options ...Optio
 		applyOption(wrapper)
 	}
 
+	failureThreshold := uint(1)
+
+	var streamHealthSettings struct {
+		FailureThreshold *uint `config:"failure_threshold"`
+	}
+
+	err := module.UnpackConfig(&streamHealthSettings)
+
+	if err != nil {
+		return nil, fmt.Errorf("unpacking raw config: %w", err)
+	}
+
+	if streamHealthSettings.FailureThreshold != nil {
+		failureThreshold = *streamHealthSettings.FailureThreshold
+	}
+
 	for i, metricSet := range metricSets {
 		wrapper.metricSets[i] = &metricSetWrapper{
-			MetricSet: metricSet,
-			module:    wrapper,
-			stats:     getMetricSetStats(wrapper.Name(), metricSet.Name()),
+			MetricSet:        metricSet,
+			module:           wrapper,
+			stats:            getMetricSetStats(wrapper.Name(), metricSet.Name()),
+			failureThreshold: failureThreshold,
 		}
 	}
 	return wrapper, nil
@@ -254,35 +277,11 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) {
 	case mb.ReportingMetricSetV2Error:
 		reporter.StartFetchTimer()
 		err := fetcher.Fetch(reporter.V2())
-		if err != nil {
-			reporter.V2().Error(err)
-			if errors.As(err, &mb.PartialMetricsError{}) {
-				// mark module as running if metrics are partially available and display the error message
-				msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			} else {
-				// mark it as degraded for any other issue encountered
-				msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			}
-			logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
-		} else {
-			msw.module.UpdateStatus(status.Running, "")
-		}
+		msw.handleFetchError(err, reporter.V2())
 	case mb.ReportingMetricSetV2WithContext:
 		reporter.StartFetchTimer()
 		err := fetcher.Fetch(ctx, reporter.V2())
-		if err != nil {
-			reporter.V2().Error(err)
-			if errors.As(err, &mb.PartialMetricsError{}) {
-				// mark module as running if metrics are partially available and display the error message
-				msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			} else {
-				// mark it as degraded for any other issue encountered
-				msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			}
-			logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
-		} else {
-			msw.module.UpdateStatus(status.Running, "")
-		}
+		msw.handleFetchError(err, reporter.V2())
 	default:
 		panic(fmt.Sprintf("unexpected fetcher type for %v", msw))
 	}
@@ -311,6 +310,31 @@ func (msw *metricSetWrapper) Test(d testing.Driver) {
 	})
 }
 
+func (msw *metricSetWrapper) handleFetchError(err error, reporter mb.PushReporterV2) {
+	switch {
+
+	case errors.As(err, &mb.PartialMetricsError{}):
+		reporter.Error(err)
+		msw.consecutiveErrors = 0
+		// mark module as running if metrics are partially available and display the error message
+		msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
+		logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
+
+	case err != nil:
+		reporter.Error(err)
+		msw.consecutiveErrors++
+		if msw.failureThreshold > 0 && msw.consecutiveErrors >= msw.failureThreshold {
+			// mark it as degraded for any other issue encountered
+			msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
+		}
+		logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
+
+	default:
+		msw.consecutiveErrors = 0
+		msw.module.UpdateStatus(status.Running, "")
+	}
+}
+
 type reporter interface {
 	StartFetchTimer()
 	V1() mb.PushReporter //nolint:staticcheck // PushReporter is deprecated but not removed