prometheus · gotjosh · Feb 13, 2024 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/notify/notify.go b/notify/notify.go
@@ -251,6 +251,7 @@ type Metrics struct {
 	numTotalFailedNotifications        *prometheus.CounterVec
 	numNotificationRequestsTotal       *prometheus.CounterVec
 	numNotificationRequestsFailedTotal *prometheus.CounterVec
+	numNotificationSuppressedTotal     *prometheus.CounterVec
 	notificationLatencySeconds         *prometheus.HistogramVec
 
 	ff featurecontrol.Flagger
@@ -284,6 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
 			Name:      "notification_requests_failed_total",
 			Help:      "The total number of failed notification requests.",
 		}, labels),
+		numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Namespace: "alertmanager",
+			Name:      "notification_suppressed_total",
+			Help:      "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.",
+		}, []string{"reason"}),
 		notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: "alertmanager",
 			Name:      "notification_latency_seconds",
@@ -296,7 +302,7 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
 	r.MustRegister(
 		m.numNotifications, m.numTotalFailedNotifications,
 		m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal,
-		m.notificationLatencySeconds,
+		m.numNotificationSuppressedTotal, m.notificationLatencySeconds,
 	)
 
 	return m
@@ -381,10 +387,10 @@ func (pb *PipelineBuilder) New(
 	rs := make(RoutingStage, len(receivers))
 
 	ms := NewGossipSettleStage(peer)
-	is := NewMuteStage(inhibitor)
-	tas := NewTimeActiveStage(intervener)
-	tms := NewTimeMuteStage(intervener)
-	ss := NewMuteStage(silencer)
+	is := NewMuteStage(inhibitor, pb.metrics)
+	tas := NewTimeActiveStage(intervener, pb.metrics)
+	tms := NewTimeMuteStage(intervener, pb.metrics)
+	ss := NewMuteStage(silencer, pb.metrics)
 
 	for name := range receivers {
 		st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
@@ -509,12 +515,13 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t
 
 // MuteStage filters alerts through a Muter.
 type MuteStage struct {
-	muter types.Muter
+	muter   types.Muter
+	metrics *Metrics
 }
 
 // NewMuteStage return a new MuteStage.
-func NewMuteStage(m types.Muter) *MuteStage {
-	return &MuteStage{muter: m}
+func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
+	return &MuteStage{muter: m, metrics: metrics}
 }
 
 // Exec implements the Stage interface.
@@ -535,7 +542,18 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type
 	}
 	if len(muted) > 0 {
 		level.Debug(logger).Log("msg", "Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted))
+
+		var reason string
+		switch n.muter.(type) {
+		case *silence.Silencer:
+			reason = "silence"
+		case *inhibit.Inhibitor:
+			reason = "inhibition"
+		default:
+		}
+		n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted)))
 	}
+
 	return ctx, filtered, nil
 }
 
@@ -877,13 +895,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ
 }
 
 type timeStage struct {
-	muter types.TimeMuter
+	muter   types.TimeMuter
+	metrics *Metrics
 }
 
 type TimeMuteStage timeStage
 
-func NewTimeMuteStage(m types.TimeMuter) *TimeMuteStage {
-	return &TimeMuteStage{m}
+func NewTimeMuteStage(m types.TimeMuter, metrics *Metrics) *TimeMuteStage {
+	return &TimeMuteStage{m, metrics}
 }
 
 // Exec implements the stage interface for TimeMuteStage.
@@ -910,16 +929,17 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type
 
 	// If the current time is inside a mute time, all alerts are removed from the pipeline.
 	if muted {
-		level.Debug(l).Log("msg", "Notifications not sent, route is within mute time")
+		tms.metrics.numNotificationSuppressedTotal.WithLabelValues("mute_time_interval").Add(float64(len(alerts)))
+		level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts))
 		return ctx, nil, nil
 	}
 	return ctx, alerts, nil
 }
 
 type TimeActiveStage timeStage
 
-func NewTimeActiveStage(m types.TimeMuter) *TimeActiveStage {
-	return &TimeActiveStage{m}
+func NewTimeActiveStage(m types.TimeMuter, metrics *Metrics) *TimeActiveStage {
+	return &TimeActiveStage{m, metrics}
 }
 
 // Exec implements the stage interface for TimeActiveStage.
@@ -947,7 +967,8 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty
 
 	// If the current time is not inside an active time, all alerts are removed from the pipeline
 	if !muted {
-		level.Debug(l).Log("msg", "Notifications not sent, route is not within active time")
+		tas.metrics.numNotificationSuppressedTotal.WithLabelValues("active_time_interval").Add(float64(len(alerts)))
+		level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts))
 		return ctx, nil, nil
 	}
 

diff --git a/notify/notify_test.go b/notify/notify_test.go
@@ -633,7 +633,8 @@ func TestMuteStage(t *testing.T) {
 		return ok
 	})
 
-	stage := NewMuteStage(muter)
+	metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
+	stage := NewMuteStage(muter, metrics)
 
 	in := []model.LabelSet{
 		{},
@@ -672,6 +673,10 @@ func TestMuteStage(t *testing.T) {
 	if !reflect.DeepEqual(got, out) {
 		t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
 	}
+	suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
+	if (len(in) - len(got)) != suppressed {
+		t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressed)
+	}
 }
 
 func TestMuteStageWithSilences(t *testing.T) {
@@ -687,9 +692,11 @@ func TestMuteStageWithSilences(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	marker := types.NewMarker(prometheus.NewRegistry())
+	reg := prometheus.NewRegistry()
+	marker := types.NewMarker(reg)
 	silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
-	stage := NewMuteStage(silencer)
+	metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
+	stage := NewMuteStage(silencer, metrics)
 
 	in := []model.LabelSet{
 		{},
@@ -732,6 +739,10 @@ func TestMuteStageWithSilences(t *testing.T) {
 	if !reflect.DeepEqual(got, out) {
 		t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
 	}
+	suppressedRoundOne := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
+	if (len(in) - len(got)) != suppressedRoundOne {
+		t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundOne)
+	}
 
 	// Do it again to exercise the version tracking of silences.
 	_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
@@ -748,6 +759,11 @@ func TestMuteStageWithSilences(t *testing.T) {
 		t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
 	}
 
+	suppressedRoundTwo := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
+	if (len(in) - len(got) + suppressedRoundOne) != suppressedRoundTwo {
+		t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundTwo)
+	}
+
 	// Expire the silence and verify that no alerts are silenced now.
 	if err := silences.Expire(silID); err != nil {
 		t.Fatal(err)
@@ -765,6 +781,10 @@ func TestMuteStageWithSilences(t *testing.T) {
 	if !reflect.DeepEqual(got, in) {
 		t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got)
 	}
+	suppressedRoundThree := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
+	if (len(in) - len(got) + suppressedRoundTwo) != suppressedRoundThree {
+		t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundThree)
+	}
 }
 
 func TestTimeMuteStage(t *testing.T) {
@@ -841,7 +861,8 @@ func TestTimeMuteStage(t *testing.T) {
 	}
 	m := map[string][]timeinterval.TimeInterval{"test": intervals}
 	intervener := timeinterval.NewIntervener(m)
-	stage := NewTimeMuteStage(intervener)
+	metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
+	stage := NewTimeMuteStage(intervener, metrics)
 
 	outAlerts := []*types.Alert{}
 	nonMuteCount := 0
@@ -875,6 +896,10 @@ func TestTimeMuteStage(t *testing.T) {
 	if len(outAlerts) != nonMuteCount {
 		t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
 	}
+	suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
+	if (len(cases) - nonMuteCount) != suppressed {
+		t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
+	}
 }
 
 func TestTimeActiveStage(t *testing.T) {
@@ -900,6 +925,11 @@ func TestTimeActiveStage(t *testing.T) {
 			labels:     model.LabelSet{"mute": "me"},
 			shouldMute: true,
 		},
+		{
+			fireTime:   "02 Dec 20 16:59 +0000",
+			labels:     model.LabelSet{"mute": "me"},
+			shouldMute: true,
+		},
 		{
 			// Tuesday before 5pm
 			fireTime:   "01 Dec 20 16:59 +0000",
@@ -926,7 +956,8 @@ func TestTimeActiveStage(t *testing.T) {
 	}
 	m := map[string][]timeinterval.TimeInterval{"test": intervals}
 	intervener := timeinterval.NewIntervener(m)
-	stage := NewTimeActiveStage(intervener)
+	metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
+	stage := NewTimeActiveStage(intervener, metrics)
 
 	outAlerts := []*types.Alert{}
 	nonMuteCount := 0
@@ -960,6 +991,10 @@ func TestTimeActiveStage(t *testing.T) {
 	if len(outAlerts) != nonMuteCount {
 		t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
 	}
+	suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
+	if (len(cases) - nonMuteCount) != suppressed {
+		t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
+	}
 }
 
 func BenchmarkHashAlert(b *testing.B) {