Remove immediate flush on reload/restart

This commit changes Alertmanager so it no longer flushes aggregation groups on configuration reload or restart of Alertmanager as this behavior causes a number of issues: 1. Alertmanager will send notifications for inhibited alerts if the inhibited alert is sent to Alertmanager before the inhibiting alert following a restart 2. Reloading Alertmanager via /-/reload can cause incomplete flushes of aggregation groups (#3407) A potential issue with this change is that following a reload or restart of Alertmanager, alerts that were waiting for group_wait will have to wait from the beginning of group_wait again. If group_wait is large then notifications could take longer to send then expected.
prometheus · Jul 5, 2023 · bd8ae85 · bd8ae85
1 parent 487db13
commit bd8ae85
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 18 deletions.
diff --git a/dispatch/dispatch.go b/dispatch/dispatch.go
@@ -383,7 +383,6 @@ type aggrGroup struct {
 	timeout func(time.Duration) time.Duration
 
 	mtx        sync.RWMutex
-	hasFlushed bool
 }
 
 // newAggrGroup returns a new aggregation group.
@@ -450,7 +449,6 @@ func (ag *aggrGroup) run(nf notifyFunc) {
 			// Wait the configured interval before calling flush again.
 			ag.mtx.Lock()
 			ag.next.Reset(ag.opts.GroupInterval)
-			ag.hasFlushed = true
 			ag.mtx.Unlock()
 
 			ag.flush(func(alerts ...*types.Alert) bool {
@@ -477,14 +475,6 @@ func (ag *aggrGroup) insert(alert *types.Alert) {
 	if err := ag.alerts.Set(alert); err != nil {
 		level.Error(ag.logger).Log("msg", "error on set alert", "err", err)
 	}
-
-	// Immediately trigger a flush if the wait duration for this
-	// alert is already over.
-	ag.mtx.Lock()
-	defer ag.mtx.Unlock()
-	if !ag.hasFlushed && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) {
-		ag.next.Reset(0)
-	}
 }
 
 func (ag *aggrGroup) empty() bool {

diff --git a/dispatch/dispatch_test.go b/dispatch/dispatch_test.go
@@ -188,21 +188,14 @@ func TestAggrGroup(t *testing.T) {
 
 	ag.stop()
 
-	// Add an alert that started more than group_interval in the past. We expect
-	// immediate flushing.
-	// Finally, set all alerts to be resolved. After successful notify the aggregation group
+	// Set all alerts to be resolved. After successful notify the aggregation group
 	// should empty itself.
 	ag = newAggrGroup(context.Background(), lset, route, nil, log.NewNopLogger())
 	go ag.run(ntfy)
-
 	ag.insert(a1)
 	ag.insert(a2)
 
-	// a2 lies way in the past so the initial group_wait should be skipped.
 	select {
-	case <-time.After(opts.GroupWait / 2):
-		t.Fatalf("expected immediate alert but received none")
-
 	case batch := <-alertsCh:
 		exp := removeEndsAt(types.AlertSlice{a1, a2})
 		sort.Sort(batch)