Skip to content

Commit

Permalink
Remove immediate flush on reload/restart
Browse files Browse the repository at this point in the history
This commit changes Alertmanager so it no longer flushes aggregation
groups on configuration reload or restart of Alertmanager as this
behavior causes a number of issues:

1. Alertmanager will send notifications for inhibited alerts if the
   inhibited alert is sent to Alertmanager before the inhibiting alert
   following a restart

2. Reloading Alertmanager via /-/reload can cause incomplete flushes
   of aggregation groups (#3407)

A potential issue with this change is that following a reload or restart
of Alertmanager, alerts that were waiting for group_wait will have to
wait from the beginning of group_wait again. If group_wait is large
then notifications could take longer to send then expected.
  • Loading branch information
grobinson-grafana committed Jul 5, 2023
1 parent 487db13 commit bd8ae85
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 18 deletions.
10 changes: 0 additions & 10 deletions dispatch/dispatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,6 @@ type aggrGroup struct {
timeout func(time.Duration) time.Duration

mtx sync.RWMutex
hasFlushed bool
}

// newAggrGroup returns a new aggregation group.
Expand Down Expand Up @@ -450,7 +449,6 @@ func (ag *aggrGroup) run(nf notifyFunc) {
// Wait the configured interval before calling flush again.
ag.mtx.Lock()
ag.next.Reset(ag.opts.GroupInterval)
ag.hasFlushed = true
ag.mtx.Unlock()

ag.flush(func(alerts ...*types.Alert) bool {
Expand All @@ -477,14 +475,6 @@ func (ag *aggrGroup) insert(alert *types.Alert) {
if err := ag.alerts.Set(alert); err != nil {
level.Error(ag.logger).Log("msg", "error on set alert", "err", err)
}

// Immediately trigger a flush if the wait duration for this
// alert is already over.
ag.mtx.Lock()
defer ag.mtx.Unlock()
if !ag.hasFlushed && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) {
ag.next.Reset(0)
}
}

func (ag *aggrGroup) empty() bool {
Expand Down
9 changes: 1 addition & 8 deletions dispatch/dispatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,21 +188,14 @@ func TestAggrGroup(t *testing.T) {

ag.stop()

// Add an alert that started more than group_interval in the past. We expect
// immediate flushing.
// Finally, set all alerts to be resolved. After successful notify the aggregation group
// Set all alerts to be resolved. After successful notify the aggregation group
// should empty itself.
ag = newAggrGroup(context.Background(), lset, route, nil, log.NewNopLogger())
go ag.run(ntfy)

ag.insert(a1)
ag.insert(a2)

// a2 lies way in the past so the initial group_wait should be skipped.
select {
case <-time.After(opts.GroupWait / 2):
t.Fatalf("expected immediate alert but received none")

case batch := <-alertsCh:
exp := removeEndsAt(types.AlertSlice{a1, a2})
sort.Sort(batch)
Expand Down

0 comments on commit bd8ae85

Please sign in to comment.