Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix MaxSilences limit causes incomplete updates of existing silences #3877

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 21 additions & 15 deletions silence/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,13 @@ func (s *Silences) getSilence(id string) (*pb.Silence, bool) {
return msil.Silence, true
}

func (s *Silences) toMeshSilence(sil *pb.Silence) *pb.MeshSilence {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've moved this to a function for two reasons:

  1. It allows us to check the full error in unit tests. For example:

    require.EqualError(t, s.Set(sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil3).Size()))
  2. We will use it in the next PR where we fix the same bug but for maximum size limits and validation errors.

return &pb.MeshSilence{
Silence: sil,
ExpiresAt: sil.EndsAt.Add(s.retention),
}
}

func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool) error {
sil.UpdatedAt = now

Expand All @@ -573,10 +580,7 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool)
}
}

msil := &pb.MeshSilence{
Silence: sil,
ExpiresAt: sil.EndsAt.Add(s.retention),
}
msil := s.toMeshSilence(sil)
b, err := marshalMeshSilence(msil)
if err != nil {
return err
Expand Down Expand Up @@ -612,25 +616,27 @@ func (s *Silences) Set(sil *pb.Silence) error {
return ErrNotFound
}

if ok {
if canUpdate(prev, sil, now) {
return s.setSilence(sil, now, false)
}
if getState(prev, s.nowUTC()) != types.SilenceStateExpired {
// We cannot update the silence, expire the old one.
if err := s.expire(prev.Id); err != nil {
return fmt.Errorf("expire previous silence: %w", err)
}
}
if ok && canUpdate(prev, sil, now) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code should run before checking the MaxSilences() limit because we don't want to prevent updating an existing silence if we are at the limit and the update does not require expiring and creating a new silence.

return s.setSilence(sil, now, false)
}

// If we got here it's either a new silence or a replacing one.
// If we got here it's either a new silence or a replacing one (which would
// also create a new silence) so we need to make sure we have capacity for
// the new silence.
if s.limits.MaxSilences != nil {
if m := s.limits.MaxSilences(); m > 0 && len(s.st)+1 > m {
return fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(s.st), m)
}
}

if ok && getState(prev, s.nowUTC()) != types.SilenceStateExpired {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The entire change here is moving the expiration of the existing silence to run after checking the MaxSilences() limit. This ensures that an existing silence is not expired if we are at the limit, as what would happen is the existing silence is expired and then the new silence cannot be created.

// We cannot update the silence, expire the old one to leave a history of
// the silence before modification.
if err := s.expire(prev.Id); err != nil {
return fmt.Errorf("expire previous silence: %w", err)
}
}

uid, err := uuid.NewV4()
if err != nil {
return fmt.Errorf("generate uuid: %w", err)
Expand Down
74 changes: 48 additions & 26 deletions silence/silence_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package silence

import (
"bytes"
"fmt"
"os"
"runtime"
"sort"
Expand Down Expand Up @@ -470,32 +471,24 @@ func TestSilenceLimits(t *testing.T) {
EndsAt: time.Now().Add(5 * time.Minute),
}
require.NoError(t, s.Set(sil1))
require.NotEqual(t, "", sil1.Id)

// Insert sil2 should fail because maximum number of silences
// has been exceeded.
// Insert sil2 should fail because maximum number of silences has been
// exceeded.
sil2 := &pb.Silence{
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
Matchers: []*pb.Matcher{{Name: "c", Pattern: "d"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
require.EqualError(t, s.Set(sil2), "exceeded maximum number of silences: 1 (limit: 1)")
require.Equal(t, "", sil2.Id)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have removed all checks for require.Equal(t, "", id) in this test because it's unrelated to limits.


// Expire sil1 and run the GC. This should allow sil2 to be
// inserted.
// Expire sil1 and run the GC. This should allow sil2 to be inserted.
require.NoError(t, s.Expire(sil1.Id))
n, err := s.GC()
require.NoError(t, err)
require.Equal(t, 1, n)

require.NoError(t, s.Set(sil2))
require.NotEqual(t, "", sil2.Id)

// Should be able to update sil2 without hitting the limit.
require.NoError(t, s.Set(sil2))

// Expire sil2.
// Expire sil2 and run the GC.
require.NoError(t, s.Expire(sil2.Id))
n, err = s.GC()
require.NoError(t, err)
Expand All @@ -505,26 +498,55 @@ func TestSilenceLimits(t *testing.T) {
sil3 := &pb.Silence{
Matchers: []*pb.Matcher{
{
Name: strings.Repeat("a", 2<<9),
Pattern: strings.Repeat("b", 2<<9),
Name: strings.Repeat("e", 2<<9),
Pattern: strings.Repeat("f", 2<<9),
},
{
Name: strings.Repeat("c", 2<<9),
Pattern: strings.Repeat("d", 2<<9),
Name: strings.Repeat("g", 2<<9),
Pattern: strings.Repeat("h", 2<<9),
},
},
CreatedBy: strings.Repeat("e", 2<<9),
Comment: strings.Repeat("f", 2<<9),
CreatedBy: strings.Repeat("i", 2<<9),
Comment: strings.Repeat("j", 2<<9),
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
err = s.Set(sil3)
require.Error(t, err)
// Do not check the exact size as it can change between consecutive runs
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed this.

// due to padding.
require.Contains(t, err.Error(), "silence exceeded maximum size")
// TODO: Once we fix https://github.com/prometheus/alertmanager/issues/3878 this should be require.Equal.
require.NotEqual(t, "", sil3.Id)
require.EqualError(t, s.Set(sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil3).Size()))

// Should be able to insert sil4.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added additional tests here to cover the following cases:

  1. Updating an existing silence without modifications.
  2. Adding a comment to an existing silence.
  3. Failing to update an existing silence because the maximum number of silences has been exceeded should not expire the existing silence.

sil4 := &pb.Silence{
Matchers: []*pb.Matcher{{Name: "k", Pattern: "l"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
require.NoError(t, s.Set(sil4))

// Should be able to update sil4 without modifications. It is expected to
// keep the same ID.
sil5 := cloneSilence(sil4)
require.NoError(t, s.Set(sil5))
require.Equal(t, sil4.Id, sil5.Id)

// Should be able to update the comment. It is also expected to keep the
// same ID.
sil6 := cloneSilence(sil5)
sil6.Comment = "m"
require.NoError(t, s.Set(sil6))
require.Equal(t, sil5.Id, sil6.Id)

// Should not be able to update the start and end time as this requires
// sil6 to be expired and a new silence to be created. However, this would
// exceed the maximum number of silences, which counts both active and
// expired silences.
sil7 := cloneSilence(sil6)
sil7.StartsAt = time.Now().Add(5 * time.Minute)
sil7.EndsAt = time.Now().Add(10 * time.Minute)
require.EqualError(t, s.Set(sil7), "exceeded maximum number of silences: 1 (limit: 1)")

// sil6 should not be expired because the update failed.
sil6, err = s.QueryOne(QIDs(sil6.Id))
require.NoError(t, err)
require.Equal(t, types.SilenceStateActive, getState(sil6, s.nowUTC()))
}

func TestSilenceNoLimits(t *testing.T) {
Expand Down
Loading