From 8de387e725ab3103b04bad850ebbe6d749f68b89 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Fri, 14 Jun 2024 11:21:23 +0100 Subject: [PATCH 1/3] Fix MaxSilences limit causes incomplete updates of existing silences This commit fixes a bug where the MaxSilences limit can cause an incomplete update of existing silences, where the old silence can be expired but the new silence cannot be created because it would exceeded the maximum number of silences. Signed-off-by: George Robinson --- silence/silence.go | 31 +++++++++-------- silence/silence_test.go | 74 ++++++++++++++++++++++++++--------------- 2 files changed, 65 insertions(+), 40 deletions(-) diff --git a/silence/silence.go b/silence/silence.go index 828c282c21..568b5d171e 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -564,6 +564,13 @@ func (s *Silences) getSilence(id string) (*pb.Silence, bool) { return msil.Silence, true } +func (s *Silences) toMeshSilence(sil *pb.Silence) *pb.MeshSilence { + return &pb.MeshSilence{ + Silence: sil, + ExpiresAt: sil.EndsAt.Add(s.retention), + } +} + func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool) error { sil.UpdatedAt = now @@ -573,10 +580,7 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool) } } - msil := &pb.MeshSilence{ - Silence: sil, - ExpiresAt: sil.EndsAt.Add(s.retention), - } + msil := s.toMeshSilence(sil) b, err := marshalMeshSilence(msil) if err != nil { return err @@ -612,16 +616,8 @@ func (s *Silences) Set(sil *pb.Silence) error { return ErrNotFound } - if ok { - if canUpdate(prev, sil, now) { - return s.setSilence(sil, now, false) - } - if getState(prev, s.nowUTC()) != types.SilenceStateExpired { - // We cannot update the silence, expire the old one. - if err := s.expire(prev.Id); err != nil { - return fmt.Errorf("expire previous silence: %w", err) - } - } + if ok && canUpdate(prev, sil, now) { + return s.setSilence(sil, now, false) } // If we got here it's either a new silence or a replacing one. @@ -631,6 +627,13 @@ func (s *Silences) Set(sil *pb.Silence) error { } } + if ok && getState(prev, s.nowUTC()) != types.SilenceStateExpired { + // We cannot update the silence, expire the old one. + if err := s.expire(prev.Id); err != nil { + return fmt.Errorf("expire previous silence: %w", err) + } + } + uid, err := uuid.NewV4() if err != nil { return fmt.Errorf("generate uuid: %w", err) diff --git a/silence/silence_test.go b/silence/silence_test.go index 98e70cda59..4ce2d070bb 100644 --- a/silence/silence_test.go +++ b/silence/silence_test.go @@ -15,6 +15,7 @@ package silence import ( "bytes" + "fmt" "os" "runtime" "sort" @@ -470,32 +471,24 @@ func TestSilenceLimits(t *testing.T) { EndsAt: time.Now().Add(5 * time.Minute), } require.NoError(t, s.Set(sil1)) - require.NotEqual(t, "", sil1.Id) - // Insert sil2 should fail because maximum number of silences - // has been exceeded. + // Insert sil2 should fail because maximum number of silences has been + // exceeded. sil2 := &pb.Silence{ - Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}}, + Matchers: []*pb.Matcher{{Name: "c", Pattern: "d"}}, StartsAt: time.Now(), EndsAt: time.Now().Add(5 * time.Minute), } require.EqualError(t, s.Set(sil2), "exceeded maximum number of silences: 1 (limit: 1)") - require.Equal(t, "", sil2.Id) - // Expire sil1 and run the GC. This should allow sil2 to be - // inserted. + // Expire sil1 and run the GC. This should allow sil2 to be inserted. require.NoError(t, s.Expire(sil1.Id)) n, err := s.GC() require.NoError(t, err) require.Equal(t, 1, n) - require.NoError(t, s.Set(sil2)) - require.NotEqual(t, "", sil2.Id) - // Should be able to update sil2 without hitting the limit. - require.NoError(t, s.Set(sil2)) - - // Expire sil2. + // Expire sil2 and run the GC. require.NoError(t, s.Expire(sil2.Id)) n, err = s.GC() require.NoError(t, err) @@ -505,26 +498,55 @@ func TestSilenceLimits(t *testing.T) { sil3 := &pb.Silence{ Matchers: []*pb.Matcher{ { - Name: strings.Repeat("a", 2<<9), - Pattern: strings.Repeat("b", 2<<9), + Name: strings.Repeat("e", 2<<9), + Pattern: strings.Repeat("f", 2<<9), }, { - Name: strings.Repeat("c", 2<<9), - Pattern: strings.Repeat("d", 2<<9), + Name: strings.Repeat("g", 2<<9), + Pattern: strings.Repeat("h", 2<<9), }, }, - CreatedBy: strings.Repeat("e", 2<<9), - Comment: strings.Repeat("f", 2<<9), + CreatedBy: strings.Repeat("i", 2<<9), + Comment: strings.Repeat("j", 2<<9), StartsAt: time.Now(), EndsAt: time.Now().Add(5 * time.Minute), } - err = s.Set(sil3) - require.Error(t, err) - // Do not check the exact size as it can change between consecutive runs - // due to padding. - require.Contains(t, err.Error(), "silence exceeded maximum size") - // TODO: Once we fix https://github.com/prometheus/alertmanager/issues/3878 this should be require.Equal. - require.NotEqual(t, "", sil3.Id) + require.EqualError(t, s.Set(sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil3).Size())) + + // Should be able to insert sil4. + sil4 := &pb.Silence{ + Matchers: []*pb.Matcher{{Name: "k", Pattern: "l"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + require.NoError(t, s.Set(sil4)) + + // Should be able to update sil4 without modifications. It is expected to + // keep the same ID. + sil5 := cloneSilence(sil4) + require.NoError(t, s.Set(sil5)) + require.Equal(t, sil4.Id, sil5.Id) + + // Should be able to update the comment. It is also expected to keep the + // same ID. + sil6 := cloneSilence(sil5) + sil6.Comment = "m" + require.NoError(t, s.Set(sil6)) + require.Equal(t, sil5.Id, sil6.Id) + + // Should not be able to update the start and end time as this requires + // sil5 to be expired and a new silence to be created. However, this would + // exceed the maximum number of silences, which counts both active and + // expired silences. + sil7 := cloneSilence(sil6) + sil7.StartsAt = time.Now().Add(5 * time.Minute) + sil7.EndsAt = time.Now().Add(10 * time.Minute) + require.EqualError(t, s.Set(sil7), "exceeded maximum number of silences: 1 (limit: 1)") + + // sil6 should not be expired because the update failed. + sil6, err = s.QueryOne(QIDs(sil6.Id)) + require.NoError(t, err) + require.Equal(t, types.SilenceStateActive, getState(sil6, s.nowUTC())) } func TestSilenceNoLimits(t *testing.T) { From 37e0a0ec32a206d92f6edcacfdc3d14f54a435e4 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Mon, 24 Jun 2024 14:43:22 +0100 Subject: [PATCH 2/3] Fix comment Signed-off-by: George Robinson --- silence/silence_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/silence/silence_test.go b/silence/silence_test.go index 4ce2d070bb..155a154454 100644 --- a/silence/silence_test.go +++ b/silence/silence_test.go @@ -535,7 +535,7 @@ func TestSilenceLimits(t *testing.T) { require.Equal(t, sil5.Id, sil6.Id) // Should not be able to update the start and end time as this requires - // sil5 to be expired and a new silence to be created. However, this would + // sil6 to be expired and a new silence to be created. However, this would // exceed the maximum number of silences, which counts both active and // expired silences. sil7 := cloneSilence(sil6) From 0fad78df081c023916b5cc22341ba5a6b7a6a56b Mon Sep 17 00:00:00 2001 From: George Robinson Date: Mon, 24 Jun 2024 14:48:22 +0100 Subject: [PATCH 3/3] Feedback Signed-off-by: George Robinson --- silence/silence.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/silence/silence.go b/silence/silence.go index 568b5d171e..37eadc60c7 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -620,7 +620,9 @@ func (s *Silences) Set(sil *pb.Silence) error { return s.setSilence(sil, now, false) } - // If we got here it's either a new silence or a replacing one. + // If we got here it's either a new silence or a replacing one (which would + // also create a new silence) so we need to make sure we have capacity for + // the new silence. if s.limits.MaxSilences != nil { if m := s.limits.MaxSilences(); m > 0 && len(s.st)+1 > m { return fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(s.st), m) @@ -628,7 +630,8 @@ func (s *Silences) Set(sil *pb.Silence) error { } if ok && getState(prev, s.nowUTC()) != types.SilenceStateExpired { - // We cannot update the silence, expire the old one. + // We cannot update the silence, expire the old one to leave a history of + // the silence before modification. if err := s.expire(prev.Id); err != nil { return fmt.Errorf("expire previous silence: %w", err) }