diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 0378fd3dc3..45aa497527 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -145,6 +145,8 @@ func run() int { dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String() retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration() maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration() + maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of active and pending silences, excluding expired silences. If negative or zero, no limit is set.").Default("0").Int() + maxPerSilenceBytes = kingpin.Flag("silences.max-per-silence-bytes", "Maximum per silence size in bytes. If negative or zero, no limit is set.").Default("0").Int() alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration() webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093") @@ -258,8 +260,12 @@ func run() int { silenceOpts := silence.Options{ SnapshotFile: filepath.Join(*dataDir, "silences"), Retention: *retention, - Logger: log.With(logger, "component", "silences"), - Metrics: prometheus.DefaultRegisterer, + Limits: silence.Limits{ + MaxSilences: *maxSilences, + MaxPerSilenceBytes: *maxPerSilenceBytes, + }, + Logger: log.With(logger, "component", "silences"), + Metrics: prometheus.DefaultRegisterer, } silences, err := silence.New(silenceOpts) diff --git a/docs/configuration.md b/docs/configuration.md index fa9878399c..5bc0979543 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -22,6 +22,17 @@ is not well-formed, the changes will not be applied and an error is logged. A configuration reload is triggered by sending a `SIGHUP` to the process or sending an HTTP POST request to the `/-/reload` endpoint. +## Limits + +Alertmanager supports a number of configurable limits via command-line flags. + +To limit the maximum number of active and pending silences, excluding expired ones, +use the `--silences.max-silences` flag. +You can limit the maximum size of individual silences with `--silences.max-per-silence-bytes`, +where the unit is in bytes. + +Both limits are disabled by default. + ## Configuration file introduction To specify which configuration file to load, use the `--config.file` flag. diff --git a/silence/silence.go b/silence/silence.go index 84507bc5be..ec063edd54 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -193,6 +193,7 @@ type Silences struct { logger log.Logger metrics *metrics retention time.Duration + limits Limits mtx sync.RWMutex st state @@ -201,6 +202,16 @@ type Silences struct { mc matcherCache } +// Limits contains the limits for silences. +type Limits struct { + // MaxSilences limits the maximum number active and pending silences. + // It does not include expired silences. + MaxSilences int + // MaxPerSilenceBytes is the maximum size of an individual silence as + // stored on disk. + MaxPerSilenceBytes int +} + // MaintenanceFunc represents the function to run as part of the periodic maintenance for silences. // It returns the size of the snapshot taken or an error if it failed. type MaintenanceFunc func() (int64, error) @@ -318,6 +329,7 @@ type Options struct { // Retention time for newly created Silences. Silences may be // garbage collected after the given duration after they ended. Retention time.Duration + Limits Limits // A logger used by background processing. Logger log.Logger @@ -342,6 +354,7 @@ func New(o Options) (*Silences, error) { mc: matcherCache{}, logger: log.NewNopLogger(), retention: o.Retention, + limits: o.Limits, broadcast: func([]byte) {}, st: state{}, } @@ -569,6 +582,13 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool) return err } + // Check the limit unless the silence has been expired. This is to avoid + // situations where silences cannot be expired after the limit has been + // reduced. + if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) { + return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes) + } + if s.st.merge(msil, now) { s.version++ } @@ -608,10 +628,10 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) { func (s *Silences) set(sil *pb.Silence) (string, error) { now := s.nowUTC() prev, ok := s.getSilence(sil.Id) - if sil.Id != "" && !ok { return "", ErrNotFound } + if ok { if canUpdate(prev, sil, now) { return sil.Id, s.setSilence(sil, now, false) @@ -623,7 +643,24 @@ func (s *Silences) set(sil *pb.Silence) (string, error) { } } } + // If we got here it's either a new silence or a replacing one. + if s.limits.MaxSilences > 0 { + // Get the number of active and pending silences to enforce limits. + q := &query{} + err := QState(types.SilenceStateActive, types.SilenceStatePending)(q) + if err != nil { + return "", fmt.Errorf("unable to query silences while checking limits: %w", err) + } + sils, _, err := s.query(q, s.nowUTC()) + if err != nil { + return "", fmt.Errorf("unable to query silences while checking limits: %w", err) + } + if len(sils)+1 > s.limits.MaxSilences { + return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(sils), s.limits.MaxSilences) + } + } + uid, err := uuid.NewV4() if err != nil { return "", fmt.Errorf("generate uuid: %w", err) @@ -634,7 +671,11 @@ func (s *Silences) set(sil *pb.Silence) (string, error) { sil.StartsAt = now } - return sil.Id, s.setSilence(sil, now, false) + if err = s.setSilence(sil, now, false); err != nil { + return "", err + } + + return sil.Id, nil } // canUpdate returns true if silence a can be updated to b without @@ -778,6 +819,9 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) { // Query for silences based on the given query parameters. It returns the // resulting silences and the state version the result is based on. func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) { + s.mtx.Lock() + defer s.mtx.Unlock() + s.metrics.queriesTotal.Inc() defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration() @@ -817,9 +861,6 @@ func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, int, error) { // the use of post-filter functions is the trivial solution for now. var res []*pb.Silence - s.mtx.Lock() - defer s.mtx.Unlock() - if q.ids != nil { for _, id := range q.ids { if s, ok := s.st[id]; ok { diff --git a/silence/silence_test.go b/silence/silence_test.go index 864950eb5a..41f0861be8 100644 --- a/silence/silence_test.go +++ b/silence/silence_test.go @@ -18,6 +18,7 @@ import ( "os" "runtime" "sort" + "strings" "sync" "testing" "time" @@ -458,6 +459,74 @@ func TestSilenceSet(t *testing.T) { require.Equal(t, want, s.st, "unexpected state after silence creation") } +func TestSilenceLimits(t *testing.T) { + s, err := New(Options{ + Limits: Limits{ + MaxSilences: 1, + MaxPerSilenceBytes: 2 << 11, // 4KB + }, + }) + require.NoError(t, err) + + // Insert sil1 should succeed without error. + sil1 := &pb.Silence{ + Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + id1, err := s.Set(sil1) + require.NoError(t, err) + require.NotEqual(t, "", id1) + + // Insert sil2 should fail because maximum number of silences + // has been exceeded. + sil2 := &pb.Silence{ + Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + id2, err := s.Set(sil2) + require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)") + require.Equal(t, "", id2) + + // Expire sil1. This should allow sil2 to be inserted. + require.NoError(t, s.Expire(id1)) + id2, err = s.Set(sil2) + require.NoError(t, err) + require.NotEqual(t, "", id2) + + // Should be able to update sil2 without hitting the limit. + _, err = s.Set(sil2) + require.NoError(t, err) + + // Expire sil2. + require.NoError(t, s.Expire(id2)) + + // Insert sil3 should fail because it exceeds maximum size. + sil3 := &pb.Silence{ + Matchers: []*pb.Matcher{ + { + Name: strings.Repeat("a", 2<<9), + Pattern: strings.Repeat("b", 2<<9), + }, + { + Name: strings.Repeat("c", 2<<9), + Pattern: strings.Repeat("d", 2<<9), + }, + }, + CreatedBy: strings.Repeat("e", 2<<9), + Comment: strings.Repeat("f", 2<<9), + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + id3, err := s.Set(sil3) + require.Error(t, err) + // Do not check the exact size as it can change between consecutive runs + // due to padding. + require.Contains(t, err.Error(), "silence exceeded maximum size") + require.Equal(t, "", id3) +} + func TestSilenceUpsert(t *testing.T) { s, err := New(Options{ Retention: time.Hour,