Skip to content

Commit

Permalink
Metrics: Notification log maintenance success and failure
Browse files Browse the repository at this point in the history
Due to various reasons, we've observed different kind of errors on this area. From read-only disks to silly code bugs. Errors during maintenance are effectively a data loss and therefore we should encourage proper monitoring of this area.

Similar to #3285

Signed-off-by: gotjosh <[email protected]>
  • Loading branch information
gotjosh committed Mar 7, 2023
1 parent 72b0cb0 commit c2e081a
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 21 deletions.
21 changes: 19 additions & 2 deletions nflog/nflog.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ type metrics struct {
queryErrorsTotal prometheus.Counter
queryDuration prometheus.Histogram
propagatedMessagesTotal prometheus.Counter
maintenanceTotal prometheus.Counter
maintenanceErrorsTotal prometheus.Counter
}

func newMetrics(r prometheus.Registerer) *metrics {
Expand All @@ -120,6 +122,14 @@ func newMetrics(r prometheus.Registerer) *metrics {
Name: "alertmanager_nflog_snapshot_size_bytes",
Help: "Size of the last notification log snapshot in bytes.",
})
m.maintenanceTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_maintenance_total",
Help: "How many maintenances were executed for the notification log.",
})
m.maintenanceErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_maintenance_errors_total",
Help: "How many maintenances were executed for the notification log that failed.",
})
m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_queries_total",
Help: "Number of notification log queries were received.",
Expand All @@ -146,6 +156,8 @@ func newMetrics(r prometheus.Registerer) *metrics {
m.queryErrorsTotal,
m.queryDuration,
m.propagatedMessagesTotal,
m.maintenanceTotal,
m.maintenanceErrorsTotal,
)
}
return m
Expand Down Expand Up @@ -317,12 +329,17 @@ func (l *Log) Maintenance(interval time.Duration, snapf string, stopc <-chan str
}

runMaintenance := func(do func() (int64, error)) error {
l.metrics.maintenanceTotal.Inc()
start := l.now().UTC()
level.Debug(l.logger).Log("msg", "Running maintenance")
size, err := do()
level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size)
l.metrics.snapshotSize.Set(float64(size))
return err
if err != nil {
l.metrics.maintenanceErrorsTotal.Inc()
return err
}
level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size)
return nil
}

Loop:
Expand Down
65 changes: 46 additions & 19 deletions nflog/nflog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ import (
"io"
"os"
"path/filepath"
"runtime"
"sync"
"testing"
"time"

"github.com/prometheus/client_golang/prometheus/testutil"
"go.uber.org/atomic"

pb "github.com/prometheus/alertmanager/nflog/nflogpb"

"github.com/benbjohnson/clock"
Expand Down Expand Up @@ -133,35 +135,54 @@ func TestLogSnapshot(t *testing.T) {
func TestWithMaintenance_SupportsCustomCallback(t *testing.T) {
f, err := os.CreateTemp("", "snapshot")
require.NoError(t, err, "creating temp file failed")

stopc := make(chan struct{})
var mtx sync.Mutex
var mc int
reg := prometheus.NewPedanticRegistry()
opts := Options{
Metrics: prometheus.NewPedanticRegistry(),
Metrics: reg,
SnapshotFile: f.Name(),
}

l, err := New(opts)
mockClock := clock.NewMock()
l.clock = mockClock
clock := clock.NewMock()
l.clock = clock
require.NoError(t, err)

go l.Maintenance(100*time.Millisecond, f.Name(), stopc, func() (int64, error) {
mtx.Lock()
mc++
mtx.Unlock()
var calls atomic.Int32
var wg sync.WaitGroup

return 0, nil
})
runtime.Gosched() // ensure that the ticker is running.
wg.Add(1)
go func() {
defer wg.Done()
l.Maintenance(100*time.Millisecond, f.Name(), stopc, func() (int64, error) {
calls.Add(1)
return 0, nil
})

mockClock.Add(200 * time.Millisecond)
close(stopc)
}()
gosched()

// Before the first tick, no maintenance executed.
clock.Add(99 * time.Millisecond)
require.EqualValues(t, 0, calls.Load())

mtx.Lock()
defer mtx.Unlock()
require.Equal(t, 2, mc)
// Tick once.
clock.Add(1 * time.Millisecond)
require.EqualValues(t, 1, calls.Load())

// Stop the maintenance loop. We should get exactly one more execution of the maintenance func.
close(stopc)
wg.Wait()

require.EqualValues(t, 2, calls.Load())
// Check the maintenance metrics.
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
# TYPE alertmanager_nflog_maintenance_errors_total counter
alertmanager_nflog_maintenance_errors_total 0
# HELP alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
# TYPE alertmanager_nflog_maintenance_total counter
alertmanager_nflog_maintenance_total 2
`), "alertmanager_nflog_maintenance_total", "alertmanager_nflog_maintenance_errors_total"))
}

func TestReplaceFile(t *testing.T) {
Expand Down Expand Up @@ -356,3 +377,9 @@ func TestStateDecodingError(t *testing.T) {
_, err = decodeState(bytes.NewReader(msg))
require.Equal(t, ErrInvalidState, err)
}

// runtime.Gosched() does not "suspend" the current goroutine so there's no guarantee that the main goroutine won't
// be able to continue. For more see https://pkg.go.dev/runtime#Gosched.
func gosched() {
time.Sleep(1 * time.Millisecond)
}

0 comments on commit c2e081a

Please sign in to comment.