diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ac858e80b..88fee87e48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Added - [#2502](https://github.com/thanos-io/thanos/pull/2502) Added `hints` field to `SeriesResponse`. Hints in an opaque data structure that can be used to carry additional information from the store and its content is implementation specific. +- [#2521](https://github.com/thanos-io/thanos/pull/2521) Sidecar: add `thanos_sidecar_reloader_reloads_failed_total`, `thanos_sidecar_reloader_reloads_total`, `thanos_sidecar_reloader_watch_errors_total`, `thanos_sidecar_reloader_watch_events_total` and `thanos_sidecar_reloader_watches` metrics. ### Changed diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index caed499963..82411ded41 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -80,6 +80,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application) { m[component.Sidecar.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { rl := reloader.New( log.With(logger, "component", "reloader"), + extprom.WrapRegistererWithPrefix("thanos_sidecar_", reg), reloader.ReloadURLFromBase(*promURL), *reloaderCfgFile, *reloaderCfgOutputFile, diff --git a/pkg/reloader/example_test.go b/pkg/reloader/example_test.go index 5f67a5caf3..dc1ec3ee65 100644 --- a/pkg/reloader/example_test.go +++ b/pkg/reloader/example_test.go @@ -20,6 +20,7 @@ func ExampleReloader() { log.Fatal(err) } rl := reloader.New( + nil, nil, reloader.ReloadURLFromBase(u), "/path/to/cfg", diff --git a/pkg/reloader/reloader.go b/pkg/reloader/reloader.go index 508c14280f..02de1771f3 100644 --- a/pkg/reloader/reloader.go +++ b/pkg/reloader/reloader.go @@ -72,6 +72,8 @@ import ( "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/thanos-io/thanos/pkg/runutil" ) @@ -89,6 +91,12 @@ type Reloader struct { lastCfgHash []byte lastRuleHash []byte + + reloads prometheus.Counter + reloadErrors prometheus.Counter + watches prometheus.Gauge + watchEvents prometheus.Counter + watchErrors prometheus.Counter } var firstGzipBytes = []byte{0x1f, 0x8b, 0x08} @@ -98,11 +106,11 @@ var firstGzipBytes = []byte{0x1f, 0x8b, 0x08} // If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables // will be substituted and the output written into the given path. Prometheus should then use // cfgOutputFile as its config file path. -func New(logger log.Logger, reloadURL *url.URL, cfgFile string, cfgOutputFile string, ruleDirs []string) *Reloader { +func New(logger log.Logger, reg prometheus.Registerer, reloadURL *url.URL, cfgFile string, cfgOutputFile string, ruleDirs []string) *Reloader { if logger == nil { logger = log.NewNopLogger() } - return &Reloader{ + r := &Reloader{ logger: logger, reloadURL: reloadURL, cfgFile: cfgFile, @@ -110,7 +118,39 @@ func New(logger log.Logger, reloadURL *url.URL, cfgFile string, cfgOutputFile st ruleDirs: ruleDirs, watchInterval: 3 * time.Minute, retryInterval: 5 * time.Second, + + reloads: promauto.With(reg).NewCounter( + prometheus.CounterOpts{ + Name: "reloader_reloads_total", + Help: "Total number of reload requests.", + }, + ), + reloadErrors: promauto.With(reg).NewCounter( + prometheus.CounterOpts{ + Name: "reloader_reloads_failed_total", + Help: "Total number of reload requests that failed.", + }, + ), + watches: promauto.With(reg).NewGauge( + prometheus.GaugeOpts{ + Name: "reloader_watches", + Help: "Number of resources watched by the reloader.", + }, + ), + watchEvents: promauto.With(reg).NewCounter( + prometheus.CounterOpts{ + Name: "reloader_watch_events_total", + Help: "Total number of events received by the reloader from the watcher.", + }, + ), + watchErrors: promauto.With(reg).NewCounter( + prometheus.CounterOpts{ + Name: "reloader_watch_errors_total", + Help: "Total number of errors received by the reloader from the watcher.", + }, + ), } + return r } // We cannot detect everything via watch. Watch interval controls how often we re-read given dirs non-recursively. @@ -154,6 +194,7 @@ func (r *Reloader) Watch(ctx context.Context) error { tick := time.NewTicker(r.watchInterval) defer tick.Stop() + r.watches.Set(float64(len(watchables))) level.Info(r.logger).Log( "msg", "started watching config file and non-recursively rule dirs for changes", "cfg", r.cfgFile, @@ -166,11 +207,12 @@ func (r *Reloader) Watch(ctx context.Context) error { return nil case <-tick.C: case event := <-watcher.Events: - // TODO(bwplotka): Add metric if we are not cycling CPU here too much. + r.watchEvents.Inc() if _, ok := watchables[filepath.Dir(event.Name)]; !ok { continue } case err := <-watcher.Errors: + r.watchErrors.Inc() level.Error(r.logger).Log("msg", "watch error", "err", err) continue } @@ -280,7 +322,9 @@ func (r *Reloader) apply(ctx context.Context) error { defer cancel() if err := runutil.RetryWithLog(r.logger, r.retryInterval, retryCtx.Done(), func() error { + r.reloads.Inc() if err := r.triggerReload(ctx); err != nil { + r.reloadErrors.Inc() return errors.Wrap(err, "trigger reload") } diff --git a/pkg/reloader/reloader_test.go b/pkg/reloader/reloader_test.go index 3784635349..f9f93bee4c 100644 --- a/pkg/reloader/reloader_test.go +++ b/pkg/reloader/reloader_test.go @@ -62,7 +62,7 @@ func TestReloader_ConfigApply(t *testing.T) { input = path.Join(dir, "in", "cfg.yaml.tmpl") output = path.Join(dir, "out", "cfg.yaml") ) - reloader := New(nil, reloadURL, input, output, nil) + reloader := New(nil, nil, reloadURL, input, output, nil) reloader.watchInterval = 9999 * time.Hour // Disable interval to test watch logic only. reloader.retryInterval = 100 * time.Millisecond @@ -205,7 +205,7 @@ func TestReloader_RuleApply(t *testing.T) { testutil.Ok(t, os.Mkdir(path.Join(dir2, "rule-dir"), os.ModePerm)) testutil.Ok(t, os.Symlink(path.Join(dir2, "rule-dir"), path.Join(dir, "rule-dir"))) - reloader := New(nil, reloadURL, "", "", []string{dir, path.Join(dir, "rule-dir")}) + reloader := New(nil, nil, reloadURL, "", "", []string{dir, path.Join(dir, "rule-dir")}) reloader.watchInterval = 100 * time.Millisecond reloader.retryInterval = 100 * time.Millisecond