Skip to content

Commit

Permalink
Apply error in loop should not kill the sidecar (#2996)
Browse files Browse the repository at this point in the history
* Log prometheus config reload error instead of stopping sidecar

Signed-off-by: Max Neverov <[email protected]>

* Add reloader option; new flags to control reloader time intervals

Signed-off-by: Max Neverov <[email protected]>

* Fix documentation after changes

Signed-off-by: Max Neverov <[email protected]>

* Apply suggestions from PR review

Co-authored-by: Bartlomiej Plotka <[email protected]>
Signed-off-by: Max Neverov <[email protected]>

Co-authored-by: Bartlomiej Plotka <[email protected]>
  • Loading branch information
mneverov and bwplotka authored Aug 7, 2020
1 parent 0d033bf commit 34a2ff1
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 52 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
- [#2892](https://github.com/thanos-io/thanos/pull/2892) Receive: Receiver fails when the initial upload fails.
- [#2865](https://github.com/thanos-io/thanos/pull/2865) ui: Migrate Thanos Ruler UI to React
- [#2964](https://github.com/thanos-io/thanos/pull/2964) Query: Add time range parameters to label APIs. Add `start` and `end` fields to Store API `LabelNamesRequest` and `LabelValuesRequest`.
- [#2996](https://github.com/thanos-io/thanos/pull/2996) Sidecar: Add `reloader_config_apply_errors_total` metric. Add new flags `--reloader.watch-interval`, and `--reloader.retry-interval`.

### Changed

Expand Down
9 changes: 9 additions & 0 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ type reloaderConfig struct {
confFile string
envVarConfFile string
ruleDirectories []string
watchInterval time.Duration
retryInterval time.Duration
}

func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
Expand All @@ -110,6 +112,13 @@ func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
cmd.Flag("reloader.rule-dir",
"Rule directories for the reloader to refresh (repeated field).").
StringsVar(&rc.ruleDirectories)
cmd.Flag("reloader.watch-interval",
"Controls how often reloader re-reads config and rules.").
Default("3m").DurationVar(&rc.watchInterval)
cmd.Flag("reloader.retry-interval",
"Controls how often reloader retries config reload in case of error.").
Default("5s").DurationVar(&rc.retryInterval)

return rc
}

Expand Down
16 changes: 9 additions & 7 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application) {
conf.registerFlag(cmd)

m[component.Sidecar.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
rl := reloader.New(
log.With(logger, "component", "reloader"),
rl := reloader.New(log.With(logger, "component", "reloader"),
extprom.WrapRegistererWithPrefix("thanos_sidecar_", reg),
reloader.ReloadURLFromBase(conf.prometheus.url),
conf.reloader.confFile,
conf.reloader.envVarConfFile,
conf.reloader.ruleDirectories,
)
&reloader.Options{
ReloadURL: reloader.ReloadURLFromBase(conf.prometheus.url),
CfgFile: conf.reloader.confFile,
CfgOutputFile: conf.reloader.envVarConfFile,
RuleDirs: conf.reloader.ruleDirectories,
WatchInterval: conf.reloader.watchInterval,
RetryInterval: conf.reloader.retryInterval,
})

return runSidecar(
g,
Expand Down
6 changes: 6 additions & 0 deletions docs/components/sidecar.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ Flags:
--reloader.rule-dir=RELOADER.RULE-DIR ...
Rule directories for the reloader to refresh
(repeated field).
--reloader.watch-interval=3m
Controls how often reloader re-reads config and
rules.
--reloader.retry-interval=5s
Controls how often reloader retries config
reload in case of error.
--objstore.config-file=<file-path>
Path to YAML file that contains object store
configuration. See format details:
Expand Down
17 changes: 9 additions & 8 deletions pkg/reloader/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"log"
"net/http"
"net/url"
"time"

"github.com/thanos-io/thanos/pkg/reloader"
)
Expand All @@ -19,14 +20,14 @@ func ExampleReloader() {
if err != nil {
log.Fatal(err)
}
rl := reloader.New(
nil,
nil,
reloader.ReloadURLFromBase(u),
"/path/to/cfg",
"/path/to/cfg.out",
[]string{"/path/to/dirs"},
)
rl := reloader.New(nil, nil, &reloader.Options{
ReloadURL: reloader.ReloadURLFromBase(u),
CfgFile: "/path/to/cfg",
CfgOutputFile: "/path/to/cfg.out",
RuleDirs: []string{"/path/to/dirs"},
WatchInterval: 3 * time.Minute,
RetryInterval: 5 * time.Second,
})

ctx, cancel := context.WithCancel(context.Background())
go func() {
Expand Down
67 changes: 45 additions & 22 deletions pkg/reloader/reloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@
// This and below for reloader:
//
// u, _ := url.Parse("http://localhost:9090")
// rl := reloader.New(
// nil,
// reloader.ReloadURLFromBase(u),
// "/path/to/cfg",
// "/path/to/cfg.out",
// []string{"/path/to/dirs"},
// )
// rl := reloader.New(nil, nil, &reloader.Options{
// ReloadURL: reloader.ReloadURLFromBase(u),
// CfgFile: "/path/to/cfg",
// CfgOutputFile: "/path/to/cfg.out",
// RuleDirs: []string{"/path/to/dirs"},
// WatchInterval: 3 * time.Minute,
// RetryInterval: 5 * time.Second,
// })
//
// The url of reloads can be generated with function ReloadURLFromBase().
// It will append the default path of reload into the given url:
Expand All @@ -41,7 +42,7 @@
// // ...
// cancel()
//
// By default, reloader will make a schedule to check the given config files and dirs of sum of hash with the last result,
// Reloader will make a schedule to check the given config files and dirs of sum of hash with the last result,
// even if it is no changes.
//
// A basic example of configuration template with environment variables:
Expand Down Expand Up @@ -97,27 +98,44 @@ type Reloader struct {
watches prometheus.Gauge
watchEvents prometheus.Counter
watchErrors prometheus.Counter
configErrors prometheus.Counter
}

// Options bundles options for the Reloader.
type Options struct {
// ReloadURL is a prometheus URL to trigger reloads.
ReloadURL *url.URL
// CfgFile is a path to the prometheus config file to watch.
CfgFile string
// CfgOutputFile is a path for the output config file.
// If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables
// will be substituted and the output written into the given path. Prometheus should then use
// cfgOutputFile as its config file path.
CfgOutputFile string
// RuleDirs is a collection of paths for this reloader to watch over.
RuleDirs []string
// WatchInterval controls how often reloader re-reads config and rules.
WatchInterval time.Duration
// RetryInterval controls how often reloader retries config reload in case of error.
RetryInterval time.Duration
}

var firstGzipBytes = []byte{0x1f, 0x8b, 0x08}

// New creates a new reloader that watches the given config file and rule directory
// and triggers a Prometheus reload upon changes.
// If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables
// will be substituted and the output written into the given path. Prometheus should then use
// cfgOutputFile as its config file path.
func New(logger log.Logger, reg prometheus.Registerer, reloadURL *url.URL, cfgFile string, cfgOutputFile string, ruleDirs []string) *Reloader {
func New(logger log.Logger, reg prometheus.Registerer, o *Options) *Reloader {
if logger == nil {
logger = log.NewNopLogger()
}
r := &Reloader{
logger: logger,
reloadURL: reloadURL,
cfgFile: cfgFile,
cfgOutputFile: cfgOutputFile,
ruleDirs: ruleDirs,
watchInterval: 3 * time.Minute,
retryInterval: 5 * time.Second,
reloadURL: o.ReloadURL,
cfgFile: o.CfgFile,
cfgOutputFile: o.CfgOutputFile,
ruleDirs: o.RuleDirs,
watchInterval: o.WatchInterval,
retryInterval: o.RetryInterval,

reloads: promauto.With(reg).NewCounter(
prometheus.CounterOpts{
Expand All @@ -131,6 +149,12 @@ func New(logger log.Logger, reg prometheus.Registerer, reloadURL *url.URL, cfgFi
Help: "Total number of reload requests that failed.",
},
),
configErrors: promauto.With(reg).NewCounter(
prometheus.CounterOpts{
Name: "reloader_config_apply_errors_total",
Help: "Total number of config applies that failed.",
},
),
watches: promauto.With(reg).NewGauge(
prometheus.GaugeOpts{
Name: "reloader_watches",
Expand Down Expand Up @@ -196,7 +220,7 @@ func (r *Reloader) Watch(ctx context.Context) error {

r.watches.Set(float64(len(watchables)))
level.Info(r.logger).Log(
"msg", "started watching config file and non-recursively rule dirs for changes",
"msg", "started watching config file and recursively rule dirs for changes",
"cfg", r.cfgFile,
"out", r.cfgOutputFile,
"dirs", strings.Join(r.ruleDirs, ","))
Expand All @@ -218,9 +242,8 @@ func (r *Reloader) Watch(ctx context.Context) error {
}

if err := r.apply(ctx); err != nil {
// Critical error.
// TODO(bwplotka): There is no need to get process down in this case and decrease availability, handle the error in different way.
return err
r.configErrors.Inc()
level.Error(r.logger).Log("msg", "apply error", "err", err)
}
}
}
Expand Down
43 changes: 28 additions & 15 deletions pkg/reloader/reloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,14 @@ func TestReloader_ConfigApply(t *testing.T) {
input = filepath.Join(dir, "in", "cfg.yaml.tmpl")
output = filepath.Join(dir, "out", "cfg.yaml")
)
reloader := New(nil, nil, reloadURL, input, output, nil)
reloader.watchInterval = 9999 * time.Hour // Disable interval to test watch logic only.
reloader.retryInterval = 100 * time.Millisecond
reloader := New(nil, nil, &Options{
ReloadURL: reloadURL,
CfgFile: input,
CfgOutputFile: output,
RuleDirs: nil,
WatchInterval: 9999 * time.Hour, // Disable interval to test watch logic only.
RetryInterval: 100 * time.Millisecond,
})

// Fail without config.
err = reloader.Watch(ctx)
Expand Down Expand Up @@ -97,6 +102,7 @@ config:
}()

reloadsSeen := 0
attemptsCnt := 0
for {
select {
case <-ctx.Done():
Expand All @@ -105,12 +111,9 @@ config:
}

rel := reloads.Load().(int)
if rel <= reloadsSeen {
// Nothing new.
continue
}
reloadsSeen = rel

if reloadsSeen == 0 {
if reloadsSeen == 1 {
// Initial apply seen (without doing nothing).
f, err := ioutil.ReadFile(output)
testutil.Ok(t, err)
Expand All @@ -128,7 +131,7 @@ config:
b: $(TEST_RELOADER_THANOS_ENV)
c: $(TEST_RELOADER_THANOS_ENV2)
`), os.ModePerm))
} else {
} else if reloadsSeen == 2 {
// Another apply, ensure we see change.
f, err := ioutil.ReadFile(output)
testutil.Ok(t, err)
Expand All @@ -138,10 +141,15 @@ config:
b: 2
c: 3
`, string(f))
// All good, break
break

// Change the mode so reloader can't read the file.
testutil.Ok(t, os.Chmod(input, os.ModeDir))
attemptsCnt += 1
// That was the second attempt to reload config. All good, break.
if attemptsCnt == 2 {
break
}
}
reloadsSeen = rel
}
cancel2()
g.Wait()
Expand Down Expand Up @@ -191,9 +199,14 @@ func TestReloader_RuleApply(t *testing.T) {
testutil.Ok(t, os.Mkdir(path.Join(dir2, "rule-dir"), os.ModePerm))
testutil.Ok(t, os.Symlink(path.Join(dir2, "rule-dir"), path.Join(dir, "rule-dir")))

reloader := New(nil, nil, reloadURL, "", "", []string{dir, path.Join(dir, "rule-dir")})
reloader.watchInterval = 100 * time.Millisecond
reloader.retryInterval = 100 * time.Millisecond
reloader := New(nil, nil, &Options{
ReloadURL: reloadURL,
CfgFile: "",
CfgOutputFile: "",
RuleDirs: []string{dir, path.Join(dir, "rule-dir")},
WatchInterval: 100 * time.Millisecond,
RetryInterval: 100 * time.Millisecond,
})

// Some initial state.
testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "rule1.yaml"), []byte("rule"), os.ModePerm))
Expand Down

0 comments on commit 34a2ff1

Please sign in to comment.