From 4a56ae09aad1f9cf6ccec51a4e6ee1c960c31554 Mon Sep 17 00:00:00 2001 From: Kartik-Garg Date: Tue, 17 Jan 2023 17:45:14 +0530 Subject: [PATCH] Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again for given timeout duration. Signed-off-by: Kartik-Garg --- CHANGELOG.md | 1 + cmd/thanos/store.go | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f05885dbbee..c9be78a554b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5995](https://github.com/thanos-io/thanos/pull/5995) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated +- [#6050](https://github.com/thanos-io/thanos/pull/6050) Store: Re-try bucket store initial sync upon failure. - [#6066](https://github.com/thanos-io/thanos/pull/6066) Tracing: fixed panic because of nil sampler - [#6067](https://github.com/thanos-io/thanos/pull/6067) Receive: fixed panic when querying uninitialized TSDBs. diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 30df09ba5e2..f3bc40c9dc2 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -49,6 +49,11 @@ import ( "github.com/thanos-io/thanos/pkg/ui" ) +const ( + timeoutDuration = 30 + intervalDuration = 10 +) + type storeConfig struct { indexCacheConfigs extflag.PathOrContent objStoreConfig extflag.PathOrContent @@ -381,14 +386,25 @@ func runStore( level.Info(logger).Log("msg", "initializing bucket store") begin := time.Now() - if err := bs.InitialSync(ctx); err != nil { + + //This will stop retrying after set timeout duration. + initialSyncCtx, cancel := context.WithTimeout(context.Background(), timeoutDuration*time.Second) + defer cancel() + + //Retry in case of error. + err := runutil.Retry(intervalDuration*time.Second, initialSyncCtx.Done(), func() error { + return bs.InitialSync(ctx) + }) + + if err != nil { close(bucketStoreReady) return errors.Wrap(err, "bucket store initial sync") } + level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) close(bucketStoreReady) - err := runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { + err = runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { if err := bs.SyncBlocks(ctx); err != nil { level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) }