From 57429c40813588f42132d14e0f82627eee147a95 Mon Sep 17 00:00:00 2001 From: Seena Fallah Date: Sun, 13 Nov 2022 20:12:55 +0100 Subject: [PATCH] compact: retry on sync metas error (#5865) As SyncMetas is surrounded by Repeat func and can return retry errors, in some cases (like S3) errors (network issue, timeout, etc.) can be retried. Signed-off-by: Seena Fallah Signed-off-by: Seena Fallah --- CHANGELOG.md | 1 + cmd/thanos/compact.go | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7d45e882e..6741eb6e84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5801](https://github.com/thanos-io/thanos/pull/5801) Store: add a new limiter `--store.grpc.downloaded-bytes-limit` that limits the number of bytes downloaded in each Series/LabelNames/LabelValues call. Use `thanos_bucket_store_postings_size_bytes` for determining the limits. - [#5839](https://github.com/thanos-io/thanos/pull/5839) Receive: Add parameter `--tsdb.out-of-order.time-window` to set time window for experimental out-of-order samples ingestion. Disabled by default (set to 0s). Please note if you enable this option and you use compactor, make sure you set the `--enable-vertical-compaction` flag, otherwise you might risk compactor halt. - [#5836](https://github.com/thanos-io/thanos/pull/5836) Receive: Add hidden flag `tsdb.memory-snapshot-on-shutdown` to enable experimental TSDB feature to snapshot on shutdown. This is intended to speed up receiver restart. +- [#5865](https://github.com/thanos-io/thanos/pull/5865) Compact: Retry on sync metas error. ### Changed diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 97639d4840..33a7618416 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -576,6 +576,15 @@ func runCompact( return runutil.Repeat(conf.progressCalculateInterval, ctx.Done(), func() error { if err := sy.SyncMetas(ctx); err != nil { + // The RetryError signals that we hit an retriable error (transient error, no connection). + // You should alert on this being triggered too frequently. + if compact.IsRetryError(err) { + level.Error(logger).Log("msg", "retriable error", "err", err) + compactMetrics.retried.Inc() + + return nil + } + return errors.Wrapf(err, "could not sync metas") }