diff --git a/CHANGELOG.md b/CHANGELOG.md index a1ea3d9713b..734ab4fb27b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Fixed -- [#2411](https://github.com/thanos-io/thanos/pull/2411) Query: fix a bug where queries might not time out sometimes due to issues with one or more StoreAPIs +- [#2472](https://github.com/thanos-io/thanos/pull/2472) compact: Fixed bug where partial blocks were never deleted and causing spam of warnings. +- [#2411](https://github.com/thanos-io/thanos/pull/2411) Query: fix a bug where queries might not time out sometimes due to issues with one or more StoreAPIs. ## [v0.12.0](https://github.com/thanos-io/thanos/releases/tag/v0.12.0) - 2020.04.15 diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 1bad45f773e..627010f1795 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -420,7 +420,7 @@ func runCompact( } // No need to resync before partial uploads and delete marked blocks. Last sync should be valid. - compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksMarkedForDeletion) + compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksCleaned, blockCleanupFailures) if err := blocksCleaner.DeleteMarkedBlocks(ctx); err != nil { return errors.Wrap(err, "error cleaning blocks") } diff --git a/pkg/compact/clean.go b/pkg/compact/clean.go index 53cfb169083..f70851bdb58 100644 --- a/pkg/compact/clean.go +++ b/pkg/compact/clean.go @@ -27,7 +27,8 @@ func BestEffortCleanAbortedPartialUploads( partial map[ulid.ULID]error, bkt objstore.Bucket, deleteAttempts prometheus.Counter, - blocksMarkedForDeletion prometheus.Counter, + blockCleanups prometheus.Counter, + blockCleanupFailures prometheus.Counter, ) { level.Info(logger).Log("msg", "started cleaning of aborted partial uploads") @@ -45,10 +46,16 @@ func BestEffortCleanAbortedPartialUploads( deleteAttempts.Inc() level.Info(logger).Log("msg", "found partially uploaded block; marking for deletion", "block", id) - if err := block.MarkForDeletion(ctx, logger, bkt, id, blocksMarkedForDeletion); err != nil { - level.Warn(logger).Log("msg", "failed to delete aborted partial upload; skipping", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err) - return + // We don't gather any information about deletion marks for partial blocks, so let's simply remove it. We waited + // long PartialUploadThresholdAge already. + // TODO(bwplotka): Fix some edge cases: https://github.com/thanos-io/thanos/issues/2470 . + if err := block.Delete(ctx, logger, bkt, id); err != nil { + blockCleanups.Inc() + level.Warn(logger).Log("msg", "failed to delete aborted partial upload; will retry in next iteration", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err) + continue } + + blockCleanupFailures.Inc() level.Info(logger).Log("msg", "deleted aborted partial upload", "block", id, "thresholdAge", PartialUploadThresholdAge) } level.Info(logger).Log("msg", "cleaning of aborted partial uploads done") diff --git a/pkg/compact/clean_test.go b/pkg/compact/clean_test.go index 1321c888ba6..51d603799bc 100644 --- a/pkg/compact/clean_test.go +++ b/pkg/compact/clean_test.go @@ -59,22 +59,17 @@ func TestBestEffortCleanAbortedPartialUploads(t *testing.T) { testutil.Ok(t, bkt.Upload(ctx, path.Join(shouldIgnoreID2.String(), "chunks", "000001"), &fakeChunk)) deleteAttempts := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) - blocksMarkedForDeletion := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) - + blockCleanupFailures := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) _, partial, err := metaFetcher.Fetch(ctx) testutil.Ok(t, err) - BestEffortCleanAbortedPartialUploads(ctx, logger, partial, bkt, deleteAttempts, blocksMarkedForDeletion) + BestEffortCleanAbortedPartialUploads(ctx, logger, partial, bkt, deleteAttempts, blockCleanupFailures) testutil.Equals(t, 1.0, promtest.ToFloat64(deleteAttempts)) + testutil.Equals(t, 1.0, promtest.ToFloat64(blockCleanupFailures)) exists, err := bkt.Exists(ctx, path.Join(shouldDeleteID.String(), "chunks", "000001")) testutil.Ok(t, err) - testutil.Equals(t, true, exists) - - exists, err = bkt.Exists(ctx, path.Join(shouldDeleteID.String(), metadata.DeletionMarkFilename)) - testutil.Ok(t, err) - testutil.Equals(t, true, exists) - testutil.Equals(t, 1.0, promtest.ToFloat64(blocksMarkedForDeletion)) + testutil.Equals(t, false, exists) exists, err = bkt.Exists(ctx, path.Join(shouldIgnoreID1.String(), "chunks", "000001")) testutil.Ok(t, err) diff --git a/test/e2e/compact_test.go b/test/e2e/compact_test.go index 9524b960dd7..6212259b6b9 100644 --- a/test/e2e/compact_test.go +++ b/test/e2e/compact_test.go @@ -504,11 +504,11 @@ func TestCompactWithStoreGateway(t *testing.T) { // NOTE: We cannot assert on intermediate `thanos_blocks_meta_` metrics as those are gauge and change dynamically due to many // compaction groups. Wait for at least first compaction iteration (next is in 5m). testutil.Ok(t, c.WaitSumMetrics(e2e.Greater(0), "thanos_compactor_iterations_total")) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_cleaned_total")) // This should be 1 [BUG no 1]. TODO: fix. + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(1), "thanos_compactor_blocks_cleaned_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2*4+2+2*3+1), "thanos_compactor_blocks_marked_for_deletion_total")) // 17. testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) // We should have 1. testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(5), "thanos_compact_group_compactions_total")) - // TODO(bwplotka): This is confusing, should be either normal compaction or vertical not both (?) + // TODO(bwplotka): This is confusing, should be either normal compaction or vertical not both (https://github.com/thanos-io/thanos/issues/2469). testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(3), "thanos_compact_group_vertical_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(12), "thanos_compact_group_compaction_runs_started_total")) @@ -519,7 +519,8 @@ func TestCompactWithStoreGateway(t *testing.T) { testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64( len(rawBlockIDs)+7+ - 5, // 5 compactions, 5 newly added blocks. + 5+ // 5 compactions, 5 newly added blocks. + -1, // Partial block removed. )), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total"))