From 8f8767c842b917697444acbd7739fa141502bce6 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Mon, 6 Feb 2023 19:15:55 +0000 Subject: [PATCH 1/3] backup: fail instead of pausing when out of retries This changes backups to fail with the most recent error instead of pausing when they run out of retries while hitting non-permanent errors. This is desired as a failed backup is expected to be more likely to be seen and noticed in monitoring (number of failed jobs increments, they show up red on jobs page, etc) and becuase a backup schedule it then able to trigger subsequent jobs instead of wiating on the incomplete paused job that will never resume or complete without human intervention. Release note (ops change): a BACKUP which encounters too many retryable errors will now fail instead of pausing to allow subsequent backups the chance to succeed. --- pkg/ccl/backupccl/backup_job.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pkg/ccl/backupccl/backup_job.go b/pkg/ccl/backupccl/backup_job.go index 5acebaaeb35a..0a78f6856738 100644 --- a/pkg/ccl/backupccl/backup_job.go +++ b/pkg/ccl/backupccl/backup_job.go @@ -722,14 +722,12 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error { } } - // We have exhausted retries, but we have not seen a "PermanentBulkJobError" so - // it is possible that this is a transient error that is taking longer than - // our configured retry to go away. - // - // Let's pause the job instead of failing it so that the user can decide - // whether to resume it or cancel it. + // We have exhausted retries without getting a "PermanentBulkJobError", but + // something must be wrong if we keep seeing errors so give up and fail to + // ensure that any alerting on failures is triggered and that any subsequent + // schedule runs are not blocked. if err != nil { - return jobs.MarkPauseRequestError(errors.Wrap(err, "exhausted retries")) + return errors.Wrap(err, "exhausted retries") } var backupDetails jobspb.BackupDetails From 4ed1551b0720d64b6f9eb0345b5d0a8aae729aa4 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Mon, 6 Feb 2023 21:10:28 -0800 Subject: [PATCH 2/3] clusterversion: don't assume BinaryMinSupportedVersionKey is first The code that applies a "dev offset" to versions assumes that the minimum supported version key is the first (non-primordial) key. This forces us to remove keys when bumping `BinaryMinSupportedVersionKey`. This change improves this code to specifically check for `BinaryMinSupportedVersionKey` so that we can clean up old keys after bumping the min supported version. Release note: None Epic: none --- pkg/clusterversion/cockroach_versions.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go index 69249a29b130..61103933520a 100644 --- a/pkg/clusterversion/cockroach_versions.go +++ b/pkg/clusterversion/cockroach_versions.go @@ -756,10 +756,8 @@ var versionsSingleton = func() keyedVersions { // version! For example, on a cluster that is on released version 3, a dev // binary containing versions 1, 2, 3, and 4 started with this flag would // renumber only 2-4 to be +1M. It would then step from 3 "up" to 1000002 -- - // which conceptually is actually back down to 2 -- then back to to 1000003, + // which conceptually is actually back down to 2 -- then back to 1000003, // then on to 1000004, etc. - skipFirst := allowUpgradeToDev - first := true for i := range rawVersionsSingleton { // VPrimordial versions are not offset; they don't matter for the logic // offsetting is used for. @@ -767,10 +765,11 @@ var versionsSingleton = func() keyedVersions { continue } - if skipFirst && first { - first = false + if allowUpgradeToDev && rawVersionsSingleton[i].Key <= BinaryMinSupportedVersionKey { + // Support upgrading from the non-development version of BinaryMinSupportedVersionKey. continue } + rawVersionsSingleton[i].Major += DevOffset } } From 0c1013d736300e7091067a6b0f744ea65cba7c01 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Tue, 7 Feb 2023 12:05:09 +0000 Subject: [PATCH 3/3] roachtest: fix inconsistency test Fix usage of "find" util so that the test works on Linux as well. Release note: none Epic: none --- pkg/cmd/roachtest/tests/inconsistency.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachtest/tests/inconsistency.go b/pkg/cmd/roachtest/tests/inconsistency.go index 9ca222f1c4e2..4c4a95034128 100644 --- a/pkg/cmd/roachtest/tests/inconsistency.go +++ b/pkg/cmd/roachtest/tests/inconsistency.go @@ -145,9 +145,9 @@ func runInconsistency(ctx context.Context, t test.Test, c cluster.Cluster) { c.Run(ctx, c.Node(n), "find {store-dir}/auxiliary/checkpoints -name checkpoint.txt") // The checkpoint can be inspected by the tooling. c.Run(ctx, c.Node(n), "./cockroach debug range-descriptors "+ - "$(find {store-dir}/auxiliary/checkpoints/* -type d -depth 0 | head -n1)") + "$(find {store-dir}/auxiliary/checkpoints/* -maxdepth 0 -type d | head -n1)") c.Run(ctx, c.Node(n), "./cockroach debug range-data --limit 10 "+ - "$(find {store-dir}/auxiliary/checkpoints/* -type d -depth 0 | head -n1) 1") + "$(find {store-dir}/auxiliary/checkpoints/* -maxdepth 0 -type d | head -n1) 1") } // NB: we can't easily verify the error because there's a lot of output which