From 36e38cdbcb939c68aa6bf39b5e893658fa147fc4 Mon Sep 17 00:00:00 2001 From: adityamaru Date: Tue, 8 Aug 2023 14:47:46 +0000 Subject: [PATCH] jobs: fix mixed-version jobs flake Similar to https://github.com/cockroachdb/cockroach/pull/107570 this is a short term fix for when an a query is executed with an AS OF SYSTEM TIME picks a transaction timestamp before the job_info migration has run. In which case parts of the jobs infrastructure will attempt to query the job_info column even though it doesn't exist at the transaction's timestamp. As a short term fix, when we encounter an UndefinedObject error for the job_info table we generate a synthetic retryable error so that the txn is pushed to a higher timestamp at which the upgrade will have completed and the job_info table will be visible. The longer term fix is being tracked in #106764. On master I can no longer reproduce the failure in #105032 but on 23.1 with this change I can successfully run 30 iterations of the test on a seed (-8690666577594439584) which previously saw occurrences of this flake. Fixes: #103239 Fixes: #105032 Release note: None --- pkg/jobs/job_info_storage.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/jobs/job_info_storage.go b/pkg/jobs/job_info_storage.go index 69bd3ba093fe..d749910f835c 100644 --- a/pkg/jobs/job_info_storage.go +++ b/pkg/jobs/job_info_storage.go @@ -235,7 +235,10 @@ func (i InfoStorage) Write(ctx context.Context, infoKey string, value []byte) er if value == nil { return errors.AssertionFailedf("missing value (infoKey %q)", infoKey) } - return i.write(ctx, infoKey, value) + if err := i.write(ctx, infoKey, value); err != nil { + return MaybeGenerateForcedRetryableError(ctx, i.txn.KV(), err) + } + return nil } // Delete removes the info record for the provided infoKey.