From ac5571536bc904da9662aa34f97518f494287cd1 Mon Sep 17 00:00:00 2001 From: adityamaru Date: Tue, 25 Jul 2023 15:58:22 -0400 Subject: [PATCH] sql,jobs: short-term fix for `UndefinedColumn` job_type error In https://github.com/cockroachdb/cockroach/issues/106762 we noticed that if a query is executed with an AS OF SYSTEM TIME clause that picks a transaction timestamp before the job_type migration, then parts of the jobs infrastructure will attempt to query the job_type column even though it doesn't exist at the transaction's timestamp. As a short term fix, when we encounter an `UndefinedColumn` error for the `job_type` column in `crdb_internal.jobs` we generate a synthetic retryable error so that the txn is pushed to a higher timestamp at which the upgrade will have completed and the `job_type` column will be visible. The longer term fix is being tracked in https://github.com/cockroachdb/cockroach/issues/106764. We are intentionally approaching this issue with a whack-a-mole approach to stabilize the tests the are running into this issue. We think time is better spent designing and investing in the longer term solution that will be tracked in #106764. Fixes: #107169 Informs: #106762 Release note: None --- pkg/jobs/BUILD.bazel | 1 + pkg/jobs/utils.go | 32 +++++++++++++++++++ pkg/sql/crdb_internal.go | 4 +-- .../upgrades/json_forward_indexes_test.go | 1 - 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/pkg/jobs/BUILD.bazel b/pkg/jobs/BUILD.bazel index 9b288826a91e..08c053692117 100644 --- a/pkg/jobs/BUILD.bazel +++ b/pkg/jobs/BUILD.bazel @@ -49,6 +49,7 @@ go_library( "//pkg/sql/catalog/descpb", "//pkg/sql/catalog/descs", "//pkg/sql/isql", + "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/sql/protoreflect", "//pkg/sql/sem/builtins", diff --git a/pkg/jobs/utils.go b/pkg/jobs/utils.go index 9ba2145fb6ea..17c39fee9724 100644 --- a/pkg/jobs/utils.go +++ b/pkg/jobs/utils.go @@ -19,6 +19,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/errors" ) @@ -153,3 +155,33 @@ func JobExists( } return row != nil, nil } + +// IsJobTypeColumnDoesNotExistError returns true if the error is of the form +// `column "job_type" does not exist`. +func isJobTypeColumnDoesNotExistError(err error) bool { + return pgerror.GetPGCode(err) == pgcode.UndefinedColumn && + strings.Contains(err.Error(), "column \"job_type\" does not exist") +} + +// MaybeGenerateForcedRetryableError returns a +// TransactionRetryWithProtoRefreshError that will cause the txn to be retried +// if the error is because of an undefined job_type column. +// +// In https://github.com/cockroachdb/cockroach/issues/106762 we noticed that if +// a query is executed with an AS OF SYSTEM TIME clause that picks a transaction +// timestamp before the job_type migration, then parts of the jobs +// infrastructure will attempt to query the job_type column even though it +// doesn't exist at the transaction's timestamp. +// +// As a short term fix, when we encounter an `UndefinedColumn` error we +// generate a synthetic retryable error so that the txn is pushed to a +// higher timestamp at which the upgrade will have completed and the +// `job_type` column will be visible. The longer term fix is being tracked +// in https://github.com/cockroachdb/cockroach/issues/106764. +func MaybeGenerateForcedRetryableError(ctx context.Context, txn *kv.Txn, err error) error { + if err != nil && isJobTypeColumnDoesNotExistError(err) { + return txn.GenerateForcedRetryableError(ctx, "synthetic error "+ + "to push timestamp to after the `job_type` upgrade has run") + } + return err +} diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go index cc4cb654b2d3..51b7cd1c5679 100644 --- a/pkg/sql/crdb_internal.go +++ b/pkg/sql/crdb_internal.go @@ -1069,7 +1069,7 @@ func populateSystemJobsTableRows( params..., ) if err != nil { - return matched, err + return matched, jobs.MaybeGenerateForcedRetryableError(ctx, p.Txn(), err) } cleanup := func(ctx context.Context) { @@ -1082,7 +1082,7 @@ func populateSystemJobsTableRows( for { hasNext, err := it.Next(ctx) if !hasNext || err != nil { - return matched, err + return matched, jobs.MaybeGenerateForcedRetryableError(ctx, p.Txn(), err) } currentRow := it.Cur() diff --git a/pkg/upgrade/upgrades/json_forward_indexes_test.go b/pkg/upgrade/upgrades/json_forward_indexes_test.go index e3e51d9eb5ee..a3ceddc965fe 100644 --- a/pkg/upgrade/upgrades/json_forward_indexes_test.go +++ b/pkg/upgrade/upgrades/json_forward_indexes_test.go @@ -25,7 +25,6 @@ import ( ) func TestJSONForwardingIndexes(t *testing.T) { - skip.WithIssue(t, 107169, "flaky test") var err error skip.UnderStressRace(t) defer leaktest.AfterTest(t)()