From 346d60b184262718d8abe80b8063ed1efeabddcb Mon Sep 17 00:00:00 2001
From: Rafi Shamim <rafi@cockroachlabs.com>
Date: Tue, 20 Aug 2024 14:50:16 -0400
Subject: [PATCH] scjob: treat descriptor not found as permanent job error

Retrying a schema change after hitting a descriptor not found error just
means the job will keep being unable to find it. We mark them as
permanent now to avoid causing infinite retries.

Note that most errors already cause the schema change job to fail, but
if the job is non-cancelable (which it might be in the PostCommit
phase), then it needs to be marked in order to not be retried.

Release note: None
---
 pkg/jobs/errors.go                      | 8 ++++++--
 pkg/sql/schemachanger/scjob/BUILD.bazel | 2 ++
 pkg/sql/schemachanger/scjob/job.go      | 8 ++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/pkg/jobs/errors.go b/pkg/jobs/errors.go
index 80e992a5af2a..7c99284181bc 100644
--- a/pkg/jobs/errors.go
+++ b/pkg/jobs/errors.go
@@ -30,6 +30,8 @@ var errRetryJobSentinel = errors.New("retriable job error")
 
 // MarkAsRetryJobError marks an error as a retriable job error which
 // indicates that the registry should retry the job.
+// Note that if a job is _not_ in the NonCancelable state, it will _only_ be
+// retried if the error has been marked as a retry job error.
 func MarkAsRetryJobError(err error) error {
 	return errors.Mark(err, errRetryJobSentinel)
 }
@@ -42,8 +44,10 @@ func IsRetryJobError(err error) bool {
 // Registry does not retry a job that fails due to a permanent error.
 var errJobPermanentSentinel = errors.New("permanent job error")
 
-// MarkAsPermanentJobError marks an error as a permanent job error, which indicates
-// Registry to not retry the job when it fails due to this error.
+// MarkAsPermanentJobError marks an error as a permanent job error, which
+// indicates Registry to not retry the job when it fails due to this error.
+// Note that if a job is in the NonCancelable state, it will always be retried
+// _unless_ the error has been marked as permanent job error.
 func MarkAsPermanentJobError(err error) error {
 	return errors.Mark(err, errJobPermanentSentinel)
 }
diff --git a/pkg/sql/schemachanger/scjob/BUILD.bazel b/pkg/sql/schemachanger/scjob/BUILD.bazel
index 8fccfd343f0c..9a0dffad1328 100644
--- a/pkg/sql/schemachanger/scjob/BUILD.bazel
+++ b/pkg/sql/schemachanger/scjob/BUILD.bazel
@@ -16,6 +16,7 @@ go_library(
         "//pkg/roachpb",
         "//pkg/settings/cluster",
         "//pkg/sql",
+        "//pkg/sql/catalog",
         "//pkg/sql/catalog/descs",
         "//pkg/sql/descmetadata",
         "//pkg/sql/isql",
@@ -25,5 +26,6 @@ go_library(
         "//pkg/sql/schemachanger/scrun",
         "//pkg/util/log",
         "//pkg/util/timeutil",
+        "@com_github_cockroachdb_errors//:errors",
     ],
 )
diff --git a/pkg/sql/schemachanger/scjob/job.go b/pkg/sql/schemachanger/scjob/job.go
index c35aadb1f9e9..f318e7520f17 100644
--- a/pkg/sql/schemachanger/scjob/job.go
+++ b/pkg/sql/schemachanger/scjob/job.go
@@ -18,6 +18,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/sql"
+	"github.com/cockroachdb/cockroach/pkg/sql/catalog"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
 	"github.com/cockroachdb/cockroach/pkg/sql/descmetadata"
 	"github.com/cockroachdb/cockroach/pkg/sql/isql"
@@ -26,6 +27,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scrun"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"github.com/cockroachdb/errors"
 )
 
 func init() {
@@ -147,6 +149,12 @@ func (n *newSchemaChangeResumer) run(ctx context.Context, execCtxI interface{})
 	)
 	// Return permanent errors back, otherwise we will try to retry
 	if sql.IsPermanentSchemaChangeError(err) {
+		// If a descriptor can't be found, we additionally mark the error as a
+		// permanent job error, so that non-cancelable jobs don't get retried. If a
+		// descriptor has gone missing, it isn't likely to come back.
+		if errors.IsAny(err, catalog.ErrDescriptorNotFound, catalog.ErrDescriptorDropped, catalog.ErrReferencedDescriptorNotFound) {
+			err = jobs.MarkAsPermanentJobError(err)
+		}
 		return err
 	}
 	if err != nil {