From 21c3ad14845767a6ae2d88c86dad8c36c437281f Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Mon, 21 Dec 2020 20:27:17 -0500 Subject: [PATCH] jobs: fix panic when a job has lost its claim during update Jobs can lose their claims. We detect when the claim doesn't match a new claim but we don't detect when the claim has been set to NULL (which is effectively equivalent). This is important because the way we clear claims is to set them to `NULL` and then we adopt jobs which have a `NULL` claim. This bug was introduced in #55120 and has been released only in the 21.1 alpha. Fixes #58049. Release note (bug fix): Fixed a bug from the previous alpha where the binary could crash if a running node lost its claim to a job while updating. --- pkg/jobs/jobs_test.go | 54 +++++++++++++++++++++++++++++++++++++++++++ pkg/jobs/update.go | 5 ++++ 2 files changed, 59 insertions(+) diff --git a/pkg/jobs/jobs_test.go b/pkg/jobs/jobs_test.go index 053c09696b3f..220b68e004bc 100644 --- a/pkg/jobs/jobs_test.go +++ b/pkg/jobs/jobs_test.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" "github.com/cockroachdb/cockroach/pkg/sql/tests" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" @@ -2642,3 +2643,56 @@ func TestMetrics(t *testing.T) { } }) } + +// TestLoseLeaseDuringExecution tests that it is safe to call update during +// job execution when the job has lost its lease and that that update operation +// will fail. +// +// This is a regression test for #58049. +func TestLoseLeaseDuringExecution(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + defer jobs.ResetConstructors()() + + // Disable the loops from messing with the job execution. + defer jobs.TestingSetAdoptAndCancelIntervals(time.Hour, time.Hour)() + + ctx := context.Background() + + s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer s.Stopper().Stop(ctx) + registry := s.JobRegistry().(*jobs.Registry) + + // The default FormatVersion value in SchemaChangeDetails corresponds to a + // pre-20.1 job. + rec := jobs.Record{ + DescriptorIDs: []descpb.ID{1}, + Details: jobspb.BackupDetails{}, + Progress: jobspb.BackupProgress{}, + } + + defer jobs.ResetConstructors()() + resumed := make(chan error, 1) + jobs.RegisterConstructor(jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { + return jobs.FakeResumer{ + OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { + defer close(resumed) + _, err := s.InternalExecutor().(sqlutil.InternalExecutor).Exec( + ctx, "set-claim-null", nil, /* txn */ + `UPDATE system.jobs SET claim_session_id = NULL WHERE id = $1`, + *j.ID()) + assert.NoError(t, err) + err = j.WithTxn(nil).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error { + return nil + }) + resumed <- err + return err + }, + } + }) + + _, err := registry.CreateJobWithTxn(ctx, rec, nil /* txn */) + require.NoError(t, err) + registry.TestingNudgeAdoptionQueue() + require.Regexp(t, `expected session '\w+' but found NULL`, <-resumed) +} diff --git a/pkg/jobs/update.go b/pkg/jobs/update.go index ec72b6a02017..181b50921014 100644 --- a/pkg/jobs/update.go +++ b/pkg/jobs/update.go @@ -130,6 +130,11 @@ func (j *Job) Update(ctx context.Context, updateFn UpdateFn) error { } if j.sessionID != "" { + if row[3] == tree.DNull { + return errors.Errorf( + "job %d: with status '%s': expected session '%s' but found NULL", + *j.ID(), statusString, j.sessionID) + } storedSession := []byte(*row[3].(*tree.DBytes)) if !bytes.Equal(storedSession, j.sessionID.UnsafeBytes()) { return errors.Errorf(