From d76c4073a0c041e6f561a8dbd8e62dcdd1aaf577 Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Thu, 17 Aug 2023 15:31:55 -0400 Subject: [PATCH] jobs: pass explicit transaction to lookup-num-running query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the `lookup-num-running` query was passed an implicit transaction. This could cause the query to hang forever if the explicit transaction invoked by its caller, `executeCandidateSchedule()` retried. This patch passes the explicit transaction to the `lookup-num-running` query. Here's a step by step timeline of what could cause deadlock: - Run `executeCandidateSchedule` with Txn A. Do a few reads. - Within `executeCandidateSchedule`, `lookup-num-running` uses an Implicit Txn B to read. - Txn A reads and writes a few intents - `executeCandidateSchedule` retries, using Txn A - `lookup-num-running` deadlocks because it observes write intents from Txn A’s first attempt. To use the example above, this patch changes `lookup-num-running` to use txn A. Release note: none Epic: none --- pkg/jobs/job_scheduler.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/jobs/job_scheduler.go b/pkg/jobs/job_scheduler.go index 23fcfa48c611..a23179aaaed6 100644 --- a/pkg/jobs/job_scheduler.go +++ b/pkg/jobs/job_scheduler.go @@ -112,13 +112,14 @@ func lookupNumRunningJobs( scheduleID int64, env scheduledjobs.JobSchedulerEnv, ie sqlutil.InternalExecutor, + txn *kv.Txn, ) (int64, error) { lookupStmt := fmt.Sprintf( "SELECT count(*) FROM %s WHERE created_by_type = '%s' AND created_by_id = %d AND status IN %s", env.SystemJobsTableName(), CreatedByScheduledJobs, scheduleID, NonTerminalStatusTupleString) row, err := ie.QueryRowEx( ctx, "lookup-num-running", - /*txn=*/ nil, + txn, sessiondata.InternalExecutorOverride{User: username.RootUserName()}, lookupStmt) if err != nil { @@ -254,7 +255,7 @@ func (s *jobScheduler) executeCandidateSchedule( return nil } - numRunning, err := lookupNumRunningJobs(ctx, schedule.ScheduleID(), s.env, s.InternalExecutor) + numRunning, err := lookupNumRunningJobs(ctx, schedule.ScheduleID(), s.env, s.InternalExecutor, txn) if err != nil { return err }