From 3e6aae4ced7a7de13e65ac7a3c0f8f9c5cae20e8 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Fri, 16 Feb 2024 11:16:35 -0500 Subject: [PATCH] sql: crdb_internal.leases could cause a deadlock with the lease manager Previously, a deadlock could occur between crdb_internal.leases and the lease manager when attempting to renew or release leases on the role_member inside crdb_internal.leases table. This deadlock occurs because lease manager lock is held while visting leases. The issue was easily reproducible when the allow_role_memberships_to_change_during_transaction setting was enabled. To resolve this, the patch introduces in-memory caching of all leases within crdb_internal.leases before privilege checks are performed, and any locks are subsequently released. Fixes: #119253 Release note (bug fix): crdb_internal.leases could cause a node to become unavailable due to a deadlock in the leasing subsystem. --- pkg/sql/crdb_internal.go | 51 ++++++++++++------- .../testdata/logic_test/crdb_internal | 15 ++++++ 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go index 42b6cd886d87..1e5079c531c8 100644 --- a/pkg/sql/crdb_internal.go +++ b/pkg/sql/crdb_internal.go @@ -840,9 +840,14 @@ CREATE TABLE crdb_internal.schema_changes ( }, } +type crdbInternalLeasesTableEntry struct { + desc catalog.Descriptor + takenOffline bool + expiration tree.DTimestamp +} + // TODO(tbg): prefix with node_. var crdbInternalLeasesTable = virtualSchemaTable{ - comment: `acquired table leases (RAM; local node only)`, schema: ` CREATE TABLE crdb_internal.leases ( node_id INT NOT NULL, @@ -852,33 +857,45 @@ CREATE TABLE crdb_internal.leases ( expiration TIMESTAMP NOT NULL, deleted BOOL NOT NULL )`, + comment: `acquired table leases (RAM; local node only)`, populate: func( ctx context.Context, p *planner, _ catalog.DatabaseDescriptor, addRow func(...tree.Datum) error, ) error { nodeID, _ := p.execCfg.NodeInfo.NodeID.OptionalNodeID() // zero if not available - var iterErr error + var leaseEntries []crdbInternalLeasesTableEntry p.LeaseMgr().VisitLeases(func(desc catalog.Descriptor, takenOffline bool, _ int, expiration tree.DTimestamp) (wantMore bool) { - if ok, err := p.HasAnyPrivilege(ctx, desc); err != nil { - iterErr = err - return false + // Because we have locked the lease manager while visiting every lease, + // it not safe to *acquire* or *release* any leases in the process. As + // a result, build an in memory cache of entries and process them after + // all the locks have been released. Previously we would check privileges, + // which would need to get the lease on the role_member table. Simply skipping + // this check on that table will not be sufficient since any active lease + // on it could expire or need an renewal and cause the sample problem. + leaseEntries = append(leaseEntries, crdbInternalLeasesTableEntry{ + desc: desc, + takenOffline: takenOffline, + expiration: expiration, + }) + return true + }) + for _, entry := range leaseEntries { + if ok, err := p.HasAnyPrivilege(ctx, entry.desc); err != nil { + return err } else if !ok { - return true + continue } - if err := addRow( tree.NewDInt(tree.DInt(nodeID)), - tree.NewDInt(tree.DInt(int64(desc.GetID()))), - tree.NewDString(desc.GetName()), - tree.NewDInt(tree.DInt(int64(desc.GetParentID()))), - &expiration, - tree.MakeDBool(tree.DBool(takenOffline)), + tree.NewDInt(tree.DInt(int64(entry.desc.GetID()))), + tree.NewDString(entry.desc.GetName()), + tree.NewDInt(tree.DInt(int64(entry.desc.GetParentID()))), + &entry.expiration, + tree.MakeDBool(tree.DBool(entry.takenOffline)), ); err != nil { - iterErr = err - return false + return err } - return true - }) - return iterErr + } + return nil }, } diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal b/pkg/sql/logictest/testdata/logic_test/crdb_internal index ff8a1cfa35e0..4adc6f177946 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal @@ -1417,3 +1417,18 @@ CREATE TABLE t76710_1 AS SELECT * FROM crdb_internal.statement_statistics; statement ok CREATE MATERIALIZED VIEW t76710_2 AS SELECT fingerprint_id FROM crdb_internal.cluster_statement_statistics; + +# In #119253 we observed a deadlock when visiting leases inside crdb_internal.lease +# between the lease manager and the authorization check for each descriptor. This +# test validates that this is no longer the case. +subtest allow_role_memberships_to_change_during_transaction + +user testuser + +statement ok +set allow_role_memberships_to_change_during_transaction=true + +statement ok +SELECT * FROM crdb_internal.leases; + +subtest end