From a767cdda788abbb7dd6a2e3b52df0d867f0b9bf8 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Mon, 11 Jan 2021 15:38:55 -0500 Subject: [PATCH] kvserver: don't refuse to fwd lease proposals in some edge cases This patch backpedals a little bit on the logic introduced in #55148. That patch said that, if a leader is known, every other replica refuses to propose a lease acquisition. Instead, the replica in question redirects whomever was triggering the lease acquisition to the leader, thinking that the leader should take the lease. That patch introduced a deadlock: some replicas refuse to take the lease because they are not VOTER_FULL (see CheckCanReceiveLease()). To fix the deadlock, this patch incorporates that check in the proposal buffer's decision about whether or not to reject a proposal: if the leader is believed to refuse to take the lease, then we again forward our own lease request. An edge case to the edge case is when the leader is not even part of the proposer's range descriptor. This can happen if the proposer is far behind. In this case, we assume that the leader is eligible. If it isn't, the deadlock will resolve once the proposer catches up. A future patch will relax the conditions under which a replica agrees to take the lease. VOTER_INCOMING replicas should take the lease. VOTER_DEMOTING are more controversial. Fixes #57798 Release note: None --- pkg/kv/kvserver/batcheval/cmd_lease.go | 4 +- .../kvserver/batcheval/cmd_lease_request.go | 2 +- pkg/kv/kvserver/batcheval/cmd_lease_test.go | 35 +++++++ .../kvserver/batcheval/cmd_lease_transfer.go | 2 +- pkg/kv/kvserver/replica_proposal_buf.go | 94 +++++++++++++++---- pkg/kv/kvserver/replica_proposal_buf_test.go | 88 +++++++++++++++-- 6 files changed, 195 insertions(+), 30 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/cmd_lease.go b/pkg/kv/kvserver/batcheval/cmd_lease.go index 1cd213321b83..d03ceabce4c4 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease.go @@ -33,7 +33,7 @@ func newFailedLeaseTrigger(isTransfer bool) result.Result { return trigger } -// checkCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease. +// CheckCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease. // Returns an error if the respective replica is not eligible. // // An error is also returned is the replica is not part of `rngDesc`. @@ -45,7 +45,7 @@ func newFailedLeaseTrigger(isTransfer bool) result.Result { // latencies. Additionally, as of the time of writing, learner replicas are // only used for a short time in replica addition, so it's not worth working // out the edge cases. -func checkCanReceiveLease( +func CheckCanReceiveLease( wouldbeLeaseholder roachpb.ReplicaDescriptor, rngDesc *roachpb.RangeDescriptor, ) error { repDesc, ok := rngDesc.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID) diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_request.go b/pkg/kv/kvserver/batcheval/cmd_lease_request.go index 0dd42ce97567..c41c09802642 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_request.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_request.go @@ -58,7 +58,7 @@ func RequestLease( // If this check is removed at some point, the filtering of learners on the // sending side would have to be removed as well. - if err := checkCanReceiveLease(args.Lease.Replica, cArgs.EvalCtx.Desc()); err != nil { + if err := CheckCanReceiveLease(args.Lease.Replica, cArgs.EvalCtx.Desc()); err != nil { rErr.Message = err.Error() return newFailedLeaseTrigger(false /* isTransfer */), rErr } diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_test.go b/pkg/kv/kvserver/batcheval/cmd_lease_test.go index ec707f9d4f8e..e90af83e7ddd 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_test.go @@ -156,3 +156,38 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { `replica (n2,s2):2LEARNER of type LEARNER cannot hold lease` require.EqualError(t, err, expForLearner) } + +func TestCheckCanReceiveLease(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + for _, tc := range []struct { + leaseholderType roachpb.ReplicaType + eligible bool + }{ + {leaseholderType: roachpb.VOTER_FULL, eligible: true}, + {leaseholderType: roachpb.VOTER_INCOMING, eligible: false}, + {leaseholderType: roachpb.VOTER_OUTGOING, eligible: false}, + {leaseholderType: roachpb.VOTER_DEMOTING, eligible: false}, + {leaseholderType: roachpb.LEARNER, eligible: false}, + {leaseholderType: roachpb.NON_VOTER, eligible: false}, + } { + t.Run(tc.leaseholderType.String(), func(t *testing.T) { + repDesc := roachpb.ReplicaDescriptor{ + ReplicaID: 1, + Type: &tc.leaseholderType, + } + rngDesc := roachpb.RangeDescriptor{ + InternalReplicas: []roachpb.ReplicaDescriptor{repDesc}, + } + err := CheckCanReceiveLease(rngDesc.InternalReplicas[0], &rngDesc) + require.Equal(t, tc.eligible, err == nil, "err: %v", err) + }) + } + + t.Run("replica not in range desc", func(t *testing.T) { + repDesc := roachpb.ReplicaDescriptor{ReplicaID: 1} + rngDesc := roachpb.RangeDescriptor{} + require.Regexp(t, "replica.*not found", CheckCanReceiveLease(repDesc, &rngDesc)) + }) +} diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go b/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go index 6335b5514800..33f9c6094c61 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go @@ -51,7 +51,7 @@ func TransferLease( // If this check is removed at some point, the filtering of learners on the // sending side would have to be removed as well. - if err := checkCanReceiveLease(args.Lease.Replica, cArgs.EvalCtx.Desc()); err != nil { + if err := CheckCanReceiveLease(args.Lease.Replica, cArgs.EvalCtx.Desc()); err != nil { return newFailedLeaseTrigger(true /* isTransfer */), err } diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index 041163e272d3..c7488f385a66 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -15,6 +15,7 @@ import ( "sync" "sync/atomic" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -154,6 +155,21 @@ type propBuf struct { } } +type rangeLeaderInfo struct { + // leaderKnown is set if the local Raft machinery knows who the leader is. If + // not set, all other fields are empty. + leaderKnown bool + + // leader represents the Raft group's leader. Not set if leaderKnown is not + // set. + leader roachpb.ReplicaID + // iAmTheLeader is set if the local replica is the leader. + iAmTheLeader bool + // leaderEligibleForLease is set if the leader is known and its type of + // replica allows it to acquire a lease. + leaderEligibleForLease bool +} + // A proposer is an object that uses a propBuf to coordinate Raft proposals. type proposer interface { locker() sync.Locker @@ -166,6 +182,7 @@ type proposer interface { // The following require the proposer to hold an exclusive lock. withGroupLocked(func(proposerRaft) error) error registerProposalLocked(*ProposalData) + leaderStatusRLocked(raftGroup proposerRaft) rangeLeaderInfo // rejectProposalWithRedirectLocked rejects a proposal and redirects the // proposer to try it on another node. This is used to sometimes reject lease // acquisitions when another replica is the leader; the intended consequence @@ -430,19 +447,14 @@ func (b *propBuf) FlushLockedWithRaftGroup( // Figure out leadership info. We'll use it to conditionally drop some // requests. - var leaderKnown, iAmTheLeader bool - var leader roachpb.ReplicaID + var leaderInfo rangeLeaderInfo if raftGroup != nil { - status := raftGroup.BasicStatus() - iAmTheLeader = status.RaftState == raft.StateLeader - leaderKnown = status.Lead != raft.None - if leaderKnown { - leader = roachpb.ReplicaID(status.Lead) - if !iAmTheLeader && leader == b.p.replicaID() { - log.Fatalf(ctx, - "inconsistent Raft state: state %s while the current replica is also the lead: %d", - status.RaftState, leader) - } + leaderInfo = b.p.leaderStatusRLocked(raftGroup) + // Sanity check. + if leaderInfo.leaderKnown && leaderInfo.leader == b.p.replicaID() && !leaderInfo.iAmTheLeader { + log.Fatalf(ctx, + "inconsistent Raft state: state %s while the current replica is also the lead: %d", + raftGroup.BasicStatus().RaftState, leaderInfo.leader) } } @@ -484,14 +496,24 @@ func (b *propBuf) FlushLockedWithRaftGroup( // ErrProposalDropped. We'll eventually re-propose it once a leader is // known, at which point it will either go through or be rejected based on // whether or not it is this replica that became the leader. - if !iAmTheLeader && p.Request.IsLeaseRequest() { - if leaderKnown && !b.testing.allowLeaseProposalWhenNotLeader { + // + // A special case is when the leader is known, but is ineligible to get the + // lease. In that case, we have no choice but to continue with the proposal. + if !leaderInfo.iAmTheLeader && p.Request.IsLeaseRequest() { + leaderKnownAndEligible := leaderInfo.leaderKnown && leaderInfo.leaderEligibleForLease + if leaderKnownAndEligible && !b.testing.allowLeaseProposalWhenNotLeader { log.VEventf(ctx, 2, "not proposing lease acquisition because we're not the leader; replica %d is", - leader) - b.p.rejectProposalWithRedirectLocked(ctx, p, leader) + leaderInfo.leader) + b.p.rejectProposalWithRedirectLocked(ctx, p, leaderInfo.leader) continue } - // If the leader is not known, continue with the proposal as explained above. + // If the leader is not known, or if it is known but it's ineligible for + // the lease, continue with the proposal as explained above. + if !leaderInfo.leaderKnown { + log.VEventf(ctx, 2, "proposing lease acquisition even though we're not the leader; the leader is unknown") + } else { + log.VEventf(ctx, 2, "proposing lease acquisition even though we're not the leader; the leader is ineligible") + } } // Raft processing bookkeeping. @@ -721,6 +743,38 @@ func (rp *replicaProposer) registerProposalLocked(p *ProposalData) { rp.mu.proposals[p.idKey] = p } +func (rp *replicaProposer) leaderStatusRLocked(raftGroup proposerRaft) rangeLeaderInfo { + r := (*Replica)(rp) + + status := raftGroup.BasicStatus() + iAmTheLeader := status.RaftState == raft.StateLeader + leader := status.Lead + leaderKnown := leader != raft.None + var leaderEligibleForLease bool + rangeDesc := r.descRLocked() + if leaderKnown { + // Figure out if the leader is eligible for getting a lease. + leaderRep, ok := rangeDesc.GetReplicaDescriptorByID(roachpb.ReplicaID(leader)) + if !ok { + // There is a leader, but it's not part of our descriptor. The descriptor + // must be stale, so we are behind in applying the log. We don't want the + // lease ourselves (as we're behind), so let's assume that the leader is + // eligible. If it proves that it isn't, we might be asked to get the + // lease again, and by then hopefully we will have caught up. + leaderEligibleForLease = true + } else { + err := batcheval.CheckCanReceiveLease(leaderRep, rangeDesc) + leaderEligibleForLease = err == nil + } + } + return rangeLeaderInfo{ + leaderKnown: leaderKnown, + leader: roachpb.ReplicaID(leader), + iAmTheLeader: iAmTheLeader, + leaderEligibleForLease: leaderEligibleForLease, + } +} + // rejectProposalWithRedirectLocked is part of the proposer interface. func (rp *replicaProposer) rejectProposalWithRedirectLocked( ctx context.Context, prop *ProposalData, redirectTo roachpb.ReplicaID, @@ -728,11 +782,11 @@ func (rp *replicaProposer) rejectProposalWithRedirectLocked( r := (*Replica)(rp) rangeDesc := r.descRLocked() storeID := r.store.StoreID() - leaderRep, _ /* ok */ := rangeDesc.GetReplicaDescriptorByID(redirectTo) + redirectRep, _ /* ok */ := rangeDesc.GetReplicaDescriptorByID(redirectTo) speculativeLease := &roachpb.Lease{ - Replica: leaderRep, + Replica: redirectRep, } - log.VEventf(ctx, 2, "redirecting proposal to node %s; request: %s", leaderRep.NodeID, prop.Request) + log.VEventf(ctx, 2, "redirecting proposal to node %s; request: %s", redirectRep.NodeID, prop.Request) r.cleanupFailedProposalLocked(prop) prop.finishApplication(ctx, proposalResult{ Err: roachpb.NewError(newNotLeaseHolderError( diff --git a/pkg/kv/kvserver/replica_proposal_buf_test.go b/pkg/kv/kvserver/replica_proposal_buf_test.go index 534a7dcffb00..6447859c000d 100644 --- a/pkg/kv/kvserver/replica_proposal_buf_test.go +++ b/pkg/kv/kvserver/replica_proposal_buf_test.go @@ -17,6 +17,7 @@ import ( "testing" "time" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/leaktest" @@ -44,6 +45,15 @@ type testProposer struct { // If not nil, this is called by RejectProposalWithRedirectLocked(). If nil, // RejectProposalWithRedirectLocked() panics. onRejectProposalWithRedirectLocked func(prop *ProposalData, redirectTo roachpb.ReplicaID) + + // leaderReplicaInDescriptor is set if the leader (as indicated by raftGroup) + // is known, and that leader is part of the range's descriptor (as seen by the + // current replica). This can be used to simulate the local replica being so + // far behind that it doesn't have an up to date descriptor. + leaderReplicaInDescriptor bool + // If leaderReplicaInDescriptor is set, this specifies what type of replica it + // is. Some types of replicas are not eligible to get a lease. + leaderReplicaType roachpb.ReplicaType } type testProposerRaft struct { @@ -99,6 +109,39 @@ func (t *testProposer) registerProposalLocked(p *ProposalData) { t.registered++ } +func (t *testProposer) leaderStatusRLocked(raftGroup proposerRaft) rangeLeaderInfo { + leaderKnown := raftGroup.BasicStatus().Lead != raft.None + var leaderRep roachpb.ReplicaID + var iAmTheLeader, leaderEligibleForLease bool + if leaderKnown { + leaderRep = roachpb.ReplicaID(raftGroup.BasicStatus().Lead) + iAmTheLeader = leaderRep == t.replicaID() + repDesc := roachpb.ReplicaDescriptor{ + ReplicaID: leaderRep, + Type: &t.leaderReplicaType, + } + + if t.leaderReplicaInDescriptor { + // Fill in a RangeDescriptor just enough for the CheckCanReceiveLease() + // call. + rngDesc := roachpb.RangeDescriptor{ + InternalReplicas: []roachpb.ReplicaDescriptor{repDesc}, + } + err := batcheval.CheckCanReceiveLease(repDesc, &rngDesc) + leaderEligibleForLease = err == nil + } else { + // This matches replicaProposed.leaderStatusRLocked(). + leaderEligibleForLease = true + } + } + return rangeLeaderInfo{ + leaderKnown: leaderKnown, + leader: leaderRep, + iAmTheLeader: iAmTheLeader, + leaderEligibleForLease: leaderEligibleForLease, + } +} + func (t *testProposer) rejectProposalWithRedirectLocked( ctx context.Context, prop *ProposalData, redirectTo roachpb.ReplicaID, ) { @@ -391,9 +434,16 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { // Each subtest will try to propose a lease acquisition in a different Raft // scenario. Some proposals should be allowed, some should be rejected. for _, tc := range []struct { - name string - state raft.StateType - leader uint64 + name string + state raft.StateType + // raft.None means there's no leader, or the leader is unknown. + leader uint64 + // Empty means VOTER_FULL. + leaderRepType roachpb.ReplicaType + // Set to simulate situations where the local replica is so behind that the + // leader is not even part of the range descriptor. + leaderNotInRngDesc bool + expRejection bool }{ { @@ -404,7 +454,7 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { expRejection: false, }, { - name: "follower known leader", + name: "follower, known eligible leader", state: raft.StateFollower, // Someone else is leader. leader: self + 1, @@ -412,7 +462,28 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { expRejection: true, }, { - name: "follower unknown leader", + name: "follower, known ineligible leader", + state: raft.StateFollower, + // Someone else is leader. + leader: self + 1, + // The leader type makes it ineligible to get the lease. Thus, the local + // proposal will not be rejected. + leaderRepType: roachpb.VOTER_DEMOTING, + expRejection: false, + }, + { + // Here we simulate the leader being known by Raft, but the local replica + // is so far behind that it doesn't contain the leader replica. + name: "follower, known leader not in range descriptor", + state: raft.StateFollower, + // Someone else is leader. + leader: self + 1, + leaderNotInRngDesc: true, + // We assume that the leader is eligible, and redirect. + expRejection: true, + }, + { + name: "follower, unknown leader", state: raft.StateFollower, // Unknown leader. leader: raft.None, @@ -448,8 +519,13 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { Lead: tc.leader, }, } - r := testProposerRaft{status: raftStatus} + r := testProposerRaft{ + status: raftStatus, + } p.raftGroup = r + p.leaderReplicaInDescriptor = !tc.leaderNotInRngDesc + p.leaderReplicaType = tc.leaderRepType + var b propBuf b.Init(&p)