From f4990e9ae8a59b06396cbf6cb17f919334dc79df Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 4 Oct 2022 17:58:38 -0400 Subject: [PATCH] kv: bypass lease transfer safety checks during joint consensus Fixes #88667. This commit adds logic to bypass lease transfer safety checks (added in 034611b) when in a joint configuration and transferring the lease from a VOTER_DEMOTING to a VOTER_INCOMING. We do so because we could get stuck without a path to exit the joint configuration if we rejected this lease transfer while waiting to confirm that the target is up-to-date on its log. That confirmation may never arrive if the target is dead or partitioned away, and while we'd rather not transfer the lease to a dead node, at least we have a mechanism to recovery from that state. We also just sent the VOTER_INCOMING a snapshot (as a LEARNER, before promotion), so it is unlikely that the replica is actually dead or behind on its log. A better alternative here would be to introduce a mechanism to choose an alternate lease transfer target after some amount of time, if the lease transfer to the VOTER_INCOMING cannot be confirmed to be safe. We may do this in the future, but given the proximity to the release and given that this matches the behavior in v22.1, we choose this approach for now. Release note: None Release justification: Needed to resolve release blocker. --- pkg/kv/batch.go | 7 ++++-- pkg/kv/db.go | 13 +++++++++++- pkg/kv/kvserver/client_lease_test.go | 9 +++++--- pkg/kv/kvserver/client_replica_test.go | 23 ++++++++++---------- pkg/kv/kvserver/replica_command.go | 16 ++++++++++++-- pkg/kv/kvserver/replica_init.go | 1 - pkg/kv/kvserver/replica_proposal_buf.go | 6 +----- pkg/kv/kvserver/replica_range_lease.go | 26 +++++++++++++++++------ pkg/kv/kvserver/replica_rangefeed_test.go | 2 +- pkg/kv/kvserver/replica_send.go | 2 +- pkg/kv/kvserver/replica_test.go | 7 +++--- pkg/kv/kvserver/replicate_queue.go | 4 ++-- pkg/kv/kvserver/testing_knobs.go | 4 ---- pkg/roachpb/api.proto | 16 ++++++++++++++ 14 files changed, 92 insertions(+), 44 deletions(-) diff --git a/pkg/kv/batch.go b/pkg/kv/batch.go index 06c21ca2a1d5..449a4a342ce1 100644 --- a/pkg/kv/batch.go +++ b/pkg/kv/batch.go @@ -735,7 +735,9 @@ func (b *Batch) adminUnsplit(splitKeyIn interface{}) { // adminTransferLease is only exported on DB. It is here for symmetry with the // other operations. -func (b *Batch) adminTransferLease(key interface{}, target roachpb.StoreID) { +func (b *Batch) adminTransferLease( + key interface{}, target roachpb.StoreID, bypassSafetyChecks bool, +) { k, err := marshalKey(key) if err != nil { b.initResult(0, 0, notRaw, err) @@ -745,7 +747,8 @@ func (b *Batch) adminTransferLease(key interface{}, target roachpb.StoreID) { RequestHeader: roachpb.RequestHeader{ Key: k, }, - Target: target, + Target: target, + BypassSafetyChecks: bypassSafetyChecks, } b.appendReqs(req) b.initResult(1, 0, notRaw, nil) diff --git a/pkg/kv/db.go b/pkg/kv/db.go index c45d71fa99bd..a0bce7f5f1be 100644 --- a/pkg/kv/db.go +++ b/pkg/kv/db.go @@ -649,7 +649,18 @@ func (db *DB) AdminTransferLease( ctx context.Context, key interface{}, target roachpb.StoreID, ) error { b := &Batch{} - b.adminTransferLease(key, target) + b.adminTransferLease(key, target, false /* bypassSafetyChecks */) + return getOneErr(db.Run(ctx, b), b) +} + +// AdminTransferLeaseBypassingSafetyChecks is like AdminTransferLease, but +// configures the lease transfer to bypass safety checks. See the comment on +// AdminTransferLeaseRequest.BypassSafetyChecks for details. +func (db *DB) AdminTransferLeaseBypassingSafetyChecks( + ctx context.Context, key interface{}, target roachpb.StoreID, +) error { + b := &Batch{} + b.adminTransferLease(key, target, true /* bypassSafetyChecks */) return getOneErr(db.Run(ctx, b), b) } diff --git a/pkg/kv/kvserver/client_lease_test.go b/pkg/kv/kvserver/client_lease_test.go index 44f428a6030d..72f47ac77314 100644 --- a/pkg/kv/kvserver/client_lease_test.go +++ b/pkg/kv/kvserver/client_lease_test.go @@ -398,13 +398,16 @@ func internalTransferLeaseFailureDuringJointConfig(t *testing.T, isManual bool) // range when they are being proposed. var scratchRangeID int64 shouldFailProposal := func(args kvserverbase.ProposalFilterArgs) bool { - // Block if a ChangeReplicas command is removing a node from our range. return args.Req.RangeID == roachpb.RangeID(atomic.LoadInt64(&scratchRangeID)) && args.Req.IsSingleTransferLeaseRequest() } + const failureMsg = "injected lease transfer" knobs.Store.(*kvserver.StoreTestingKnobs).TestingProposalFilter = func(args kvserverbase.ProposalFilterArgs) *roachpb.Error { if shouldFailProposal(args) { - return roachpb.NewErrorf("Injecting lease transfer failure") + // The lease transfer should be configured to bypass safety checks. + // See maybeTransferLeaseDuringLeaveJoint for an explanation. + require.True(t, args.Req.Requests[0].GetTransferLease().BypassSafetyChecks) + return roachpb.NewErrorf(failureMsg) } return nil } @@ -430,7 +433,7 @@ func internalTransferLeaseFailureDuringJointConfig(t *testing.T, isManual bool) {ChangeType: roachpb.ADD_VOTER, Target: tc.Target(3)}, }) require.Error(t, err) - require.Regexp(t, "Injecting lease transfer failure", err) + require.Regexp(t, failureMsg, err) // We're now in a joint configuration, n1 already revoked its lease but all // other replicas think n1 is the leaseholder. As long as n1 is alive, it is diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 64e702d94149..4b50b3e397b7 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -1589,7 +1589,7 @@ func TestLeaseExpirationBasedRangeTransfer(t *testing.T) { origLease, _ := l.replica0.GetLease() { // Transferring the lease to ourself should be a no-op. - if err := l.replica0.AdminTransferLease(ctx, l.replica0Desc.StoreID); err != nil { + if err := l.replica0.AdminTransferLease(ctx, l.replica0Desc.StoreID, false /* bypassSafetyChecks */); err != nil { t.Fatal(err) } newLease, _ := l.replica0.GetLease() @@ -1601,12 +1601,12 @@ func TestLeaseExpirationBasedRangeTransfer(t *testing.T) { { // An invalid target should result in an error. const expected = "unable to find store .* in range" - if err := l.replica0.AdminTransferLease(ctx, 1000); !testutils.IsError(err, expected) { + if err := l.replica0.AdminTransferLease(ctx, 1000, false /* bypassSafetyChecks */); !testutils.IsError(err, expected) { t.Fatalf("expected %s, but found %v", expected, err) } } - if err := l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID); err != nil { + if err := l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID, false /* bypassSafetyChecks */); err != nil { t.Fatal(err) } @@ -1651,7 +1651,7 @@ func TestLeaseExpirationBasedRangeTransferWithExtension(t *testing.T) { l := setupLeaseTransferTest(t) defer l.tc.Stopper().Stop(ctx) // Ensure that replica1 has the lease. - if err := l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID); err != nil { + if err := l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID, false /* bypassSafetyChecks */); err != nil { t.Fatal(err) } l.checkHasLease(t, 1) @@ -1673,7 +1673,7 @@ func TestLeaseExpirationBasedRangeTransferWithExtension(t *testing.T) { transferErrCh := make(chan error) go func() { // Transfer back from replica1 to replica0. - err := l.replica1.AdminTransferLease(context.Background(), l.replica0Desc.StoreID) + err := l.replica1.AdminTransferLease(context.Background(), l.replica0Desc.StoreID, false /* bypassSafetyChecks */) // Ignore not leaseholder errors which can arise due to re-proposals. if errors.HasType(err, (*roachpb.NotLeaseHolderError)(nil)) { err = nil @@ -1738,7 +1738,7 @@ func TestLeaseExpirationBasedDrainTransferWithExtension(t *testing.T) { l := setupLeaseTransferTest(t) defer l.tc.Stopper().Stop(ctx) // Ensure that replica1 has the lease. - if err := l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID); err != nil { + if err := l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID, false /* bypassSafetyChecks */); err != nil { t.Fatal(err) } l.checkHasLease(t, 1) @@ -1791,7 +1791,7 @@ func TestLeaseExpirationBelowFutureTimeRequest(t *testing.T) { // Ensure that replica1 has the lease, and that replica0 has also picked up // on the lease transfer. - require.NoError(t, l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID)) + require.NoError(t, l.replica0.AdminTransferLease(ctx, l.replica1Desc.StoreID, false /* bypassSafetyChecks */)) l.checkHasLease(t, 1) preLease, _ := l.replica1.GetLease() require.Eventually(t, func() bool { @@ -2780,9 +2780,6 @@ func TestLeaseTransferInSnapshotUpdatesTimestampCache(t *testing.T) { ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ Knobs: base.TestingKnobs{ - Store: &kvserver.StoreTestingKnobs{ - AllowLeaseTransfersWhenTargetMayNeedSnapshot: true, - }, Server: &server.TestingKnobs{ WallClock: manualClock, }, @@ -2872,7 +2869,11 @@ func TestLeaseTransferInSnapshotUpdatesTimestampCache(t *testing.T) { // Finally, transfer the lease to node 2 while it is still unavailable and // behind. We try to avoid this case when picking new leaseholders in practice, // but we're never 100% successful. - tc.TransferRangeLeaseOrFatal(t, *repl0.Desc(), tc.Target(2)) + // NOTE: we bypass safety checks because the target node is behind on its log, + // so the lease transfer would be rejected otherwise. + err = tc.Servers[0].DB().AdminTransferLeaseBypassingSafetyChecks(ctx, + repl0.Desc().StartKey.AsRawKey(), tc.Target(2).StoreID) + require.Nil(t, err) // Remove the partition. A snapshot to node 2 should follow. This snapshot // will inform node 2 that it is the new leaseholder for the range. Node 2 diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 0d239b12ad48..7d4b57575afd 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -1230,7 +1230,19 @@ func (r *Replica) maybeTransferLeaseDuringLeaveJoint( } log.VEventf(ctx, 2, "leaseholder %v is being removed through an atomic "+ "replication change, transferring lease to %v", voterDemotingTarget, voterIncomingTarget) - err = r.store.DB().AdminTransferLease(ctx, r.startKey, voterIncomingTarget.StoreID) + // We bypass safety checks when transferring the lease to the VOTER_INCOMING. + // We do so because we could get stuck without a path to exit the joint + // configuration if we rejected this lease transfer while waiting to confirm + // that the target is up-to-date on its log. That confirmation may never + // arrive if the target is dead or partitioned away, and while we'd rather not + // transfer the lease to a dead node, at least we have a mechanism to recovery + // from that state. We also just sent the VOTER_INCOMING a snapshot (as a + // LEARNER, before promotion), so it is unlikely that the replica is actually + // dead or behind on its log. + // TODO(nvanbenschoten): this isn't great. Instead of bypassing safety checks, + // we should build a mechanism to choose an alternate lease transfer target + // after some amount of time. + err = r.store.DB().AdminTransferLeaseBypassingSafetyChecks(ctx, r.startKey, voterIncomingTarget.StoreID) if err != nil { return err } @@ -3637,7 +3649,7 @@ func (r *Replica) adminScatter( targetStoreID := potentialLeaseTargets[newLeaseholderIdx].StoreID if targetStoreID != r.store.StoreID() { log.VEventf(ctx, 2, "randomly transferring lease to s%d", targetStoreID) - if err := r.AdminTransferLease(ctx, targetStoreID); err != nil { + if err := r.AdminTransferLease(ctx, targetStoreID, false /* bypassSafetyChecks */); err != nil { log.Warningf(ctx, "failed to scatter lease to s%d: %+v", targetStoreID, err) } } diff --git a/pkg/kv/kvserver/replica_init.go b/pkg/kv/kvserver/replica_init.go index c25f4fde2817..d60bf9600cfb 100644 --- a/pkg/kv/kvserver/replica_init.go +++ b/pkg/kv/kvserver/replica_init.go @@ -103,7 +103,6 @@ func newUnloadedReplica( r.mu.checksums = map[uuid.UUID]*replicaChecksum{} r.mu.proposalBuf.Init((*replicaProposer)(r), tracker.NewLockfreeTracker(), r.Clock(), r.ClusterSettings()) r.mu.proposalBuf.testing.allowLeaseProposalWhenNotLeader = store.cfg.TestingKnobs.AllowLeaseRequestProposalsWhenNotLeader - r.mu.proposalBuf.testing.allowLeaseTransfersWhenTargetMayNeedSnapshot = store.cfg.TestingKnobs.AllowLeaseTransfersWhenTargetMayNeedSnapshot r.mu.proposalBuf.testing.dontCloseTimestamps = store.cfg.TestingKnobs.DontCloseTimestamps r.mu.proposalBuf.testing.submitProposalFilter = store.cfg.TestingKnobs.TestingProposalSubmitFilter diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index 5eb0e402e6f5..3f6d56d0e9f8 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -111,10 +111,6 @@ type propBuf struct { // heartbeats and then expect other replicas to take the lease without // worrying about Raft). allowLeaseProposalWhenNotLeader bool - // allowLeaseTransfersWhenTargetMayNeedSnapshot, if set, makes the proposal - // buffer allow lease request proposals even when the proposer cannot prove - // that the lease transfer target does not need a Raft snapshot. - allowLeaseTransfersWhenTargetMayNeedSnapshot bool // dontCloseTimestamps inhibits the closing of timestamps. dontCloseTimestamps bool } @@ -699,7 +695,7 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( newLease := p.command.ReplicatedEvalResult.State.Lease newLeaseTarget := newLease.Replica.ReplicaID snapStatus := raftutil.ReplicaMayNeedSnapshot(&status, firstIndex, newLeaseTarget) - if snapStatus != raftutil.NoSnapshotNeeded && !b.testing.allowLeaseTransfersWhenTargetMayNeedSnapshot { + if snapStatus != raftutil.NoSnapshotNeeded && !p.Request.Requests[0].GetTransferLease().BypassSafetyChecks { b.p.rejectProposalWithLeaseTransferRejectedLocked(ctx, p, newLease, snapStatus) return true } diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index e24c8afb2822..d924631b8e8f 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -198,6 +198,7 @@ func (p *pendingLeaseRequest) InitOrJoinRequest( status kvserverpb.LeaseStatus, startKey roachpb.Key, transfer bool, + bypassSafetyChecks bool, ) *leaseRequestHandle { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { @@ -292,11 +293,19 @@ func (p *pendingLeaseRequest) InitOrJoinRequest( var leaseReq roachpb.Request if transfer { leaseReq = &roachpb.TransferLeaseRequest{ - RequestHeader: reqHeader, - Lease: reqLease, - PrevLease: status.Lease, + RequestHeader: reqHeader, + Lease: reqLease, + PrevLease: status.Lease, + BypassSafetyChecks: bypassSafetyChecks, } } else { + if bypassSafetyChecks { + // TODO(nvanbenschoten): we could support a similar bypassSafetyChecks + // flag for RequestLeaseRequest, which would disable the protection in + // propBuf.maybeRejectUnsafeProposalLocked. For now, we use a testing + // knob. + log.Fatal(ctx, "bypassSafetyChecks not supported for RequestLeaseRequest") + } minProposedTS := p.repl.mu.minLeaseProposedTS leaseReq = &roachpb.RequestLeaseRequest{ RequestHeader: reqHeader, @@ -842,7 +851,8 @@ func (r *Replica) requestLeaseLocked( return r.mu.pendingLeaseRequest.newResolvedHandle(roachpb.NewError(err)) } return r.mu.pendingLeaseRequest.InitOrJoinRequest( - ctx, repDesc, status, r.mu.state.Desc.StartKey.AsRawKey(), false /* transfer */) + ctx, repDesc, status, r.mu.state.Desc.StartKey.AsRawKey(), + false /* transfer */, false /* bypassSafetyChecks */) } // AdminTransferLease transfers the LeaderLease to another replica. Only the @@ -867,7 +877,9 @@ func (r *Replica) requestLeaseLocked( // replica. Otherwise, a NotLeaseHolderError is returned. // // AdminTransferLease implements the ReplicaLeaseMover interface. -func (r *Replica) AdminTransferLease(ctx context.Context, target roachpb.StoreID) error { +func (r *Replica) AdminTransferLease( + ctx context.Context, target roachpb.StoreID, bypassSafetyChecks bool, +) error { // initTransferHelper inits a transfer if no extension is in progress. // It returns a channel for waiting for the result of a pending // extension (if any is in progress) and a channel for waiting for the @@ -921,7 +933,7 @@ func (r *Replica) AdminTransferLease(ctx context.Context, target roachpb.StoreID raftStatus := r.raftStatusRLocked() raftFirstIndex := r.raftFirstIndexRLocked() snapStatus := raftutil.ReplicaMayNeedSnapshot(raftStatus, raftFirstIndex, nextLeaseHolder.ReplicaID) - if snapStatus != raftutil.NoSnapshotNeeded && !r.store.TestingKnobs().AllowLeaseTransfersWhenTargetMayNeedSnapshot { + if snapStatus != raftutil.NoSnapshotNeeded && !bypassSafetyChecks { r.store.metrics.LeaseTransferErrorCount.Inc(1) log.VEventf(ctx, 2, "not initiating lease transfer because the target %s may "+ "need a snapshot: %s", nextLeaseHolder, snapStatus) @@ -930,7 +942,7 @@ func (r *Replica) AdminTransferLease(ctx context.Context, target roachpb.StoreID } transfer = r.mu.pendingLeaseRequest.InitOrJoinRequest( - ctx, nextLeaseHolder, status, desc.StartKey.AsRawKey(), true, /* transfer */ + ctx, nextLeaseHolder, status, desc.StartKey.AsRawKey(), true /* transfer */, bypassSafetyChecks, ) return nil, transfer, nil } diff --git a/pkg/kv/kvserver/replica_rangefeed_test.go b/pkg/kv/kvserver/replica_rangefeed_test.go index 680c4890012d..55f0a2e7a06d 100644 --- a/pkg/kv/kvserver/replica_rangefeed_test.go +++ b/pkg/kv/kvserver/replica_rangefeed_test.go @@ -830,7 +830,7 @@ func TestReplicaRangefeedRetryErrors(t *testing.T) { if raftStatus != nil && raftStatus.RaftState == raft.StateFollower { return nil } - err = repl.AdminTransferLease(ctx, roachpb.StoreID(1)) + err = repl.AdminTransferLease(ctx, roachpb.StoreID(1), false /* bypassSafetyChecks */) // NB: errors.Wrapf(nil, ...) returns nil. // nolint:errwrap return errors.Errorf("not raft follower: %+v, transferred lease: %v", raftStatus, err) diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index 395c2e8a1a94..89c05635011d 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -934,7 +934,7 @@ func (r *Replica) executeAdminBatch( resp = &reply case *roachpb.AdminTransferLeaseRequest: - pErr = roachpb.NewError(r.AdminTransferLease(ctx, tArgs.Target)) + pErr = roachpb.NewError(r.AdminTransferLease(ctx, tArgs.Target, tArgs.BypassSafetyChecks)) resp = &roachpb.AdminTransferLeaseResponse{} case *roachpb.AdminChangeReplicasRequest: diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index a434d55ea422..e250338698ff 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -616,9 +616,6 @@ func TestBehaviorDuringLeaseTransfer(t *testing.T) { tsc := TestStoreConfig(clock) var leaseAcquisitionTrap atomic.Value tsc.TestingKnobs.DisableAutomaticLeaseRenewal = true - // We're transferring the lease to a bogus replica, so disable protection - // which would otherwise notice this and reject the lease transfer. - tsc.TestingKnobs.AllowLeaseTransfersWhenTargetMayNeedSnapshot = true tsc.TestingKnobs.LeaseRequestEvent = func(ts hlc.Timestamp, _ roachpb.StoreID, _ roachpb.RangeID) *roachpb.Error { val := leaseAcquisitionTrap.Load() if val == nil { @@ -669,7 +666,9 @@ func TestBehaviorDuringLeaseTransfer(t *testing.T) { // Initiate a transfer (async) and wait for it to be blocked. transferResChan := make(chan error) go func() { - err := tc.repl.AdminTransferLease(ctx, secondReplica.StoreID) + // We're transferring the lease to a bogus replica, so disable protection + // which would otherwise notice this and reject the lease transfer. + err := tc.repl.AdminTransferLease(ctx, secondReplica.StoreID, true /* bypassSafetyChecks */) if !testutils.IsError(err, "injected") { transferResChan <- err } else { diff --git a/pkg/kv/kvserver/replicate_queue.go b/pkg/kv/kvserver/replicate_queue.go index de99b16a494b..201c6b59dda8 100644 --- a/pkg/kv/kvserver/replicate_queue.go +++ b/pkg/kv/kvserver/replicate_queue.go @@ -1827,7 +1827,7 @@ func (rq *replicateQueue) shedLease( // ReplicaLeaseMover handles lease transfers for a single range. type ReplicaLeaseMover interface { // AdminTransferLease moves the lease to the requested store. - AdminTransferLease(ctx context.Context, target roachpb.StoreID) error + AdminTransferLease(ctx context.Context, target roachpb.StoreID, bypassSafetyChecks bool) error // String returns info about the replica. String() string @@ -1860,7 +1860,7 @@ func (rq *replicateQueue) TransferLease( ) error { rq.metrics.TransferLeaseCount.Inc(1) log.KvDistribution.Infof(ctx, "transferring lease to s%d", target) - if err := rlm.AdminTransferLease(ctx, target); err != nil { + if err := rlm.AdminTransferLease(ctx, target, false /* bypassSafetyChecks */); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", rlm, target) } rq.lastLeaseTransfer.Store(timeutil.Now()) diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 3b38d0f655ef..ea7f2d28a644 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -340,10 +340,6 @@ type StoreTestingKnobs struct { // heartbeats and then expect other replicas to take the lease without // worrying about Raft). AllowLeaseRequestProposalsWhenNotLeader bool - // AllowLeaseTransfersWhenTargetMayNeedSnapshot, if set, makes the Replica - // and proposal buffer allow lease request proposals even when the proposer - // cannot prove that the lease transfer target does not need a Raft snapshot. - AllowLeaseTransfersWhenTargetMayNeedSnapshot bool // LeaseTransferRejectedRetryLoopCount, if set, configures the maximum number // of retries for the retry loop used during lease transfers. This retry loop // retries after transfer attempts are rejected because the transfer is deemed diff --git a/pkg/roachpb/api.proto b/pkg/roachpb/api.proto index a0d6b29b9abc..7aa217014cc8 100644 --- a/pkg/roachpb/api.proto +++ b/pkg/roachpb/api.proto @@ -880,6 +880,14 @@ message AdminMergeResponse { message AdminTransferLeaseRequest { RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; int32 target = 2 [(gogoproto.casttype) = "StoreID"]; + // When set to true, bypass_safety_checks configures the lease transfer to + // skip safety checks that ensure that the transfer target is known to be + // (according to the outgoing leaseholder) alive and sufficiently caught up on + // its log. This option should be used sparingly — typically only by outgoing + // leaseholders who both have some other reason to believe that the target is + // alive and caught up on its log (e.g. they just sent it a snapshot) and also + // can't tolerate rejected lease transfers. + bool bypass_safety_checks = 3; } message AdminTransferLeaseResponse { @@ -1380,6 +1388,14 @@ message TransferLeaseRequest { // The previous lease is specified by the caller to verify // it has not changed when executing this command. Lease prev_lease = 3 [(gogoproto.nullable) = false]; + // When set to true, bypass_safety_checks configures the lease transfer to + // skip safety checks that ensure that the transfer target is known to be + // (according to the outgoing leaseholder) alive and sufficiently caught up on + // its log. This option should be used sparingly — typically only by outgoing + // leaseholders who both have some other reason to believe that the target is + // alive and caught up on its log (e.g. they just sent it a snapshot) and also + // can't tolerate rejected lease transfers. + bool bypass_safety_checks = 4; } // LeaseInfoRequest is the argument to the LeaseInfo() method, for getting