From d89a7e4588eac007444cbe411380789ec2468663 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 12 Sep 2022 19:07:06 -0400
Subject: [PATCH] kv: remove broken attempt to reject lease acquisitions on
 draining nodes

Related to #83261.

This commit removes "protection" that avoided lease acquisitions on
draining nodes. This protection had already been effectively disabled by
acc1ad1, which allowed Raft leaders to bypass the check. As the comment
here (added in 5ffaa9e) explained, Raft followers are already unable to
acquire the lease. If leaders bypass the check and follower (and
candidates) don't need it, the check is useless, so we remove it.

The commit also removes `TestReplicaDrainLease`, which was supposed to
test this behavior. We remove the test not because it started failing
after the change, but because it did not. It must not have been testing
anything real anymore after acc1ad1.

Release justification: low risk change related to release blocker.

Release note: None
---
 pkg/kv/kvserver/replica_proposal.go    |  15 ++-
 pkg/kv/kvserver/replica_range_lease.go |  25 -----
 pkg/kv/kvserver/replica_test.go        | 121 -------------------------
 3 files changed, 7 insertions(+), 154 deletions(-)

diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go
index 308deef3c54b..6026021933dd 100644
--- a/pkg/kv/kvserver/replica_proposal.go
+++ b/pkg/kv/kvserver/replica_proposal.go
@@ -263,10 +263,13 @@ func (r *Replica) leasePostApplyLocked(
 	leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID || prevLease.Sequence != newLease.Sequence
 
 	if iAmTheLeaseHolder {
-		// Log lease acquisition whenever an Epoch-based lease changes hands (or verbose
-		// logging is enabled).
-		if newLease.Type() == roachpb.LeaseEpoch && leaseChangingHands || log.V(1) {
-			log.VEventf(ctx, 1, "new range lease %s following %s", newLease, prevLease)
+		// Log lease acquisitions loudly when verbose logging is enabled or when the
+		// new leaseholder is draining, in which case it should be shedding leases.
+		// Otherwise, log a trace event.
+		if log.V(1) || r.store.IsDraining() {
+			log.Infof(ctx, "new range lease %s following %s", newLease, prevLease)
+		} else {
+			log.Eventf(ctx, "new range lease %s following %s", newLease, prevLease)
 		}
 	}
 
@@ -421,10 +424,6 @@ func (r *Replica) leasePostApplyLocked(
 				log.Errorf(ctx, "%v", err)
 			}
 		})
-		if leaseChangingHands && log.V(1) {
-			// This logging is useful to troubleshoot incomplete drains.
-			log.Info(ctx, "is now leaseholder")
-		}
 	}
 
 	// Inform the store of this lease.
diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go
index fb318fdffe46..7643904c80c3 100644
--- a/pkg/kv/kvserver/replica_range_lease.go
+++ b/pkg/kv/kvserver/replica_range_lease.go
@@ -63,7 +63,6 @@ import (
 	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/logtags"
 	"github.com/cockroachdb/redact"
-	"go.etcd.io/etcd/raft/v3"
 )
 
 var leaseStatusLogLimiter = func() *log.EveryN {
@@ -800,30 +799,6 @@ func (r *Replica) requestLeaseLocked(
 		return r.mu.pendingLeaseRequest.newResolvedHandle(pErr)
 	}
 
-	// If we're draining, we'd rather not take any new leases (since we're also
-	// trying to move leases away elsewhere). But if we're the leader, we don't
-	// really have a choice and we take the lease - there might not be any other
-	// replica available to take this lease (perhaps they're all draining).
-	if r.store.IsDraining() {
-		// NB: Replicas that are not the Raft leader will not take leases anyway
-		// (see the check inside propBuf.FlushLockedWithRaftGroup()), so we don't
-		// really need any special behavior for draining nodes here. This check
-		// serves mostly as a means to get more granular logging and as a defensive
-		// precaution.
-		if r.raftBasicStatusRLocked().RaftState != raft.StateLeader {
-			log.VEventf(ctx, 2, "refusing to take the lease because we're draining")
-			return r.mu.pendingLeaseRequest.newResolvedHandle(
-				roachpb.NewError(
-					newNotLeaseHolderError(
-						roachpb.Lease{}, r.store.StoreID(), r.mu.state.Desc,
-						"refusing to take the lease; node is draining",
-					),
-				),
-			)
-		}
-		log.Info(ctx, "trying to take the lease while we're draining since we're the raft leader")
-	}
-
 	// Propose a Raft command to get a lease for this replica.
 	repDesc, err := r.getReplicaDescriptorRLocked()
 	if err != nil {
diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go
index 6802942f2b08..c7da218fa72e 100644
--- a/pkg/kv/kvserver/replica_test.go
+++ b/pkg/kv/kvserver/replica_test.go
@@ -1351,127 +1351,6 @@ func TestReplicaLeaseRejectUnknownRaftNodeID(t *testing.T) {
 	}
 }
 
-// Test that draining nodes only take the lease if they're the leader.
-func TestReplicaDrainLease(t *testing.T) {
-	defer leaktest.AfterTest(t)()
-	defer log.Scope(t).Close(t)
-	ctx := context.Background()
-	clusterArgs := base.TestClusterArgs{
-		ReplicationMode: base.ReplicationManual,
-		ServerArgs: base.TestServerArgs{
-			Knobs: base.TestingKnobs{
-				NodeLiveness: NodeLivenessTestingKnobs{
-					// This test waits for an epoch-based lease to expire, so we're setting the
-					// liveness duration as low as possible while still keeping the test stable.
-					LivenessDuration: 2000 * time.Millisecond,
-					RenewalDuration:  1000 * time.Millisecond,
-				},
-				Store: &StoreTestingKnobs{
-					// We eliminate clock offsets in order to eliminate the stasis period of
-					// leases. Otherwise we'd need to make leases longer.
-					MaxOffset: time.Nanosecond,
-				},
-			},
-		},
-	}
-	tc := serverutils.StartNewTestCluster(t, 2, clusterArgs)
-	defer tc.Stopper().Stop(ctx)
-	rngKey := tc.ScratchRange(t)
-	tc.AddVotersOrFatal(t, rngKey, tc.Target(1))
-
-	s1 := tc.Server(0)
-	s2 := tc.Server(1)
-	store1, err := s1.GetStores().(*Stores).GetStore(s1.GetFirstStoreID())
-	require.NoError(t, err)
-	store2, err := s2.GetStores().(*Stores).GetStore(s2.GetFirstStoreID())
-	require.NoError(t, err)
-
-	rd := tc.LookupRangeOrFatal(t, rngKey)
-	r1, err := store1.GetReplica(rd.RangeID)
-	require.NoError(t, err)
-	status := r1.CurrentLeaseStatus(ctx)
-	require.True(t, status.Lease.OwnedBy(store1.StoreID()), "someone else got the lease: %s", status)
-	// We expect the lease to be valid, but don't check that because, under race, it might have
-	// expired already.
-
-	// Stop n1's heartbeats and wait for the lease to expire.
-	log.Infof(ctx, "test: suspending heartbeats for n1")
-	cleanup := s1.NodeLiveness().(*liveness.NodeLiveness).PauseAllHeartbeatsForTest()
-	defer cleanup()
-
-	testutils.SucceedsSoon(t, func() error {
-		status := r1.CurrentLeaseStatus(ctx)
-		require.True(t, status.Lease.OwnedBy(store1.StoreID()), "someone else got the lease: %s", status)
-		if status.State == kvserverpb.LeaseState_VALID {
-			return errors.New("lease still valid")
-		}
-		// We need to wait for the stasis state to pass too; during stasis other
-		// replicas can't take the lease.
-		if status.State == kvserverpb.LeaseState_UNUSABLE {
-			return errors.New("lease still in stasis")
-		}
-		return nil
-	})
-
-	require.Equal(t, r1.RaftStatus().Lead, uint64(r1.ReplicaID()),
-		"expected leadership to still be on the first replica")
-
-	// Wait until n1 has heartbeat its liveness record (epoch >= 1) and n2
-	// knows about it. Otherwise, the following could occur:
-	//
-	// - n1's heartbeats to epoch 1 and acquires lease
-	// - n2 doesn't receive this yet (gossip)
-	// - when n2 is asked to acquire the lease, it uses a lease with epoch 1
-	//   but the liveness record with epoch zero
-	// - lease status is ERROR, lease acquisition (and thus test) fails.
-	testutils.SucceedsSoon(t, func() error {
-		nl, ok := s2.NodeLiveness().(*liveness.NodeLiveness).GetLiveness(s1.NodeID())
-		if !ok {
-			return errors.New("no liveness record for n1")
-		}
-		if nl.Epoch < 1 {
-			return errors.New("epoch for n1 still zero")
-		}
-		return nil
-	})
-
-	// Mark the stores as draining. We'll then start checking how acquiring leases
-	// behaves while draining.
-	store1.draining.Store(true)
-	store2.draining.Store(true)
-
-	r2, err := store2.GetReplica(rd.RangeID)
-	require.NoError(t, err)
-	// Check that a draining replica that's not the leader does NOT take the
-	// lease.
-	_, pErr := r2.redirectOnOrAcquireLease(ctx)
-	require.NotNil(t, pErr)
-	require.IsType(t, &roachpb.NotLeaseHolderError{}, pErr.GetDetail())
-
-	// Now transfer the leadership from r1 to r2 and check that r1 can now acquire
-	// the lease.
-
-	// Initiate the leadership transfer.
-	r1.mu.Lock()
-	r1.mu.internalRaftGroup.TransferLeader(uint64(r2.ReplicaID()))
-	r1.mu.Unlock()
-	// Run the range through the Raft scheduler, otherwise the leadership messages
-	// doesn't get sent because the range is quiesced.
-	store1.EnqueueRaftUpdateCheck(r1.RangeID)
-
-	// Wait for the leadership transfer to happen.
-	testutils.SucceedsSoon(t, func() error {
-		if r2.RaftStatus().SoftState.RaftState != raft.StateLeader {
-			return errors.Newf("r1 not yet leader")
-		}
-		return nil
-	})
-
-	// Check that r2 can now acquire the lease.
-	_, pErr = r2.redirectOnOrAcquireLease(ctx)
-	require.NoError(t, pErr.GoError())
-}
-
 // TestReplicaGossipFirstRange verifies that the first range gossips its
 // location and the cluster ID.
 func TestReplicaGossipFirstRange(t *testing.T) {