Merge #55624 #56004 #56016

55624: kvserver: don't always refuse to take leases when draining r=andreimatei a=andreimatei Before this patch, a draining node would not take any new leases once it's draining. This is a problem in case all replicas of a range are draining at the same time (e.g. when draining a single-node cluster, or when draining multiple nodes at the same time perhaps by mistake) - nobody wants the lease. Particularly because the liveness range is expiration-based (and thus permanently in need of new leases to be granted), this quickly results in nodes failing their liveness. It also becomes more of a problem with #55148, where we start refusing to take the lease on replicas that are not the leader - so if the leader is draining, we deadlock. This patch makes an exception for leaders, which now no longer refuse the lease even when they're draining. The reasonsing being that it's too easy to deadlock otherwise. Release note: None 56004: build: add instructions to rectify code that is not regenerated r=irfansharif a=otan Make it easier for the developer to know how to fix un-generated code by specifying the required commands. Release note: None 56016: backfill: remove unused bound account from column backfiller r=yuzefovich a=adityamaru Previously, we were creating a bound account in the column backfiller which was not being used to track anything. This change removes that. Release note: None Co-authored-by: Andrei Matei <[email protected]> Co-authored-by: Oliver Tan <[email protected]> Co-authored-by: Aditya Maru <[email protected]>
cockroachdb · Oct 27, 2020 · d058e01 · d058e01
4 parents 8652b7a + acc1ad1 + 9a94690 + 2c85868
commit d058e01
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 189 deletions.
diff --git a/build/teamcity-check.sh b/build/teamcity-check.sh
@@ -14,6 +14,9 @@ function check_clean() {
   if [[ "$(git status --porcelain 2>&1)" != "" ]]; then
     git status >&2 || true
     git diff -a >&2 || true
+    echo "====================================================" >&2
+    echo "Some automatically generated code is not up to date." >&2
+    echo $1 >&2
     exit 1
   fi
 }
@@ -35,22 +38,23 @@ fi
 tc_start_block "Ensure generated code is up-to-date"
 # Buffer noisy output and only print it on failure.
 run build/builder.sh make generate &> artifacts/generate.log || (cat artifacts/generate.log && false)
-run build/builder.sh make buildshort &> artifacts/buildshort.log || (cat artifacts/buildshort.log && false)
 rm artifacts/generate.log
+check_clean "Run \`make generate\` to automatically regenerate these."
+run build/builder.sh make buildshort &> artifacts/buildshort.log || (cat artifacts/buildshort.log && false)
 rm artifacts/buildshort.log
-check_clean
+check_clean "Run \`make buildshort\` to automatically regenerate these."
 tc_end_block "Ensure generated code is up-to-date"
 
 # generated code can generate new dependencies; check dependencies after generated code.
 tc_start_block "Ensure dependencies are up-to-date"
 # Run go mod tidy and `make -k vendor_rebuild` and ensure nothing changes.
 run build/builder.sh go mod tidy
-check_clean
+check_clean "Run \`go mod tidy\` and \`make -k vendor_rebuild\` to automatically regenerate these."
 run build/builder.sh make -k vendor_rebuild
 cd vendor
-check_clean
+check_clean "Run \`make -k vendor_rebuild\` to automatically regenerate these."
 cd ..
-check_clean
+check_clean "Run \`make -k vendor_rebuild\` to automatically regenerate these. If it is only BUILD.bazel files that are affected, run \`bazel run //:gazelle\` to automatically regenerate these."
 tc_end_block "Ensure dependencies are up-to-date"
 
 tc_start_block "Lint"

diff --git a/pkg/base/testing_knobs.go b/pkg/base/testing_knobs.go
@@ -31,6 +31,7 @@ type TestingKnobs struct {
 	DistSQL              ModuleTestingKnobs
 	SQLEvalContext       ModuleTestingKnobs
 	RegistryLiveness     ModuleTestingKnobs
+	NodeLiveness         ModuleTestingKnobs
 	Server               ModuleTestingKnobs
 	TenantTestingKnobs   ModuleTestingKnobs
 	JobsTestingKnobs     ModuleTestingKnobs

diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go
@@ -1234,158 +1234,6 @@ func TestReplicateAfterRemoveAndSplit(t *testing.T) {
 	mtc.stores[2].SetReplicaGCQueueActive(true)
 }
 
-// Test various mechanism for refreshing pending commands.
-func TestRefreshPendingCommands(t *testing.T) {
-	defer leaktest.AfterTest(t)()
-	defer log.Scope(t).Close(t)
-
-	// In this scenario, three different mechanisms detect the need to repropose
-	// commands. Test that each one is sufficient individually. We have this
-	// redundancy because some mechanisms respond with lower latency than others,
-	// but each has some scenarios (not currently tested) in which it is
-	// insufficient on its own. In addition, there is a fourth reproposal
-	// mechanism (reasonNewLeaderOrConfigChange) which is not relevant to this
-	// scenario.
-	//
-	// We don't test with only reasonNewLeader because that mechanism is less
-	// robust than refreshing due to snapshot or ticks. In particular, it is
-	// possible for node 3 to propose the RequestLease command and have that
-	// command executed by the other nodes but to never see the execution locally
-	// because it is caught up by applying a snapshot.
-	testCases := map[string]kvserver.StoreTestingKnobs{
-		"reasonSnapshotApplied": {
-			DisableRefreshReasonNewLeader: true,
-			DisableRefreshReasonTicks:     true,
-		},
-		"reasonTicks": {
-			DisableRefreshReasonNewLeader:       true,
-			DisableRefreshReasonSnapshotApplied: true,
-		},
-	}
-	for name, c := range testCases {
-		t.Run(name, func(t *testing.T) {
-			sc := kvserver.TestStoreConfig(nil)
-			sc.TestingKnobs = c
-			// Disable periodic gossip tasks which can move the range 1 lease
-			// unexpectedly.
-			sc.TestingKnobs.DisablePeriodicGossips = true
-			sc.Clock = nil // manual clock
-			mtc := &multiTestContext{
-				storeConfig: &sc,
-				// This test was written before the multiTestContext started creating
-				// many system ranges at startup, and hasn't been update to take that
-				// into account.
-				startWithSingleRange: true,
-			}
-			defer mtc.Stop()
-			mtc.Start(t, 3)
-
-			const rangeID = roachpb.RangeID(1)
-			mtc.replicateRange(rangeID, 1, 2)
-
-			// Put some data in the range so we'll have something to test for.
-			incArgs := incrementArgs([]byte("a"), 5)
-			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
-				t.Fatal(err)
-			}
-
-			// Wait for all nodes to catch up.
-			mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5})
-
-			// Stop node 2; while it is down write some more data.
-			mtc.stopStore(2)
-
-			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
-				t.Fatal(err)
-			}
-
-			// Get the last increment's log index.
-			repl, err := mtc.stores[0].GetReplica(1)
-			if err != nil {
-				t.Fatal(err)
-			}
-			index, err := repl.GetLastIndex()
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			// Truncate the log at index+1 (log entries < N are removed, so this includes
-			// the increment).
-			truncArgs := truncateLogArgs(index+1, rangeID)
-			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
-				t.Fatal(err)
-			}
-
-			// Stop and restart node 0 in order to make sure that any in-flight Raft
-			// messages have been sent.
-			mtc.stopStore(0)
-			mtc.restartStore(0)
-
-			////////////////////////////////////////////////////////////////////
-			// We want store 2 to take the lease later, so we'll drain the other
-			// stores and expire the lease.
-			////////////////////////////////////////////////////////////////////
-
-			// Disable node liveness heartbeats which can reacquire leases when we're
-			// trying to expire them. We pause liveness heartbeats here after node 0
-			// was restarted (which creates a new NodeLiveness).
-			pauseNodeLivenessHeartbeatLoops(mtc)
-
-			// Start draining stores 0 and 1 to prevent them from grabbing any new
-			// leases.
-			mtc.advanceClock(context.Background())
-			var wg sync.WaitGroup
-			for i := 0; i < 2; i++ {
-				wg.Add(1)
-				go func(i int) {
-					mtc.stores[i].SetDraining(true, nil /* reporter */)
-					wg.Done()
-				}(i)
-			}
-
-			// Wait for the stores 0 and 1 to have entered draining mode, and then
-			// advance the clock. Advancing the clock will leave the liveness records
-			// of draining nodes in an expired state, so the SetDraining() call above
-			// will be able to terminate.
-			draining := false
-			for !draining {
-				draining = true
-				for i := 0; i < 2; i++ {
-					draining = draining && mtc.stores[i].IsDraining()
-				}
-				// Allow this loop to be preempted. Failure to do so can cause a
-				// deadlock because a non-preemptible loop will prevent GC from
-				// starting which in turn will cause all other goroutines to be stuck
-				// as soon as they are called on to assist the GC (this shows up as
-				// goroutines stuck in "GC assist wait"). With all of the other
-				// goroutines stuck, nothing will be able to set mtc.stores[i].draining
-				// to true.
-				//
-				// See #18554.
-				runtime.Gosched()
-			}
-			mtc.advanceClock(context.Background())
-
-			wg.Wait()
-
-			// Restart node 2 and wait for the snapshot to be applied. Note that
-			// waitForValues reads directly from the engine and thus isn't executing
-			// a Raft command.
-			mtc.restartStore(2)
-			mtc.waitForValues(roachpb.Key("a"), []int64{10, 10, 10})
-
-			// Send an increment to the restarted node. If we don't refresh pending
-			// commands appropriately, the range lease command will not get
-			// re-proposed when we discover the new leader.
-			if _, err := kv.SendWrapped(context.Background(), mtc.stores[2].TestSender(), incArgs); err != nil {
-				t.Fatal(err)
-			}
-
-			mtc.waitForValues(roachpb.Key("a"), []int64{15, 15, 15})
-		})
-	}
-}
-
 // Test that when a Raft group is not able to establish a quorum, its Raft log
 // does not grow without bound. It tests two different scenarios where this used
 // to be possible (see #27772):

diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go
@@ -58,6 +58,7 @@ import (
 	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/logtags"
 	"github.com/opentracing/opentracing-go"
+	"go.etcd.io/etcd/raft"
 )
 
 var leaseStatusLogLimiter = log.Every(5 * time.Second)
@@ -583,6 +584,15 @@ func (r *Replica) leaseStatus(
 	return status
 }
 
+// currentLeaseStatus returns the status of the current lease for a current
+// timestamp.
+func (r *Replica) currentLeaseStatus(ctx context.Context) kvserverpb.LeaseStatus {
+	timestamp := r.store.Clock().Now()
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	return r.leaseStatus(ctx, *r.mu.state.Lease, timestamp, r.mu.minLeaseProposedTS)
+}
+
 // requiresExpiringLeaseRLocked returns whether this range uses an
 // expiration-based lease; false if epoch-based. Ranges located before or
 // including the node liveness table must use expiration leases to avoid
@@ -616,8 +626,14 @@ func (r *Replica) requestLeaseLocked(
 		return r.mu.pendingLeaseRequest.newResolvedHandle(roachpb.NewError(
 			newNotLeaseHolderError(&transferLease, r.store.StoreID(), r.mu.state.Desc)))
 	}
-	if r.store.IsDraining() {
-		// We've retired from active duty.
+	// If we're draining, we'd rather not take any new leases (since we're also
+	// trying to move leases away elsewhere). But if we're the leader, we don't
+	// really have a choice and we take the lease - there might not be any other
+	// replica available to take this lease (perhaps they're all draining).
+	if r.store.IsDraining() && (r.raftBasicStatusRLocked().RaftState != raft.StateLeader) {
+		// TODO(andrei): If we start refusing to take leases on followers elsewhere,
+		// this code can go away.
+		log.VEventf(ctx, 2, "refusing to take the lease because we're draining")
 		return r.mu.pendingLeaseRequest.newResolvedHandle(roachpb.NewError(
 			newNotLeaseHolderError(nil, r.store.StoreID(), r.mu.state.Desc)))
 	}

diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go
@@ -1428,36 +1428,108 @@ func TestReplicaLeaseRejectUnknownRaftNodeID(t *testing.T) {
 	}
 }
 
-// TestReplicaDrainLease makes sure that no new leases are granted when
-// the Store is draining.
+// Test that draining nodes only take the lease if they're the leader.
 func TestReplicaDrainLease(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
-	tc := testContext{}
-	stopper := stop.NewStopper()
-	defer stopper.Stop(context.Background())
-	tc.Start(t, stopper)
-
-	// Acquire initial lease.
 	ctx := context.Background()
-	status, pErr := tc.repl.redirectOnOrAcquireLease(ctx)
-	if pErr != nil {
-		t.Fatal(pErr)
+	clusterArgs := base.TestClusterArgs{
+		ReplicationMode: base.ReplicationManual,
+		ServerArgs: base.TestServerArgs{
+			Knobs: base.TestingKnobs{
+				NodeLiveness: NodeLivenessTestingKnobs{
+					// This test waits for an epoch-based lease to expire, so we're setting the
+					// liveness duration as low as possible while still keeping the test stable.
+					LivenessDuration: 2000 * time.Millisecond,
+					RenewalDuration:  1000 * time.Millisecond,
+				},
+				Store: &StoreTestingKnobs{
+					// We eliminate clock offsets in order to eliminate the stasis period of
+					// leases. Otherwise we'd need to make leases longer.
+					MaxOffset: time.Nanosecond,
+				},
+			},
+		},
 	}
+	tc := serverutils.StartNewTestCluster(t, 2, clusterArgs)
+	defer tc.Stopper().Stop(ctx)
+	rngKey := tc.ScratchRange(t)
+	tc.AddReplicasOrFatal(t, rngKey, tc.Target(1))
 
-	tc.store.SetDraining(true, nil /* reporter */)
-	tc.repl.mu.Lock()
-	pErr = <-tc.repl.requestLeaseLocked(ctx, status).C()
-	tc.repl.mu.Unlock()
-	_, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError)
-	if !ok {
-		t.Fatalf("expected NotLeaseHolderError, not %v", pErr)
-	}
-	tc.store.SetDraining(false, nil /* reporter */)
-	// Newly undrained, leases work again.
-	if _, pErr := tc.repl.redirectOnOrAcquireLease(ctx); pErr != nil {
-		t.Fatal(pErr)
-	}
+	s1 := tc.Server(0)
+	s2 := tc.Server(1)
+	store1, err := s1.GetStores().(*Stores).GetStore(s1.GetFirstStoreID())
+	require.NoError(t, err)
+	store2, err := s2.GetStores().(*Stores).GetStore(s2.GetFirstStoreID())
+	require.NoError(t, err)
+
+	rd := tc.LookupRangeOrFatal(t, rngKey)
+	r1, err := store1.GetReplica(rd.RangeID)
+	require.NoError(t, err)
+	status := r1.currentLeaseStatus(ctx)
+	require.True(t, status.Lease.OwnedBy(store1.StoreID()), "someone else got the lease: %s", status)
+	// We expect the lease to be valid, but don't check that because, under race, it might have
+	// expired already.
+
+	// Stop n1's heartbeats and wait for the lease to expire.
+
+	log.Infof(ctx, "test: suspending heartbeats for n1")
+	cleanup := s1.NodeLiveness().(*NodeLiveness).PauseAllHeartbeatsForTest()
+	defer cleanup()
+
+	require.NoError(t, err)
+	testutils.SucceedsSoon(t, func() error {
+		status := r1.currentLeaseStatus(ctx)
+		require.True(t, status.Lease.OwnedBy(store1.StoreID()), "someone else got the lease: %s", status)
+		if status.State == kvserverpb.LeaseState_VALID {
+			return errors.New("lease still valid")
+		}
+		// We need to wait for the stasis state to pass too; during stasis other
+		// replicas can't take the lease.
+		if status.State == kvserverpb.LeaseState_STASIS {
+			return errors.New("lease still in stasis")
+		}
+		return nil
+	})
+
+	require.Equal(t, r1.RaftStatus().Lead, uint64(r1.ReplicaID()),
+		"expected leadership to still be on the first replica")
+
+	// Mark the stores as draining. We'll then start checking how acquiring leases
+	// behaves while draining.
+	store1.draining.Store(true)
+	store2.draining.Store(true)
+
+	r2, err := store2.GetReplica(rd.RangeID)
+	require.NoError(t, err)
+	// Check that a draining replica that's not the leader does NOT take the
+	// lease.
+	_, pErr := r2.redirectOnOrAcquireLease(ctx)
+	require.NotNil(t, pErr)
+	require.IsType(t, &roachpb.NotLeaseHolderError{}, pErr.GetDetail())
+
+	// Now transfer the leadership from r1 to r2 and check that r1 can now acquire
+	// the lease.
+
+	// Initiate the leadership transfer.
+	r1.mu.Lock()
+	r1.mu.internalRaftGroup.TransferLeader(uint64(r2.ReplicaID()))
+	r1.mu.Unlock()
+	// Run the range through the Raft scheduler, otherwise the leadership messages
+	// doesn't get sent because the range is quiesced.
+	store1.EnqueueRaftUpdateCheck(r1.RangeID)
+
+	// Wait for the leadership transfer to happen.
+	testutils.SucceedsSoon(t, func() error {
+		if r2.RaftStatus().SoftState.RaftState != raft.StateLeader {
+			return errors.Newf("r1 not yet leader")
+		}
+		return nil
+	})
+
+	// Check that r2 can now acquire the lease.
+	_, pErr = r2.redirectOnOrAcquireLease(ctx)
+	require.NoError(t, pErr.GoError())
 }
 
 // TestReplicaGossipFirstRange verifies that the first range gossips its