Merge #51894

51894: kvserver: don't unquiesce on liveness changes of up-to-date replicas r=nvanbenschoten a=nvanbenschoten Mitigates the runtime regression in #50865. Implements the proposal from #50865 (comment). In #26911, we introduced the ability to quiesce ranges with dead replicas that were behind on the Raft log. This was a big win, as it prevented a node failure from stopping any range with a replica on that node from quiescing, even if the range was dormant. However, in order to ensure that the replica was caught up quickly once its node being brought back on-line, we began aggressively unquiescing when nodes moved from not-live to live. This turns out to be a destabilizing behavior. In scenarios like #50865 where nodes contain large numbers of replicas (i.e. O(100k)), suddenly unquiescing all replicas on a node due to a transient liveness blip can bring a cluster to its knees. Like #51888 and #45013, this is another case where we add work to the system when it gets sufficiently loaded, causing stability to be divergent instead of convergent and resulting in an unstable system. This fix removes the need to unquiesce ranges that have up-to-date replicas on nodes that undergo liveness changes. It does so by socializing the liveness state of lagging replicas during Range quiescence. In dong so through a new `lagging_quiescence` set attached to quiescence requests, it allows all quiesced replicas to agree on which node liveness events would need to result in the range being unquiesced and which ones can be ignored. This allows us to continue to provide the guarantee that: > If at least one up-to-date replica in a Raft group is alive, the > Raft group will catch up any lagging replicas that are also alive. For a pseudo-proof of this guarantee, see #50865 (comment). A secondary effect of this change is that it fixes a bug in quiescence as it exists today. Before this change, we did not provide this guarantee in the case that a leader was behind on its node liveness information, broadcasted a quiescence notification, and then crashed. In that case, a node that was brought back online might not be caught up for minutes because the range wouldn't unquiesce until the lagging replica reached out to it. Interestingly, this may be a semi-common occurrence during rolling restart scenarios. This is a large improvement on top of the existing behavior because it reduces the set of ranges with a replica on a given dead node that unquiesce when that node becomes live to only those ranges that had write activity after the node went down. This is typically only a small fraction of the total number of ranges on each node (operating on the usual assumption that in an average large cluster, most data is write-cold), especially when the outage is short. However, if all ranges had write activity and subsequently quiesced before the outage resolved, we would still have to unquiesce all ranges upon the node coming back up. For this reason, the change is primarily targeted towards reducing the impact of node liveness blips and not reducing the impact of bringing nodes back up after sustained node outages. This is intentional, as the former category of outage is the one more likely to be caused by overload due to unquiescence traffic itself (as we see in #50865), so it is the category we choose to focus on here. Release note (performance improvement): transient node liveness blips no longer cause up-to-date Ranges to unquiesce, which makes these events less destabilizing. @petermattis I'm adding you as a reviewer because you've done more thinking about Range quiescence than anyone else. But I can find a new second reviewer if you don't have time to take a look at this. Co-authored-by: Nathan VanBenschoten <[email protected]>
cockroachdb · Aug 20, 2020 · 9d2062c · 9d2062c
2 parents 0c8e734 + f7c9558
commit 9d2062c
Show file tree

Hide file tree

Showing 12 changed files with 782 additions and 217 deletions.
diff --git a/pkg/kv/kvserver/kvserverpb/liveness.go b/pkg/kv/kvserver/kvserverpb/liveness.go
@@ -43,6 +43,25 @@ func (l *Liveness) IsDead(now time.Time, threshold time.Duration) bool {
 	return !now.Before(deadAsOf)
 }
 
+// Compare returns an integer comparing two pieces of liveness information,
+// based on which liveness information is more recent.
+func (l *Liveness) Compare(o Liveness) int {
+	// Compare Epoch, and if no change there, Expiration.
+	if l.Epoch != o.Epoch {
+		if l.Epoch < o.Epoch {
+			return -1
+		}
+		return +1
+	}
+	if l.Expiration != o.Expiration {
+		if l.Expiration.Less(o.Expiration) {
+			return -1
+		}
+		return +1
+	}
+	return 0
+}
+
 func (l *Liveness) String() string {
 	var extra string
 	if l.Draining || l.Membership.Decommissioning() || l.Membership.Decommissioned() {

diff --git a/pkg/kv/kvserver/node_liveness.go b/pkg/kv/kvserver/node_liveness.go
@@ -124,7 +124,7 @@ type LivenessMetrics struct {
 
 // IsLiveCallback is invoked when a node's IsLive state changes to true.
 // Callbacks can be registered via NodeLiveness.RegisterCallback().
-type IsLiveCallback func(nodeID roachpb.NodeID)
+type IsLiveCallback func(kvserverpb.Liveness)
 
 // HeartbeatCallback is invoked whenever this node updates its own liveness status,
 // indicating that it is alive.
@@ -847,8 +847,8 @@ func (nl *NodeLiveness) SelfEx() (LivenessRecord, error) {
 // IsLiveMapEntry encapsulates data about current liveness for a
 // node.
 type IsLiveMapEntry struct {
+	kvserverpb.Liveness
 	IsLive bool
-	Epoch  int64
 }
 
 // IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry.
@@ -869,8 +869,8 @@ func (nl *NodeLiveness) GetIsLiveMap() IsLiveMap {
 			continue
 		}
 		lMap[nID] = IsLiveMapEntry{
-			IsLive: isLive,
-			Epoch:  l.Epoch,
+			Liveness: l.Liveness,
+			IsLive:   isLive,
 		}
 	}
 	return lMap
@@ -1139,7 +1139,7 @@ func (nl *NodeLiveness) maybeUpdate(new LivenessRecord) {
 	now := nl.clock.Now().GoTime()
 	if !old.IsLive(now) && new.IsLive(now) {
 		for _, fn := range callbacks {
-			fn(new.NodeID)
+			fn(new.Liveness)
 		}
 	}
 }
@@ -1151,12 +1151,9 @@ func shouldReplaceLiveness(old, new kvserverpb.Liveness) bool {
 		return true
 	}
 
-	// Compare Epoch, and if no change there, Expiration.
-	if old.Epoch != new.Epoch {
-		return old.Epoch < new.Epoch
-	}
-	if old.Expiration != new.Expiration {
-		return old.Expiration.Less(new.Expiration)
+	// Compare liveness information. If old < new, replace.
+	if cmp := old.Compare(new); cmp != 0 {
+		return cmp < 0
 	}
 
 	// If Epoch and Expiration are unchanged, assume that the update is newer

diff --git a/pkg/kv/kvserver/node_liveness_test.go b/pkg/kv/kvserver/node_liveness_test.go
@@ -262,10 +262,10 @@ func TestNodeIsLiveCallback(t *testing.T) {
 
 	var cbMu syncutil.Mutex
 	cbs := map[roachpb.NodeID]struct{}{}
-	mtc.nodeLivenesses[0].RegisterCallback(func(nodeID roachpb.NodeID) {
+	mtc.nodeLivenesses[0].RegisterCallback(func(l kvserverpb.Liveness) {
 		cbMu.Lock()
 		defer cbMu.Unlock()
-		cbs[nodeID] = struct{}{}
+		cbs[l.NodeID] = struct{}{}
 	})
 
 	// Advance clock past the liveness threshold.
@@ -555,10 +555,13 @@ func TestNodeLivenessGetIsLiveMap(t *testing.T) {
 	verifyLiveness(t, mtc)
 	pauseNodeLivenessHeartbeatLoops(mtc)
 	lMap := mtc.nodeLivenesses[0].GetIsLiveMap()
+	l1, _ := mtc.nodeLivenesses[0].GetLiveness(1)
+	l2, _ := mtc.nodeLivenesses[0].GetLiveness(2)
+	l3, _ := mtc.nodeLivenesses[0].GetLiveness(3)
 	expectedLMap := kvserver.IsLiveMap{
-		1: {IsLive: true, Epoch: 1},
-		2: {IsLive: true, Epoch: 1},
-		3: {IsLive: true, Epoch: 1},
+		1: {Liveness: l1.Liveness, IsLive: true},
+		2: {Liveness: l2.Liveness, IsLive: true},
+		3: {Liveness: l3.Liveness, IsLive: true},
 	}
 	if !reflect.DeepEqual(expectedLMap, lMap) {
 		t.Errorf("expected liveness map %+v; got %+v", expectedLMap, lMap)
@@ -580,10 +583,13 @@ func TestNodeLivenessGetIsLiveMap(t *testing.T) {
 
 	// Now verify only node 0 is live.
 	lMap = mtc.nodeLivenesses[0].GetIsLiveMap()
+	l1, _ = mtc.nodeLivenesses[0].GetLiveness(1)
+	l2, _ = mtc.nodeLivenesses[0].GetLiveness(2)
+	l3, _ = mtc.nodeLivenesses[0].GetLiveness(3)
 	expectedLMap = kvserver.IsLiveMap{
-		1: {IsLive: true, Epoch: 1},
-		2: {IsLive: false, Epoch: 1},
-		3: {IsLive: false, Epoch: 1},
+		1: {Liveness: l1.Liveness, IsLive: true},
+		2: {Liveness: l2.Liveness, IsLive: false},
+		3: {Liveness: l3.Liveness, IsLive: false},
 	}
 	if !reflect.DeepEqual(expectedLMap, lMap) {
 		t.Errorf("expected liveness map %+v; got %+v", expectedLMap, lMap)