From 91c2a6ba993f3104c69d68f60d2584a82786333b Mon Sep 17 00:00:00 2001 From: Alex Sarkesian Date: Thu, 22 Dec 2022 16:54:52 -0500 Subject: [PATCH] kvserver: refactor upreplication fragile quorum check into allocator While previously we checked that we can avoid a fragile quorum during the addition of new voters in the replicate queue, this change moves the check logic into the allocator code itself, allowing it to be reused by other users of the allocator. This enables us to perform this check when evaluating decommission viability, or anywhere else that uses the allocator for new voter replicas. Part of #91570. Release note: None --- .../allocator/allocatorimpl/allocator.go | 59 +++++++++++++++++++ pkg/kv/kvserver/replicate_queue.go | 42 ++++--------- 2 files changed, 71 insertions(+), 30 deletions(-) diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go index f77b10265a2d..5210fa9ef308 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go @@ -675,6 +675,21 @@ func GetNeededNonVoters(numVoters, zoneConfigNonVoterCount, clusterNodes int) in return need } +// WillHaveFragileQuorum determines, based on the number of existing voters, +// incoming voters, and needed voters, if we will be upreplicating to a state +// in which we don't have enough needed voters and yet will have a fragile quorum +// due to an even number of voter replicas. +func WillHaveFragileQuorum( + numExistingVoters, numNewVoters, zoneConfigVoterCount, clusterNodes int, +) bool { + neededVoters := GetNeededVoters(int32(zoneConfigVoterCount), clusterNodes) + willHave := numExistingVoters + numNewVoters + // NB: If willHave >= neededVoters, then always allow up-replicating as that + // will be the case when up-replicating a range with a decommissioning + // replica. + return numNewVoters > 0 && willHave < neededVoters && willHave%2 == 0 +} + // LiveAndDeadVoterAndNonVoterReplicas splits up the replica in the given range // descriptor by voters vs non-voters and live replicas vs dead replicas. func LiveAndDeadVoterAndNonVoterReplicas( @@ -1195,6 +1210,50 @@ func (a *Allocator) AllocateTarget( } } +// CheckAvoidsFragileQuorum ensures that if we are allocating a new voter and +// will result in an even number of voters, that we can allocate another voter +// target in order to avoid a fragile quorum state. This check should be +// performed whenever we are planning or testing allocation of a new voter. +// +// We can skip this check if we're swapping a replica or allocating a non-voter, +// since that does not change the quorum size. +func (a *Allocator) CheckAvoidsFragileQuorum( + ctx context.Context, + storePool storepool.AllocatorStorePool, + conf roachpb.SpanConfig, + existingVoters, remainingLiveNonVoters []roachpb.ReplicaDescriptor, + replicaStatus ReplicaStatus, + replicaType TargetReplicaType, + newTarget roachpb.ReplicationTarget, + isReplacement bool, +) error { + // Validation is only applicable when allocating new voters. + if replicaType != VoterTarget { + return nil + } + newVoters := 0 + if !isReplacement { + newVoters = 1 + } + clusterNodes := storePool.ClusterNodeCount() + neededVoters := GetNeededVoters(conf.GetNumVoters(), clusterNodes) + + if WillHaveFragileQuorum(len(existingVoters), newVoters, neededVoters, clusterNodes) { + // This means we are going to up-replicate to an even replica state. + // Check if it is possible to go to an odd replica state beyond it. + oldPlusNewReplicas := existingVoters + oldPlusNewReplicas = append( + oldPlusNewReplicas, + roachpb.ReplicaDescriptor{NodeID: newTarget.NodeID, StoreID: newTarget.StoreID}, + ) + + _, _, err := a.AllocateVoter(ctx, storePool, conf, oldPlusNewReplicas, remainingLiveNonVoters, replicaStatus) + return err + } + + return nil +} + // AllocateVoter returns a suitable store for a new allocation of a voting // replica with the required attributes. Nodes already accommodating existing // voting replicas are ruled out as targets. diff --git a/pkg/kv/kvserver/replicate_queue.go b/pkg/kv/kvserver/replicate_queue.go index fdd5fc2c6f3e..1233bc6444ff 100644 --- a/pkg/kv/kvserver/replicate_queue.go +++ b/pkg/kv/kvserver/replicate_queue.go @@ -1154,6 +1154,7 @@ func (rq *replicateQueue) addOrReplaceVoters( ) (op AllocationOp, _ error) { effects := effectBuilder{} desc, conf := repl.DescAndSpanConfig() + isReplace := removeIdx >= 0 // The allocator should not try to re-add this replica since there is a reason // we're removing it (i.e. dead or decommissioning). If we left the replica in @@ -1163,42 +1164,23 @@ func (rq *replicateQueue) addOrReplaceVoters( if err != nil { return nil, err } - if removeIdx >= 0 && newVoter.StoreID == existingVoters[removeIdx].StoreID { + if isReplace && newVoter.StoreID == existingVoters[removeIdx].StoreID { return nil, errors.AssertionFailedf("allocator suggested to replace replica on s%d with itself", newVoter.StoreID) } - clusterNodes := rq.storePool.ClusterNodeCount() - neededVoters := allocatorimpl.GetNeededVoters(conf.GetNumVoters(), clusterNodes) - - // Only up-replicate if there are suitable allocation targets such that, - // either the replication goal is met, or it is possible to get to the next + // We only want to up-replicate if there are suitable allocation targets such + // that, either the replication goal is met, or it is possible to get to the next // odd number of replicas. A consensus group of size 2n has worse failure // tolerance properties than a group of size 2n - 1 because it has a larger // quorum. For example, up-replicating from 1 to 2 replicas only makes sense // if it is possible to be able to go to 3 replicas. - // - // NB: If willHave > neededVoters, then always allow up-replicating as that - // will be the case when up-replicating a range with a decommissioning - // replica. - // - // We skip this check if we're swapping a replica, since that does not - // change the quorum size. - // TODO(sarkesian): extract this bit into a reusable function - if willHave := len(existingVoters) + 1; removeIdx < 0 && willHave < neededVoters && willHave%2 == 0 { - // This means we are going to up-replicate to an even replica state. - // Check if it is possible to go to an odd replica state beyond it. - oldPlusNewReplicas := append([]roachpb.ReplicaDescriptor(nil), existingVoters...) - oldPlusNewReplicas = append( - oldPlusNewReplicas, - roachpb.ReplicaDescriptor{NodeID: newVoter.NodeID, StoreID: newVoter.StoreID}, - ) - _, _, err := rq.allocator.AllocateVoter(ctx, rq.storePool, conf, oldPlusNewReplicas, remainingLiveNonVoters, replicaStatus) - if err != nil { - // It does not seem possible to go to the next odd replica state. Note - // that AllocateVoter returns an allocatorError (a PurgatoryError) - // when purgatory is requested. - return nil, errors.Wrap(err, "avoid up-replicating to fragile quorum") - } + if err := rq.allocator.CheckAvoidsFragileQuorum(ctx, rq.storePool, conf, + existingVoters, remainingLiveNonVoters, + replicaStatus, allocatorimpl.VoterTarget, newVoter, isReplace); err != nil { + // It does not seem possible to go to the next odd replica state. Note + // that AllocateVoter returns an allocatorError (a PurgatoryError) + // when purgatory is requested. + return nil, errors.Wrap(err, "avoid up-replicating to fragile quorum") } // Figure out whether we should be promoting an existing non-voting replica to @@ -1222,7 +1204,7 @@ func (rq *replicateQueue) addOrReplaceVoters( }) ops = roachpb.MakeReplicationChanges(roachpb.ADD_VOTER, newVoter) } - if removeIdx < 0 { + if !isReplace { log.KvDistribution.Infof(ctx, "adding voter %+v: %s", newVoter, rangeRaftProgress(repl.RaftStatus(), existingVoters)) } else {