Skip to content

Commit

Permalink
kvserver: refactor upreplication fragile quorum check into allocator
Browse files Browse the repository at this point in the history
While previously we checked that we can avoid a fragile quorum during
the addition of new voters in the replicate queue, this change moves the
check logic into the allocator code itself, allowing it to be reused by
other users of the allocator. This enables us to perform this check when
evaluating decommission viability, or anywhere else that uses the
allocator for new voter replicas.

Part of cockroachdb#91570.

Release note: None
  • Loading branch information
AlexTalks committed Jan 7, 2023
1 parent f41f680 commit 91c2a6b
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 30 deletions.
59 changes: 59 additions & 0 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,21 @@ func GetNeededNonVoters(numVoters, zoneConfigNonVoterCount, clusterNodes int) in
return need
}

// WillHaveFragileQuorum determines, based on the number of existing voters,
// incoming voters, and needed voters, if we will be upreplicating to a state
// in which we don't have enough needed voters and yet will have a fragile quorum
// due to an even number of voter replicas.
func WillHaveFragileQuorum(
numExistingVoters, numNewVoters, zoneConfigVoterCount, clusterNodes int,
) bool {
neededVoters := GetNeededVoters(int32(zoneConfigVoterCount), clusterNodes)
willHave := numExistingVoters + numNewVoters
// NB: If willHave >= neededVoters, then always allow up-replicating as that
// will be the case when up-replicating a range with a decommissioning
// replica.
return numNewVoters > 0 && willHave < neededVoters && willHave%2 == 0
}

// LiveAndDeadVoterAndNonVoterReplicas splits up the replica in the given range
// descriptor by voters vs non-voters and live replicas vs dead replicas.
func LiveAndDeadVoterAndNonVoterReplicas(
Expand Down Expand Up @@ -1195,6 +1210,50 @@ func (a *Allocator) AllocateTarget(
}
}

// CheckAvoidsFragileQuorum ensures that if we are allocating a new voter and
// will result in an even number of voters, that we can allocate another voter
// target in order to avoid a fragile quorum state. This check should be
// performed whenever we are planning or testing allocation of a new voter.
//
// We can skip this check if we're swapping a replica or allocating a non-voter,
// since that does not change the quorum size.
func (a *Allocator) CheckAvoidsFragileQuorum(
ctx context.Context,
storePool storepool.AllocatorStorePool,
conf roachpb.SpanConfig,
existingVoters, remainingLiveNonVoters []roachpb.ReplicaDescriptor,
replicaStatus ReplicaStatus,
replicaType TargetReplicaType,
newTarget roachpb.ReplicationTarget,
isReplacement bool,
) error {
// Validation is only applicable when allocating new voters.
if replicaType != VoterTarget {
return nil
}
newVoters := 0
if !isReplacement {
newVoters = 1
}
clusterNodes := storePool.ClusterNodeCount()
neededVoters := GetNeededVoters(conf.GetNumVoters(), clusterNodes)

if WillHaveFragileQuorum(len(existingVoters), newVoters, neededVoters, clusterNodes) {
// This means we are going to up-replicate to an even replica state.
// Check if it is possible to go to an odd replica state beyond it.
oldPlusNewReplicas := existingVoters
oldPlusNewReplicas = append(
oldPlusNewReplicas,
roachpb.ReplicaDescriptor{NodeID: newTarget.NodeID, StoreID: newTarget.StoreID},
)

_, _, err := a.AllocateVoter(ctx, storePool, conf, oldPlusNewReplicas, remainingLiveNonVoters, replicaStatus)
return err
}

return nil
}

// AllocateVoter returns a suitable store for a new allocation of a voting
// replica with the required attributes. Nodes already accommodating existing
// voting replicas are ruled out as targets.
Expand Down
42 changes: 12 additions & 30 deletions pkg/kv/kvserver/replicate_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,7 @@ func (rq *replicateQueue) addOrReplaceVoters(
) (op AllocationOp, _ error) {
effects := effectBuilder{}
desc, conf := repl.DescAndSpanConfig()
isReplace := removeIdx >= 0

// The allocator should not try to re-add this replica since there is a reason
// we're removing it (i.e. dead or decommissioning). If we left the replica in
Expand All @@ -1163,42 +1164,23 @@ func (rq *replicateQueue) addOrReplaceVoters(
if err != nil {
return nil, err
}
if removeIdx >= 0 && newVoter.StoreID == existingVoters[removeIdx].StoreID {
if isReplace && newVoter.StoreID == existingVoters[removeIdx].StoreID {
return nil, errors.AssertionFailedf("allocator suggested to replace replica on s%d with itself", newVoter.StoreID)
}

clusterNodes := rq.storePool.ClusterNodeCount()
neededVoters := allocatorimpl.GetNeededVoters(conf.GetNumVoters(), clusterNodes)

// Only up-replicate if there are suitable allocation targets such that,
// either the replication goal is met, or it is possible to get to the next
// We only want to up-replicate if there are suitable allocation targets such
// that, either the replication goal is met, or it is possible to get to the next
// odd number of replicas. A consensus group of size 2n has worse failure
// tolerance properties than a group of size 2n - 1 because it has a larger
// quorum. For example, up-replicating from 1 to 2 replicas only makes sense
// if it is possible to be able to go to 3 replicas.
//
// NB: If willHave > neededVoters, then always allow up-replicating as that
// will be the case when up-replicating a range with a decommissioning
// replica.
//
// We skip this check if we're swapping a replica, since that does not
// change the quorum size.
// TODO(sarkesian): extract this bit into a reusable function
if willHave := len(existingVoters) + 1; removeIdx < 0 && willHave < neededVoters && willHave%2 == 0 {
// This means we are going to up-replicate to an even replica state.
// Check if it is possible to go to an odd replica state beyond it.
oldPlusNewReplicas := append([]roachpb.ReplicaDescriptor(nil), existingVoters...)
oldPlusNewReplicas = append(
oldPlusNewReplicas,
roachpb.ReplicaDescriptor{NodeID: newVoter.NodeID, StoreID: newVoter.StoreID},
)
_, _, err := rq.allocator.AllocateVoter(ctx, rq.storePool, conf, oldPlusNewReplicas, remainingLiveNonVoters, replicaStatus)
if err != nil {
// It does not seem possible to go to the next odd replica state. Note
// that AllocateVoter returns an allocatorError (a PurgatoryError)
// when purgatory is requested.
return nil, errors.Wrap(err, "avoid up-replicating to fragile quorum")
}
if err := rq.allocator.CheckAvoidsFragileQuorum(ctx, rq.storePool, conf,
existingVoters, remainingLiveNonVoters,
replicaStatus, allocatorimpl.VoterTarget, newVoter, isReplace); err != nil {
// It does not seem possible to go to the next odd replica state. Note
// that AllocateVoter returns an allocatorError (a PurgatoryError)
// when purgatory is requested.
return nil, errors.Wrap(err, "avoid up-replicating to fragile quorum")
}

// Figure out whether we should be promoting an existing non-voting replica to
Expand All @@ -1222,7 +1204,7 @@ func (rq *replicateQueue) addOrReplaceVoters(
})
ops = roachpb.MakeReplicationChanges(roachpb.ADD_VOTER, newVoter)
}
if removeIdx < 0 {
if !isReplace {
log.KvDistribution.Infof(ctx, "adding voter %+v: %s",
newVoter, rangeRaftProgress(repl.RaftStatus(), existingVoters))
} else {
Expand Down

0 comments on commit 91c2a6b

Please sign in to comment.