Skip to content

Commit

Permalink
allocator: fix priority inversion between replica replace and remove
Browse files Browse the repository at this point in the history
Previously, when a change in cluster size lowers the effective
replication factor (e.g. a system range with RF=5 on a 5-node cluster
and 1 or more decommissioning nodes which results in an effective RF=3)
the allocator would always compute a needed action of "remove", rather
than any higher-priority actions that should be acted upon first, such
as to replace any decommissioning/dead replicas. This change modifies
the allocator to consider how many replicas are being removed.

Epic: CRDB-20924

Release note: None
  • Loading branch information
AlexTalks committed Feb 7, 2023
1 parent 247a543 commit 93dd982
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 23 deletions.
15 changes: 9 additions & 6 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,7 @@ func (a *Allocator) computeAction(
// actions.
haveVoters := len(voterReplicas)
decommissioningVoters := storePool.DecommissioningReplicas(voterReplicas)
postDecommissionVoters := haveVoters - len(decommissioningVoters)
// Node count including dead nodes but excluding
// decommissioning/decommissioned nodes.
clusterNodes := storePool.ClusterNodeCount()
Expand Down Expand Up @@ -975,9 +976,9 @@ func (a *Allocator) computeAction(
return action, action.Priority()
}

if haveVoters == neededVoters && len(deadVoters) > 0 {
if postDecommissionVoters <= neededVoters && len(deadVoters) > 0 {
// Range has dead voter(s). We should up-replicate to add another before
// before removing the dead one. This can avoid permanent data loss in cases
// removing the dead one. This can avoid permanent data loss in cases
// where the node is only temporarily dead, but we remove it from the range
// and lose a second node before we can up-replicate (#25392).
action = AllocatorReplaceDeadVoter
Expand All @@ -986,7 +987,7 @@ func (a *Allocator) computeAction(
return action, action.Priority()
}

if haveVoters == neededVoters && len(decommissioningVoters) > 0 {
if postDecommissionVoters < neededVoters {
// Range has decommissioning voter(s), which should be replaced.
action = AllocatorReplaceDecommissioningVoter
log.KvDistribution.VEventf(ctx, 3, "%s - replacement for %d decommissioning voters priority=%.2f",
Expand Down Expand Up @@ -1041,19 +1042,21 @@ func (a *Allocator) computeAction(
return action, action.Priority()
}

decommissioningNonVoters := storePool.DecommissioningReplicas(nonVoterReplicas)
postDecommissionNonVoters := haveNonVoters - len(decommissioningNonVoters)
liveNonVoters, deadNonVoters := storePool.LiveAndDeadReplicas(
nonVoterReplicas, includeSuspectAndDrainingStores,
)
if haveNonVoters == neededNonVoters && len(deadNonVoters) > 0 {

if postDecommissionNonVoters <= neededNonVoters && len(deadNonVoters) > 0 {
// The range has non-voter(s) on a dead node that we should replace.
action = AllocatorReplaceDeadNonVoter
log.KvDistribution.VEventf(ctx, 3, "%s - replacement for %d dead non-voters priority=%.2f",
action, len(deadNonVoters), action.Priority())
return action, action.Priority()
}

decommissioningNonVoters := storePool.DecommissioningReplicas(nonVoterReplicas)
if haveNonVoters == neededNonVoters && len(decommissioningNonVoters) > 0 {
if postDecommissionNonVoters < neededNonVoters {
// The range has non-voter(s) on a decommissioning node that we should
// replace.
action = AllocatorReplaceDecommissioningNonVoter
Expand Down
31 changes: 23 additions & 8 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6810,7 +6810,7 @@ func TestAllocatorComputeActionDecommission(t *testing.T) {
},
},
},
expectedAction: AllocatorRemoveDeadVoter,
expectedAction: AllocatorReplaceDeadVoter,
live: []roachpb.StoreID{1, 4},
dead: []roachpb.StoreID{2},
decommissioning: []roachpb.StoreID{3},
Expand Down Expand Up @@ -6843,7 +6843,7 @@ func TestAllocatorComputeActionDecommission(t *testing.T) {
},
},
},
expectedAction: AllocatorRemoveDeadVoter,
expectedAction: AllocatorReplaceDeadVoter,
live: []roachpb.StoreID{1, 4},
dead: nil,
decommissioning: []roachpb.StoreID{3},
Expand Down Expand Up @@ -6903,7 +6903,7 @@ func TestAllocatorComputeActionDecommission(t *testing.T) {
},
},
},
expectedAction: AllocatorRemoveDecommissioningVoter,
expectedAction: AllocatorReplaceDecommissioningVoter,
live: []roachpb.StoreID{4},
dead: nil,
decommissioning: []roachpb.StoreID{1, 2, 3},
Expand Down Expand Up @@ -7090,7 +7090,7 @@ func TestAllocatorComputeActionWithStorePoolDecommission(t *testing.T) {
},
},
},
expectedAction: AllocatorRemoveDeadVoter,
expectedAction: AllocatorReplaceDeadVoter,
live: []roachpb.StoreID{1, 4},
dead: []roachpb.StoreID{2},
decommissioning: []roachpb.StoreID{3},
Expand Down Expand Up @@ -7123,7 +7123,7 @@ func TestAllocatorComputeActionWithStorePoolDecommission(t *testing.T) {
},
},
},
expectedAction: AllocatorRemoveDeadVoter,
expectedAction: AllocatorReplaceDeadVoter,
live: []roachpb.StoreID{1, 4},
dead: nil,
decommissioning: []roachpb.StoreID{3},
Expand Down Expand Up @@ -7183,7 +7183,7 @@ func TestAllocatorComputeActionWithStorePoolDecommission(t *testing.T) {
},
},
},
expectedAction: AllocatorRemoveDecommissioningVoter,
expectedAction: AllocatorReplaceDecommissioningVoter,
live: []roachpb.StoreID{4},
dead: nil,
decommissioning: []roachpb.StoreID{1, 2, 3},
Expand Down Expand Up @@ -7330,6 +7330,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
// set. We are checking that the effective replication factor is rounded down
// to the number of stores which are not decommissioned or decommissioning.
testCases := []struct {
name string
storeList []roachpb.StoreID
expectedNumReplicas int
expectedAction AllocatorAction
Expand All @@ -7342,16 +7343,18 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
// Four known stores, three of them are decommissioning, so effective
// replication factor would be 1 if we hadn't decided that we'll never
// drop past 3, so 3 it is.
name: "four replicas with three decommissioning",
storeList: []roachpb.StoreID{1, 2, 3, 4},
expectedNumReplicas: 3,
expectedAction: AllocatorRemoveDecommissioningVoter,
expectedAction: AllocatorReplaceDecommissioningVoter,
live: []roachpb.StoreID{4},
unavailable: nil,
dead: nil,
decommissioning: []roachpb.StoreID{1, 2, 3},
},
{
// Ditto.
name: "three replicas with three decommissioning",
storeList: []roachpb.StoreID{1, 2, 3},
expectedNumReplicas: 3,
expectedAction: AllocatorReplaceDecommissioningVoter,
Expand All @@ -7365,6 +7368,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
// factor would be even (four), in which case we drop down one more
// to three. Then the right thing becomes removing the dead replica
// from the range at hand, rather than trying to replace it.
name: "four replicas with one dead",
storeList: []roachpb.StoreID{1, 2, 3, 4},
expectedNumReplicas: 3,
expectedAction: AllocatorRemoveDeadVoter,
Expand All @@ -7378,6 +7382,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
// in the system which amounts to an effective replication factor
// of three (avoiding the even number). Adding a replica is more
// important than replacing the dead one.
name: "two replicas with one dead",
storeList: []roachpb.StoreID{1, 4},
expectedNumReplicas: 3,
expectedAction: AllocatorAddVoter,
Expand All @@ -7388,6 +7393,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
},
{
// Similar to above, but nothing to do.
name: "three replicas with nothing to do",
storeList: []roachpb.StoreID{1, 2, 3},
expectedNumReplicas: 3,
expectedAction: AllocatorConsiderRebalance,
Expand All @@ -7400,6 +7406,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
// Effective replication factor can't dip below three (unless the
// span config explicitly asks for that, which it does not), so three
// it is and we are under-replicaed.
name: "RF stays above three",
storeList: []roachpb.StoreID{1, 2},
expectedNumReplicas: 3,
expectedAction: AllocatorAddVoter,
Expand All @@ -7410,6 +7417,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
},
{
// Three and happy.
name: "three and happy",
storeList: []roachpb.StoreID{1, 2, 3},
expectedNumReplicas: 3,
expectedAction: AllocatorConsiderRebalance,
Expand All @@ -7420,6 +7428,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
},
{
// Three again, on account of avoiding the even four.
name: "avoid even replicas",
storeList: []roachpb.StoreID{1, 2, 3, 4},
expectedNumReplicas: 3,
expectedAction: AllocatorRemoveVoter,
Expand All @@ -7431,6 +7440,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
{
// The usual case in which there are enough nodes to accommodate the
// span config.
name: "five and happy",
storeList: []roachpb.StoreID{1, 2, 3, 4, 5},
expectedNumReplicas: 5,
expectedAction: AllocatorConsiderRebalance,
Expand All @@ -7442,6 +7452,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
{
// No dead or decommissioning node and enough nodes around, so
// sticking with the span config.
name: "five and happy with one unavailable",
storeList: []roachpb.StoreID{1, 2, 3, 4, 5},
expectedNumReplicas: 5,
expectedAction: AllocatorConsiderRebalance,
Expand All @@ -7452,6 +7463,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
},
{
// Ditto.
name: "five and happy with two unavailable",
storeList: []roachpb.StoreID{1, 2, 3, 4, 5},
expectedNumReplicas: 5,
expectedAction: AllocatorConsiderRebalance,
Expand All @@ -7462,6 +7474,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
},
{
// Ditto, but we've lost quorum.
name: "five with lost quorum",
storeList: []roachpb.StoreID{1, 2, 3, 4, 5},
expectedNumReplicas: 5,
expectedAction: AllocatorRangeUnavailable,
Expand All @@ -7474,6 +7487,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
// Ditto (dead nodes don't reduce NumReplicas, only decommissioning
// or decommissioned do, and both correspond to the 'decommissioning'
// slice in these tests).
name: "five with lost quorum on one dead and one unavailable",
storeList: []roachpb.StoreID{1, 2, 3, 4, 5},
expectedNumReplicas: 5,
expectedAction: AllocatorReplaceDeadVoter,
Expand All @@ -7485,6 +7499,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
{
// Avoiding four, so getting three, and since there is no dead store
// the most important thing is removing a decommissioning replica.
name: "five with one decommissioning and one unavailable",
storeList: []roachpb.StoreID{1, 2, 3, 4, 5},
expectedNumReplicas: 3,
expectedAction: AllocatorRemoveDecommissioningVoter,
Expand Down Expand Up @@ -7514,7 +7529,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
roachpb.RKey(keys.SystemPrefix),
} {
for _, c := range testCases {
t.Run(prefixKey.String(), func(t *testing.T) {
t.Run(fmt.Sprintf("%s%s", c.name, prefixKey), func(t *testing.T) {
numNodes = len(c.storeList) - len(c.decommissioning)
mockStorePool(sp, c.live, c.unavailable, c.dead,
c.decommissioning, nil, nil)
Expand Down
95 changes: 86 additions & 9 deletions pkg/kv/kvserver/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3469,7 +3469,7 @@ func TestAllocatorCheckRange(t *testing.T) {
}{
{
name: "overreplicated",
stores: multiRegionStores,
stores: multiRegionStores, // Nine stores
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
Expand All @@ -3482,7 +3482,7 @@ func TestAllocatorCheckRange(t *testing.T) {
},
{
name: "overreplicated but store dead",
stores: multiRegionStores,
stores: multiRegionStores, // Nine stores
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
Expand All @@ -3497,7 +3497,7 @@ func TestAllocatorCheckRange(t *testing.T) {
},
{
name: "decommissioning but underreplicated",
stores: multiRegionStores,
stores: multiRegionStores, // Nine stores
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
Expand All @@ -3511,7 +3511,7 @@ func TestAllocatorCheckRange(t *testing.T) {
},
{
name: "decommissioning with replacement",
stores: multiRegionStores,
stores: multiRegionStores, // Nine stores
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
Expand Down Expand Up @@ -3541,7 +3541,7 @@ func TestAllocatorCheckRange(t *testing.T) {
},
{
name: "five to four nodes at RF five",
stores: noLocalityStores,
stores: noLocalityStores, // Five stores
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
Expand All @@ -3558,6 +3558,83 @@ func TestAllocatorCheckRange(t *testing.T) {
expectErr: false,
expectValidTarget: false,
},
{
name: "five to two nodes at RF five replaces first",
stores: noLocalityStores, // Five stores
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
{NodeID: 3, StoreID: 3, ReplicaID: 3},
{NodeID: 4, StoreID: 4, ReplicaID: 4},
{NodeID: 5, StoreID: 5, ReplicaID: 5},
},
zoneConfig: zonepb.DefaultSystemZoneConfigRef(),
livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{
3: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
4: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
5: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
baselineExpNoop: true,
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
expectValidTarget: false,
expectAllocatorErr: true,
expectedErrStr: "likely not enough nodes in cluster",
},
{
name: "replace first when lowering effective RF",
stores: multiRegionStores, // Nine stores
existingReplicas: []roachpb.ReplicaDescriptor{
// Region "a"
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
// Region "b"
{NodeID: 4, StoreID: 4, ReplicaID: 3},
{NodeID: 5, StoreID: 5, ReplicaID: 4},
// Region "c"
{NodeID: 7, StoreID: 7, ReplicaID: 5},
},
zoneConfig: zonepb.DefaultSystemZoneConfigRef(),
livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{
// Downsize to one node per region: 3,6,9.
1: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
2: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
4: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
5: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
7: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
8: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
expectErr: false,
expectValidTarget: true,
},
{
name: "replace dead first when lowering effective RF",
stores: multiRegionStores, // Nine stores
existingReplicas: []roachpb.ReplicaDescriptor{
// Region "a"
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
// Region "b"
{NodeID: 4, StoreID: 4, ReplicaID: 3},
{NodeID: 5, StoreID: 5, ReplicaID: 4},
// Region "c"
{NodeID: 7, StoreID: 7, ReplicaID: 5},
},
zoneConfig: zonepb.DefaultSystemZoneConfigRef(),
livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{
// Downsize to: 1,4,7,9 but 7 is dead.
// Replica on n7 should be replaced first.
2: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
3: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
5: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
6: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
7: livenesspb.NodeLivenessStatus_DEAD,
8: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
expectedAction: allocatorimpl.AllocatorReplaceDeadVoter,
expectErr: false,
expectValidTarget: true,
},
} {
t.Run(tc.name, func(t *testing.T) {
// Setup store pool based on store descriptors and configure test store.
Expand Down Expand Up @@ -3629,6 +3706,10 @@ func TestAllocatorCheckRange(t *testing.T) {
true /* collectTraces */, storePoolOverride,
)

require.Equalf(t, tc.expectedAction, action,
"expected action \"%s\", got action \"%s\"", tc.expectedAction, action,
)

// Validate expectations from test case.
if tc.expectErr || tc.expectAllocatorErr {
require.Error(t, err)
Expand All @@ -3647,10 +3728,6 @@ func TestAllocatorCheckRange(t *testing.T) {
require.NoError(t, err)
}

require.Equalf(t, tc.expectedAction, action,
"expected action \"%s\", got action \"%s\"", tc.expectedAction, action,
)

if tc.expectValidTarget {
require.NotEqualf(t, roachpb.ReplicationTarget{}, target, "expected valid target")
}
Expand Down

0 comments on commit 93dd982

Please sign in to comment.