Skip to content

Commit

Permalink
kvserver: rebalance ranges with one voter using joint configurations
Browse files Browse the repository at this point in the history
The allocator would add a voter, instead of both adding and removing the
existing voter when rebalancing ranges with one replica. Removing the
leaseholder replica was not possible prior to cockroachdb#74077, so the addition
only was necessary.

This restriction is no longer necessary. Allow rebalancing a one voter
range between stores using joint configurations, where the lease will be
transferred to the incoming voter store, from the outgoing demoting
voter.

Scattering ranges with one voter will now leave the range with exactly
one voter, where previously both the leaseholder voter evaluating the
scatter, and the new voter would be left.

Before this patch, scattering 1000 ranges with RF=1 on a 5 store
cluster:

```
  store_id | replica_count | replica_distribution | lease_count | lease_distribution
-----------+---------------+----------------------+-------------+---------------------
         1 |          1001 | ##########           |         500 | ##########
         5 |           291 | ###                  |         147 | ###
         4 |           275 | ###                  |         137 | ###
         3 |           229 | ###                  |         118 | ###
         2 |           206 | ###                  |          99 | ##
```

After:

```
  store_id | replica_count | replica_distribution | lease_count | lease_distribution
-----------+---------------+----------------------+-------------+---------------------
         2 |           242 | ##########           |         241 | ##########
         4 |           227 | ##########           |         227 | ##########
         5 |           217 | #########            |         216 | #########
         3 |           209 | #########            |         208 | #########
         1 |           106 | #####                |         109 | #####
```

Fixes: cockroachdb#108420
Fixes: cockroachdb#124171

Release note (bug fix): Scattering a range with replication factor=1, no
longer erroneously up-replicates the range to two replicas. Leases will
also no longer thrash between nodes when perturbed with replication
factor=1.
  • Loading branch information
kvoli committed May 16, 2024
1 parent 122a1b4 commit e6eaca8
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 159 deletions.
46 changes: 0 additions & 46 deletions pkg/kv/kvserver/allocator/plan/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
"github.com/cockroachdb/cockroach/pkg/raft"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
)
Expand All @@ -36,51 +35,6 @@ func ReplicationChangesForRebalance(
rebalanceTargetType allocatorimpl.TargetReplicaType,
) (chgs []kvpb.ReplicationChange, performingSwap bool, err error) {
rdesc, found := desc.GetReplicaDescriptor(addTarget.StoreID)
if rebalanceTargetType == allocatorimpl.VoterTarget && numExistingVoters == 1 {
// If there's only one replica, the removal target is the
// leaseholder and this is unsupported and will fail. However,
// this is also the only way to rebalance in a single-replica
// range. If we try the atomic swap here, we'll fail doing
// nothing, and so we stay locked into the current distribution
// of replicas. (Note that maybeTransferLeaseAway above will not
// have found a target, and so will have returned (false, nil).
//
// Do the best thing we can, which is carry out the addition
// only, which should succeed, and the next time we touch this
// range, we will have one more replica and hopefully it will
// take the lease and remove the current leaseholder.
//
// It's possible that "rebalancing deadlock" can occur in other
// scenarios, it's really impossible to tell from the code given
// the constraints we support. However, the lease transfer often
// does not happen spuriously, and we can't enter dangerous
// configurations sporadically, so this code path is only hit
// when we know it's necessary, picking the smaller of two evils.
//
// See https://github.com/cockroachdb/cockroach/issues/40333.
log.KvDistribution.Infof(ctx, "can't swap replica due to lease; falling back to add")

// Even when there is only 1 existing voter, there may be other replica
// types in the range. Check if the add target already has a replica, if so
// it must be a non-voter or the rebalance is invalid.
if found && rdesc.Type == roachpb.NON_VOTER {
// The receiving store already has a non-voting replica. Instead of just
// adding a voter to the receiving store, we *must* promote the non-voting
// replica to a voter.
chgs = kvpb.ReplicationChangesForPromotion(addTarget)
} else if !found {
chgs = []kvpb.ReplicationChange{
{ChangeType: roachpb.ADD_VOTER, Target: addTarget},
}
} else {
return nil, false, errors.AssertionFailedf(
"invalid rebalancing decision: trying to"+
" move voter to a store that already has a replica %s for the range", rdesc,
)
}
return chgs, false, err
}

switch rebalanceTargetType {
case allocatorimpl.VoterTarget:
// Check if the target being added already has a non-voting replica.
Expand Down
56 changes: 28 additions & 28 deletions pkg/kv/kvserver/asim/tests/testdata/non_rand/one_voter
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@ plot stat=replicas
----
----

100.00 ┼──────╭───────────────
93.33 ┤ ╭╯ ╰╮ ╰─╮
86.67 ┤ │ ╰╮ ╭╰╮─────────
80.00 ┤ │ │ ╭╯ │ ╭─────────╮╮
73.33 ┤ ╭╯ ╰╮ ╭─╯ ╰╮ ╭╯ ╰╮ ╭╮╰────╮─────╭
66.67 ┤ │ ╰╮ │ ╰╮ ╭╯ ╰╮ ╭─╯╰╯ ╰─╮╭──╯╰───────╮─╮
60.00 ┤ ╭╯ │ ╭─╯ │ ╭──╯ ╰╮│ ╰╯ ╰╯ ╰──╰─────────────
53.33 ┤ │ ╰╮ ╭╯ ╰──╯ ╰╯ ╰───╯╰
46.67 ┤ ╭╯ │ │
40.00 ┤ │ ╰╮ ╭╯
33.33 ┤ │ │╭╯
26.67 ┤ ╭╯ ╰╯
100.00 ┼╮
93.33 ┤
86.67 ┤
80.00 ┤
73.33 ┤
66.67 ┤
60.00 ┤
53.33 ┤ ╰────────────────────────────────────────────────────────────────────────────
46.67 ┤ ╭────────────────────────────────────────────────────────────────────────────
40.00 ┤ │
33.33 ┤ │
26.67 ┤ ╭╯
20.00 ┤ │
13.33 ┤╭╯
6.67 ┤│
Expand All @@ -38,22 +38,22 @@ plot stat=leases
----
----

100.00 ┼──────╮ ╭───────
93.33 ┤ ╰╮ ╭╯ ╰╮
86.67 ┤ │ │ │ ╭──
80.00 ┤ ╰╮ ╭╯ ╰╮ ╭─╯ ╰╮
73.33 ┤ │ │ │ ╭╯ ╰╮ ╭────╮ ╭───
66.67 ┤ ╰╮╭╯ ╰╮ ╭╯ ╰╮ ╭╯ ╰╮ ╭╯ ╰╮
60.00 ┤ ││ ╰╮│ ╰╮╭╯ ╰╮╭╯ ╰╮ ╭───╮ ╭───╮
53.33 ┤ ╭╯ │╯ ╭╯ ╰╮ ╰╭─╯ ╰──╮─╯ ╰─╭────
46.67 ┤ │╮ ╰╮ │╮ ╭│ ╭╯─╮ ╭─╯╰─╮ ╭─╯────
40.00 ┤ ││ ╭╯│ ╭╯╰╮ ╭╯╰╮ ╭╯ ╰───╯ ╰───╯
33.33 ┤ ╭╯╰╮ ╭╯ ╰╮ ╭╯ ╰╮ ╭╯ ╰╮ ╭╯
26.67 ┤ │ │ │ ╰╮ ╭╯ ╰────╯ ╰───
20.00 ┤ ╭╯ ╰╮ ╭╯ ╰─╮ ╭╯
13.33 ┤ │ │ ╭╯ ╰──
6.67 ┤ ╭╯ ╰╮
0.00 ┼──────╯ ╰───────
100.00 ┼╮
93.33 ┤
86.67 ┤
80.00 ┤
73.33 ┤
66.67 ┤
60.00 ┤
53.33 ┤ ╰────────────────────────────────────────────────────────────────────────────
46.67 ┤ ╭────────────────────────────────────────────────────────────────────────────
40.00 ┤
33.33 ┤
26.67 ┤
20.00 ┤
13.33 ┤
6.67 ┤│
0.00 ┼╯
leases
----
----
Expand Down
87 changes: 2 additions & 85 deletions pkg/kv/kvserver/asim/tests/testdata/rand/rand_ranges
Original file line number Diff line number Diff line change
Expand Up @@ -177,48 +177,7 @@ configurations generated using seed 2643318057788968173
randomized ranges with placement_type=random, ranges=944, key_space=150098, replication_factor=1, bytes=0
basic load with rw_ratio=0.00, rate=0.00, skewed_access=false, min_block_size=1, max_block_size=1, min_key=1, max_key=200000
number of mutation events=0, number of assertion events=0
initial state at 2022-03-21 11:00:00:
stores(15)=[s1n1=(replicas(64)),s2n2=(replicas(51)),s3n3=(replicas(12)),s4n4=(replicas(25)),s5n5=(replicas(115)),s6n6=(replicas(90)),s7n7=(replicas(114)),s8n8=(replicas(25)),s9n9=(replicas(26)),s10n10=(replicas(64)),s11n11=(replicas(0)),s12n12=(replicas(114)),s13n13=(replicas(103)),s14n14=(replicas(51)),s15n15=(replicas(90))]
topology:
US
US_1
└── [1 2 3 4 5]
US_2
└── [6 7 8 9 10]
US_3
└── [11 12 13 14 15]
no events were scheduled
sample2: failed assertion
conformance unavailable=0 under=0 over=0 violating=0 lease-violating=0 lease-less-preferred=0
actual unavailable=0 under=0, over=27 violating=0 lease-violating=0 lease-less-preferred=0
over replicated:
r63:00000{09858-10017} [(n1,s1):4, (n9,s9):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r77:0000012{084-243} [(n4,s4):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r174:0000027{507-666} [(n6,s6):4, (n9,s9):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r192:0000030{369-528} [(n6,s6):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r237:0000037{524-683} [(n5,s5):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r255:0000040{386-545} [(n8,s8):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r348:0000055{173-332} [(n1,s1):1, (n9,s9):2] applying ttl_seconds=0 num_replicas=1 num_voters=1
r370:0000058{671-830} [(n5,s5):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r378:00000{59943-60102} [(n8,s8):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r387:0000061{374-533} [(n12,s12):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r419:0000066{462-621} [(n1,s1):6, (n9,s9):7] applying ttl_seconds=0 num_replicas=1 num_voters=1
r483:0000076{638-797} [(n2,s2):4, (n9,s9):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r513:0000081{408-567} [(n6,s6):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r599:0000095{082-241} [(n11,s11):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r621:0000098{580-739} [(n14,s14):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r637:0000101{124-283} [(n13,s13):4, (n9,s9):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r644:0000102{237-396} [(n2,s2):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r645:0000102{396-555} [(n3,s3):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r681:0000108{120-279} [(n12,s12):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r693:0000110{028-187} [(n14,s14):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r712:0000113{049-208} [(n11,s11):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r732:0000116{229-388} [(n5,s5):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r740:0000117{501-660} [(n12,s12):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r763:0000121{158-317} [(n10,s10):4, (n9,s9):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r796:0000126{405-564} [(n10,s10):4, (n9,s9):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r805:0000127{836-995} [(n5,s5):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r928:0000147{393-552} [(n12,s12):2, (n9,s9):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
sample2: pass
----------------------------------
sample3: start running
configurations generated using seed 6972490225919430754
Expand All @@ -227,47 +186,5 @@ configurations generated using seed 6972490225919430754
randomized ranges with placement_type=random, ranges=479, key_space=199954, replication_factor=1, bytes=0
basic load with rw_ratio=0.00, rate=0.00, skewed_access=false, min_block_size=1, max_block_size=1, min_key=1, max_key=200000
number of mutation events=0, number of assertion events=0
initial state at 2022-03-21 11:00:00:
stores(15)=[s1n1=(replicas(27)),s2n2=(replicas(1)),s3n3=(replicas(40)),s4n4=(replicas(13)),s5n5=(replicas(26)),s6n6=(replicas(13)),s7n7=(replicas(60)),s8n8=(replicas(60)),s9n9=(replicas(53)),s10n10=(replicas(14)),s11n11=(replicas(59)),s12n12=(replicas(6)),s13n13=(replicas(13)),s14n14=(replicas(40)),s15n15=(replicas(54))]
topology:
US
US_1
└── [1 2 3 4 5]
US_2
└── [6 7 8 9 10]
US_3
└── [11 12 13 14 15]
no events were scheduled
sample3: failed assertion
conformance unavailable=0 under=0 over=0 violating=0 lease-violating=0 lease-less-preferred=0
actual unavailable=0 under=0, over=28 violating=0 lease-violating=0 lease-less-preferred=0
over replicated:
r17:000000{6672-7089} [(n10,s10):4, (n5,s5):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r39:000001{5846-6263} [(n8,s8):6, (n1,s1):7] applying ttl_seconds=0 num_replicas=1 num_voters=1
r88:0000036{279-696} [(n6,s6):6, (n14,s14):7] applying ttl_seconds=0 num_replicas=1 num_voters=1
r93:0000038{364-781} [(n14,s14):5, (n8,s8):6] applying ttl_seconds=0 num_replicas=1 num_voters=1
r115:0000047{538-955} [(n8,s8):7, (n10,s10):8] applying ttl_seconds=0 num_replicas=1 num_voters=1
r128:000005{2959-3376} [(n6,s6):4, (n1,s1):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r137:000005{6712-7129} [(n1,s1):7, (n3,s3):8] applying ttl_seconds=0 num_replicas=1 num_voters=1
r139:0000057{546-963} [(n6,s6):3, (n8,s8):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r197:000008{1732-2149} [(n14,s14):5, (n10,s10):6] applying ttl_seconds=0 num_replicas=1 num_voters=1
r198:0000082{149-566} [(n6,s6):2, (n14,s14):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r201:0000083{400-817} [(n10,s10):4, (n1,s1):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r207:000008{5902-6319} [(n6,s6):4, (n14,s14):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r211:0000087{570-987} [(n10,s10):3, (n4,s4):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r214:000008{8821-9238} [(n1,s1):5, (n10,s10):6] applying ttl_seconds=0 num_replicas=1 num_voters=1
r219:000009{0906-1323} [(n8,s8):4, (n1,s1):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r230:0000095{493-910} [(n8,s8):4, (n10,s10):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r244:0000101{331-748} [(n10,s10):2, (n1,s1):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r253:0000105{084-501} [(n10,s10):2, (n1,s1):3] applying ttl_seconds=0 num_replicas=1 num_voters=1
r267:000011{0922-1339} [(n14,s14):5, (n10,s10):6] applying ttl_seconds=0 num_replicas=1 num_voters=1
r349:0000145{116-533} [(n10,s10):3, (n9,s9):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r353:000014{6784-7201} [(n10,s10):5, (n1,s1):6] applying ttl_seconds=0 num_replicas=1 num_voters=1
r403:000016{7634-8051} [(n14,s14):4, (n8,s8):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r421:0000175{140-557} [(n12,s12):4, (n10,s10):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r425:000017{6808-7225} [(n14,s14):4, (n1,s1):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r431:0000179{310-727} [(n12,s12):5, (n8,s8):6] applying ttl_seconds=0 num_replicas=1 num_voters=1
r433:0000180{144-561} [(n10,s10):4, (n1,s1):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
r455:0000189{318-735} [(n6,s6):3, (n10,s10):4] applying ttl_seconds=0 num_replicas=1 num_voters=1
r477:0000198{492-909} [(n1,s1):4, (n7,s7):5] applying ttl_seconds=0 num_replicas=1 num_voters=1
sample3: pass
----------------------------------
99 changes: 99 additions & 0 deletions pkg/sql/scatter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ import (
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/desctestutils"
"github.com/cockroachdb/cockroach/pkg/sql/randgen"
"github.com/cockroachdb/cockroach/pkg/testutils"
Expand All @@ -27,6 +29,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"github.com/gogo/protobuf/proto"
"github.com/stretchr/testify/require"
)

func TestScatterRandomizeLeases(t *testing.T) {
Expand Down Expand Up @@ -176,3 +180,98 @@ func TestScatterResponse(t *testing.T) {
t.Fatalf("expected %d rows, but got %d", e, a)
}
}

// TestScatterWithOneVoter tests that the scatter command works when the
// replication factor is set to 1. We expect that the total number of replicas
// remains unchanged and that the scattering store loses some replicas. Note we
// don't assert on the final distribution being even across all stores, scatter
// promises randomness, not necessarily uniformity.
func TestScatterWithOneVoter(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

skip.UnderRace(t) // Too slow under stressrace.
skip.UnderDeadlock(t)
skip.UnderShort(t)

zcfg := zonepb.DefaultZoneConfig()
zcfg.NumReplicas = proto.Int32(1)

tc := serverutils.StartCluster(t, 3, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DefaultZoneConfigOverride: &zcfg,
},
},
},
})
defer tc.Stopper().Stop(context.Background())

sqlutils.CreateTable(
t, tc.ServerConn(0), "t",
"k INT PRIMARY KEY, v INT",
500,
sqlutils.ToRowFn(sqlutils.RowIdxFn, sqlutils.RowModuloFn(10)),
)

r := sqlutils.MakeSQLRunner(tc.ServerConn(0))

// Create 49 splits, for 50 ranges in the test table.
r.Exec(t, "ALTER TABLE test.t SPLIT AT (SELECT i*10 FROM generate_series(1, 49) AS g(i))")

getReplicaCounts := func() (map[int]int, int, error) {
rows := r.Query(t, `
WITH ranges_info AS (
SHOW RANGES FROM TABLE test.t
)
SELECT
store_id,
count(*) AS replica_count
FROM
(
SELECT
unnest(replicas) AS store_id
FROM
ranges_info
) AS store_replicas
GROUP BY
store_id;`)
replicaCounts := make(map[int]int)
totalReplicas := 0
numRows := 0
for ; rows.Next(); numRows++ {
var storeID, replicaCount int
if err := rows.Scan(&storeID, &replicaCount); err != nil {
return nil, 0, err
}
replicaCounts[storeID] = replicaCount
totalReplicas += replicaCount
}
if err := rows.Err(); err != nil {
return nil, 0, err
}
return replicaCounts, totalReplicas, nil
}

oldReplicaCounts, oldTotalReplicas, err := getReplicaCounts()
if err != nil {
t.Fatal(err)
}

// Expect that the number of replicas on store 1 to have changed. We can't
// assert that the distribution will be even across all three stores, but s1
// (the initial leaseholder and replica) should have a different number of
// replicas than before. rebalancing is otherwise disabled in this test, so
// the only replica movements are from the scatter.
r.Exec(t, "ALTER TABLE test.t SCATTER")
newReplicaCounts, newTotalReplicas, err := getReplicaCounts()
require.NoError(t, err)

require.Equal(t, oldTotalReplicas, newTotalReplicas,
"expected total replica count to remain the same post-scatter, "+
"old replica counts(%d): %v, new replica counts(%d): %v",
oldTotalReplicas, oldReplicaCounts, newTotalReplicas, newReplicaCounts)
require.NotEqual(t, oldReplicaCounts[1], newReplicaCounts[1])
}

0 comments on commit e6eaca8

Please sign in to comment.