Skip to content

Commit

Permalink
kvserver: de-flake TestReplicateQueueUpAndDownReplicateNonVoters
Browse files Browse the repository at this point in the history
Fixes #75135. This test asserted on span configs applying to a scratch
range. When stressing, it appeared that some time we were not seeing the
scratch range adopt the prescribed number of voters/non-voters. Staring
at the test itself, we were only nudging the replication queues for the
first node in the three node test. It's possible for the scratch range
to have been housed on a node other than the first; this commit makes it
so that the test nudges queues on all nodes. For good measure, lets also
ensure that the split queues process everything, ditto for the snapshot
queues.

To repro:

    dev test pkg/kv/kvserver \
      -f TestReplicateQueueUpAndDownReplicateNonVoters \
      -v --show-logs --timeout 2m --stress

Release note: None
  • Loading branch information
irfansharif committed Jan 20, 2022
1 parent fb72381 commit f85d165
Showing 1 changed file with 16 additions and 13 deletions.
29 changes: 16 additions & 13 deletions pkg/kv/kvserver/replicate_queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,22 +301,29 @@ func TestReplicateQueueDownReplicate(t *testing.T) {
}

func scanAndGetNumNonVoters(
ctx context.Context,
t *testing.T,
tc *testcluster.TestCluster,
store *kvserver.Store,
scratchKey roachpb.Key,
) (numNonVoters int) {
// Nudge the replicateQueue to up/down-replicate our scratch range.
if err := store.ForceReplicationScanAndProcess(); err != nil {
t.Fatal(err)
for _, s := range tc.Servers {
// Nudge internal queues to up/down-replicate our scratch range.
require.NoError(t, s.Stores().VisitStores(func(s *kvserver.Store) error {
require.NoError(t, s.ForceSplitScanAndProcess())
require.NoError(t, s.ForceReplicationScanAndProcess())
require.NoError(t, s.ForceRaftSnapshotQueueProcess())
return nil
}))
}
scratchRange := tc.LookupRangeOrFatal(t, scratchKey)
row := tc.ServerConn(0).QueryRow(
`SELECT array_length(non_voting_replicas, 1) FROM crdb_internal.ranges_no_leases WHERE range_id=$1`,
scratchRange.GetRangeID())
err := row.Scan(&numNonVoters)
log.Warningf(ctx, "error while retrieving the number of non-voters: %s", err)
if err := row.Scan(&numNonVoters); err != nil {
if testutils.IsError(err, "converting NULL to int is unsupported") {
return 0
}
t.Fatal(err)
}

return numNonVoters
}
Expand All @@ -329,7 +336,6 @@ func TestReplicateQueueUpAndDownReplicateNonVoters(t *testing.T) {
skip.UnderRace(t)
defer log.Scope(t).Close(t)

ctx := context.Background()
tc := testcluster.StartTestCluster(t, 1,
base.TestClusterArgs{
ReplicationMode: base.ReplicationAuto,
Expand Down Expand Up @@ -360,13 +366,10 @@ func TestReplicateQueueUpAndDownReplicateNonVoters(t *testing.T) {
// Add two new servers and expect that 2 non-voters are added to the range.
tc.AddAndStartServer(t, base.TestServerArgs{})
tc.AddAndStartServer(t, base.TestServerArgs{})
store, err := tc.Server(0).GetStores().(*kvserver.Stores).GetStore(
tc.Server(0).GetFirstStoreID())
require.NoError(t, err)

var expectedNonVoterCount = 2
testutils.SucceedsSoon(t, func() error {
if found := scanAndGetNumNonVoters(ctx, t, tc, store, scratchKey); found != expectedNonVoterCount {
if found := scanAndGetNumNonVoters(t, tc, scratchKey); found != expectedNonVoterCount {
return errors.Errorf("expected upreplication to %d non-voters; found %d",
expectedNonVoterCount, found)
}
Expand All @@ -379,7 +382,7 @@ func TestReplicateQueueUpAndDownReplicateNonVoters(t *testing.T) {
require.NoError(t, err)
expectedNonVoterCount = 0
testutils.SucceedsSoon(t, func() error {
if found := scanAndGetNumNonVoters(ctx, t, tc, store, scratchKey); found != expectedNonVoterCount {
if found := scanAndGetNumNonVoters(t, tc, scratchKey); found != expectedNonVoterCount {
return errors.Errorf("expected downreplication to %d non-voters; found %d",
expectedNonVoterCount, found)
}
Expand Down

0 comments on commit f85d165

Please sign in to comment.