From f85d165c8e640392adeae24458254e1457e64ecf Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Thu, 20 Jan 2022 09:31:51 -0500 Subject: [PATCH] kvserver: de-flake TestReplicateQueueUpAndDownReplicateNonVoters Fixes #75135. This test asserted on span configs applying to a scratch range. When stressing, it appeared that some time we were not seeing the scratch range adopt the prescribed number of voters/non-voters. Staring at the test itself, we were only nudging the replication queues for the first node in the three node test. It's possible for the scratch range to have been housed on a node other than the first; this commit makes it so that the test nudges queues on all nodes. For good measure, lets also ensure that the split queues process everything, ditto for the snapshot queues. To repro: dev test pkg/kv/kvserver \ -f TestReplicateQueueUpAndDownReplicateNonVoters \ -v --show-logs --timeout 2m --stress Release note: None --- pkg/kv/kvserver/replicate_queue_test.go | 29 ++++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pkg/kv/kvserver/replicate_queue_test.go b/pkg/kv/kvserver/replicate_queue_test.go index 51f17bfc3b92..ddfc0b945fa2 100644 --- a/pkg/kv/kvserver/replicate_queue_test.go +++ b/pkg/kv/kvserver/replicate_queue_test.go @@ -301,22 +301,29 @@ func TestReplicateQueueDownReplicate(t *testing.T) { } func scanAndGetNumNonVoters( - ctx context.Context, t *testing.T, tc *testcluster.TestCluster, - store *kvserver.Store, scratchKey roachpb.Key, ) (numNonVoters int) { - // Nudge the replicateQueue to up/down-replicate our scratch range. - if err := store.ForceReplicationScanAndProcess(); err != nil { - t.Fatal(err) + for _, s := range tc.Servers { + // Nudge internal queues to up/down-replicate our scratch range. + require.NoError(t, s.Stores().VisitStores(func(s *kvserver.Store) error { + require.NoError(t, s.ForceSplitScanAndProcess()) + require.NoError(t, s.ForceReplicationScanAndProcess()) + require.NoError(t, s.ForceRaftSnapshotQueueProcess()) + return nil + })) } scratchRange := tc.LookupRangeOrFatal(t, scratchKey) row := tc.ServerConn(0).QueryRow( `SELECT array_length(non_voting_replicas, 1) FROM crdb_internal.ranges_no_leases WHERE range_id=$1`, scratchRange.GetRangeID()) - err := row.Scan(&numNonVoters) - log.Warningf(ctx, "error while retrieving the number of non-voters: %s", err) + if err := row.Scan(&numNonVoters); err != nil { + if testutils.IsError(err, "converting NULL to int is unsupported") { + return 0 + } + t.Fatal(err) + } return numNonVoters } @@ -329,7 +336,6 @@ func TestReplicateQueueUpAndDownReplicateNonVoters(t *testing.T) { skip.UnderRace(t) defer log.Scope(t).Close(t) - ctx := context.Background() tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ ReplicationMode: base.ReplicationAuto, @@ -360,13 +366,10 @@ func TestReplicateQueueUpAndDownReplicateNonVoters(t *testing.T) { // Add two new servers and expect that 2 non-voters are added to the range. tc.AddAndStartServer(t, base.TestServerArgs{}) tc.AddAndStartServer(t, base.TestServerArgs{}) - store, err := tc.Server(0).GetStores().(*kvserver.Stores).GetStore( - tc.Server(0).GetFirstStoreID()) - require.NoError(t, err) var expectedNonVoterCount = 2 testutils.SucceedsSoon(t, func() error { - if found := scanAndGetNumNonVoters(ctx, t, tc, store, scratchKey); found != expectedNonVoterCount { + if found := scanAndGetNumNonVoters(t, tc, scratchKey); found != expectedNonVoterCount { return errors.Errorf("expected upreplication to %d non-voters; found %d", expectedNonVoterCount, found) } @@ -379,7 +382,7 @@ func TestReplicateQueueUpAndDownReplicateNonVoters(t *testing.T) { require.NoError(t, err) expectedNonVoterCount = 0 testutils.SucceedsSoon(t, func() error { - if found := scanAndGetNumNonVoters(ctx, t, tc, store, scratchKey); found != expectedNonVoterCount { + if found := scanAndGetNumNonVoters(t, tc, scratchKey); found != expectedNonVoterCount { return errors.Errorf("expected downreplication to %d non-voters; found %d", expectedNonVoterCount, found) }