Skip to content

Commit

Permalink
Merge #60935
Browse files Browse the repository at this point in the history
60935: kvserver: fix flake in TestReportUnreachableRemoveRace r=lunevalex a=lunevalex

Fixes #59209

This cyles through adding/removing replicas, we need to wait for
the removal and addition to go through, before trying the next round
otherwise the cluster could be still taking out a replica when we try to
add it back, or it could still be adding it when we try to remove it.
Additionally the circuit breaker was triggered on the wrong node, we need
to trigger it on all the nodes but the new leader instead of the otherway around.

Release note: None

Co-authored-by: Alex Lunev <[email protected]>
  • Loading branch information
craig[bot] and lunevalex committed Feb 23, 2021
2 parents 2572200 + 0ed03f8 commit 4317460
Showing 1 changed file with 24 additions and 6 deletions.
30 changes: 24 additions & 6 deletions pkg/kv/kvserver/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2520,7 +2520,6 @@ func TestReportUnreachableHeartbeats(t *testing.T) {
// races (primarily in asynchronous coalesced heartbeats).
func TestReportUnreachableRemoveRace(t *testing.T) {
defer leaktest.AfterTest(t)()
skip.WithIssue(t, 59209, "flaky test")
defer log.Scope(t).Close(t)

ctx := context.Background()
Expand All @@ -2529,9 +2528,9 @@ func TestReportUnreachableRemoveRace(t *testing.T) {
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

key := tc.ScratchRange(t)
key := tc.ScratchRangeWithExpirationLease(t)
tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...)
require.NoError(t, tc.WaitForVoters(key, tc.Targets(1, 2)...))

outer:
for i := 0; i < 5; i++ {
Expand All @@ -2551,11 +2550,30 @@ outer:
tc.TransferRangeLeaseOrFatal(t, *repl.Desc(), tc.Target(replicaIdx))
}
tc.RemoveVotersOrFatal(t, key, tc.Target(leaderIdx))
cb := tc.Servers[replicaIdx].RaftTransport().GetCircuitBreaker(tc.Target(replicaIdx).NodeID, rpc.DefaultClass)
cb.Break()
// We want to stop all nodes from talking to the replicaIdx, so need
// to trip the breaker on all servers but it.
for i := range tc.Servers {
if i != replicaIdx {
cb := tc.Servers[i].RaftTransport().GetCircuitBreaker(tc.Target(replicaIdx).NodeID, rpc.DefaultClass)
cb.Break()
}
}
time.Sleep(tc.GetFirstStoreFromServer(t, replicaIdx).GetStoreConfig().CoalescedHeartbeatsInterval)
cb.Reset()
for i := range tc.Servers {
if i != replicaIdx {
cb := tc.Servers[i].RaftTransport().GetCircuitBreaker(tc.Target(replicaIdx).NodeID, rpc.DefaultClass)
cb.Reset()
}
}
// Make sure the old replica was actually removed, before we try to re-adding it.
testutils.SucceedsSoon(t, func() error {
if oldRepl := tc.GetFirstStoreFromServer(t, leaderIdx).LookupReplica(roachpb.RKey(key)); oldRepl != nil {
return errors.Errorf("Expected replica %s to be removed", oldRepl)
}
return nil
})
tc.AddVotersOrFatal(t, key, tc.Target(leaderIdx))
require.NoError(t, tc.WaitForVoters(key, tc.Target(leaderIdx)))
continue outer
}
t.Fatal("could not find raft replica")
Expand Down

0 comments on commit 4317460

Please sign in to comment.