diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go index c683e2a9268d..1362b3ecd5c7 100644 --- a/pkg/storage/replica.go +++ b/pkg/storage/replica.go @@ -93,6 +93,8 @@ const ( // simpler with this being turned off. var txnAutoGC = true +var tickQuiesced = envutil.EnvOrDefaultBool("COCKROACH_TICK_QUIESCED", true) + // raftInitialLog{Index,Term} are the starting points for the raft log. We // bootstrap the raft membership by synthesizing a snapshot as if there were // some discarded prefix to the log, so we must begin the log at an arbitrary @@ -2216,6 +2218,29 @@ func (r *Replica) tickRaftMuLocked() (bool, error) { return false, nil } if r.mu.quiescent { + // While a replica is quiesced we still advance its logical clock. This is + // necessary to avoid a scenario where the leader quiesces and a follower + // does not. The follower calls an election but the election fails because + // the leader and other follower believe that no time in the current term + // has passed. The Raft group is then in a state where one member has a + // term that is advanced which will then cause subsequent heartbeats from + // the existing leader to be rejected in a way that the leader will step + // down. This situation is caused by an interaction between quiescence and + // the Raft CheckQuorum feature which relies on the logical clock ticking + // at roughly the same rate on all members of the group. + // + // By ticking the logical clock (incrementing an integer) we avoid this + // situation. If one of the followers does not quiesce it will call an + // election but the election will succeed. Note that while we expect such + // elections from quiesced followers to be extremely rare, it is very + // difficult to completely eliminate them so we want to minimize the + // disruption when they do occur. + // + // For more details, see #9372. + // TODO(bdarnell): remove this once we have fully switched to PreVote + if tickQuiesced { + r.mu.internalRaftGroup.TickQuiesced() + } return false, nil } if r.maybeQuiesceLocked() { diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 0cfbe4821a0b..174ec7e0a94f 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -98,6 +98,9 @@ var changeTypeInternalToRaft = map[roachpb.ReplicaChangeType]raftpb.ConfChangeTy var storeSchedulerConcurrency = envutil.EnvOrDefaultInt( "COCKROACH_SCHEDULER_CONCURRENCY", 2*runtime.NumCPU()) +var enablePreVote = envutil.EnvOrDefaultBool( + "COCKROACH_ENABLE_PREVOTE", false) + // RaftElectionTimeout returns the raft election timeout, as computed // from the specified tick interval and number of election timeout // ticks. If raftElectionTimeoutTicks is 0, uses the value of @@ -150,7 +153,14 @@ func newRaftConfig( HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, Storage: strg, Logger: logger, - PreVote: true, + + // TODO(bdarnell): PreVote and CheckQuorum are two ways of + // achieving the same thing. PreVote is more compatible with + // quiesced ranges, so we want to switch to it once we've worked + // out the bugs. + PreVote: enablePreVote, + CheckQuorum: !enablePreVote, + // TODO(bdarnell): make these configurable; evaluate defaults. MaxSizePerMsg: 1024 * 1024, MaxInflightMsgs: 256,