From b8b0cf83d1e56a8a23304203b6a5a14e7d6feff9 Mon Sep 17 00:00:00 2001 From: Benjamin Wang Date: Fri, 9 Aug 2024 11:26:15 +0100 Subject: [PATCH] Skip leadership check if the etcd instance is active processing heartbeat Signed-off-by: Benjamin Wang --- server/etcdserver/raft.go | 14 +++++++++-- server/etcdserver/server.go | 16 ++++++++++++ server/etcdserver/server_test.go | 43 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/server/etcdserver/raft.go b/server/etcdserver/raft.go index d397612af9c..fd4b5dac337 100644 --- a/server/etcdserver/raft.go +++ b/server/etcdserver/raft.go @@ -80,7 +80,9 @@ type toApply struct { type raftNode struct { lg *zap.Logger - tickMu *sync.Mutex + tickMu *sync.RWMutex + // timestamp of the latest tick + latestTickTs time.Time raftNodeConfig // a chan to send/receive snapshot @@ -132,8 +134,9 @@ func newRaftNode(cfg raftNodeConfig) *raftNode { raft.SetLogger(lg) r := &raftNode{ lg: cfg.lg, - tickMu: new(sync.Mutex), + tickMu: new(sync.RWMutex), raftNodeConfig: cfg, + latestTickTs: time.Now(), // set up contention detectors for raft heartbeat message. // expect to send a heartbeat within 2 heartbeat intervals. td: contention.NewTimeoutDetector(2 * cfg.heartbeat), @@ -155,9 +158,16 @@ func newRaftNode(cfg raftNodeConfig) *raftNode { func (r *raftNode) tick() { r.tickMu.Lock() r.Tick() + r.latestTickTs = time.Now() r.tickMu.Unlock() } +func (r *raftNode) getLatestTickTs() time.Time { + r.tickMu.RLock() + defer r.tickMu.RUnlock() + return r.latestTickTs +} + // start prepares and starts raftNode in a new goroutine. It is no longer safe // to modify the fields after it has been started. func (r *raftNode) start(rh *raftReadyHandler) { diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 6708f71bf9a..0600a31b896 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -904,10 +904,26 @@ func (s *EtcdServer) revokeExpiredLeases(leases []*lease.Lease) { }) } +// isActive checks if the etcd instance is still actively processing the +// heartbeat message (ticks). It returns false if no heartbeat has been +// received within 3 * tickMs. +func (s *EtcdServer) isActive() bool { + latestTickTs := s.r.getLatestTickTs() + threshold := 3 * time.Duration(s.Cfg.TickMs) * time.Millisecond + return latestTickTs.Add(threshold).After(time.Now()) +} + // ensureLeadership checks whether current member is still the leader. func (s *EtcdServer) ensureLeadership() bool { lg := s.Logger() + if s.isActive() { + lg.Debug("The member is active, skip checking leadership", + zap.Time("latestTickTs", s.r.getLatestTickTs()), + zap.Time("now", time.Now())) + return true + } + ctx, cancel := context.WithTimeout(s.ctx, s.Cfg.ReqTimeout()) defer cancel() if err := s.linearizableReadNotify(ctx); err != nil { diff --git a/server/etcdserver/server_test.go b/server/etcdserver/server_test.go index 215c86abbc9..858aa32a076 100644 --- a/server/etcdserver/server_test.go +++ b/server/etcdserver/server_test.go @@ -29,6 +29,7 @@ import ( "github.com/coreos/go-semver/semver" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/zap" "go.uber.org/zap/zaptest" @@ -1538,3 +1539,45 @@ func TestWaitAppliedIndex(t *testing.T) { }) } } + +func TestIsActive(t *testing.T) { + cases := []struct { + name string + tickMs uint + durationSinceLastTick time.Duration + expectActive bool + }{ + { + name: "1.5*tickMs,active", + tickMs: 100, + durationSinceLastTick: 150 * time.Millisecond, + expectActive: true, + }, + { + name: "2*tickMs,active", + tickMs: 200, + durationSinceLastTick: 400 * time.Millisecond, + expectActive: true, + }, + { + name: "4*tickMs,not active", + tickMs: 150, + durationSinceLastTick: 600 * time.Millisecond, + expectActive: false, + }, + } + + for _, tc := range cases { + s := EtcdServer{ + Cfg: config.ServerConfig{ + TickMs: tc.tickMs, + }, + r: raftNode{ + tickMu: new(sync.RWMutex), + latestTickTs: time.Now().Add(-tc.durationSinceLastTick), + }, + } + + require.Equal(t, tc.expectActive, s.isActive()) + } +}