Skip to content

Commit

Permalink
server: make leader can lost lease as soon as possible (#3712) (#3719)
Browse files Browse the repository at this point in the history
* This is an automated cherry-pick of #3712

Signed-off-by: ti-chi-bot <[email protected]>

* reslove conflict

Signed-off-by: nolouch <[email protected]>

Co-authored-by: ShuNing <[email protected]>
  • Loading branch information
ti-chi-bot and nolouch authored May 28, 2021
1 parent 8a2cfe9 commit 04ed1ff
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 6 deletions.
1 change: 1 addition & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ func (c *RaftCluster) Stop() {
c.coordinator.stop()
c.Unlock()
c.wg.Wait()
log.Info("raftcluster is stopped")
}

// IsRunning return if the cluster is running.
Expand Down
22 changes: 17 additions & 5 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ func (s *Server) createRaftCluster() error {
}

func (s *Server) stopRaftCluster() {
failpoint.Inject("raftclusterIsBusy", func() {})
s.cluster.Stop()
}

Expand Down Expand Up @@ -1219,9 +1220,13 @@ func (s *Server) campaignLeader() {
// 1. lease based approach is not affected by thread pause, slow runtime schedule, etc.
// 2. load region could be slow. Based on lease we can recover TSO service faster.
ctx, cancel := context.WithCancel(s.serverLoopCtx)
defer cancel()
defer s.member.ResetLeader()
// maintain the PD leader
var resetLeaderOnce sync.Once
defer resetLeaderOnce.Do(func() {
cancel()
s.member.ResetLeader()
})

// maintain the PD leadership, after this, TSO can be service.
go s.member.KeepLeader(ctx)
log.Info("campaign pd leader ok", zap.String("campaign-pd-leader-name", s.Name()))

Expand All @@ -1236,8 +1241,6 @@ func (s *Server) campaignLeader() {
return
}
defer s.tsoAllocatorManager.ResetAllocatorGroup(tso.GlobalDCLocation)
// Check the cluster dc-location after the PD leader is elected
go s.tsoAllocatorManager.ClusterDCLocationChecker()

if err := s.reloadConfigFromKV(); err != nil {
log.Error("failed to reload configuration", errs.ZapError(err))
Expand All @@ -1263,7 +1266,16 @@ func (s *Server) campaignLeader() {
log.Error("failed to sync id from etcd", errs.ZapError(err))
return
}
// EnableLeader to accept the remaining service, such as GetStore, GetRegion.
s.member.EnableLeader()
// Check the cluster dc-location after the PD leader is elected.
go s.tsoAllocatorManager.ClusterDCLocationChecker()
defer resetLeaderOnce.Do(func() {
// as soon as cancel the leadership keepalive, then other member have chance
// to be new leader.
cancel()
s.member.ResetLeader()
})

CheckPDVersion(s.persistOptions)
log.Info("PD cluster leader is ready to serve", zap.String("pd-leader-name", s.Name()))
Expand Down
22 changes: 21 additions & 1 deletion tests/server/member/member_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

. "github.com/pingcap/check"
"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
"github.com/pingcap/kvproto/pkg/pdpb"
"github.com/tikv/pd/pkg/etcdutil"
"github.com/tikv/pd/pkg/testutil"
Expand Down Expand Up @@ -238,12 +239,31 @@ func (s *serverTestSuite) TestLeaderResign(c *C) {
c.Assert(leader3, Equals, leader1)
}

func (s *serverTestSuite) TestLeaderResignWithBlock(c *C) {
cluster, err := tests.NewTestCluster(s.ctx, 3)
defer cluster.Destroy()
c.Assert(err, IsNil)

err = cluster.RunInitialServers()
c.Assert(err, IsNil)

leader1 := cluster.WaitLeader()
addr1 := cluster.GetServer(leader1).GetConfig().ClientUrls

err = failpoint.Enable("github.com/tikv/pd/server/raftclusterIsBusy", `pause`)
c.Assert(err, IsNil)
defer failpoint.Disable("github.com/tikv/pd/server/raftclusterIsBusy")
s.post(c, addr1+"/pd/api/v1/leader/resign", "")
leader2 := s.waitLeaderChange(c, cluster, leader1)
c.Log("leader2:", leader2)
c.Assert(leader2, Not(Equals), leader1)
}

func (s *serverTestSuite) waitLeaderChange(c *C, cluster *tests.TestCluster, old string) string {
var leader string
testutil.WaitUntil(c, func(c *C) bool {
leader = cluster.GetLeader()
if leader == old || leader == "" {
time.Sleep(time.Second)
return false
}
return true
Expand Down

0 comments on commit 04ed1ff

Please sign in to comment.