From 79d2713ea9e081f0583b6bd11e49efafa7c88e7e Mon Sep 17 00:00:00 2001 From: ShuNing Date: Thu, 27 May 2021 20:43:35 +0800 Subject: [PATCH 1/2] This is an automated cherry-pick of #3712 Signed-off-by: ti-chi-bot --- server/cluster/cluster.go | 1 + server/server.go | 22 +++++++++++++++++----- tests/server/member/member_test.go | 26 +++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 92b58c24427..9bed409312f 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -364,6 +364,7 @@ func (c *RaftCluster) Stop() { c.coordinator.stop() c.Unlock() c.wg.Wait() + log.Info("raftcluster is stopped") } // IsRunning return if the cluster is running. diff --git a/server/server.go b/server/server.go index 97d2df8c0ac..b4d23bbb965 100644 --- a/server/server.go +++ b/server/server.go @@ -647,6 +647,7 @@ func (s *Server) createRaftCluster() error { } func (s *Server) stopRaftCluster() { + failpoint.Inject("raftclusterIsBusy", func() {}) s.cluster.Stop() } @@ -1219,9 +1220,13 @@ func (s *Server) campaignLeader() { // 1. lease based approach is not affected by thread pause, slow runtime schedule, etc. // 2. load region could be slow. Based on lease we can recover TSO service faster. ctx, cancel := context.WithCancel(s.serverLoopCtx) - defer cancel() - defer s.member.ResetLeader() - // maintain the PD leader + var resetLeaderOnce sync.Once + defer resetLeaderOnce.Do(func() { + cancel() + s.member.ResetLeader() + }) + + // maintain the PD leadership, after this, TSO can be service. go s.member.KeepLeader(ctx) log.Info("campaign pd leader ok", zap.String("campaign-pd-leader-name", s.Name())) @@ -1236,8 +1241,6 @@ func (s *Server) campaignLeader() { return } defer s.tsoAllocatorManager.ResetAllocatorGroup(tso.GlobalDCLocation) - // Check the cluster dc-location after the PD leader is elected - go s.tsoAllocatorManager.ClusterDCLocationChecker() if err := s.reloadConfigFromKV(); err != nil { log.Error("failed to reload configuration", errs.ZapError(err)) @@ -1263,7 +1266,16 @@ func (s *Server) campaignLeader() { log.Error("failed to sync id from etcd", errs.ZapError(err)) return } + // EnableLeader to accept the remaining service, such as GetStore, GetRegion. s.member.EnableLeader() + // Check the cluster dc-location after the PD leader is elected. + go s.tsoAllocatorManager.ClusterDCLocationChecker() + defer resetLeaderOnce.Do(func() { + // as soon as cancel the leadership keepalive, then other member have chance + // to be new leader. + cancel() + s.member.ResetLeader() + }) CheckPDVersion(s.persistOptions) log.Info("PD cluster leader is ready to serve", zap.String("pd-leader-name", s.Name())) diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index 02a72cedb3b..a2b2440ac15 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -26,6 +26,7 @@ import ( . "github.com/pingcap/check" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/tikv/pd/pkg/etcdutil" "github.com/tikv/pd/pkg/testutil" @@ -238,12 +239,35 @@ func (s *serverTestSuite) TestLeaderResign(c *C) { c.Assert(leader3, Equals, leader1) } +<<<<<<< HEAD func (s *serverTestSuite) waitLeaderChange(c *C, cluster *tests.TestCluster, old string) string { +======= +func (s *memberTestSuite) TestLeaderResignWithBlock(c *C) { + cluster, err := tests.NewTestCluster(s.ctx, 3) + defer cluster.Destroy() + c.Assert(err, IsNil) + + err = cluster.RunInitialServers() + c.Assert(err, IsNil) + + leader1 := cluster.WaitLeader() + addr1 := cluster.GetServer(leader1).GetConfig().ClientUrls + + err = failpoint.Enable("github.com/tikv/pd/server/raftclusterIsBusy", `pause`) + c.Assert(err, IsNil) + defer failpoint.Disable("github.com/tikv/pd/server/raftclusterIsBusy") + s.post(c, addr1+"/pd/api/v1/leader/resign", "") + leader2 := s.waitLeaderChange(c, cluster, leader1) + c.Log("leader2:", leader2) + c.Assert(leader2, Not(Equals), leader1) +} + +func (s *memberTestSuite) waitLeaderChange(c *C, cluster *tests.TestCluster, old string) string { +>>>>>>> 71b12e40a (server: make leader can lost lease as soon as possible (#3712)) var leader string testutil.WaitUntil(c, func(c *C) bool { leader = cluster.GetLeader() if leader == old || leader == "" { - time.Sleep(time.Second) return false } return true From d83603ebb7cc5dd08d13b893924631cfb7ada0a2 Mon Sep 17 00:00:00 2001 From: nolouch Date: Thu, 27 May 2021 21:07:50 +0800 Subject: [PATCH 2/2] reslove conflict Signed-off-by: nolouch --- tests/server/member/member_test.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index a2b2440ac15..cde316c5101 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -239,10 +239,7 @@ func (s *serverTestSuite) TestLeaderResign(c *C) { c.Assert(leader3, Equals, leader1) } -<<<<<<< HEAD -func (s *serverTestSuite) waitLeaderChange(c *C, cluster *tests.TestCluster, old string) string { -======= -func (s *memberTestSuite) TestLeaderResignWithBlock(c *C) { +func (s *serverTestSuite) TestLeaderResignWithBlock(c *C) { cluster, err := tests.NewTestCluster(s.ctx, 3) defer cluster.Destroy() c.Assert(err, IsNil) @@ -262,8 +259,7 @@ func (s *memberTestSuite) TestLeaderResignWithBlock(c *C) { c.Assert(leader2, Not(Equals), leader1) } -func (s *memberTestSuite) waitLeaderChange(c *C, cluster *tests.TestCluster, old string) string { ->>>>>>> 71b12e40a (server: make leader can lost lease as soon as possible (#3712)) +func (s *serverTestSuite) waitLeaderChange(c *C, cluster *tests.TestCluster, old string) string { var leader string testutil.WaitUntil(c, func(c *C) bool { leader = cluster.GetLeader()