Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

balance: slow down interval increase speed. #585

Merged
merged 3 commits into from
Mar 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 15 additions & 18 deletions server/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ import (
)

const (
historiesCacheSize = 1000
eventsCacheSize = 1000
maxScheduleRetries = 10
maxScheduleInterval = time.Minute
minScheduleInterval = time.Millisecond * 10
historiesCacheSize = 1000
eventsCacheSize = 1000
maxScheduleRetries = 10
maxScheduleInterval = time.Minute
minScheduleInterval = time.Millisecond * 10
scheduleIntervalFactor = 1.3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason to use 1.3 here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's arbitrarily selected. We need a slower grow speed here.
If factor is 2, interval reaches max value 1min after about 13 retries, which takes less than 1.5min minutes in total.
When we can't schedule an operator in 1.5 minutes, it's not always true that the cluster is balanced, it may be caused by slow heartbeat or slow snapshot.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we construct a test to verify the change is ok?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll see what I can do.

)

var (
Expand Down Expand Up @@ -162,14 +163,8 @@ func (c *coordinator) runScheduler(s *scheduleController) {
if !s.AllowSchedule() {
continue
}
for i := 0; i < maxScheduleRetries; i++ {
op := s.Schedule(c.cluster)
if op == nil {
continue
}
if c.addOperator(op) {
break
}
if op := s.Schedule(c.cluster); op != nil {
c.addOperator(op)
}
case <-s.Ctx().Done():
log.Infof("%v stopped: %v", s.GetName(), s.Ctx().Err())
Expand Down Expand Up @@ -300,14 +295,16 @@ func (s *scheduleController) Stop() {
}

func (s *scheduleController) Schedule(cluster *clusterInfo) Operator {
// If we have schedule, reset interval to the minimal interval.
if op := s.Scheduler.Schedule(cluster); op != nil {
s.interval = minScheduleInterval
return op
for i := 0; i < maxScheduleRetries; i++ {
// If we have schedule, reset interval to the minimal interval.
if op := s.Scheduler.Schedule(cluster); op != nil {
s.interval = minScheduleInterval
return op
}
}

// If we have no schedule, increase the interval exponentially.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please update the comment here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiplied by 1.3 every time is also 'exponentially'.

s.interval = minDuration(s.interval*2, maxScheduleInterval)
s.interval = minDuration(time.Duration(float64(s.interval)*scheduleIntervalFactor), maxScheduleInterval)
return nil
}

Expand Down
20 changes: 19 additions & 1 deletion server/coordinator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ func (s *testScheduleControllerSuite) TestController(c *C) {
lb := newBalanceLeaderScheduler(opt)
sc := newScheduleController(co, lb)

for i := minScheduleInterval; sc.GetInterval() != maxScheduleInterval; i *= 2 {
for i := minScheduleInterval; sc.GetInterval() != maxScheduleInterval; i = time.Duration(float64(i) * scheduleIntervalFactor) {
c.Assert(sc.GetInterval(), Equals, i)
c.Assert(sc.Schedule(cluster), IsNil)
}
Expand Down Expand Up @@ -327,6 +327,24 @@ func (s *testScheduleControllerSuite) TestController(c *C) {
c.Assert(sc.AllowSchedule(), IsTrue)
}

func (s *testScheduleControllerSuite) TestInterval(c *C) {
cluster := newClusterInfo(newMockIDAllocator())
_, opt := newTestScheduleConfig()
co := newCoordinator(cluster, opt)
lb := newBalanceLeaderScheduler(opt)
sc := newScheduleController(co, lb)

// If no operator for x seconds, the next check should be in x/2 seconds.
idleSeconds := []int{5, 10, 20, 30, 60}
for _, n := range idleSeconds {
sc.interval = minScheduleInterval
for totalSleep := time.Duration(0); totalSleep <= time.Second*time.Duration(n); totalSleep += sc.GetInterval() {
c.Assert(sc.Schedule(cluster), IsNil)
}
c.Assert(sc.GetInterval(), Less, time.Second*time.Duration(n/2))
}
}

func checkAddPeerResp(c *C, resp *pdpb.RegionHeartbeatResponse, storeID uint64) {
changePeer := resp.GetChangePeer()
c.Assert(changePeer.GetChangeType(), Equals, raftpb.ConfChangeType_AddNode)
Expand Down