Skip to content

Commit

Permalink
autopilot: fix dead server removal condition to use correct failure t…
Browse files Browse the repository at this point in the history
…olerance (#4017)

* Make dead server removal condition in autopilot use correct failure tolerance rules
* Introduce func with explanation
  • Loading branch information
preetapan authored and hanshasselberg committed Dec 16, 2019
1 parent 4f5d502 commit c47dbff
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 46 deletions.
70 changes: 42 additions & 28 deletions agent/consul/autopilot/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ func (a *Autopilot) RemoveDeadServers() {
}
}

func canRemoveServers(peers, minQuorum, deadServers int) (bool, string) {
if peers-deadServers < int(minQuorum) {
return false, fmt.Sprintf("denied, because removing %d/%d servers would leave less then minimal allowed quorum of %d servers", deadServers, peers, minQuorum)
}

// Only do removals if a minority of servers will be affected.
// For failure tolerance of F we need n = 2F+1 servers.
// This means we can safely remove up to (n-1)/2 servers.
if deadServers > (peers-1)/2 {
return false, fmt.Sprintf("denied, because removing the majority of servers %d/%d is not safe", deadServers, peers)
}
return true, fmt.Sprintf("allowed, because removing %d/%d servers leaves a majority of servers above the minimal allowed quorum %d", deadServers, peers, minQuorum)
}

// pruneDeadServers removes up to numPeers/2 failed servers
func (a *Autopilot) pruneDeadServers() error {
conf := a.delegate.AutopilotConfig()
Expand Down Expand Up @@ -226,42 +240,42 @@ func (a *Autopilot) pruneDeadServers() error {
}
}

// We can bail early if there's nothing to do.
removalCount := len(failed) + len(staleRaftServers)
if removalCount == 0 {
deadServers := len(failed) + len(staleRaftServers)

// nothing to do
if deadServers == 0 {
return nil
}

// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
if serfWAN != nil {
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
}
if ok, msg := canRemoveServers(NumPeers(raftConfig), int(conf.MinQuorum), deadServers); !ok {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: %s.", msg)
return nil
}

for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
if serfWAN != nil {
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
}

minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return err
}

minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return err
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}
if err := future.Error(); err != nil {
return err
}
if err := future.Error(); err != nil {
return err
}
} else {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
}

return nil
Expand Down
25 changes: 25 additions & 0 deletions agent/consul/autopilot/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"testing"

"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)

func TestMinRaftProtocol(t *testing.T) {
Expand Down Expand Up @@ -84,3 +85,27 @@ func TestMinRaftProtocol(t *testing.T) {
}
}
}

func TestAutopilot_canRemoveServers(t *testing.T) {
type test struct {
peers int
minQuorum int
deadServers int
ok bool
}

tests := []test{
{1, 1, 1, false},
{3, 3, 1, false},
{4, 3, 3, false},
{5, 3, 3, false},
{5, 3, 2, true},
{5, 3, 1, true},
{9, 3, 5, false},
}
for _, test := range tests {
ok, msg := canRemoveServers(test.peers, test.minQuorum, test.deadServers)
require.Equal(t, test.ok, ok)
t.Logf("%+v: %s", test, msg)
}
}
62 changes: 44 additions & 18 deletions agent/consul/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)

func TestAutopilot_IdempotentShutdown(t *testing.T) {
Expand Down Expand Up @@ -37,7 +38,7 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
conf := func(c *Config) {
c.Datacenter = dc
c.Bootstrap = false
c.BootstrapExpect = 3
c.BootstrapExpect = 5
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
}
dir1, s1 := testServerWithConfig(t, conf)
Expand All @@ -52,43 +53,68 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
defer os.RemoveAll(dir3)
defer s3.Shutdown()

servers := []*Server{s1, s2, s3}
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()

dir5, s5 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir5)
defer s5.Shutdown()

servers := []*Server{s1, s2, s3, s4, s5}

// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)
joinLAN(t, s5, s1)

for _, s := range servers {
testrpc.WaitForLeader(t, s.RPC, dc)
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
}

// Bring up a new server
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
require := require.New(t)
testrpc.WaitForLeader(t, s1.RPC, "dc1")
leaderIndex := -1
for i, s := range servers {
if s.IsLeader() {
leaderIndex = i
break
}
}
require.NotEqual(leaderIndex, -1)

// Shutdown two non-leader servers
killed := make(map[string]struct{})
for i, s := range servers {
if i != leaderIndex {
s.Shutdown()
killed[string(s.config.NodeID)] = struct{}{}
}
if len(killed) == 2 {
break
}
}

// Kill a non-leader server
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.LANMembers() {
for _, m := range servers[leaderIndex].LANMembers() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 2 {
r.Fatal(nil)
if alive != 3 {
r.Fatalf("Expected three alive servers instead of %d", alive)
}
})

// Join the new server
joinLAN(t, s4, s1)
servers[2] = s4

// Make sure the dead server is removed and we're back to 3 total peers
// Make sure the dead servers are removed and we're back to 3 total peers
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
_, killed := killed[string(s.config.NodeID)]
if !killed {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
}

Expand Down

0 comments on commit c47dbff

Please sign in to comment.