autopilot: fix dead server removal condition to use correct failure tolerance (#4017)
* Make dead server removal condition in autopilot use correct failure tolerance rules * Introduce func with explanation
This commit is contained in:
parent
e3e56ff3c2
commit
f607a00138
|
@ -174,6 +174,20 @@ func (a *Autopilot) RemoveDeadServers() {
|
|||
}
|
||||
}
|
||||
|
||||
func canRemoveServers(peers, minQuorum, deadServers int) (bool, string) {
|
||||
if peers-deadServers < int(minQuorum) {
|
||||
return false, fmt.Sprintf("denied, because removing %d/%d servers would leave less then minimal allowed quorum of %d servers", deadServers, peers, minQuorum)
|
||||
}
|
||||
|
||||
// Only do removals if a minority of servers will be affected.
|
||||
// For failure tolerance of F we need n = 2F+1 servers.
|
||||
// This means we can safely remove up to (n-1)/2 servers.
|
||||
if deadServers > (peers-1)/2 {
|
||||
return false, fmt.Sprintf("denied, because removing the majority of servers %d/%d is not safe", deadServers, peers)
|
||||
}
|
||||
return true, fmt.Sprintf("allowed, because removing %d/%d servers leaves a majority of servers above the minimal allowed quorum %d", deadServers, peers, minQuorum)
|
||||
}
|
||||
|
||||
// pruneDeadServers removes up to numPeers/2 failed servers
|
||||
func (a *Autopilot) pruneDeadServers() error {
|
||||
conf := a.delegate.AutopilotConfig()
|
||||
|
@ -226,42 +240,42 @@ func (a *Autopilot) pruneDeadServers() error {
|
|||
}
|
||||
}
|
||||
|
||||
// We can bail early if there's nothing to do.
|
||||
removalCount := len(failed) + len(staleRaftServers)
|
||||
if removalCount == 0 {
|
||||
deadServers := len(failed) + len(staleRaftServers)
|
||||
|
||||
// nothing to do
|
||||
if deadServers == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Only do removals if a minority of servers will be affected.
|
||||
peers := NumPeers(raftConfig)
|
||||
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
|
||||
for _, node := range failed {
|
||||
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
|
||||
go serfLAN.RemoveFailedNode(node.Name)
|
||||
if serfWAN != nil {
|
||||
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
|
||||
}
|
||||
if ok, msg := canRemoveServers(NumPeers(raftConfig), int(conf.MinQuorum), deadServers); !ok {
|
||||
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: %s.", msg)
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, node := range failed {
|
||||
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
|
||||
go serfLAN.RemoveFailedNode(node.Name)
|
||||
if serfWAN != nil {
|
||||
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
|
||||
}
|
||||
|
||||
minRaftProtocol, err := a.MinRaftProtocol()
|
||||
if err != nil {
|
||||
}
|
||||
|
||||
minRaftProtocol, err := a.MinRaftProtocol()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, raftServer := range staleRaftServers {
|
||||
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
|
||||
var future raft.Future
|
||||
if minRaftProtocol >= 2 {
|
||||
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
|
||||
} else {
|
||||
future = raftNode.RemovePeer(raftServer.Address)
|
||||
}
|
||||
if err := future.Error(); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, raftServer := range staleRaftServers {
|
||||
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
|
||||
var future raft.Future
|
||||
if minRaftProtocol >= 2 {
|
||||
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
|
||||
} else {
|
||||
future = raftNode.RemovePeer(raftServer.Address)
|
||||
}
|
||||
if err := future.Error(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
|
@ -6,6 +6,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/hashicorp/serf/serf"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMinRaftProtocol(t *testing.T) {
|
||||
|
@ -84,3 +85,27 @@ func TestMinRaftProtocol(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAutopilot_canRemoveServers(t *testing.T) {
|
||||
type test struct {
|
||||
peers int
|
||||
minQuorum int
|
||||
deadServers int
|
||||
ok bool
|
||||
}
|
||||
|
||||
tests := []test{
|
||||
{1, 1, 1, false},
|
||||
{3, 3, 1, false},
|
||||
{4, 3, 3, false},
|
||||
{5, 3, 3, false},
|
||||
{5, 3, 2, true},
|
||||
{5, 3, 1, true},
|
||||
{9, 3, 5, false},
|
||||
}
|
||||
for _, test := range tests {
|
||||
ok, msg := canRemoveServers(test.peers, test.minQuorum, test.deadServers)
|
||||
require.Equal(t, test.ok, ok)
|
||||
t.Logf("%+v: %s", test, msg)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"github.com/hashicorp/consul/testrpc"
|
||||
"github.com/hashicorp/raft"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestAutopilot_IdempotentShutdown(t *testing.T) {
|
||||
|
@ -37,7 +38,7 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
|||
conf := func(c *Config) {
|
||||
c.Datacenter = dc
|
||||
c.Bootstrap = false
|
||||
c.BootstrapExpect = 3
|
||||
c.BootstrapExpect = 5
|
||||
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
|
||||
}
|
||||
dir1, s1 := testServerWithConfig(t, conf)
|
||||
|
@ -52,43 +53,68 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
|||
defer os.RemoveAll(dir3)
|
||||
defer s3.Shutdown()
|
||||
|
||||
servers := []*Server{s1, s2, s3}
|
||||
|
||||
// Try to join
|
||||
joinLAN(t, s2, s1)
|
||||
joinLAN(t, s3, s1)
|
||||
|
||||
for _, s := range servers {
|
||||
testrpc.WaitForLeader(t, s.RPC, dc)
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
||||
}
|
||||
|
||||
// Bring up a new server
|
||||
dir4, s4 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir4)
|
||||
defer s4.Shutdown()
|
||||
|
||||
// Kill a non-leader server
|
||||
s3.Shutdown()
|
||||
dir5, s5 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir5)
|
||||
defer s5.Shutdown()
|
||||
|
||||
servers := []*Server{s1, s2, s3, s4, s5}
|
||||
|
||||
// Try to join
|
||||
joinLAN(t, s2, s1)
|
||||
joinLAN(t, s3, s1)
|
||||
joinLAN(t, s4, s1)
|
||||
joinLAN(t, s5, s1)
|
||||
|
||||
for _, s := range servers {
|
||||
testrpc.WaitForLeader(t, s.RPC, dc)
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
|
||||
}
|
||||
|
||||
require := require.New(t)
|
||||
testrpc.WaitForLeader(t, s1.RPC, "dc1")
|
||||
leaderIndex := -1
|
||||
for i, s := range servers {
|
||||
if s.IsLeader() {
|
||||
leaderIndex = i
|
||||
break
|
||||
}
|
||||
}
|
||||
require.NotEqual(leaderIndex, -1)
|
||||
|
||||
// Shutdown two non-leader servers
|
||||
killed := make(map[string]struct{})
|
||||
for i, s := range servers {
|
||||
if i != leaderIndex {
|
||||
s.Shutdown()
|
||||
killed[string(s.config.NodeID)] = struct{}{}
|
||||
}
|
||||
if len(killed) == 2 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
alive := 0
|
||||
for _, m := range s1.LANMembers() {
|
||||
for _, m := range servers[leaderIndex].LANMembers() {
|
||||
if m.Status == serf.StatusAlive {
|
||||
alive++
|
||||
}
|
||||
}
|
||||
if alive != 2 {
|
||||
r.Fatal(nil)
|
||||
if alive != 3 {
|
||||
r.Fatalf("Expected three alive servers instead of %d", alive)
|
||||
}
|
||||
})
|
||||
|
||||
// Join the new server
|
||||
joinLAN(t, s4, s1)
|
||||
servers[2] = s4
|
||||
|
||||
// Make sure the dead server is removed and we're back to 3 total peers
|
||||
// Make sure the dead servers are removed and we're back to 3 total peers
|
||||
for _, s := range servers {
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
||||
_, killed := killed[string(s.config.NodeID)]
|
||||
if !killed {
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue