Add configurable cleanup of dead servers when a new server joins

This commit is contained in:
Kyle Havlovitz 2017-02-17 10:49:16 -08:00
parent 2dcfd9115d
commit 2c9001a389
No known key found for this signature in database
GPG key ID: 8A5E6B173056AD6C
4 changed files with 90 additions and 0 deletions

View file

@ -274,6 +274,10 @@ type Config struct {
// This period is meant to be long enough for a leader election to take
// place, and a small jitter is applied to avoid a thundering herd.
RPCHoldTimeout time.Duration
// AutopilotServerCleanup controls whether to remove dead servers when a new
// server is added to the Raft peers
AutopilotServerCleanup bool
}
// CheckVersion is used to check if the ProtocolVersion is valid
@ -346,6 +350,8 @@ func DefaultConfig() *Config {
RPCHoldTimeout: 7 * time.Second,
TLSMinVersion: "tls10",
AutopilotServerCleanup: true,
}
// Increase our reap interval to 3 days instead of 24h.

View file

@ -567,6 +567,20 @@ func (s *Server) joinConsulServer(m serf.Member, parts *agent.Server) error {
s.logger.Printf("[ERR] consul: failed to add raft peer: %v", err)
return err
}
// Look for dead servers to clean up
if s.config.AutopilotServerCleanup {
for _, member := range s.serfLAN.Members() {
valid, _ := agent.IsConsulServer(member)
if valid && member.Name != m.Name && member.Status == serf.StatusFailed {
if err := s.handleDeregisterMember("Removing failed server", member); err != nil {
return fmt.Errorf("[ERROR] consul: Couldn't deregister failed server (%s): %v", member.Name, err)
}
s.logger.Printf("[INFO] consul: Removed failed server: %v", member.Name)
}
}
}
return nil
}

View file

@ -622,3 +622,72 @@ func TestLeader_ReapTombstones(t *testing.T) {
t.Fatalf("err: %v", err)
})
}
func TestLeader_DeadServerCleanup(t *testing.T) {
dir1, s1 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
servers := []*Server{s1, s2, s3}
// Try to join
addr := fmt.Sprintf("127.0.0.1:%d",
s1.config.SerfLANConfig.MemberlistConfig.BindPort)
if _, err := s2.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
if _, err := s3.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
for _, s := range servers {
testutil.WaitForResult(func() (bool, error) {
peers, _ := s.numPeers()
return peers == 3, nil
}, func(err error) {
t.Fatalf("should have 3 peers")
})
}
// Kill a non-leader server (s2 or s3, so s4 can still join s1)
var nonLeader *Server
var removedIndex int
for i, s := range servers {
if !s.IsLeader() && i > 0 {
nonLeader = s
removedIndex = i
break
}
}
nonLeader.Shutdown()
time.Sleep(1 * time.Second)
// Bring up and join a new server
dir4, s4 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
if _, err := s4.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
// Make sure the dead server is removed and we're back to 3 total peers
servers[removedIndex] = s4
for _, s := range servers {
testutil.WaitForResult(func() (bool, error) {
peers, _ := s.numPeers()
return peers == 3, nil
}, func(err error) {
t.Fatalf("should have 3 peers")
})
}
}

View file

@ -317,6 +317,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w
conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion)
conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin)
conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax)
conf.Tags["raft_vsn"] = fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion)
conf.Tags["build"] = s.config.Build
conf.Tags["port"] = fmt.Sprintf("%d", addr.Port)
if s.config.Bootstrap {