diff --git a/CHANGELOG.md b/CHANGELOG.md index 748d4c854..7a7fea872 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ BUG FIXES: * Fixing issue with Session ID and ACL ID generation. Previously, sessions and tokens could be invalidated when the leader changed. * Fixing multiple headers for /v1/event/list endpoint [GH-361] + * Fixing graceful leave of leader causing invalid Raft peers [GH-360] ## 0.4.0 (September 5, 2014) diff --git a/consul/leader.go b/consul/leader.go index ba7280f9a..87e065068 100644 --- a/consul/leader.go +++ b/consul/leader.go @@ -399,22 +399,30 @@ func (s *Server) handleReapMember(member serf.Member) error { // handleDeregisterMember is used to deregister a member of a given reason func (s *Server) handleDeregisterMember(reason string, member serf.Member) error { state := s.fsm.State() - // Check if the node does not exists _, found, _ := state.GetNode(member.Name) if !found { return nil } - s.logger.Printf("[INFO] consul: member '%s' %s, deregistering", member.Name, reason) // Remove from Raft peers if this was a server if valid, parts := isConsulServer(member); valid { + s.logger.Printf("[INFO] consul: server '%s' %s, removing as peer", member.Name, reason) if err := s.removeConsulServer(member, parts.Port); err != nil { return err } } + // Do not deregister ourself. This can only happen if the current leader + // is leaving. Instead, we should allow a follower to take-over and + // deregister us later. + if member.Name == s.config.NodeName { + s.logger.Printf("[WARN] consul: deregistering self (%s) should be done by follower", s.config.NodeName) + return nil + } + // Deregister the node + s.logger.Printf("[INFO] consul: member '%s' %s, deregistering", member.Name, reason) req := structs.DeregisterRequest{ Datacenter: s.config.Datacenter, Node: member.Name, @@ -454,11 +462,6 @@ func (s *Server) joinConsulServer(m serf.Member, parts *serverParts) error { // removeConsulServer is used to try to remove a consul server that has left func (s *Server) removeConsulServer(m serf.Member, port int) error { - // Do not remove ourself - if m.Name == s.config.NodeName { - return nil - } - // Attempt to remove as peer peer := &net.TCPAddr{IP: m.Addr, Port: port} future := s.raft.RemovePeer(peer) diff --git a/consul/leader_test.go b/consul/leader_test.go index fc1c16cdf..6cad9cd70 100644 --- a/consul/leader_test.go +++ b/consul/leader_test.go @@ -3,12 +3,13 @@ package consul import ( "errors" "fmt" - "github.com/hashicorp/consul/consul/structs" - "github.com/hashicorp/consul/testutil" - "github.com/hashicorp/serf/serf" "os" "testing" "time" + + "github.com/hashicorp/consul/consul/structs" + "github.com/hashicorp/consul/testutil" + "github.com/hashicorp/serf/serf" ) func TestLeader_RegisterMember(t *testing.T) { @@ -328,6 +329,73 @@ func TestLeader_LeftServer(t *testing.T) { } } +func TestLeader_LeftLeader(t *testing.T) { + dir1, s1 := testServer(t) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + + dir2, s2 := testServerDCBootstrap(t, "dc1", false) + defer os.RemoveAll(dir2) + defer s2.Shutdown() + + dir3, s3 := testServerDCBootstrap(t, "dc1", false) + defer os.RemoveAll(dir3) + defer s3.Shutdown() + servers := []*Server{s1, s2, s3} + + // Try to join + addr := fmt.Sprintf("127.0.0.1:%d", + s1.config.SerfLANConfig.MemberlistConfig.BindPort) + if _, err := s2.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + if _, err := s3.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + + for _, s := range servers { + testutil.WaitForResult(func() (bool, error) { + peers, _ := s.raftPeers.Peers() + return len(peers) == 3, nil + }, func(err error) { + t.Fatalf("should have 3 peers") + }) + } + + // Kill the leader! + var leader *Server + for _, s := range servers { + if s.IsLeader() { + leader = s + break + } + } + leader.Leave() + leader.Shutdown() + time.Sleep(100 * time.Millisecond) + + var remain *Server + for _, s := range servers { + if s == leader { + continue + } + remain = s + testutil.WaitForResult(func() (bool, error) { + peers, _ := s.raftPeers.Peers() + return len(peers) == 2, errors.New(fmt.Sprintf("%v", peers)) + }, func(err error) { + t.Fatalf("should have 2 peers: %v", err) + }) + } + + // Verify the old leader is deregistered + state := remain.fsm.State() + _, found, _ := state.GetNode(leader.config.NodeName) + if found { + t.Fatalf("leader should be deregistered") + } +} + func TestLeader_MultiBootstrap(t *testing.T) { dir1, s1 := testServer(t) defer os.RemoveAll(dir1)