consul: Fixing graceful leave of current leader. Fixes #360.

This commit is contained in:
Armon Dadgar 2014-10-13 22:14:43 -07:00
parent b1d0611e0e
commit b6c5d77cf8
3 changed files with 82 additions and 10 deletions

View File

@ -7,6 +7,7 @@ BUG FIXES:
* Fixing issue with Session ID and ACL ID generation. Previously, * Fixing issue with Session ID and ACL ID generation. Previously,
sessions and tokens could be invalidated when the leader changed. sessions and tokens could be invalidated when the leader changed.
* Fixing multiple headers for /v1/event/list endpoint [GH-361] * Fixing multiple headers for /v1/event/list endpoint [GH-361]
* Fixing graceful leave of leader causing invalid Raft peers [GH-360]
## 0.4.0 (September 5, 2014) ## 0.4.0 (September 5, 2014)

View File

@ -399,22 +399,30 @@ func (s *Server) handleReapMember(member serf.Member) error {
// handleDeregisterMember is used to deregister a member of a given reason // handleDeregisterMember is used to deregister a member of a given reason
func (s *Server) handleDeregisterMember(reason string, member serf.Member) error { func (s *Server) handleDeregisterMember(reason string, member serf.Member) error {
state := s.fsm.State() state := s.fsm.State()
// Check if the node does not exists // Check if the node does not exists
_, found, _ := state.GetNode(member.Name) _, found, _ := state.GetNode(member.Name)
if !found { if !found {
return nil return nil
} }
s.logger.Printf("[INFO] consul: member '%s' %s, deregistering", member.Name, reason)
// Remove from Raft peers if this was a server // Remove from Raft peers if this was a server
if valid, parts := isConsulServer(member); valid { if valid, parts := isConsulServer(member); valid {
s.logger.Printf("[INFO] consul: server '%s' %s, removing as peer", member.Name, reason)
if err := s.removeConsulServer(member, parts.Port); err != nil { if err := s.removeConsulServer(member, parts.Port); err != nil {
return err return err
} }
} }
// Do not deregister ourself. This can only happen if the current leader
// is leaving. Instead, we should allow a follower to take-over and
// deregister us later.
if member.Name == s.config.NodeName {
s.logger.Printf("[WARN] consul: deregistering self (%s) should be done by follower", s.config.NodeName)
return nil
}
// Deregister the node // Deregister the node
s.logger.Printf("[INFO] consul: member '%s' %s, deregistering", member.Name, reason)
req := structs.DeregisterRequest{ req := structs.DeregisterRequest{
Datacenter: s.config.Datacenter, Datacenter: s.config.Datacenter,
Node: member.Name, Node: member.Name,
@ -454,11 +462,6 @@ func (s *Server) joinConsulServer(m serf.Member, parts *serverParts) error {
// removeConsulServer is used to try to remove a consul server that has left // removeConsulServer is used to try to remove a consul server that has left
func (s *Server) removeConsulServer(m serf.Member, port int) error { func (s *Server) removeConsulServer(m serf.Member, port int) error {
// Do not remove ourself
if m.Name == s.config.NodeName {
return nil
}
// Attempt to remove as peer // Attempt to remove as peer
peer := &net.TCPAddr{IP: m.Addr, Port: port} peer := &net.TCPAddr{IP: m.Addr, Port: port}
future := s.raft.RemovePeer(peer) future := s.raft.RemovePeer(peer)

View File

@ -3,12 +3,13 @@ package consul
import ( import (
"errors" "errors"
"fmt" "fmt"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/serf"
"os" "os"
"testing" "testing"
"time" "time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/serf"
) )
func TestLeader_RegisterMember(t *testing.T) { func TestLeader_RegisterMember(t *testing.T) {
@ -328,6 +329,73 @@ func TestLeader_LeftServer(t *testing.T) {
} }
} }
func TestLeader_LeftLeader(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
servers := []*Server{s1, s2, s3}
// Try to join
addr := fmt.Sprintf("127.0.0.1:%d",
s1.config.SerfLANConfig.MemberlistConfig.BindPort)
if _, err := s2.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
if _, err := s3.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
for _, s := range servers {
testutil.WaitForResult(func() (bool, error) {
peers, _ := s.raftPeers.Peers()
return len(peers) == 3, nil
}, func(err error) {
t.Fatalf("should have 3 peers")
})
}
// Kill the leader!
var leader *Server
for _, s := range servers {
if s.IsLeader() {
leader = s
break
}
}
leader.Leave()
leader.Shutdown()
time.Sleep(100 * time.Millisecond)
var remain *Server
for _, s := range servers {
if s == leader {
continue
}
remain = s
testutil.WaitForResult(func() (bool, error) {
peers, _ := s.raftPeers.Peers()
return len(peers) == 2, errors.New(fmt.Sprintf("%v", peers))
}, func(err error) {
t.Fatalf("should have 2 peers: %v", err)
})
}
// Verify the old leader is deregistered
state := remain.fsm.State()
_, found, _ := state.GetNode(leader.config.NodeName)
if found {
t.Fatalf("leader should be deregistered")
}
}
func TestLeader_MultiBootstrap(t *testing.T) { func TestLeader_MultiBootstrap(t *testing.T) {
dir1, s1 := testServer(t) dir1, s1 := testServer(t)
defer os.RemoveAll(dir1) defer os.RemoveAll(dir1)