open-consul/agent/consul/serf.go

279 lines
8.1 KiB
Go
Raw Normal View History

2013-12-07 01:18:09 +00:00
package consul
2013-12-07 00:54:33 +00:00
2013-12-09 23:29:44 +00:00
import (
"strings"
"time"
pkg refactor command/agent/* -> agent/* command/consul/* -> agent/consul/* command/agent/command{,_test}.go -> command/agent{,_test}.go command/base/command.go -> command/base.go command/base/* -> command/* commands.go -> command/commands.go The script which did the refactor is: ( cd $GOPATH/src/github.com/hashicorp/consul git mv command/agent/command.go command/agent.go git mv command/agent/command_test.go command/agent_test.go git mv command/agent/flag_slice_value{,_test}.go command/ git mv command/agent . git mv command/base/command.go command/base.go git mv command/base/config_util{,_test}.go command/ git mv commands.go command/ git mv consul agent rmdir command/base/ gsed -i -e 's|package agent|package command|' command/agent{,_test}.go gsed -i -e 's|package agent|package command|' command/flag_slice_value{,_test}.go gsed -i -e 's|package base|package command|' command/base.go command/config_util{,_test}.go gsed -i -e 's|package main|package command|' command/commands.go gsed -i -e 's|base.Command|BaseCommand|' command/commands.go gsed -i -e 's|agent.Command|AgentCommand|' command/commands.go gsed -i -e 's|\tCommand:|\tBaseCommand:|' command/commands.go gsed -i -e 's|base\.||' command/commands.go gsed -i -e 's|command\.||' command/commands.go gsed -i -e 's|command|c|' main.go gsed -i -e 's|range Commands|range command.Commands|' main.go gsed -i -e 's|Commands: Commands|Commands: command.Commands|' main.go gsed -i -e 's|base\.BoolValue|BoolValue|' command/operator_autopilot_set.go gsed -i -e 's|base\.DurationValue|DurationValue|' command/operator_autopilot_set.go gsed -i -e 's|base\.StringValue|StringValue|' command/operator_autopilot_set.go gsed -i -e 's|base\.UintValue|UintValue|' command/operator_autopilot_set.go gsed -i -e 's|\bCommand\b|BaseCommand|' command/base.go gsed -i -e 's|BaseCommand Options|Command Options|' command/base.go gsed -i -e 's|base.Command|BaseCommand|' command/*.go gsed -i -e 's|c\.Command|c.BaseCommand|g' command/*.go gsed -i -e 's|\tCommand:|\tBaseCommand:|' command/*_test.go gsed -i -e 's|base\.||' command/*_test.go gsed -i -e 's|\bCommand\b|AgentCommand|' command/agent{,_test}.go gsed -i -e 's|cmd.AgentCommand|cmd.BaseCommand|' command/agent.go gsed -i -e 's|cli.AgentCommand = new(Command)|cli.Command = new(AgentCommand)|' command/agent_test.go gsed -i -e 's|exec.AgentCommand|exec.Command|' command/agent_test.go gsed -i -e 's|exec.BaseCommand|exec.Command|' command/agent_test.go gsed -i -e 's|NewTestAgent|agent.NewTestAgent|' command/agent_test.go gsed -i -e 's|= TestConfig|= agent.TestConfig|' command/agent_test.go gsed -i -e 's|: RetryJoin|: agent.RetryJoin|' command/agent_test.go gsed -i -e 's|\.\./\.\./|../|' command/config_util_test.go gsed -i -e 's|\bverifyUniqueListeners|VerifyUniqueListeners|' agent/config{,_test}.go command/agent.go gsed -i -e 's|\bserfLANKeyring\b|SerfLANKeyring|g' agent/{agent,keyring,testagent}.go command/agent.go gsed -i -e 's|\bserfWANKeyring\b|SerfWANKeyring|g' agent/{agent,keyring,testagent}.go command/agent.go gsed -i -e 's|\bNewAgent\b|agent.New|g' command/agent{,_test}.go gsed -i -e 's|\bNewAgent|New|' agent/{acl_test,agent,testagent}.go gsed -i -e 's|\bAgent\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bBool\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bConfig\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bDefaultConfig\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bDevConfig\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bMergeConfig\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bReadConfigPaths\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bParseMetaPair\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bSerfLANKeyring\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|\bSerfWANKeyring\b|agent.&|g' command/agent{,_test}.go gsed -i -e 's|circonus\.agent|circonus|g' command/agent{,_test}.go gsed -i -e 's|logger\.agent|logger|g' command/agent{,_test}.go gsed -i -e 's|metrics\.agent|metrics|g' command/agent{,_test}.go gsed -i -e 's|// agent.Agent|// agent|' command/agent{,_test}.go gsed -i -e 's|a\.agent\.Config|a.Config|' command/agent{,_test}.go gsed -i -e 's|agent\.AppendSliceValue|AppendSliceValue|' command/{configtest,validate}.go gsed -i -e 's|consul/consul|agent/consul|' GNUmakefile gsed -i -e 's|\.\./test|../../test|' agent/consul/server_test.go # fix imports f=$(grep -rl 'github.com/hashicorp/consul/command/agent' * | grep '\.go') gsed -i -e 's|github.com/hashicorp/consul/command/agent|github.com/hashicorp/consul/agent|' $f goimports -w $f f=$(grep -rl 'github.com/hashicorp/consul/consul' * | grep '\.go') gsed -i -e 's|github.com/hashicorp/consul/consul|github.com/hashicorp/consul/agent/consul|' $f goimports -w $f goimports -w command/*.go main.go )
2017-06-09 22:28:28 +00:00
"github.com/hashicorp/consul/agent/consul/agent"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
2013-12-09 23:29:44 +00:00
)
2014-03-20 19:51:49 +00:00
const (
// StatusReap is used to update the status of a node if we
// are handling a EventMemberReap
StatusReap = serf.MemberStatus(-1)
// userEventPrefix is pre-pended to a user event to distinguish it
userEventPrefix = "consul:event:"
// maxPeerRetries limits how many invalidate attempts are made
maxPeerRetries = 6
// peerRetryBase is a baseline retry time
peerRetryBase = 1 * time.Second
2014-03-20 19:51:49 +00:00
)
2014-08-27 01:50:03 +00:00
// userEventName computes the name of a user event
func userEventName(name string) string {
return userEventPrefix + name
}
// isUserEvent checks if a serf event is a user event
func isUserEvent(name string) bool {
return strings.HasPrefix(name, userEventPrefix)
}
// rawUserEventName is used to get the raw user event name
func rawUserEventName(name string) string {
return strings.TrimPrefix(name, userEventPrefix)
2014-08-27 01:50:03 +00:00
}
2013-12-07 00:54:33 +00:00
// lanEventHandler is used to handle events from the lan Serf cluster
func (s *Server) lanEventHandler() {
for {
select {
case e := <-s.eventChLAN:
2013-12-09 23:29:44 +00:00
switch e.EventType() {
case serf.EventMemberJoin:
s.lanNodeJoin(e.(serf.MemberEvent))
s.localMemberEvent(e.(serf.MemberEvent))
case serf.EventMemberLeave, serf.EventMemberFailed:
s.lanNodeFailed(e.(serf.MemberEvent))
s.localMemberEvent(e.(serf.MemberEvent))
2014-03-20 19:51:49 +00:00
case serf.EventMemberReap:
2014-01-09 23:49:09 +00:00
s.localMemberEvent(e.(serf.MemberEvent))
2013-12-09 23:29:44 +00:00
case serf.EventUser:
s.localEvent(e.(serf.UserEvent))
2017-03-21 23:36:44 +00:00
case serf.EventMemberUpdate:
s.localMemberEvent(e.(serf.MemberEvent))
2014-03-12 19:46:14 +00:00
case serf.EventQuery: // Ignore
2013-12-09 23:29:44 +00:00
default:
s.logger.Printf("[WARN] consul: Unhandled LAN Serf Event: %#v", e)
2013-12-09 23:29:44 +00:00
}
2013-12-07 00:54:33 +00:00
case <-s.shutdownCh:
return
}
}
}
2014-01-09 23:49:09 +00:00
// localMemberEvent is used to reconcile Serf events with the strongly
// consistent store if we are the current leader
func (s *Server) localMemberEvent(me serf.MemberEvent) {
// Do nothing if we are not the leader
if !s.IsLeader() {
return
}
2014-03-20 19:51:49 +00:00
// Check if this is a reap event
isReap := me.EventType() == serf.EventMemberReap
// Queue the members for reconciliation
2014-01-09 23:49:09 +00:00
for _, m := range me.Members {
2014-03-20 19:51:49 +00:00
// Change the status if this is a reap event
if isReap {
m.Status = StatusReap
}
select {
case s.reconcileCh <- m:
default:
}
}
2013-12-09 23:29:44 +00:00
}
// localEvent is called when we receive an event on the local Serf
func (s *Server) localEvent(event serf.UserEvent) {
// Handle only consul events
if !strings.HasPrefix(event.Name, "consul:") {
return
}
switch name := event.Name; {
case name == newLeaderEvent:
s.logger.Printf("[INFO] consul: New leader elected: %s", event.Payload)
// Trigger the callback
if s.config.ServerUp != nil {
s.config.ServerUp()
}
case isUserEvent(name):
event.Name = rawUserEventName(name)
s.logger.Printf("[DEBUG] consul: User event: %s", event.Name)
// Trigger the callback
if s.config.UserEventHandler != nil {
s.config.UserEventHandler(event)
}
default:
s.logger.Printf("[WARN] consul: Unhandled local event: %v", event)
}
}
// lanNodeJoin is used to handle join events on the LAN pool.
func (s *Server) lanNodeJoin(me serf.MemberEvent) {
2013-12-12 00:24:34 +00:00
for _, m := range me.Members {
2016-03-30 00:39:19 +00:00
ok, parts := agent.IsConsulServer(m)
2013-12-12 00:24:34 +00:00
if !ok {
continue
}
s.logger.Printf("[INFO] consul: Adding LAN server %s", parts)
// See if it's configured as part of our DC.
if parts.Datacenter == s.config.Datacenter {
s.localLock.Lock()
s.localConsuls[raft.ServerAddress(parts.Addr.String())] = parts
s.localLock.Unlock()
}
// If we're still expecting to bootstrap, may need to handle this.
if s.config.BootstrapExpect != 0 {
s.maybeBootstrap()
}
2017-03-15 19:26:54 +00:00
// Kick the join flooders.
s.FloodNotify()
}
}
// maybeBootstrap is used to handle bootstrapping when a new consul server joins.
2014-06-18 23:15:28 +00:00
func (s *Server) maybeBootstrap() {
// Bootstrap can only be done if there are no committed logs, remove our
// expectations of bootstrapping. This is slightly cheaper than the full
// check that BootstrapCluster will do, so this is a good pre-filter.
2014-06-18 23:15:28 +00:00
index, err := s.raftStore.LastIndex()
if err != nil {
s.logger.Printf("[ERR] consul: Failed to read last raft index: %v", err)
2014-06-18 23:15:28 +00:00
return
}
if index != 0 {
s.logger.Printf("[INFO] consul: Raft data found, disabling bootstrap mode")
s.config.BootstrapExpect = 0
2014-06-18 23:15:28 +00:00
return
}
// Scan for all the known servers.
2014-06-18 23:15:28 +00:00
members := s.serfLAN.Members()
var servers []agent.Server
2014-06-18 23:15:28 +00:00
for _, member := range members {
2016-03-30 00:39:19 +00:00
valid, p := agent.IsConsulServer(member)
2014-06-18 23:15:28 +00:00
if !valid {
continue
}
if p.Datacenter != s.config.Datacenter {
s.logger.Printf("[ERR] consul: Member %v has a conflicting datacenter, ignoring", member)
continue
}
if p.Expect != 0 && p.Expect != s.config.BootstrapExpect {
2014-06-18 23:15:28 +00:00
s.logger.Printf("[ERR] consul: Member %v has a conflicting expect value. All nodes should expect the same number.", member)
return
}
if p.Bootstrap {
s.logger.Printf("[ERR] consul: Member %v has bootstrap mode. Expect disabled.", member)
return
}
servers = append(servers, *p)
2014-06-18 23:15:28 +00:00
}
// Skip if we haven't met the minimum expect count.
if len(servers) < s.config.BootstrapExpect {
2014-06-18 23:15:28 +00:00
return
}
// Query each of the servers and make sure they report no Raft peers.
for _, server := range servers {
var peers []string
// Retry with exponential backoff to get peer status from this server
for attempt := uint(0); attempt < maxPeerRetries; attempt++ {
if err := s.connPool.RPC(s.config.Datacenter, server.Addr, server.Version,
"Status.Peers", server.UseTLS, &struct{}{}, &peers); err != nil {
nextRetry := time.Duration((1 << attempt) * peerRetryBase)
s.logger.Printf("[ERR] consul: Failed to confirm peer status for %s: %v. Retrying in "+
"%v...", server.Name, err, nextRetry.String())
time.Sleep(nextRetry)
} else {
break
}
}
// Found a node with some Raft peers, stop bootstrap since there's
// evidence of an existing cluster. We should get folded in by the
2016-09-01 06:54:53 +00:00
// existing servers if that's the case, so it's cleaner to sit as a
// candidate with no peers so we don't cause spurious elections.
// It's OK this is racy, because even with an initial bootstrap
// as long as one peer runs bootstrap things will work, and if we
// have multiple peers bootstrap in the same way, that's OK. We
// just don't want a server added much later to do a live bootstrap
// and interfere with the cluster. This isn't required for Raft's
// correctness because no server in the existing cluster will vote
// for this server, but it makes things much more stable.
if len(peers) > 0 {
s.logger.Printf("[INFO] consul: Existing Raft peers reported by %s, disabling bootstrap mode", server.Name)
s.config.BootstrapExpect = 0
return
}
}
// Attempt a live bootstrap!
var configuration raft.Configuration
var addrs []string
2017-02-22 20:53:32 +00:00
minRaftVersion, err := ServerMinRaftProtocol(members)
if err != nil {
s.logger.Printf("[ERR] consul: Failed to read server raft versions: %v", err)
}
for _, server := range servers {
addr := server.Addr.String()
addrs = append(addrs, addr)
2017-02-22 20:53:32 +00:00
var id raft.ServerID
if minRaftVersion >= 3 {
2017-02-22 20:53:32 +00:00
id = raft.ServerID(server.ID)
} else {
id = raft.ServerID(addr)
}
peer := raft.Server{
2017-02-22 20:53:32 +00:00
ID: id,
Address: raft.ServerAddress(addr),
}
configuration.Servers = append(configuration.Servers, peer)
}
s.logger.Printf("[INFO] consul: Found expected number of peers, attempting bootstrap: %s",
strings.Join(addrs, ","))
future := s.raft.BootstrapCluster(configuration)
if err := future.Error(); err != nil {
s.logger.Printf("[ERR] consul: Failed to bootstrap cluster: %v", err)
2014-06-18 23:15:28 +00:00
}
// Bootstrapping complete, or failed for some reason, don't enter this
// again.
s.config.BootstrapExpect = 0
2014-06-18 23:15:28 +00:00
}
// lanNodeFailed is used to handle fail events on the LAN pool.
func (s *Server) lanNodeFailed(me serf.MemberEvent) {
for _, m := range me.Members {
2016-03-30 00:39:19 +00:00
ok, parts := agent.IsConsulServer(m)
if !ok {
continue
}
s.logger.Printf("[INFO] consul: Removing LAN server %s", parts)
s.localLock.Lock()
delete(s.localConsuls, raft.ServerAddress(parts.Addr.String()))
s.localLock.Unlock()
}
}