2017-03-01 22:04:40 +00:00
|
|
|
package consul
|
|
|
|
|
|
|
|
import (
|
2017-03-20 03:48:42 +00:00
|
|
|
"context"
|
2017-03-01 22:04:40 +00:00
|
|
|
"fmt"
|
|
|
|
"strconv"
|
2017-03-10 00:43:07 +00:00
|
|
|
"sync"
|
2017-03-01 22:04:40 +00:00
|
|
|
"time"
|
|
|
|
|
2017-03-15 23:09:55 +00:00
|
|
|
"github.com/armon/go-metrics"
|
pkg refactor
command/agent/* -> agent/*
command/consul/* -> agent/consul/*
command/agent/command{,_test}.go -> command/agent{,_test}.go
command/base/command.go -> command/base.go
command/base/* -> command/*
commands.go -> command/commands.go
The script which did the refactor is:
(
cd $GOPATH/src/github.com/hashicorp/consul
git mv command/agent/command.go command/agent.go
git mv command/agent/command_test.go command/agent_test.go
git mv command/agent/flag_slice_value{,_test}.go command/
git mv command/agent .
git mv command/base/command.go command/base.go
git mv command/base/config_util{,_test}.go command/
git mv commands.go command/
git mv consul agent
rmdir command/base/
gsed -i -e 's|package agent|package command|' command/agent{,_test}.go
gsed -i -e 's|package agent|package command|' command/flag_slice_value{,_test}.go
gsed -i -e 's|package base|package command|' command/base.go command/config_util{,_test}.go
gsed -i -e 's|package main|package command|' command/commands.go
gsed -i -e 's|base.Command|BaseCommand|' command/commands.go
gsed -i -e 's|agent.Command|AgentCommand|' command/commands.go
gsed -i -e 's|\tCommand:|\tBaseCommand:|' command/commands.go
gsed -i -e 's|base\.||' command/commands.go
gsed -i -e 's|command\.||' command/commands.go
gsed -i -e 's|command|c|' main.go
gsed -i -e 's|range Commands|range command.Commands|' main.go
gsed -i -e 's|Commands: Commands|Commands: command.Commands|' main.go
gsed -i -e 's|base\.BoolValue|BoolValue|' command/operator_autopilot_set.go
gsed -i -e 's|base\.DurationValue|DurationValue|' command/operator_autopilot_set.go
gsed -i -e 's|base\.StringValue|StringValue|' command/operator_autopilot_set.go
gsed -i -e 's|base\.UintValue|UintValue|' command/operator_autopilot_set.go
gsed -i -e 's|\bCommand\b|BaseCommand|' command/base.go
gsed -i -e 's|BaseCommand Options|Command Options|' command/base.go
gsed -i -e 's|base.Command|BaseCommand|' command/*.go
gsed -i -e 's|c\.Command|c.BaseCommand|g' command/*.go
gsed -i -e 's|\tCommand:|\tBaseCommand:|' command/*_test.go
gsed -i -e 's|base\.||' command/*_test.go
gsed -i -e 's|\bCommand\b|AgentCommand|' command/agent{,_test}.go
gsed -i -e 's|cmd.AgentCommand|cmd.BaseCommand|' command/agent.go
gsed -i -e 's|cli.AgentCommand = new(Command)|cli.Command = new(AgentCommand)|' command/agent_test.go
gsed -i -e 's|exec.AgentCommand|exec.Command|' command/agent_test.go
gsed -i -e 's|exec.BaseCommand|exec.Command|' command/agent_test.go
gsed -i -e 's|NewTestAgent|agent.NewTestAgent|' command/agent_test.go
gsed -i -e 's|= TestConfig|= agent.TestConfig|' command/agent_test.go
gsed -i -e 's|: RetryJoin|: agent.RetryJoin|' command/agent_test.go
gsed -i -e 's|\.\./\.\./|../|' command/config_util_test.go
gsed -i -e 's|\bverifyUniqueListeners|VerifyUniqueListeners|' agent/config{,_test}.go command/agent.go
gsed -i -e 's|\bserfLANKeyring\b|SerfLANKeyring|g' agent/{agent,keyring,testagent}.go command/agent.go
gsed -i -e 's|\bserfWANKeyring\b|SerfWANKeyring|g' agent/{agent,keyring,testagent}.go command/agent.go
gsed -i -e 's|\bNewAgent\b|agent.New|g' command/agent{,_test}.go
gsed -i -e 's|\bNewAgent|New|' agent/{acl_test,agent,testagent}.go
gsed -i -e 's|\bAgent\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bBool\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bConfig\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bDefaultConfig\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bDevConfig\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bMergeConfig\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bReadConfigPaths\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bParseMetaPair\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bSerfLANKeyring\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|\bSerfWANKeyring\b|agent.&|g' command/agent{,_test}.go
gsed -i -e 's|circonus\.agent|circonus|g' command/agent{,_test}.go
gsed -i -e 's|logger\.agent|logger|g' command/agent{,_test}.go
gsed -i -e 's|metrics\.agent|metrics|g' command/agent{,_test}.go
gsed -i -e 's|// agent.Agent|// agent|' command/agent{,_test}.go
gsed -i -e 's|a\.agent\.Config|a.Config|' command/agent{,_test}.go
gsed -i -e 's|agent\.AppendSliceValue|AppendSliceValue|' command/{configtest,validate}.go
gsed -i -e 's|consul/consul|agent/consul|' GNUmakefile
gsed -i -e 's|\.\./test|../../test|' agent/consul/server_test.go
# fix imports
f=$(grep -rl 'github.com/hashicorp/consul/command/agent' * | grep '\.go')
gsed -i -e 's|github.com/hashicorp/consul/command/agent|github.com/hashicorp/consul/agent|' $f
goimports -w $f
f=$(grep -rl 'github.com/hashicorp/consul/consul' * | grep '\.go')
gsed -i -e 's|github.com/hashicorp/consul/consul|github.com/hashicorp/consul/agent/consul|' $f
goimports -w $f
goimports -w command/*.go main.go
)
2017-06-09 22:28:28 +00:00
|
|
|
"github.com/hashicorp/consul/agent/consul/agent"
|
|
|
|
"github.com/hashicorp/consul/agent/consul/structs"
|
2017-04-12 22:28:18 +00:00
|
|
|
"github.com/hashicorp/go-version"
|
2017-03-01 22:04:40 +00:00
|
|
|
"github.com/hashicorp/raft"
|
|
|
|
"github.com/hashicorp/serf/serf"
|
|
|
|
)
|
|
|
|
|
2017-03-08 19:31:32 +00:00
|
|
|
// AutopilotPolicy is the interface for the Autopilot mechanism
|
|
|
|
type AutopilotPolicy interface {
|
|
|
|
// PromoteNonVoters defines the handling of non-voting servers
|
|
|
|
PromoteNonVoters(*structs.AutopilotConfig) error
|
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
func (s *Server) startAutopilot() {
|
|
|
|
s.autopilotShutdownCh = make(chan struct{})
|
2017-03-10 00:43:07 +00:00
|
|
|
s.autopilotWaitGroup = sync.WaitGroup{}
|
|
|
|
s.autopilotWaitGroup.Add(1)
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
go s.autopilotLoop()
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Server) stopAutopilot() {
|
|
|
|
close(s.autopilotShutdownCh)
|
2017-03-10 00:43:07 +00:00
|
|
|
s.autopilotWaitGroup.Wait()
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2017-04-13 00:09:57 +00:00
|
|
|
var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
|
2017-04-12 22:28:18 +00:00
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
// autopilotLoop periodically looks for nonvoting servers to promote and dead servers to remove.
|
|
|
|
func (s *Server) autopilotLoop() {
|
2017-03-10 19:41:17 +00:00
|
|
|
defer s.autopilotWaitGroup.Done()
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
// Monitor server health until shutdown
|
2017-03-10 00:43:07 +00:00
|
|
|
ticker := time.NewTicker(s.config.AutopilotInterval)
|
2017-03-10 19:41:17 +00:00
|
|
|
defer ticker.Stop()
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-s.autopilotShutdownCh:
|
|
|
|
return
|
|
|
|
case <-ticker.C:
|
2017-04-13 01:38:36 +00:00
|
|
|
autopilotConfig, ok := s.getOrCreateAutopilotConfig()
|
2017-04-13 00:09:57 +00:00
|
|
|
if !ok {
|
2017-04-12 22:28:18 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-04-13 00:09:57 +00:00
|
|
|
if err := s.autopilotPolicy.PromoteNonVoters(autopilotConfig); err != nil {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[ERR] autopilot: error checking for non-voters to promote: %s", err)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2017-04-13 00:09:57 +00:00
|
|
|
if err := s.pruneDeadServers(autopilotConfig); err != nil {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[ERR] autopilot: error checking for dead servers to remove: %s", err)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
case <-s.autopilotRemoveDeadCh:
|
2017-04-13 01:38:36 +00:00
|
|
|
autopilotConfig, ok := s.getOrCreateAutopilotConfig()
|
2017-04-13 00:09:57 +00:00
|
|
|
if !ok {
|
|
|
|
continue
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-04-12 22:28:18 +00:00
|
|
|
|
2017-04-13 00:09:57 +00:00
|
|
|
if err := s.pruneDeadServers(autopilotConfig); err != nil {
|
|
|
|
s.logger.Printf("[ERR] autopilot: error checking for dead servers to remove: %s", err)
|
2017-04-12 22:28:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
// pruneDeadServers removes up to numPeers/2 failed servers
|
2017-04-13 00:09:57 +00:00
|
|
|
func (s *Server) pruneDeadServers(autopilotConfig *structs.AutopilotConfig) error {
|
2017-03-07 21:58:06 +00:00
|
|
|
// Find any failed servers
|
|
|
|
var failed []string
|
2017-03-29 19:52:00 +00:00
|
|
|
staleRaftServers := make(map[string]raft.Server)
|
2017-04-13 00:09:57 +00:00
|
|
|
if autopilotConfig.CleanupDeadServers {
|
2017-03-29 19:52:00 +00:00
|
|
|
future := s.raft.GetConfiguration()
|
2017-04-13 00:09:57 +00:00
|
|
|
if err := future.Error(); err != nil {
|
2017-03-29 19:52:00 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, server := range future.Configuration().Servers {
|
|
|
|
staleRaftServers[string(server.Address)] = server
|
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
for _, member := range s.serfLAN.Members() {
|
2017-03-29 19:52:00 +00:00
|
|
|
valid, parts := agent.IsConsulServer(member)
|
|
|
|
|
|
|
|
if valid {
|
|
|
|
// Remove this server from the stale list; it has a serf entry
|
|
|
|
if _, ok := staleRaftServers[parts.Addr.String()]; ok {
|
|
|
|
delete(staleRaftServers, parts.Addr.String())
|
|
|
|
}
|
|
|
|
|
|
|
|
if member.Status == serf.StatusFailed {
|
|
|
|
failed = append(failed, member.Name)
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-29 19:52:00 +00:00
|
|
|
removalCount := len(failed) + len(staleRaftServers)
|
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
// Nothing to remove, return early
|
2017-03-29 19:52:00 +00:00
|
|
|
if removalCount == 0 {
|
2017-03-10 00:43:07 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-03-07 21:58:06 +00:00
|
|
|
peers, err := s.numPeers()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only do removals if a minority of servers will be affected
|
2017-03-29 19:52:00 +00:00
|
|
|
if removalCount < peers/2 {
|
2017-03-07 21:58:06 +00:00
|
|
|
for _, server := range failed {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[INFO] autopilot: Attempting removal of failed server: %v", server)
|
2017-03-07 21:58:06 +00:00
|
|
|
go s.serfLAN.RemoveFailedNode(server)
|
|
|
|
}
|
2017-03-29 20:38:40 +00:00
|
|
|
|
|
|
|
minRaftProtocol, err := ServerMinRaftProtocol(s.serfLAN.Members())
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-03-29 19:52:00 +00:00
|
|
|
for _, raftServer := range staleRaftServers {
|
2017-03-29 20:38:40 +00:00
|
|
|
var future raft.Future
|
|
|
|
if minRaftProtocol >= 2 {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[INFO] autopilot: Attempting removal of stale raft server : %v", raftServer.ID)
|
2017-03-29 20:38:40 +00:00
|
|
|
future = s.raft.RemoveServer(raftServer.ID, 0, 0)
|
|
|
|
} else {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[INFO] autopilot: Attempting removal of stale raft server : %v", raftServer.ID)
|
2017-03-29 20:38:40 +00:00
|
|
|
future = s.raft.RemovePeer(raftServer.Address)
|
|
|
|
}
|
|
|
|
if err := future.Error(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-03-29 19:52:00 +00:00
|
|
|
}
|
2017-03-07 21:58:06 +00:00
|
|
|
} else {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
|
2017-03-07 21:58:06 +00:00
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-03-08 19:31:32 +00:00
|
|
|
// BasicAutopilot defines a policy for promoting non-voting servers in a way
|
|
|
|
// that maintains an odd-numbered voter count.
|
|
|
|
type BasicAutopilot struct {
|
|
|
|
server *Server
|
|
|
|
}
|
|
|
|
|
|
|
|
// PromoteNonVoters promotes eligible non-voting servers to voters.
|
2017-04-13 00:09:57 +00:00
|
|
|
func (b *BasicAutopilot) PromoteNonVoters(autopilotConfig *structs.AutopilotConfig) error {
|
2017-03-08 19:31:32 +00:00
|
|
|
minRaftProtocol, err := ServerMinRaftProtocol(b.server.LANMembers())
|
2017-03-01 22:04:40 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error getting server raft protocol versions: %s", err)
|
|
|
|
}
|
|
|
|
|
2017-03-07 21:58:06 +00:00
|
|
|
// If we don't meet the minimum version for non-voter features, bail early
|
|
|
|
if minRaftProtocol < 3 {
|
|
|
|
return nil
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-08 19:31:32 +00:00
|
|
|
future := b.server.raft.GetConfiguration()
|
2017-03-07 21:58:06 +00:00
|
|
|
if err := future.Error(); err != nil {
|
|
|
|
return fmt.Errorf("failed to get raft configuration: %v", err)
|
|
|
|
}
|
|
|
|
|
2017-03-21 23:36:44 +00:00
|
|
|
// Find any non-voters eligible for promotion
|
2017-03-07 21:58:06 +00:00
|
|
|
var promotions []raft.Server
|
|
|
|
voterCount := 0
|
2017-03-21 23:36:44 +00:00
|
|
|
for _, server := range future.Configuration().Servers {
|
2017-03-07 21:58:06 +00:00
|
|
|
// If this server has been stable and passing for long enough, promote it to a voter
|
2017-03-21 23:36:44 +00:00
|
|
|
if !isVoter(server.Suffrage) {
|
2017-03-10 00:43:07 +00:00
|
|
|
health := b.server.getServerHealth(string(server.ID))
|
2017-04-13 00:09:57 +00:00
|
|
|
if health.IsStable(time.Now(), autopilotConfig) {
|
2017-03-07 21:58:06 +00:00
|
|
|
promotions = append(promotions, server)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-03-07 21:58:06 +00:00
|
|
|
} else {
|
|
|
|
voterCount++
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-03-07 21:58:06 +00:00
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-21 23:36:44 +00:00
|
|
|
if _, err := b.server.handlePromotions(voterCount, promotions); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Server) handlePromotions(voterCount int, promotions []raft.Server) (bool, error) {
|
2017-03-07 21:58:06 +00:00
|
|
|
if len(promotions) == 0 {
|
2017-03-21 23:36:44 +00:00
|
|
|
return false, nil
|
2017-03-07 21:58:06 +00:00
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-07 21:58:06 +00:00
|
|
|
// If there's currently an even number of servers, we can promote the first server in the list
|
|
|
|
// to get to an odd-sized quorum
|
|
|
|
newServers := false
|
|
|
|
if voterCount%2 == 0 {
|
2017-03-21 23:36:44 +00:00
|
|
|
addFuture := s.raft.AddVoter(promotions[0].ID, promotions[0].Address, 0, 0)
|
2017-03-07 21:58:06 +00:00
|
|
|
if err := addFuture.Error(); err != nil {
|
2017-03-21 23:36:44 +00:00
|
|
|
return newServers, fmt.Errorf("failed to add raft peer: %v", err)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-03-07 21:58:06 +00:00
|
|
|
promotions = promotions[1:]
|
|
|
|
newServers = true
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-07 21:58:06 +00:00
|
|
|
// Promote remaining servers in twos to maintain an odd quorum size
|
|
|
|
for i := 0; i < len(promotions)-1; i += 2 {
|
2017-03-21 23:36:44 +00:00
|
|
|
addFirst := s.raft.AddVoter(promotions[i].ID, promotions[i].Address, 0, 0)
|
2017-03-07 21:58:06 +00:00
|
|
|
if err := addFirst.Error(); err != nil {
|
2017-03-21 23:36:44 +00:00
|
|
|
return newServers, fmt.Errorf("failed to add raft peer: %v", err)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-03-21 23:36:44 +00:00
|
|
|
addSecond := s.raft.AddVoter(promotions[i+1].ID, promotions[i+1].Address, 0, 0)
|
2017-03-07 21:58:06 +00:00
|
|
|
if err := addSecond.Error(); err != nil {
|
2017-03-21 23:36:44 +00:00
|
|
|
return newServers, fmt.Errorf("failed to add raft peer: %v", err)
|
2017-03-07 21:58:06 +00:00
|
|
|
}
|
|
|
|
newServers = true
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-07 21:58:06 +00:00
|
|
|
// If we added a new server, trigger a check to remove dead servers
|
|
|
|
if newServers {
|
|
|
|
select {
|
2017-03-21 23:36:44 +00:00
|
|
|
case s.autopilotRemoveDeadCh <- struct{}{}:
|
2017-03-07 21:58:06 +00:00
|
|
|
default:
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-21 23:36:44 +00:00
|
|
|
return newServers, nil
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
// serverHealthLoop monitors the health of the servers in the cluster
|
|
|
|
func (s *Server) serverHealthLoop() {
|
|
|
|
// Monitor server health until shutdown
|
|
|
|
ticker := time.NewTicker(s.config.ServerHealthInterval)
|
2017-03-10 19:41:17 +00:00
|
|
|
defer ticker.Stop()
|
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-s.shutdownCh:
|
|
|
|
return
|
|
|
|
case <-ticker.C:
|
2017-03-16 02:57:54 +00:00
|
|
|
if err := s.updateClusterHealth(); err != nil {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[ERR] autopilot: error updating cluster health: %s", err)
|
2017-03-16 02:57:54 +00:00
|
|
|
}
|
2017-03-16 01:27:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-03-10 00:43:07 +00:00
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
// updateClusterHealth fetches the Raft stats of the other servers and updates
|
|
|
|
// s.clusterHealth based on the configured Autopilot thresholds
|
|
|
|
func (s *Server) updateClusterHealth() error {
|
|
|
|
// Don't do anything if the min Raft version is too low
|
|
|
|
minRaftProtocol, err := ServerMinRaftProtocol(s.LANMembers())
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error getting server raft protocol versions: %s", err)
|
|
|
|
}
|
|
|
|
if minRaftProtocol < 3 {
|
|
|
|
return nil
|
|
|
|
}
|
2017-03-10 00:43:07 +00:00
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
state := s.fsm.State()
|
|
|
|
_, autopilotConf, err := state.AutopilotConfig()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error retrieving autopilot config: %s", err)
|
|
|
|
}
|
|
|
|
// Bail early if autopilot config hasn't been initialized yet
|
|
|
|
if autopilotConf == nil {
|
|
|
|
return nil
|
|
|
|
}
|
2017-03-15 23:09:55 +00:00
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
// Get the the serf members which are Consul servers
|
2017-03-18 01:42:28 +00:00
|
|
|
serverMap := make(map[string]*agent.Server)
|
2017-03-16 01:27:17 +00:00
|
|
|
for _, member := range s.LANMembers() {
|
|
|
|
if member.Status == serf.StatusLeft {
|
|
|
|
continue
|
|
|
|
}
|
2017-03-15 23:09:55 +00:00
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
valid, parts := agent.IsConsulServer(member)
|
|
|
|
if valid {
|
2017-03-18 01:42:28 +00:00
|
|
|
serverMap[parts.ID] = parts
|
2017-03-16 01:27:17 +00:00
|
|
|
}
|
|
|
|
}
|
2017-03-15 23:09:55 +00:00
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
future := s.raft.GetConfiguration()
|
|
|
|
if err := future.Error(); err != nil {
|
|
|
|
return fmt.Errorf("error getting Raft configuration %s", err)
|
|
|
|
}
|
2017-03-20 03:48:42 +00:00
|
|
|
servers := future.Configuration().Servers
|
|
|
|
|
|
|
|
// Fetch the health for each of the servers in parallel so we get as
|
|
|
|
// consistent of a sample as possible. We capture the leader's index
|
|
|
|
// here as well so it roughly lines up with the same point in time.
|
|
|
|
targetLastIndex := s.raft.LastIndex()
|
|
|
|
var fetchList []*agent.Server
|
|
|
|
for _, server := range servers {
|
|
|
|
if parts, ok := serverMap[string(server.ID)]; ok {
|
|
|
|
fetchList = append(fetchList, parts)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
d := time.Now().Add(s.config.ServerHealthInterval / 2)
|
|
|
|
ctx, cancel := context.WithDeadline(context.Background(), d)
|
|
|
|
defer cancel()
|
|
|
|
fetchedStats := s.statsFetcher.Fetch(ctx, fetchList)
|
2017-03-16 01:27:17 +00:00
|
|
|
|
|
|
|
// Build a current list of server healths
|
2017-03-21 23:36:44 +00:00
|
|
|
leader := s.raft.Leader()
|
2017-03-16 01:27:17 +00:00
|
|
|
var clusterHealth structs.OperatorHealthReply
|
|
|
|
voterCount := 0
|
2017-04-04 22:42:17 +00:00
|
|
|
healthyCount := 0
|
|
|
|
healthyVoterCount := 0
|
2017-03-16 01:27:17 +00:00
|
|
|
for _, server := range servers {
|
|
|
|
health := structs.ServerHealth{
|
|
|
|
ID: string(server.ID),
|
|
|
|
Address: string(server.Address),
|
2017-03-21 23:36:44 +00:00
|
|
|
Leader: server.Address == leader,
|
2017-03-16 01:27:17 +00:00
|
|
|
LastContact: -1,
|
2017-03-16 02:57:54 +00:00
|
|
|
Voter: server.Suffrage == raft.Voter,
|
2017-03-16 01:27:17 +00:00
|
|
|
}
|
2017-03-15 23:09:55 +00:00
|
|
|
|
2017-03-18 01:42:28 +00:00
|
|
|
parts, ok := serverMap[string(server.ID)]
|
2017-03-16 01:27:17 +00:00
|
|
|
if ok {
|
2017-03-18 01:42:28 +00:00
|
|
|
health.Name = parts.Name
|
|
|
|
health.SerfStatus = parts.Status
|
2017-03-21 23:36:44 +00:00
|
|
|
health.Version = parts.Build.String()
|
2017-03-20 03:48:42 +00:00
|
|
|
if stats, ok := fetchedStats[string(server.ID)]; ok {
|
|
|
|
if err := s.updateServerHealth(&health, parts, stats, autopilotConf, targetLastIndex); err != nil {
|
2017-04-05 00:21:49 +00:00
|
|
|
s.logger.Printf("[WARN] autopilot: error updating server health: %s", err)
|
2017-03-20 03:48:42 +00:00
|
|
|
}
|
2017-03-10 00:43:07 +00:00
|
|
|
}
|
2017-03-16 01:27:17 +00:00
|
|
|
} else {
|
|
|
|
health.SerfStatus = serf.StatusNone
|
|
|
|
}
|
2017-03-10 00:43:07 +00:00
|
|
|
|
2017-04-04 22:42:17 +00:00
|
|
|
if health.Voter {
|
|
|
|
voterCount++
|
|
|
|
}
|
2017-03-16 01:27:17 +00:00
|
|
|
if health.Healthy {
|
|
|
|
healthyCount++
|
2017-03-16 19:19:16 +00:00
|
|
|
if health.Voter {
|
2017-04-04 22:42:17 +00:00
|
|
|
healthyVoterCount++
|
2017-03-16 19:19:16 +00:00
|
|
|
}
|
2017-03-16 01:27:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
clusterHealth.Servers = append(clusterHealth.Servers, health)
|
|
|
|
}
|
|
|
|
clusterHealth.Healthy = healthyCount == len(servers)
|
|
|
|
|
|
|
|
// If we have extra healthy voters, update FailureTolerance
|
2017-04-04 22:42:17 +00:00
|
|
|
requiredQuorum := voterCount/2 + 1
|
|
|
|
if healthyVoterCount > requiredQuorum {
|
|
|
|
clusterHealth.FailureTolerance = healthyVoterCount - requiredQuorum
|
2017-03-16 01:27:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Heartbeat a metric for monitoring if we're the leader
|
|
|
|
if s.IsLeader() {
|
|
|
|
metrics.SetGauge([]string{"consul", "autopilot", "failure_tolerance"}, float32(clusterHealth.FailureTolerance))
|
|
|
|
if clusterHealth.Healthy {
|
|
|
|
metrics.SetGauge([]string{"consul", "autopilot", "healthy"}, 1)
|
|
|
|
} else {
|
|
|
|
metrics.SetGauge([]string{"consul", "autopilot", "healthy"}, 0)
|
2017-03-10 00:43:07 +00:00
|
|
|
}
|
|
|
|
}
|
2017-03-16 01:27:17 +00:00
|
|
|
|
|
|
|
s.clusterHealthLock.Lock()
|
|
|
|
s.clusterHealth = clusterHealth
|
|
|
|
s.clusterHealthLock.Unlock()
|
|
|
|
|
|
|
|
return nil
|
2017-03-10 00:43:07 +00:00
|
|
|
}
|
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
// updateServerHealth computes the resulting health of the server based on its
|
|
|
|
// fetched stats and the state of the leader.
|
|
|
|
func (s *Server) updateServerHealth(health *structs.ServerHealth,
|
|
|
|
server *agent.Server, stats *structs.ServerStats,
|
|
|
|
autopilotConf *structs.AutopilotConfig, targetLastIndex uint64) error {
|
2017-03-07 21:58:06 +00:00
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
health.LastTerm = stats.LastTerm
|
|
|
|
health.LastIndex = stats.LastIndex
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
if stats.LastContact != "never" {
|
2017-03-20 03:48:42 +00:00
|
|
|
var err error
|
2017-03-10 00:43:07 +00:00
|
|
|
health.LastContact, err = time.ParseDuration(stats.LastContact)
|
2017-03-01 22:04:40 +00:00
|
|
|
if err != nil {
|
2017-03-16 01:27:17 +00:00
|
|
|
return fmt.Errorf("error parsing last_contact duration: %s", err)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
}
|
2017-03-07 21:58:06 +00:00
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
lastTerm, err := strconv.ParseUint(s.raft.Stats()["last_log_term"], 10, 64)
|
|
|
|
if err != nil {
|
2017-03-16 01:27:17 +00:00
|
|
|
return fmt.Errorf("error parsing last_log_term: %s", err)
|
2017-03-10 00:43:07 +00:00
|
|
|
}
|
2017-03-20 03:48:42 +00:00
|
|
|
health.Healthy = health.IsHealthy(lastTerm, targetLastIndex, autopilotConf)
|
2017-03-01 22:04:40 +00:00
|
|
|
|
|
|
|
// If this is a new server or the health changed, reset StableSince
|
2017-03-10 00:43:07 +00:00
|
|
|
lastHealth := s.getServerHealth(server.ID)
|
2017-03-01 22:04:40 +00:00
|
|
|
if lastHealth == nil || lastHealth.Healthy != health.Healthy {
|
|
|
|
health.StableSince = time.Now()
|
|
|
|
} else {
|
|
|
|
health.StableSince = lastHealth.StableSince
|
|
|
|
}
|
|
|
|
|
2017-03-16 01:27:17 +00:00
|
|
|
return nil
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2017-03-15 23:09:55 +00:00
|
|
|
func (s *Server) getClusterHealth() structs.OperatorHealthReply {
|
|
|
|
s.clusterHealthLock.RLock()
|
|
|
|
defer s.clusterHealthLock.RUnlock()
|
|
|
|
return s.clusterHealth
|
|
|
|
}
|
|
|
|
|
2017-03-10 00:43:07 +00:00
|
|
|
func (s *Server) getServerHealth(id string) *structs.ServerHealth {
|
2017-03-15 23:09:55 +00:00
|
|
|
s.clusterHealthLock.RLock()
|
|
|
|
defer s.clusterHealthLock.RUnlock()
|
|
|
|
for _, health := range s.clusterHealth.Servers {
|
|
|
|
if health.ID == id {
|
|
|
|
return &health
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-03-15 23:09:55 +00:00
|
|
|
return nil
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-03-21 23:36:44 +00:00
|
|
|
|
|
|
|
func isVoter(suffrage raft.ServerSuffrage) bool {
|
|
|
|
switch suffrage {
|
|
|
|
case raft.Voter, raft.Staging:
|
|
|
|
return true
|
|
|
|
default:
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|