7921f044e5
Nomad's original autopilot was importing from a private package in Consul. It has been moved out to a shared library. Switch Nomad to use this library so that we can eliminate the import of Consul, which is necessary to build Nomad ENT with the current version of the Consul SDK. This also will let us pick up autopilot improvements shared with Consul more easily.
247 lines
7 KiB
Go
247 lines
7 KiB
Go
package nomad
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strconv"
|
|
|
|
metrics "github.com/armon/go-metrics"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/raft"
|
|
autopilot "github.com/hashicorp/raft-autopilot"
|
|
"github.com/hashicorp/serf/serf"
|
|
)
|
|
|
|
const (
|
|
// AutopilotRZTag is the Serf tag to use for the redundancy zone value
|
|
// when passing the server metadata to Autopilot.
|
|
AutopilotRZTag = "ap_zone"
|
|
|
|
// AutopilotRZTag is the Serf tag to use for the custom version value
|
|
// when passing the server metadata to Autopilot.
|
|
AutopilotVersionTag = "ap_version"
|
|
)
|
|
|
|
// AutopilotDelegate is a Nomad delegate for autopilot operations. It implements
|
|
// the autopilot.ApplicationIntegration interface, and the methods required for
|
|
// that interface have been documented as such below.
|
|
type AutopilotDelegate struct {
|
|
server *Server
|
|
}
|
|
|
|
// AutopilotConfig is used to retrieve the latest configuration from the Nomad
|
|
// delegate. This method is required to implement the ApplicationIntegration
|
|
// interface.
|
|
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
|
|
c := d.server.getOrCreateAutopilotConfig()
|
|
if c == nil {
|
|
return nil
|
|
}
|
|
conf := &autopilot.Config{
|
|
CleanupDeadServers: c.CleanupDeadServers,
|
|
LastContactThreshold: c.LastContactThreshold,
|
|
MaxTrailingLogs: c.MaxTrailingLogs,
|
|
MinQuorum: c.MinQuorum,
|
|
ServerStabilizationTime: c.ServerStabilizationTime,
|
|
Ext: autopilotConfigExt(c),
|
|
}
|
|
|
|
return conf
|
|
}
|
|
|
|
// FetchServerStats will be called by autopilot to request Nomad fetch the
|
|
// server stats out of band. This method is required to implement the
|
|
// ApplicationIntegration interface
|
|
func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[raft.ServerID]*autopilot.Server) map[raft.ServerID]*autopilot.ServerStats {
|
|
return d.server.statsFetcher.Fetch(ctx, servers)
|
|
}
|
|
|
|
// KnownServers will be called by autopilot to request the list of servers known
|
|
// to Nomad. This method is required to implement the ApplicationIntegration
|
|
// interface
|
|
func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
|
return d.server.autopilotServers()
|
|
}
|
|
|
|
// NotifyState will be called when the autopilot state is updated. The Nomad
|
|
// leader heartbeats a metric for monitoring based on this information. This
|
|
// method is required to implement the ApplicationIntegration interface
|
|
func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
|
|
if d.server.raft.State() == raft.Leader {
|
|
metrics.SetGauge([]string{"nomad", "autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
|
|
if state.Healthy {
|
|
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 1)
|
|
} else {
|
|
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 0)
|
|
}
|
|
}
|
|
}
|
|
|
|
// RemoveFailedServer will be called by autopilot to notify Nomad to remove the
|
|
// server in a failed state. This method is required to implement the
|
|
// ApplicationIntegration interface. (Note this is expected to return
|
|
// immediately so we'll spawn a goroutine for it.)
|
|
func (d *AutopilotDelegate) RemoveFailedServer(failedSrv *autopilot.Server) {
|
|
go func() {
|
|
err := d.server.RemoveFailedNode(failedSrv.Name)
|
|
if err != nil {
|
|
d.server.logger.Error("could not remove failed server", "server", string(failedSrv.ID))
|
|
}
|
|
}()
|
|
}
|
|
|
|
// MinRaftProtocol returns the lowest supported Raft protocol among alive
|
|
// servers
|
|
func (s *Server) MinRaftProtocol() (int, error) {
|
|
return minRaftProtocol(s.serf.Members(), isNomadServer)
|
|
}
|
|
|
|
// GetClusterHealth is used to get the current health of the servers, as known
|
|
// by the leader.
|
|
func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
|
|
|
|
state := s.autopilot.GetState()
|
|
if state == nil {
|
|
// this behavior seems odd but its functionally equivalent to 1.8.5 where if
|
|
// autopilot didn't have a health reply yet it would just return no error
|
|
return nil
|
|
}
|
|
|
|
health := &structs.OperatorHealthReply{
|
|
Healthy: state.Healthy,
|
|
FailureTolerance: state.FailureTolerance,
|
|
}
|
|
|
|
for _, srv := range state.Servers {
|
|
srvHealth := structs.ServerHealth{
|
|
ID: string(srv.Server.ID),
|
|
Name: srv.Server.Name,
|
|
Address: string(srv.Server.Address),
|
|
Version: srv.Server.Version,
|
|
Leader: srv.State == autopilot.RaftLeader,
|
|
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
|
|
LastContact: srv.Stats.LastContact,
|
|
LastTerm: srv.Stats.LastTerm,
|
|
LastIndex: srv.Stats.LastIndex,
|
|
Healthy: srv.Health.Healthy,
|
|
StableSince: srv.Health.StableSince,
|
|
}
|
|
|
|
switch srv.Server.NodeStatus {
|
|
case autopilot.NodeAlive:
|
|
srvHealth.SerfStatus = serf.StatusAlive
|
|
case autopilot.NodeLeft:
|
|
srvHealth.SerfStatus = serf.StatusLeft
|
|
case autopilot.NodeFailed:
|
|
srvHealth.SerfStatus = serf.StatusFailed
|
|
default:
|
|
srvHealth.SerfStatus = serf.StatusNone
|
|
}
|
|
|
|
health.Servers = append(health.Servers, srvHealth)
|
|
}
|
|
|
|
return health
|
|
}
|
|
|
|
// -------------------
|
|
// helper functions
|
|
|
|
func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) {
|
|
minVersion := -1
|
|
for _, m := range members {
|
|
if m.Status != serf.StatusAlive {
|
|
continue
|
|
}
|
|
|
|
ok, server := serverFunc(m)
|
|
if !ok {
|
|
return -1, fmt.Errorf("not a Nomad server")
|
|
}
|
|
if server == nil {
|
|
continue
|
|
}
|
|
|
|
vsn, ok := m.Tags["raft_vsn"]
|
|
if !ok {
|
|
vsn = "1"
|
|
}
|
|
raftVsn, err := strconv.Atoi(vsn)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
if minVersion == -1 || raftVsn < minVersion {
|
|
minVersion = raftVsn
|
|
}
|
|
}
|
|
|
|
if minVersion == -1 {
|
|
return minVersion, fmt.Errorf("No servers found")
|
|
}
|
|
|
|
return minVersion, nil
|
|
}
|
|
|
|
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
|
|
servers := make(map[raft.ServerID]*autopilot.Server)
|
|
|
|
for _, member := range s.serf.Members() {
|
|
srv, err := s.autopilotServer(member)
|
|
if err != nil {
|
|
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
|
|
continue
|
|
} else if srv == nil {
|
|
// this member was a client
|
|
continue
|
|
}
|
|
|
|
servers[srv.ID] = srv
|
|
}
|
|
|
|
return servers
|
|
}
|
|
|
|
func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
|
|
ok, srv := isNomadServer(m)
|
|
if !ok {
|
|
return nil, nil
|
|
}
|
|
|
|
return s.autopilotServerFromMetadata(srv)
|
|
}
|
|
|
|
func (s *Server) autopilotServerFromMetadata(srv *serverParts) (*autopilot.Server, error) {
|
|
server := &autopilot.Server{
|
|
Name: srv.Name,
|
|
ID: raft.ServerID(srv.ID),
|
|
Address: raft.ServerAddress(srv.Addr.String()),
|
|
Version: srv.Build.String(),
|
|
RaftVersion: srv.RaftVersion,
|
|
Ext: s.autopilotServerExt(srv),
|
|
}
|
|
|
|
switch srv.Status {
|
|
case serf.StatusLeft:
|
|
server.NodeStatus = autopilot.NodeLeft
|
|
case serf.StatusAlive, serf.StatusLeaving:
|
|
// we want to treat leaving as alive to prevent autopilot from
|
|
// prematurely removing the node.
|
|
server.NodeStatus = autopilot.NodeAlive
|
|
case serf.StatusFailed:
|
|
server.NodeStatus = autopilot.NodeFailed
|
|
default:
|
|
server.NodeStatus = autopilot.NodeUnknown
|
|
}
|
|
|
|
members := s.serf.Members()
|
|
for _, member := range members {
|
|
if member.Name == srv.Name {
|
|
server.Meta = member.Tags
|
|
break
|
|
}
|
|
}
|
|
|
|
return server, nil
|
|
}
|