open-nomad/nomad/autopilot.go
Tim Gross 7921f044e5
migrate autopilot implementation to raft-autopilot (#14441)
Nomad's original autopilot was importing from a private package in Consul. It
has been moved out to a shared library. Switch Nomad to use this library so that
we can eliminate the import of Consul, which is necessary to build Nomad ENT
with the current version of the Consul SDK. This also will let us pick up
autopilot improvements shared with Consul more easily.
2022-09-01 14:27:10 -04:00

247 lines
7 KiB
Go

package nomad
import (
"context"
"fmt"
"strconv"
metrics "github.com/armon/go-metrics"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
"github.com/hashicorp/serf/serf"
)
const (
// AutopilotRZTag is the Serf tag to use for the redundancy zone value
// when passing the server metadata to Autopilot.
AutopilotRZTag = "ap_zone"
// AutopilotRZTag is the Serf tag to use for the custom version value
// when passing the server metadata to Autopilot.
AutopilotVersionTag = "ap_version"
)
// AutopilotDelegate is a Nomad delegate for autopilot operations. It implements
// the autopilot.ApplicationIntegration interface, and the methods required for
// that interface have been documented as such below.
type AutopilotDelegate struct {
server *Server
}
// AutopilotConfig is used to retrieve the latest configuration from the Nomad
// delegate. This method is required to implement the ApplicationIntegration
// interface.
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
c := d.server.getOrCreateAutopilotConfig()
if c == nil {
return nil
}
conf := &autopilot.Config{
CleanupDeadServers: c.CleanupDeadServers,
LastContactThreshold: c.LastContactThreshold,
MaxTrailingLogs: c.MaxTrailingLogs,
MinQuorum: c.MinQuorum,
ServerStabilizationTime: c.ServerStabilizationTime,
Ext: autopilotConfigExt(c),
}
return conf
}
// FetchServerStats will be called by autopilot to request Nomad fetch the
// server stats out of band. This method is required to implement the
// ApplicationIntegration interface
func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[raft.ServerID]*autopilot.Server) map[raft.ServerID]*autopilot.ServerStats {
return d.server.statsFetcher.Fetch(ctx, servers)
}
// KnownServers will be called by autopilot to request the list of servers known
// to Nomad. This method is required to implement the ApplicationIntegration
// interface
func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
return d.server.autopilotServers()
}
// NotifyState will be called when the autopilot state is updated. The Nomad
// leader heartbeats a metric for monitoring based on this information. This
// method is required to implement the ApplicationIntegration interface
func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
if d.server.raft.State() == raft.Leader {
metrics.SetGauge([]string{"nomad", "autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
if state.Healthy {
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 1)
} else {
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 0)
}
}
}
// RemoveFailedServer will be called by autopilot to notify Nomad to remove the
// server in a failed state. This method is required to implement the
// ApplicationIntegration interface. (Note this is expected to return
// immediately so we'll spawn a goroutine for it.)
func (d *AutopilotDelegate) RemoveFailedServer(failedSrv *autopilot.Server) {
go func() {
err := d.server.RemoveFailedNode(failedSrv.Name)
if err != nil {
d.server.logger.Error("could not remove failed server", "server", string(failedSrv.ID))
}
}()
}
// MinRaftProtocol returns the lowest supported Raft protocol among alive
// servers
func (s *Server) MinRaftProtocol() (int, error) {
return minRaftProtocol(s.serf.Members(), isNomadServer)
}
// GetClusterHealth is used to get the current health of the servers, as known
// by the leader.
func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
state := s.autopilot.GetState()
if state == nil {
// this behavior seems odd but its functionally equivalent to 1.8.5 where if
// autopilot didn't have a health reply yet it would just return no error
return nil
}
health := &structs.OperatorHealthReply{
Healthy: state.Healthy,
FailureTolerance: state.FailureTolerance,
}
for _, srv := range state.Servers {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}
switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}
health.Servers = append(health.Servers, srvHealth)
}
return health
}
// -------------------
// helper functions
func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) {
minVersion := -1
for _, m := range members {
if m.Status != serf.StatusAlive {
continue
}
ok, server := serverFunc(m)
if !ok {
return -1, fmt.Errorf("not a Nomad server")
}
if server == nil {
continue
}
vsn, ok := m.Tags["raft_vsn"]
if !ok {
vsn = "1"
}
raftVsn, err := strconv.Atoi(vsn)
if err != nil {
return -1, err
}
if minVersion == -1 || raftVsn < minVersion {
minVersion = raftVsn
}
}
if minVersion == -1 {
return minVersion, fmt.Errorf("No servers found")
}
return minVersion, nil
}
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
servers := make(map[raft.ServerID]*autopilot.Server)
for _, member := range s.serf.Members() {
srv, err := s.autopilotServer(member)
if err != nil {
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
continue
} else if srv == nil {
// this member was a client
continue
}
servers[srv.ID] = srv
}
return servers
}
func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
ok, srv := isNomadServer(m)
if !ok {
return nil, nil
}
return s.autopilotServerFromMetadata(srv)
}
func (s *Server) autopilotServerFromMetadata(srv *serverParts) (*autopilot.Server, error) {
server := &autopilot.Server{
Name: srv.Name,
ID: raft.ServerID(srv.ID),
Address: raft.ServerAddress(srv.Addr.String()),
Version: srv.Build.String(),
RaftVersion: srv.RaftVersion,
Ext: s.autopilotServerExt(srv),
}
switch srv.Status {
case serf.StatusLeft:
server.NodeStatus = autopilot.NodeLeft
case serf.StatusAlive, serf.StatusLeaving:
// we want to treat leaving as alive to prevent autopilot from
// prematurely removing the node.
server.NodeStatus = autopilot.NodeAlive
case serf.StatusFailed:
server.NodeStatus = autopilot.NodeFailed
default:
server.NodeStatus = autopilot.NodeUnknown
}
members := s.serf.Members()
for _, member := range members {
if member.Name == srv.Name {
server.Meta = member.Tags
break
}
}
return server, nil
}