open-nomad/nomad/autopilot.go
2023-11-28 13:52:25 -05:00

256 lines
7.2 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package nomad
import (
"context"
"fmt"
"strconv"
metrics "github.com/hashicorp/go-metrics"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
"github.com/hashicorp/serf/serf"
)
const (
// AutopilotRZTag is the Serf tag to use for the redundancy zone value
// when passing the server metadata to Autopilot.
AutopilotRZTag = "ap_zone"
// AutopilotRZTag is the Serf tag to use for the custom version value
// when passing the server metadata to Autopilot.
AutopilotVersionTag = "ap_version"
)
// AutopilotDelegate is a Nomad delegate for autopilot operations. It implements
// the autopilot.ApplicationIntegration interface, and the methods required for
// that interface have been documented as such below.
type AutopilotDelegate struct {
server *Server
}
// AutopilotConfig is used to retrieve the latest configuration from the Nomad
// delegate. This method is required to implement the ApplicationIntegration
// interface.
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
c := d.server.getOrCreateAutopilotConfig()
if c == nil {
return nil
}
conf := &autopilot.Config{
CleanupDeadServers: c.CleanupDeadServers,
LastContactThreshold: c.LastContactThreshold,
MaxTrailingLogs: c.MaxTrailingLogs,
MinQuorum: c.MinQuorum,
ServerStabilizationTime: c.ServerStabilizationTime,
Ext: autopilotConfigExt(c),
}
return conf
}
// FetchServerStats will be called by autopilot to request Nomad fetch the
// server stats out of band. This method is required to implement the
// ApplicationIntegration interface
func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[raft.ServerID]*autopilot.Server) map[raft.ServerID]*autopilot.ServerStats {
return d.server.statsFetcher.Fetch(ctx, servers)
}
// KnownServers will be called by autopilot to request the list of servers known
// to Nomad. This method is required to implement the ApplicationIntegration
// interface
func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
return d.server.autopilotServers()
}
// NotifyState will be called when the autopilot state is updated. The Nomad
// leader heartbeats a metric for monitoring based on this information. This
// method is required to implement the ApplicationIntegration interface
func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
if d.server.raft.State() == raft.Leader {
metrics.SetGauge([]string{"nomad", "autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
if state.Healthy {
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 1)
} else {
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 0)
}
}
}
// RemoveFailedServer will be called by autopilot to notify Nomad to remove the
// server in a failed state. This method is required to implement the
// ApplicationIntegration interface. (Note this is expected to return
// immediately so we'll spawn a goroutine for it.)
func (d *AutopilotDelegate) RemoveFailedServer(failedSrv *autopilot.Server) {
go func() {
err := d.server.RemoveFailedNode(failedSrv.Name)
if err != nil {
d.server.logger.Error("could not remove failed server",
"server", string(failedSrv.ID),
"error", err,
)
}
}()
}
// MinRaftProtocol returns the lowest supported Raft protocol among alive
// servers
func (s *Server) MinRaftProtocol() (int, error) {
return minRaftProtocol(s.serf.Members(), isNomadServer)
}
// GetClusterHealth is used to get the current health of the servers, as known
// by the leader.
func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
state := s.autopilot.GetState()
if state == nil {
// this behavior seems odd but its functionally equivalent to 1.8.5 where if
// autopilot didn't have a health reply yet it would just return no error
return nil
}
health := &structs.OperatorHealthReply{
Healthy: state.Healthy,
FailureTolerance: state.FailureTolerance,
}
for _, srv := range state.Servers {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}
switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}
health.Servers = append(health.Servers, srvHealth)
}
return health
}
// -------------------
// helper functions
func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) {
minVersion := -1
for _, m := range members {
if m.Status != serf.StatusAlive {
continue
}
ok, server := serverFunc(m)
if !ok {
return -1, fmt.Errorf("not a Nomad server")
}
if server == nil {
continue
}
vsn, ok := m.Tags["raft_vsn"]
if !ok {
vsn = "1"
}
raftVsn, err := strconv.Atoi(vsn)
if err != nil {
return -1, err
}
if minVersion == -1 || raftVsn < minVersion {
minVersion = raftVsn
}
}
if minVersion == -1 {
return minVersion, fmt.Errorf("No servers found")
}
return minVersion, nil
}
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
servers := make(map[raft.ServerID]*autopilot.Server)
for _, member := range s.serf.Members() {
srv, err := s.autopilotServer(member)
if err != nil {
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
continue
} else if srv == nil {
// this member was a client or in another region
continue
}
servers[srv.ID] = srv
}
return servers
}
func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
ok, srv := isNomadServer(m)
if !ok {
return nil, nil
}
if srv.Region != s.Region() {
return nil, nil
}
return s.autopilotServerFromMetadata(srv)
}
func (s *Server) autopilotServerFromMetadata(srv *serverParts) (*autopilot.Server, error) {
server := &autopilot.Server{
Name: srv.Name,
ID: raft.ServerID(srv.ID),
Address: raft.ServerAddress(srv.Addr.String()),
Version: srv.Build.String(),
RaftVersion: srv.RaftVersion,
Ext: s.autopilotServerExt(srv),
}
switch srv.Status {
case serf.StatusLeft:
server.NodeStatus = autopilot.NodeLeft
case serf.StatusAlive, serf.StatusLeaving:
// we want to treat leaving as alive to prevent autopilot from
// prematurely removing the node.
server.NodeStatus = autopilot.NodeAlive
case serf.StatusFailed:
server.NodeStatus = autopilot.NodeFailed
default:
server.NodeStatus = autopilot.NodeUnknown
}
members := s.serf.Members()
for _, member := range members {
if member.Name == srv.Name {
server.Meta = member.Tags
break
}
}
return server, nil
}