2017-03-01 22:04:40 +00:00
|
|
|
package consul
|
|
|
|
|
|
|
|
import (
|
2017-03-20 03:48:42 +00:00
|
|
|
"context"
|
2017-03-01 22:04:40 +00:00
|
|
|
"fmt"
|
|
|
|
|
2017-12-13 01:45:03 +00:00
|
|
|
"github.com/armon/go-metrics"
|
2020-11-13 02:12:12 +00:00
|
|
|
"github.com/armon/go-metrics/prometheus"
|
2017-03-01 22:04:40 +00:00
|
|
|
"github.com/hashicorp/raft"
|
2020-09-25 17:46:38 +00:00
|
|
|
autopilot "github.com/hashicorp/raft-autopilot"
|
2017-03-01 22:04:40 +00:00
|
|
|
"github.com/hashicorp/serf/serf"
|
2021-08-19 21:17:59 +00:00
|
|
|
|
2022-04-19 17:03:03 +00:00
|
|
|
"github.com/hashicorp/consul/agent/consul/autopilotevents"
|
2021-08-19 21:17:59 +00:00
|
|
|
"github.com/hashicorp/consul/agent/metadata"
|
|
|
|
"github.com/hashicorp/consul/agent/structs"
|
2022-04-07 14:48:48 +00:00
|
|
|
"github.com/hashicorp/consul/logging"
|
2021-08-19 21:17:59 +00:00
|
|
|
"github.com/hashicorp/consul/types"
|
2017-03-01 22:04:40 +00:00
|
|
|
)
|
|
|
|
|
2020-11-13 02:12:12 +00:00
|
|
|
var AutopilotGauges = []prometheus.GaugeDefinition{
|
|
|
|
{
|
2020-11-13 21:18:04 +00:00
|
|
|
Name: []string{"autopilot", "failure_tolerance"},
|
2020-11-16 19:02:11 +00:00
|
|
|
Help: "Tracks the number of voting servers that the cluster can lose while continuing to function.",
|
2020-11-13 02:12:12 +00:00
|
|
|
},
|
|
|
|
{
|
2020-11-13 21:18:04 +00:00
|
|
|
Name: []string{"autopilot", "healthy"},
|
2020-11-16 19:02:11 +00:00
|
|
|
Help: "Tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.",
|
2020-11-13 02:12:12 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2017-12-12 00:38:52 +00:00
|
|
|
// AutopilotDelegate is a Consul delegate for autopilot operations.
|
|
|
|
type AutopilotDelegate struct {
|
2022-04-19 17:03:03 +00:00
|
|
|
server *Server
|
|
|
|
readyServersPublisher *autopilotevents.ReadyServersEventPublisher
|
2017-03-08 19:31:32 +00:00
|
|
|
}
|
|
|
|
|
2017-12-13 01:45:03 +00:00
|
|
|
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
|
2022-04-07 14:48:48 +00:00
|
|
|
return d.server.getAutopilotConfigOrDefault().ToAutopilotLibraryConfig()
|
2017-03-16 01:27:17 +00:00
|
|
|
}
|
2017-03-10 00:43:07 +00:00
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
|
|
|
return d.server.autopilotServers()
|
2017-03-10 00:43:07 +00:00
|
|
|
}
|
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[raft.ServerID]*autopilot.Server) map[raft.ServerID]*autopilot.ServerStats {
|
|
|
|
return d.server.statsFetcher.Fetch(ctx, servers)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
|
2022-04-07 14:48:48 +00:00
|
|
|
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
|
|
|
|
if state.Healthy {
|
|
|
|
metrics.SetGauge([]string{"autopilot", "healthy"}, 1)
|
2021-10-08 17:31:50 +00:00
|
|
|
} else {
|
2022-04-07 14:48:48 +00:00
|
|
|
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
|
2017-12-13 01:45:03 +00:00
|
|
|
}
|
2022-04-19 17:03:03 +00:00
|
|
|
|
|
|
|
d.readyServersPublisher.PublishReadyServersEvents(state)
|
2017-03-15 23:09:55 +00:00
|
|
|
}
|
|
|
|
|
2020-11-05 16:18:59 +00:00
|
|
|
func (d *AutopilotDelegate) RemoveFailedServer(srv *autopilot.Server) {
|
2021-10-26 20:08:55 +00:00
|
|
|
serverEntMeta := structs.DefaultEnterpriseMetaInDefaultPartition()
|
2020-11-05 16:18:59 +00:00
|
|
|
go func() {
|
2021-10-26 20:08:55 +00:00
|
|
|
if err := d.server.RemoveFailedNode(srv.Name, false, serverEntMeta); err != nil {
|
2021-09-20 12:40:58 +00:00
|
|
|
d.server.logger.Error("failed to remove server", "name", srv.Name, "id", srv.ID, "error", err)
|
2020-11-05 16:18:59 +00:00
|
|
|
}
|
|
|
|
}()
|
2017-03-21 23:36:44 +00:00
|
|
|
}
|
2017-12-13 01:45:03 +00:00
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
func (s *Server) initAutopilot(config *Config) {
|
2022-04-19 17:03:03 +00:00
|
|
|
apDelegate := &AutopilotDelegate{
|
|
|
|
server: s,
|
|
|
|
readyServersPublisher: autopilotevents.NewReadyServersEventPublisher(autopilotevents.Config{
|
|
|
|
Publisher: s.publisher,
|
|
|
|
GetStore: func() autopilotevents.StateStore { return s.fsm.State() },
|
|
|
|
}),
|
|
|
|
}
|
2020-09-25 17:46:38 +00:00
|
|
|
|
|
|
|
s.autopilot = autopilot.New(
|
|
|
|
s.raft,
|
|
|
|
apDelegate,
|
|
|
|
autopilot.WithLogger(s.logger),
|
|
|
|
autopilot.WithReconcileInterval(config.AutopilotInterval),
|
|
|
|
autopilot.WithUpdateInterval(config.ServerHealthInterval),
|
|
|
|
autopilot.WithPromoter(s.autopilotPromoter()),
|
2022-04-07 14:48:48 +00:00
|
|
|
autopilot.WithReconciliationDisabled(),
|
2020-09-25 17:46:38 +00:00
|
|
|
)
|
2022-04-19 17:03:03 +00:00
|
|
|
|
|
|
|
// registers a snapshot handler for the event publisher to send as the first event for a new stream
|
|
|
|
s.publisher.RegisterHandler(autopilotevents.EventTopicReadyServers, apDelegate.readyServersPublisher.HandleSnapshot)
|
2017-12-13 01:45:03 +00:00
|
|
|
}
|
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
|
|
|
|
servers := make(map[raft.ServerID]*autopilot.Server)
|
|
|
|
for _, member := range s.serfLAN.Members() {
|
|
|
|
srv, err := s.autopilotServer(member)
|
|
|
|
if err != nil {
|
|
|
|
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
|
|
|
|
continue
|
|
|
|
} else if srv == nil {
|
|
|
|
// this member was a client
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
servers[srv.ID] = srv
|
|
|
|
}
|
|
|
|
|
|
|
|
return servers
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
|
|
|
|
ok, srv := metadata.IsConsulServer(m)
|
|
|
|
if !ok {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return s.autopilotServerFromMetadata(srv)
|
2017-12-13 01:45:03 +00:00
|
|
|
}
|
2019-06-28 17:40:07 +00:00
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
func (s *Server) autopilotServerFromMetadata(srv *metadata.Server) (*autopilot.Server, error) {
|
|
|
|
server := &autopilot.Server{
|
|
|
|
Name: srv.ShortName,
|
|
|
|
ID: raft.ServerID(srv.ID),
|
|
|
|
Address: raft.ServerAddress(srv.Addr.String()),
|
|
|
|
Version: srv.Build.String(),
|
|
|
|
RaftVersion: srv.RaftVersion,
|
|
|
|
Ext: s.autopilotServerExt(srv),
|
|
|
|
}
|
|
|
|
|
|
|
|
switch srv.Status {
|
|
|
|
case serf.StatusLeft:
|
|
|
|
server.NodeStatus = autopilot.NodeLeft
|
|
|
|
case serf.StatusAlive, serf.StatusLeaving:
|
|
|
|
// we want to treat leaving as alive to prevent autopilot from
|
|
|
|
// prematurely removing the node.
|
|
|
|
server.NodeStatus = autopilot.NodeAlive
|
|
|
|
case serf.StatusFailed:
|
|
|
|
server.NodeStatus = autopilot.NodeFailed
|
|
|
|
default:
|
|
|
|
server.NodeStatus = autopilot.NodeUnknown
|
|
|
|
}
|
|
|
|
|
|
|
|
// populate the node meta if there is any. When a node first joins or if
|
|
|
|
// there are ACL issues then this could be empty if the server has not
|
|
|
|
// yet been able to register itself in the catalog
|
peering: initial sync (#12842)
- Add endpoints related to peering: read, list, generate token, initiate peering
- Update node/service/check table indexing to account for peers
- Foundational changes for pushing service updates to a peer
- Plumb peer name through Health.ServiceNodes path
see: ENT-1765, ENT-1280, ENT-1283, ENT-1283, ENT-1756, ENT-1739, ENT-1750, ENT-1679,
ENT-1709, ENT-1704, ENT-1690, ENT-1689, ENT-1702, ENT-1701, ENT-1683, ENT-1663,
ENT-1650, ENT-1678, ENT-1628, ENT-1658, ENT-1640, ENT-1637, ENT-1597, ENT-1634,
ENT-1613, ENT-1616, ENT-1617, ENT-1591, ENT-1588, ENT-1596, ENT-1572, ENT-1555
Co-authored-by: R.B. Boyer <rb@hashicorp.com>
Co-authored-by: freddygv <freddy@hashicorp.com>
Co-authored-by: Chris S. Kim <ckim@hashicorp.com>
Co-authored-by: Evan Culver <eculver@hashicorp.com>
Co-authored-by: Nitya Dhanushkodi <nitya@hashicorp.com>
2022-04-21 22:34:40 +00:00
|
|
|
_, node, err := s.fsm.State().GetNodeID(types.NodeID(srv.ID), structs.NodeEnterpriseMetaInDefaultPartition(), structs.DefaultPeerKeyword)
|
2020-09-25 17:46:38 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error retrieving node from state store: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if node != nil {
|
|
|
|
server.Meta = node.Meta
|
|
|
|
}
|
|
|
|
|
|
|
|
return server, nil
|
2019-06-28 17:40:07 +00:00
|
|
|
}
|
2022-04-07 14:48:48 +00:00
|
|
|
|
|
|
|
func (s *Server) getAutopilotConfigOrDefault() *structs.AutopilotConfig {
|
|
|
|
logger := s.loggers.Named(logging.Autopilot)
|
|
|
|
state := s.fsm.State()
|
|
|
|
_, config, err := state.AutopilotConfig()
|
|
|
|
if err != nil {
|
|
|
|
logger.Error("failed to get config", "error", err)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if config != nil {
|
|
|
|
return config
|
|
|
|
}
|
|
|
|
|
|
|
|
// autopilot may start running prior to there ever being a leader
|
|
|
|
// and having an autopilot configuration created. In that case
|
|
|
|
// use the one from the local configuration for now.
|
|
|
|
return s.config.AutopilotConfig
|
|
|
|
}
|