9fe6c33c0d
Prior to #13244, connect proxies and gateways could only be configured by an xDS session served by the local client agent. In an upcoming release, it will be possible to deploy a Consul service mesh without client agents. In this model, xDS sessions will be handled by the servers themselves, which necessitates load-balancing to prevent a single server from receiving a disproportionate amount of load and becoming overwhelmed. This introduces a simple form of load-balancing where Consul will attempt to achieve an even spread of load (xDS sessions) between all healthy servers. It does so by implementing a concurrent session limiter (limiter.SessionLimiter) and adjusting the limit according to autopilot state and proxy service registrations in the catalog. If a server is already over capacity (i.e. the session limit is lowered), Consul will begin draining sessions to rebalance the load. This will result in the client receiving a `RESOURCE_EXHAUSTED` status code. It is the client's responsibility to observe this response and reconnect to a different server. Users of the gRPC client connection brokered by the consul-server-connection-manager library will get this for free. The rate at which Consul will drain sessions to rebalance load is scaled dynamically based on the number of proxies in the catalog.
183 lines
5.6 KiB
Go
183 lines
5.6 KiB
Go
package consul
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
|
|
"github.com/armon/go-metrics"
|
|
"github.com/armon/go-metrics/prometheus"
|
|
"github.com/hashicorp/raft"
|
|
autopilot "github.com/hashicorp/raft-autopilot"
|
|
"github.com/hashicorp/serf/serf"
|
|
|
|
"github.com/hashicorp/consul/agent/consul/autopilotevents"
|
|
"github.com/hashicorp/consul/agent/metadata"
|
|
"github.com/hashicorp/consul/agent/structs"
|
|
"github.com/hashicorp/consul/logging"
|
|
"github.com/hashicorp/consul/types"
|
|
)
|
|
|
|
var AutopilotGauges = []prometheus.GaugeDefinition{
|
|
{
|
|
Name: []string{"autopilot", "failure_tolerance"},
|
|
Help: "Tracks the number of voting servers that the cluster can lose while continuing to function.",
|
|
},
|
|
{
|
|
Name: []string{"autopilot", "healthy"},
|
|
Help: "Tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.",
|
|
},
|
|
}
|
|
|
|
// AutopilotDelegate is a Consul delegate for autopilot operations.
|
|
type AutopilotDelegate struct {
|
|
server *Server
|
|
readyServersPublisher *autopilotevents.ReadyServersEventPublisher
|
|
}
|
|
|
|
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
|
|
return d.server.getAutopilotConfigOrDefault().ToAutopilotLibraryConfig()
|
|
}
|
|
|
|
func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
|
return d.server.autopilotServers()
|
|
}
|
|
|
|
func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[raft.ServerID]*autopilot.Server) map[raft.ServerID]*autopilot.ServerStats {
|
|
return d.server.statsFetcher.Fetch(ctx, servers)
|
|
}
|
|
|
|
func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
|
|
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
|
|
if state.Healthy {
|
|
metrics.SetGauge([]string{"autopilot", "healthy"}, 1)
|
|
} else {
|
|
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
|
|
}
|
|
|
|
d.readyServersPublisher.PublishReadyServersEvents(state)
|
|
|
|
var readyServers uint32
|
|
for _, server := range state.Servers {
|
|
if autopilotevents.IsServerReady(server) {
|
|
readyServers++
|
|
}
|
|
}
|
|
d.server.xdsCapacityController.SetServerCount(readyServers)
|
|
}
|
|
|
|
func (d *AutopilotDelegate) RemoveFailedServer(srv *autopilot.Server) {
|
|
serverEntMeta := structs.DefaultEnterpriseMetaInDefaultPartition()
|
|
go func() {
|
|
if err := d.server.RemoveFailedNode(srv.Name, false, serverEntMeta); err != nil {
|
|
d.server.logger.Error("failed to remove server", "name", srv.Name, "id", srv.ID, "error", err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (s *Server) initAutopilot(config *Config) {
|
|
apDelegate := &AutopilotDelegate{
|
|
server: s,
|
|
readyServersPublisher: autopilotevents.NewReadyServersEventPublisher(autopilotevents.Config{
|
|
Publisher: s.publisher,
|
|
GetStore: func() autopilotevents.StateStore { return s.fsm.State() },
|
|
}),
|
|
}
|
|
|
|
s.autopilot = autopilot.New(
|
|
s.raft,
|
|
apDelegate,
|
|
autopilot.WithLogger(s.logger),
|
|
autopilot.WithReconcileInterval(config.AutopilotInterval),
|
|
autopilot.WithUpdateInterval(config.ServerHealthInterval),
|
|
autopilot.WithPromoter(s.autopilotPromoter()),
|
|
autopilot.WithReconciliationDisabled(),
|
|
)
|
|
|
|
// registers a snapshot handler for the event publisher to send as the first event for a new stream
|
|
s.publisher.RegisterHandler(autopilotevents.EventTopicReadyServers, apDelegate.readyServersPublisher.HandleSnapshot, false)
|
|
}
|
|
|
|
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
|
|
servers := make(map[raft.ServerID]*autopilot.Server)
|
|
for _, member := range s.serfLAN.Members() {
|
|
srv, err := s.autopilotServer(member)
|
|
if err != nil {
|
|
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
|
|
continue
|
|
} else if srv == nil {
|
|
// this member was a client
|
|
continue
|
|
}
|
|
|
|
servers[srv.ID] = srv
|
|
}
|
|
|
|
return servers
|
|
}
|
|
|
|
func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
|
|
ok, srv := metadata.IsConsulServer(m)
|
|
if !ok {
|
|
return nil, nil
|
|
}
|
|
|
|
return s.autopilotServerFromMetadata(srv)
|
|
}
|
|
|
|
func (s *Server) autopilotServerFromMetadata(srv *metadata.Server) (*autopilot.Server, error) {
|
|
server := &autopilot.Server{
|
|
Name: srv.ShortName,
|
|
ID: raft.ServerID(srv.ID),
|
|
Address: raft.ServerAddress(srv.Addr.String()),
|
|
Version: srv.Build.String(),
|
|
RaftVersion: srv.RaftVersion,
|
|
Ext: s.autopilotServerExt(srv),
|
|
}
|
|
|
|
switch srv.Status {
|
|
case serf.StatusLeft:
|
|
server.NodeStatus = autopilot.NodeLeft
|
|
case serf.StatusAlive, serf.StatusLeaving:
|
|
// we want to treat leaving as alive to prevent autopilot from
|
|
// prematurely removing the node.
|
|
server.NodeStatus = autopilot.NodeAlive
|
|
case serf.StatusFailed:
|
|
server.NodeStatus = autopilot.NodeFailed
|
|
default:
|
|
server.NodeStatus = autopilot.NodeUnknown
|
|
}
|
|
|
|
// populate the node meta if there is any. When a node first joins or if
|
|
// there are ACL issues then this could be empty if the server has not
|
|
// yet been able to register itself in the catalog
|
|
_, node, err := s.fsm.State().GetNodeID(types.NodeID(srv.ID), structs.NodeEnterpriseMetaInDefaultPartition(), structs.DefaultPeerKeyword)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error retrieving node from state store: %w", err)
|
|
}
|
|
|
|
if node != nil {
|
|
server.Meta = node.Meta
|
|
}
|
|
|
|
return server, nil
|
|
}
|
|
|
|
func (s *Server) getAutopilotConfigOrDefault() *structs.AutopilotConfig {
|
|
logger := s.loggers.Named(logging.Autopilot)
|
|
state := s.fsm.State()
|
|
_, config, err := state.AutopilotConfig()
|
|
if err != nil {
|
|
logger.Error("failed to get config", "error", err)
|
|
return nil
|
|
}
|
|
|
|
if config != nil {
|
|
return config
|
|
}
|
|
|
|
// autopilot may start running prior to there ever being a leader
|
|
// and having an autopilot configuration created. In that case
|
|
// use the one from the local configuration for now.
|
|
return s.config.AutopilotConfig
|
|
}
|