open-nomad/command/agent/agent.go

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1320 lines
42 KiB
Go
Raw Normal View History

2015-08-16 20:54:49 +00:00
package agent
2015-08-16 23:40:04 +00:00
import (
"context"
2015-08-24 00:40:27 +00:00
"fmt"
2019-02-22 19:40:13 +00:00
"io"
2018-04-10 00:51:55 +00:00
"io/ioutil"
2018-10-16 22:34:32 +00:00
golog "log"
2015-08-31 01:10:23 +00:00
"net"
"os"
2015-08-31 01:10:23 +00:00
"path/filepath"
"runtime"
"strings"
2015-08-16 23:40:04 +00:00
"sync"
"time"
2015-08-16 21:34:38 +00:00
metrics "github.com/armon/go-metrics"
consulapi "github.com/hashicorp/consul/api"
2018-10-16 22:34:32 +00:00
log "github.com/hashicorp/go-hclog"
uuidparse "github.com/hashicorp/go-uuid"
2015-08-23 23:53:15 +00:00
"github.com/hashicorp/nomad/client"
2018-10-16 22:34:32 +00:00
clientconfig "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/lib/cgutil"
"github.com/hashicorp/nomad/client/state"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/command/agent/event"
"github.com/hashicorp/nomad/helper/bufconndialer"
"github.com/hashicorp/nomad/helper/escapingfs"
2019-01-23 14:27:14 +00:00
"github.com/hashicorp/nomad/helper/pluginutils/loader"
2018-04-10 00:51:55 +00:00
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/lib/cpuset"
2015-08-16 23:40:04 +00:00
"github.com/hashicorp/nomad/nomad"
2021-06-04 19:38:46 +00:00
"github.com/hashicorp/nomad/nomad/deploymentwatcher"
2015-08-31 01:10:23 +00:00
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/hashicorp/raft"
2015-08-16 23:40:04 +00:00
)
const (
2017-09-26 22:26:33 +00:00
agentHttpCheckInterval = 10 * time.Second
agentHttpCheckTimeout = 5 * time.Second
serverRpcCheckInterval = 10 * time.Second
serverRpcCheckTimeout = 3 * time.Second
serverSerfCheckInterval = 10 * time.Second
serverSerfCheckTimeout = 3 * time.Second
// roles used in identifying Consul entries for Nomad agents
consulRoleServer = "server"
consulRoleClient = "client"
2020-05-30 13:27:33 +00:00
// DefaultRaftMultiplier is used as a baseline Raft configuration that
// will be reliable on a very basic server.
DefaultRaftMultiplier = 1
// MaxRaftMultiplier is a fairly arbitrary upper bound that limits the
// amount of performance detuning that's possible.
MaxRaftMultiplier = 10
)
2015-08-16 23:40:04 +00:00
// Agent is a long running daemon that is used to run both
// clients and servers. Servers are responsible for managing
// state and making scheduling decisions. Clients can be
// scheduled to, and are responsible for interfacing with
// servers to run allocations.
2015-08-16 20:54:49 +00:00
type Agent struct {
config *Config
configLock sync.Mutex
logger log.InterceptLogger
2020-03-24 00:06:11 +00:00
auditor event.Auditor
httpLogger log.Logger
2018-09-13 17:43:40 +00:00
logOutput io.Writer
2015-08-16 23:40:04 +00:00
// EnterpriseAgent holds information and methods for enterprise functionality
EnterpriseAgent *EnterpriseAgent
// consulService is Nomad's custom Consul client for managing services
// and checks.
consulService *consul.ServiceClient
// consulProxies is the subset of Consul's Agent API Nomad uses.
consulProxies *consul.ConnectProxies
// consulCatalog is the subset of Consul's Catalog API Nomad uses.
consulCatalog consul.CatalogAPI
// consulConfigEntries is the subset of Consul's Configuration Entries API Nomad uses.
consulConfigEntries consul.ConfigAPI
// consulACLs is Nomad's subset of Consul's ACL API Nomad uses.
consulACLs consul.ACLsAPI
2018-09-26 20:39:09 +00:00
// client is the launched Nomad Client. Can be nil if the agent isn't
// configured to run a client.
client *client.Client
2018-09-26 20:39:09 +00:00
// server is the launched Nomad Server. Can be nil if the agent isn't
// configured to run a server.
server *nomad.Server
2015-08-16 23:40:04 +00:00
2018-09-26 20:39:09 +00:00
// pluginLoader is used to load plugins
pluginLoader loader.PluginCatalog
// pluginSingletonLoader is a plugin loader that will returns singleton
// instances of the plugins.
pluginSingletonLoader loader.PluginCatalog
2015-08-16 23:40:04 +00:00
shutdown bool
shutdownCh chan struct{}
2015-08-16 23:40:04 +00:00
shutdownLock sync.Mutex
// builtinDialer dials the builtinListener. It is used for connecting
// consul-template to the HTTP API in process. In the event this agent is
// not running in client mode, these two fields will be nil.
builtinListener net.Listener
builtinDialer *bufconndialer.BufConnWrapper
InmemSink *metrics.InmemSink
2015-08-16 20:54:49 +00:00
}
2015-08-16 23:40:04 +00:00
// NewAgent is used to create a new agent with the given configuration
func NewAgent(config *Config, logger log.InterceptLogger, logOutput io.Writer, inmem *metrics.InmemSink) (*Agent, error) {
2015-08-16 23:40:04 +00:00
a := &Agent{
config: config,
logOutput: logOutput,
shutdownCh: make(chan struct{}),
InmemSink: inmem,
2015-08-16 23:40:04 +00:00
}
2015-08-24 00:40:27 +00:00
2018-09-13 17:43:40 +00:00
// Create the loggers
a.logger = logger
a.httpLogger = a.logger.ResetNamed("http")
2018-09-13 17:43:40 +00:00
// Global logger should match internal logger as much as possible
2018-09-13 17:43:40 +00:00
golog.SetFlags(golog.LstdFlags | golog.Lmicroseconds)
if err := a.setupConsul(config.Consul); err != nil {
return nil, fmt.Errorf("Failed to initialize Consul client: %v", err)
}
2018-08-30 20:43:09 +00:00
2018-09-26 20:39:09 +00:00
if err := a.setupPlugins(); err != nil {
return nil, err
}
2018-08-30 20:43:09 +00:00
2015-08-24 00:40:27 +00:00
if err := a.setupServer(); err != nil {
return nil, err
}
if err := a.setupClient(); err != nil {
return nil, err
}
if err := a.setupEnterpriseAgent(logger); err != nil {
return nil, err
}
2015-08-24 00:40:27 +00:00
if a.client == nil && a.server == nil {
return nil, fmt.Errorf("must have at least client or server mode enabled")
}
2015-08-16 21:34:38 +00:00
return a, nil
}
// convertServerConfig takes an agent config and log output and returns a Nomad
// Config. There may be missing fields that must be set by the agent. To do this
// call finalizeServerConfig
func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
conf := agentConfig.NomadConfig
2015-09-06 01:41:00 +00:00
if conf == nil {
conf = nomad.DefaultConfig()
}
conf.DevMode = agentConfig.DevMode
conf.EnableDebug = agentConfig.EnableDebug
conf.Build = agentConfig.Version.VersionNumber()
conf.Revision = agentConfig.Version.Revision
if agentConfig.Region != "" {
conf.Region = agentConfig.Region
2015-08-31 01:14:40 +00:00
}
// Set the Authoritative Region if set, otherwise default to
// the same as the local region.
if agentConfig.Server.AuthoritativeRegion != "" {
conf.AuthoritativeRegion = agentConfig.Server.AuthoritativeRegion
} else if agentConfig.Region != "" {
conf.AuthoritativeRegion = agentConfig.Region
}
if agentConfig.Datacenter != "" {
conf.Datacenter = agentConfig.Datacenter
2015-08-31 01:14:40 +00:00
}
if agentConfig.NodeName != "" {
conf.NodeName = agentConfig.NodeName
2015-08-31 01:14:40 +00:00
}
if agentConfig.Server.BootstrapExpect > 0 {
Simplify Bootstrap logic in tests This change updates tests to honor `BootstrapExpect` exclusively when forming test clusters and removes test only knobs, e.g. `config.DevDisableBootstrap`. Background: Test cluster creation is fragile. Test servers don't follow the BootstapExpected route like production clusters. Instead they start as single node clusters and then get rejoin and may risk causing brain split or other test flakiness. The test framework expose few knobs to control those (e.g. `config.DevDisableBootstrap` and `config.Bootstrap`) that control whether a server should bootstrap the cluster. These flags are confusing and it's unclear when to use: their usage in multi-node cluster isn't properly documented. Furthermore, they have some bad side-effects as they don't control Raft library: If `config.DevDisableBootstrap` is true, the test server may not immediately attempt to bootstrap a cluster, but after an election timeout (~50ms), Raft may force a leadership election and win it (with only one vote) and cause a split brain. The knobs are also confusing as Bootstrap is an overloaded term. In BootstrapExpect, we refer to bootstrapping the cluster only after N servers are connected. But in tests and the knobs above, it refers to whether the server is a single node cluster and shouldn't wait for any other server. Changes: This commit makes two changes: First, it relies on `BootstrapExpected` instead of `Bootstrap` and/or `DevMode` flags. This change is relatively trivial. Introduce a `Bootstrapped` flag to track if the cluster is bootstrapped. This allows us to keep `BootstrapExpected` immutable. Previously, the flag was a config value but it gets set to 0 after cluster bootstrap completes.
2020-03-02 15:29:24 +00:00
conf.BootstrapExpect = agentConfig.Server.BootstrapExpect
2015-08-31 01:10:23 +00:00
}
if agentConfig.DataDir != "" {
conf.DataDir = filepath.Join(agentConfig.DataDir, "server")
2015-08-31 01:10:23 +00:00
}
if agentConfig.Server.DataDir != "" {
conf.DataDir = agentConfig.Server.DataDir
2015-08-31 01:10:23 +00:00
}
if agentConfig.Server.RaftProtocol != 0 {
conf.RaftConfig.ProtocolVersion = raft.ProtocolVersion(agentConfig.Server.RaftProtocol)
}
if v := conf.RaftConfig.ProtocolVersion; v != 3 {
return nil, fmt.Errorf("raft_protocol must be 3 in Nomad v1.4 and later, got %d", v)
}
2020-05-30 13:27:33 +00:00
raftMultiplier := int(DefaultRaftMultiplier)
if agentConfig.Server.RaftMultiplier != nil && *agentConfig.Server.RaftMultiplier != 0 {
raftMultiplier = *agentConfig.Server.RaftMultiplier
if raftMultiplier < 1 || raftMultiplier > MaxRaftMultiplier {
return nil, fmt.Errorf("raft_multiplier cannot be %d. Must be between 1 and %d", *agentConfig.Server.RaftMultiplier, MaxRaftMultiplier)
}
}
conf.RaftConfig.ElectionTimeout *= time.Duration(raftMultiplier)
conf.RaftConfig.HeartbeatTimeout *= time.Duration(raftMultiplier)
conf.RaftConfig.LeaderLeaseTimeout *= time.Duration(raftMultiplier)
conf.RaftConfig.CommitTimeout *= time.Duration(raftMultiplier)
2018-05-31 20:11:11 +00:00
if agentConfig.Server.NumSchedulers != nil {
conf.NumSchedulers = *agentConfig.Server.NumSchedulers
2015-08-31 01:10:23 +00:00
}
if len(agentConfig.Server.EnabledSchedulers) != 0 {
// Convert to a set and require the core scheduler
set := make(map[string]struct{}, 4)
set[structs.JobTypeCore] = struct{}{}
for _, sched := range agentConfig.Server.EnabledSchedulers {
set[sched] = struct{}{}
}
schedulers := make([]string, 0, len(set))
for k := range set {
schedulers = append(schedulers, k)
}
conf.EnabledSchedulers = schedulers
2015-08-31 01:10:23 +00:00
}
if agentConfig.ACL.Enabled {
conf.ACLEnabled = true
}
if agentConfig.ACL.ReplicationToken != "" {
conf.ReplicationToken = agentConfig.ACL.ReplicationToken
}
if agentConfig.ACL.TokenMinExpirationTTL != 0 {
conf.ACLTokenMinExpirationTTL = agentConfig.ACL.TokenMinExpirationTTL
}
if agentConfig.ACL.TokenMaxExpirationTTL != 0 {
conf.ACLTokenMaxExpirationTTL = agentConfig.ACL.TokenMaxExpirationTTL
}
2017-09-19 14:47:10 +00:00
if agentConfig.Sentinel != nil {
conf.SentinelConfig = agentConfig.Sentinel
}
if agentConfig.Server.NonVotingServer {
conf.NonVoter = true
}
if agentConfig.Server.RedundancyZone != "" {
conf.RedundancyZone = agentConfig.Server.RedundancyZone
}
if agentConfig.Server.UpgradeVersion != "" {
conf.UpgradeVersion = agentConfig.Server.UpgradeVersion
}
if agentConfig.Server.EnableEventBroker != nil {
conf.EnableEventBroker = *agentConfig.Server.EnableEventBroker
}
if agentConfig.Server.EventBufferSize != nil {
if *agentConfig.Server.EventBufferSize < 0 {
return nil, fmt.Errorf("Invalid Config, event_buffer_size must be non-negative")
}
conf.EventBufferSize = int64(*agentConfig.Server.EventBufferSize)
}
if agentConfig.Autopilot != nil {
if agentConfig.Autopilot.CleanupDeadServers != nil {
conf.AutopilotConfig.CleanupDeadServers = *agentConfig.Autopilot.CleanupDeadServers
}
if agentConfig.Autopilot.ServerStabilizationTime != 0 {
conf.AutopilotConfig.ServerStabilizationTime = agentConfig.Autopilot.ServerStabilizationTime
}
if agentConfig.Autopilot.LastContactThreshold != 0 {
conf.AutopilotConfig.LastContactThreshold = agentConfig.Autopilot.LastContactThreshold
}
if agentConfig.Autopilot.MaxTrailingLogs != 0 {
conf.AutopilotConfig.MaxTrailingLogs = uint64(agentConfig.Autopilot.MaxTrailingLogs)
}
2020-02-16 21:23:20 +00:00
if agentConfig.Autopilot.MinQuorum != 0 {
conf.AutopilotConfig.MinQuorum = uint(agentConfig.Autopilot.MinQuorum)
}
if agentConfig.Autopilot.EnableRedundancyZones != nil {
conf.AutopilotConfig.EnableRedundancyZones = *agentConfig.Autopilot.EnableRedundancyZones
}
if agentConfig.Autopilot.DisableUpgradeMigration != nil {
conf.AutopilotConfig.DisableUpgradeMigration = *agentConfig.Autopilot.DisableUpgradeMigration
}
if agentConfig.Autopilot.EnableCustomUpgrades != nil {
conf.AutopilotConfig.EnableCustomUpgrades = *agentConfig.Autopilot.EnableCustomUpgrades
}
}
// Set up the bind addresses
rpcAddr, err := net.ResolveTCPAddr("tcp", agentConfig.normalizedAddrs.RPC)
if err != nil {
return nil, fmt.Errorf("Failed to parse RPC address %q: %v", agentConfig.normalizedAddrs.RPC, err)
}
serfAddr, err := net.ResolveTCPAddr("tcp", agentConfig.normalizedAddrs.Serf)
if err != nil {
return nil, fmt.Errorf("Failed to parse Serf address %q: %v", agentConfig.normalizedAddrs.Serf, err)
}
conf.RPCAddr.Port = rpcAddr.Port
conf.RPCAddr.IP = rpcAddr.IP
conf.SerfConfig.MemberlistConfig.BindPort = serfAddr.Port
conf.SerfConfig.MemberlistConfig.BindAddr = serfAddr.IP.String()
// Set up the advertise addresses
rpcAddr, err = net.ResolveTCPAddr("tcp", agentConfig.AdvertiseAddrs.RPC)
if err != nil {
return nil, fmt.Errorf("Failed to parse RPC advertise address %q: %v", agentConfig.AdvertiseAddrs.RPC, err)
}
serfAddr, err = net.ResolveTCPAddr("tcp", agentConfig.AdvertiseAddrs.Serf)
if err != nil {
return nil, fmt.Errorf("Failed to parse Serf advertise address %q: %v", agentConfig.AdvertiseAddrs.Serf, err)
}
// Server address is the serf advertise address and rpc port. This is the
// address that all servers should be able to communicate over RPC with.
serverAddr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(serfAddr.IP.String(), fmt.Sprintf("%d", rpcAddr.Port)))
if err != nil {
return nil, fmt.Errorf("Failed to resolve Serf advertise address %q: %v", agentConfig.AdvertiseAddrs.Serf, err)
}
conf.SerfConfig.MemberlistConfig.AdvertiseAddr = serfAddr.IP.String()
conf.SerfConfig.MemberlistConfig.AdvertisePort = serfAddr.Port
conf.ClientRPCAdvertise = rpcAddr
conf.ServerRPCAdvertise = serverAddr
// Set up gc threshold and heartbeat grace period
if gcThreshold := agentConfig.Server.NodeGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.NodeGCThreshold = dur
}
if gcInterval := agentConfig.Server.JobGCInterval; gcInterval != "" {
dur, err := time.ParseDuration(gcInterval)
if err != nil {
return nil, fmt.Errorf("failed to parse job_gc_interval: %v", err)
} else if dur <= time.Duration(0) {
return nil, fmt.Errorf("job_gc_interval should be greater than 0s")
}
conf.JobGCInterval = dur
}
if gcThreshold := agentConfig.Server.JobGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.JobGCThreshold = dur
}
if gcThreshold := agentConfig.Server.EvalGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.EvalGCThreshold = dur
}
2017-06-29 18:29:44 +00:00
if gcThreshold := agentConfig.Server.DeploymentGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.DeploymentGCThreshold = dur
}
if gcThreshold := agentConfig.Server.CSIVolumeClaimGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.CSIVolumeClaimGCThreshold = dur
}
if gcThreshold := agentConfig.Server.CSIPluginGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.CSIPluginGCThreshold = dur
}
if gcThreshold := agentConfig.Server.ACLTokenGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.ACLTokenExpirationGCThreshold = dur
}
if gcThreshold := agentConfig.Server.RootKeyGCThreshold; gcThreshold != "" {
dur, err := time.ParseDuration(gcThreshold)
if err != nil {
return nil, err
}
conf.RootKeyGCThreshold = dur
}
if gcInterval := agentConfig.Server.RootKeyGCInterval; gcInterval != "" {
dur, err := time.ParseDuration(gcInterval)
if err != nil {
return nil, err
}
conf.RootKeyGCInterval = dur
}
if rotationThreshold := agentConfig.Server.RootKeyRotationThreshold; rotationThreshold != "" {
dur, err := time.ParseDuration(rotationThreshold)
if err != nil {
return nil, err
}
conf.RootKeyRotationThreshold = dur
}
if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != 0 {
conf.HeartbeatGrace = heartbeatGrace
}
if min := agentConfig.Server.MinHeartbeatTTL; min != 0 {
conf.MinHeartbeatTTL = min
}
if maxHPS := agentConfig.Server.MaxHeartbeatsPerSecond; maxHPS != 0 {
conf.MaxHeartbeatsPerSecond = maxHPS
2016-03-04 23:44:12 +00:00
}
if failoverTTL := agentConfig.Server.FailoverHeartbeatTTL; failoverTTL != 0 {
conf.FailoverHeartbeatTTL = failoverTTL
}
2016-03-04 23:44:12 +00:00
if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ServerServiceName == "" {
return nil, fmt.Errorf("server_service_name must be set when auto_advertise is enabled")
}
// handle system scheduler preemption default
if agentConfig.Server.DefaultSchedulerConfig != nil {
conf.DefaultSchedulerConfig = *agentConfig.Server.DefaultSchedulerConfig
}
2016-08-09 21:35:40 +00:00
// Add the Consul and Vault configs
conf.ConsulConfig = agentConfig.Consul
conf.VaultConfig = agentConfig.Vault
2016-10-25 22:57:38 +00:00
// Set the TLS config
conf.TLSConfig = agentConfig.TLSConfig
2017-10-30 19:19:11 +00:00
// Setup telemetry related config
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.DisableDispatchedJobSummaryMetrics = agentConfig.Telemetry.DisableDispatchedJobSummaryMetrics
2017-10-30 19:19:11 +00:00
if d, err := time.ParseDuration(agentConfig.Limits.RPCHandshakeTimeout); err != nil {
return nil, fmt.Errorf("error parsing rpc_handshake_timeout: %v", err)
} else if d < 0 {
return nil, fmt.Errorf("rpc_handshake_timeout must be >= 0")
} else {
conf.RPCHandshakeTimeout = d
}
// Set max rpc conns; nil/0 == unlimited
// Leave a little room for streaming RPCs
minLimit := config.LimitsNonStreamingConnsPerClient + 5
if agentConfig.Limits.RPCMaxConnsPerClient == nil || *agentConfig.Limits.RPCMaxConnsPerClient == 0 {
conf.RPCMaxConnsPerClient = 0
} else if limit := *agentConfig.Limits.RPCMaxConnsPerClient; limit <= minLimit {
return nil, fmt.Errorf("rpc_max_conns_per_client must be > %d; found: %d", minLimit, limit)
} else {
conf.RPCMaxConnsPerClient = limit
}
2021-06-04 19:38:46 +00:00
// Set deployment rate limit
if rate := agentConfig.Server.DeploymentQueryRateLimit; rate == 0 {
conf.DeploymentQueryRateLimit = deploymentwatcher.LimitStateQueriesPerSecond
} else if rate > 0 {
conf.DeploymentQueryRateLimit = rate
} else {
return nil, fmt.Errorf("deploy_query_rate_limit must be greater than 0")
}
// Set plan rejection tracker configuration.
if planRejectConf := agentConfig.Server.PlanRejectionTracker; planRejectConf != nil {
if planRejectConf.Enabled != nil {
conf.NodePlanRejectionEnabled = *planRejectConf.Enabled
}
conf.NodePlanRejectionThreshold = planRejectConf.NodeThreshold
if planRejectConf.NodeWindow == 0 {
return nil, fmt.Errorf("plan_rejection_tracker.node_window must be greater than 0")
} else {
conf.NodePlanRejectionWindow = planRejectConf.NodeWindow
}
}
// Add Enterprise license configs
conf.LicenseEnv = agentConfig.Server.LicenseEnv
conf.LicensePath = agentConfig.Server.LicensePath
conf.LicenseConfig.AdditionalPubKeys = agentConfig.Server.licenseAdditionalPublicKeys
// Add the search configuration
if search := agentConfig.Server.Search; search != nil {
conf.SearchConfig = &structs.SearchConfig{
FuzzyEnabled: search.FuzzyEnabled,
LimitQuery: search.LimitQuery,
LimitResults: search.LimitResults,
MinTermLength: search.MinTermLength,
}
}
// Set the raft bolt parameters
if bolt := agentConfig.Server.RaftBoltConfig; bolt != nil {
conf.RaftBoltNoFreelistSync = bolt.NoFreelistSync
}
return conf, nil
}
// serverConfig is used to generate a new server configuration struct
// for initializing a nomad server.
func (a *Agent) serverConfig() (*nomad.Config, error) {
c, err := convertServerConfig(a.config)
if err != nil {
return nil, err
}
a.finalizeServerConfig(c)
return c, nil
}
// finalizeServerConfig sets configuration fields on the server config that are
// not staticly convertable and are from the agent.
func (a *Agent) finalizeServerConfig(c *nomad.Config) {
// Setup the logging
c.Logger = a.logger
c.LogOutput = a.logOutput
// Setup the plugin loaders
c.PluginLoader = a.pluginLoader
c.PluginSingletonLoader = a.pluginSingletonLoader
2020-07-17 15:04:57 +00:00
c.AgentShutdown = func() error { return a.Shutdown() }
}
// clientConfig is used to generate a new client configuration struct for
// initializing a Nomad client.
func (a *Agent) clientConfig() (*clientconfig.Config, error) {
c, err := convertClientConfig(a.config)
if err != nil {
return nil, err
2015-09-06 01:41:00 +00:00
}
if err := a.finalizeClientConfig(c); err != nil {
return nil, err
}
return c, nil
}
// finalizeClientConfig sets configuration fields on the client config that are
// not staticly convertable and are from the agent.
func (a *Agent) finalizeClientConfig(c *clientconfig.Config) error {
// Setup the logging
c.Logger = a.logger
// If we are running a server, append both its bind and advertise address so
// we are able to at least talk to the local server even if that isn't
// configured explicitly. This handles both running server and client on one
// host and -dev mode.
2015-08-24 00:40:27 +00:00
if a.server != nil {
advertised := a.config.AdvertiseAddrs
normalized := a.config.normalizedAddrs
if advertised == nil || advertised.RPC == "" {
return fmt.Errorf("AdvertiseAddrs is nil or empty")
} else if normalized == nil || normalized.RPC == "" {
return fmt.Errorf("normalizedAddrs is nil or empty")
2018-05-22 21:45:15 +00:00
}
if normalized.RPC == advertised.RPC {
c.Servers = append(c.Servers, normalized.RPC)
} else {
c.Servers = append(c.Servers, normalized.RPC, advertised.RPC)
}
2015-08-24 00:40:27 +00:00
}
// Setup the plugin loaders
c.PluginLoader = a.pluginLoader
c.PluginSingletonLoader = a.pluginSingletonLoader
// Log deprecation messages about Consul related configuration in client
// options
var invalidConsulKeys []string
for key := range c.Options {
if strings.HasPrefix(key, "consul") {
invalidConsulKeys = append(invalidConsulKeys, fmt.Sprintf("options.%s", key))
}
}
if len(invalidConsulKeys) > 0 {
2018-09-13 17:43:40 +00:00
a.logger.Warn("invalid consul keys", "keys", strings.Join(invalidConsulKeys, ","))
a.logger.Warn(`Nomad client ignores consul related configuration in client options.
Please refer to the guide https://www.nomadproject.io/docs/agent/configuration/consul.html
to configure Nomad to work with Consul.`)
}
return nil
}
// convertClientConfig takes an agent config and log output and returns a client
// Config. There may be missing fields that must be set by the agent. To do this
// call finalizeServerConfig
func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) {
// Set up the configuration
conf := agentConfig.ClientConfig
if conf == nil {
conf = clientconfig.DefaultConfig()
}
conf.Servers = agentConfig.Client.Servers
conf.DevMode = agentConfig.DevMode
conf.EnableDebug = agentConfig.EnableDebug
if agentConfig.Region != "" {
conf.Region = agentConfig.Region
2015-10-03 00:32:11 +00:00
}
if agentConfig.DataDir != "" {
conf.StateDir = filepath.Join(agentConfig.DataDir, "client")
conf.AllocDir = filepath.Join(agentConfig.DataDir, "alloc")
}
if agentConfig.Client.StateDir != "" {
conf.StateDir = agentConfig.Client.StateDir
}
if agentConfig.Client.AllocDir != "" {
conf.AllocDir = agentConfig.Client.AllocDir
}
if agentConfig.Client.NetworkInterface != "" {
conf.NetworkInterface = agentConfig.Client.NetworkInterface
}
conf.ChrootEnv = agentConfig.Client.ChrootEnv
conf.Options = agentConfig.Client.Options
if agentConfig.Client.NetworkSpeed != 0 {
conf.NetworkSpeed = agentConfig.Client.NetworkSpeed
}
if agentConfig.Client.CpuCompute != 0 {
conf.CpuCompute = agentConfig.Client.CpuCompute
}
if agentConfig.Client.MemoryMB != 0 {
conf.MemoryMB = agentConfig.Client.MemoryMB
}
if agentConfig.Client.MaxKillTimeout != "" {
dur, err := time.ParseDuration(agentConfig.Client.MaxKillTimeout)
if err != nil {
2016-11-15 23:28:21 +00:00
return nil, fmt.Errorf("Error parsing max kill timeout: %s", err)
}
conf.MaxKillTimeout = dur
}
conf.ClientMaxPort = uint(agentConfig.Client.ClientMaxPort)
conf.ClientMinPort = uint(agentConfig.Client.ClientMinPort)
conf.MaxDynamicPort = agentConfig.Client.MaxDynamicPort
conf.MinDynamicPort = agentConfig.Client.MinDynamicPort
conf.DisableRemoteExec = agentConfig.Client.DisableRemoteExec
if agentConfig.Client.TemplateConfig != nil {
conf.TemplateConfig = agentConfig.Client.TemplateConfig.Copy()
}
2015-08-31 01:10:23 +00:00
hvMap := make(map[string]*structs.ClientHostVolumeConfig, len(agentConfig.Client.HostVolumes))
for _, v := range agentConfig.Client.HostVolumes {
hvMap[v.Name] = v
}
conf.HostVolumes = hvMap
2015-08-31 01:10:23 +00:00
// Setup the node
conf.Node = new(structs.Node)
conf.Node.Datacenter = agentConfig.Datacenter
conf.Node.Name = agentConfig.NodeName
conf.Node.Meta = agentConfig.Client.Meta
conf.Node.NodeClass = agentConfig.Client.NodeClass
// Set up the HTTP advertise address
conf.Node.HTTPAddr = agentConfig.AdvertiseAddrs.HTTP
2016-03-14 02:05:41 +00:00
// Canonicalize Node struct
conf.Node.Canonicalize()
2016-03-14 02:05:41 +00:00
// Reserve resources on the node.
2018-09-30 01:44:55 +00:00
// COMPAT(0.10): Remove in 0.10
2016-03-14 02:05:41 +00:00
r := conf.Node.Reserved
if r == nil {
r = new(structs.Resources)
conf.Node.Reserved = r
}
r.CPU = agentConfig.Client.Reserved.CPU
r.MemoryMB = agentConfig.Client.Reserved.MemoryMB
r.DiskMB = agentConfig.Client.Reserved.DiskMB
2016-03-14 02:05:41 +00:00
2018-09-30 01:44:55 +00:00
res := conf.Node.ReservedResources
if res == nil {
res = new(structs.NodeReservedResources)
conf.Node.ReservedResources = res
}
res.Cpu.CpuShares = int64(agentConfig.Client.Reserved.CPU)
res.Memory.MemoryMB = int64(agentConfig.Client.Reserved.MemoryMB)
res.Disk.DiskMB = int64(agentConfig.Client.Reserved.DiskMB)
res.Networks.ReservedHostPorts = agentConfig.Client.Reserved.ReservedPorts
if agentConfig.Client.Reserved.Cores != "" {
cores, err := cpuset.Parse(agentConfig.Client.Reserved.Cores)
if err != nil {
return nil, fmt.Errorf("failed to parse client > reserved > cores value %q: %v", agentConfig.Client.Reserved.Cores, err)
}
res.Cpu.ReservedCpuCores = cores.ToSlice()
}
2018-09-30 01:44:55 +00:00
conf.Version = agentConfig.Version
if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ClientServiceName == "" {
return nil, fmt.Errorf("client_service_name must be set when auto_advertise is enabled")
}
conf.ConsulConfig = agentConfig.Consul
conf.VaultConfig = agentConfig.Vault
// Set up Telemetry configuration
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.PublishNodeMetrics = agentConfig.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics
// Set the TLS related configs
conf.TLSConfig = agentConfig.TLSConfig
conf.Node.TLSEnabled = conf.TLSConfig.EnableHTTP
2017-01-31 23:32:20 +00:00
// Set the GC related configs
conf.GCInterval = agentConfig.Client.GCInterval
conf.GCParallelDestroys = agentConfig.Client.GCParallelDestroys
conf.GCDiskUsageThreshold = agentConfig.Client.GCDiskUsageThreshold
conf.GCInodeUsageThreshold = agentConfig.Client.GCInodeUsageThreshold
conf.GCMaxAllocs = agentConfig.Client.GCMaxAllocs
if agentConfig.Client.NoHostUUID != nil {
conf.NoHostUUID = *agentConfig.Client.NoHostUUID
} else {
// Default no_host_uuid to true
conf.NoHostUUID = true
}
2017-01-31 23:32:20 +00:00
2017-08-20 00:19:38 +00:00
// Setup the ACLs
conf.ACLEnabled = agentConfig.ACL.Enabled
conf.ACLTokenTTL = agentConfig.ACL.TokenTTL
conf.ACLPolicyTTL = agentConfig.ACL.PolicyTTL
conf.ACLRoleTTL = agentConfig.ACL.RoleTTL
2017-08-20 00:19:38 +00:00
connect: enable proxy.passthrough configuration Enable configuration of HTTP and gRPC endpoints which should be exposed by the Connect sidecar proxy. This changeset is the first "non-magical" pass that lays the groundwork for enabling Consul service checks for tasks running in a network namespace because they are Connect-enabled. The changes here provide for full configuration of the connect { sidecar_service { proxy { expose { paths = [{ path = <exposed endpoint> protocol = <http or grpc> local_path_port = <local endpoint port> listener_port = <inbound mesh port> }, ... ] } } } stanza. Everything from `expose` and below is new, and partially implements the precedent set by Consul: https://www.consul.io/docs/connect/registration/service-registration.html#expose-paths-configuration-reference Combined with a task-group level network port-mapping in the form: port "exposeExample" { to = -1 } it is now possible to "punch a hole" through the network namespace to a specific HTTP or gRPC path, with the anticipated use case of creating Consul checks on Connect enabled services. A future PR may introduce more automagic behavior, where we can do things like 1) auto-fill the 'expose.path.local_path_port' with the default value of the 'service.port' value for task-group level connect-enabled services. 2) automatically generate a port-mapping 3) enable an 'expose.checks' flag which automatically creates exposed endpoints for every compatible consul service check (http/grpc checks on connect enabled services).
2020-03-07 03:15:22 +00:00
// Setup networking configuration
conf.CNIPath = agentConfig.Client.CNIPath
conf.CNIConfigDir = agentConfig.Client.CNIConfigDir
conf.BridgeNetworkName = agentConfig.Client.BridgeNetworkName
conf.BridgeNetworkAllocSubnet = agentConfig.Client.BridgeNetworkSubnet
2020-06-16 15:53:10 +00:00
for _, hn := range agentConfig.Client.HostNetworks {
conf.HostNetworks[hn.Name] = hn
}
conf.BindWildcardDefaultHostNetwork = agentConfig.Client.BindWildcardDefaultHostNetwork
2020-06-16 15:53:10 +00:00
conf.CgroupParent = cgutil.GetCgroupParent(agentConfig.Client.CgroupParent)
if agentConfig.Client.ReserveableCores != "" {
cores, err := cpuset.Parse(agentConfig.Client.ReserveableCores)
if err != nil {
return nil, fmt.Errorf("failed to parse 'reservable_cores': %v", err)
}
conf.ReservableCores = cores.ToSlice()
}
if agentConfig.Client.NomadServiceDiscovery != nil {
conf.NomadServiceDiscovery = *agentConfig.Client.NomadServiceDiscovery
}
artifactConfig, err := clientconfig.ArtifactConfigFromAgent(agentConfig.Client.Artifact)
if err != nil {
return nil, fmt.Errorf("invalid artifact config: %v", err)
}
conf.Artifact = artifactConfig
return conf, nil
}
// setupServer is used to setup the server if enabled
func (a *Agent) setupServer() error {
if !a.config.Server.Enabled {
return nil
}
// Setup the configuration
conf, err := a.serverConfig()
if err != nil {
return fmt.Errorf("server config setup failed: %s", err)
}
2018-04-10 00:51:55 +00:00
// Generate a node ID and persist it if it is the first instance, otherwise
// read the persisted node ID.
if err := a.setupNodeID(conf); err != nil {
return fmt.Errorf("setting up server node ID failed: %s", err)
}
// Sets up the keyring for gossip encryption
if err := a.setupKeyrings(conf); err != nil {
return fmt.Errorf("failed to configure keyring: %v", err)
}
// Create the server
server, err := nomad.NewServer(conf, a.consulCatalog, a.consulConfigEntries, a.consulACLs)
if err != nil {
return fmt.Errorf("server setup failed: %v", err)
}
a.server = server
// Consul check addresses default to bind but can be toggled to use advertise
rpcCheckAddr := a.config.normalizedAddrs.RPC
serfCheckAddr := a.config.normalizedAddrs.Serf
2017-01-18 23:55:14 +00:00
if *a.config.Consul.ChecksUseAdvertise {
rpcCheckAddr = a.config.AdvertiseAddrs.RPC
serfCheckAddr = a.config.AdvertiseAddrs.Serf
}
// Create the Nomad Server services for Consul
2017-01-18 23:55:14 +00:00
if *a.config.Consul.AutoAdvertise {
2016-06-13 23:29:07 +00:00
httpServ := &structs.Service{
Name: a.config.Consul.ServerServiceName,
PortLabel: a.config.AdvertiseAddrs.HTTP,
2019-05-14 18:37:34 +00:00
Tags: append([]string{consul.ServiceTagHTTP}, a.config.Consul.Tags...),
2016-06-13 23:29:07 +00:00
}
const isServer = true
if check := a.agentHTTPCheck(isServer); check != nil {
httpServ.Checks = []*structs.ServiceCheck{check}
}
2016-06-13 23:29:07 +00:00
rpcServ := &structs.Service{
Name: a.config.Consul.ServerServiceName,
PortLabel: a.config.AdvertiseAddrs.RPC,
2019-05-14 18:37:34 +00:00
Tags: append([]string{consul.ServiceTagRPC}, a.config.Consul.Tags...),
Checks: []*structs.ServiceCheck{
2017-09-26 22:26:33 +00:00
{
Name: a.config.Consul.ServerRPCCheckName,
Type: "tcp",
Interval: serverRpcCheckInterval,
Timeout: serverRpcCheckTimeout,
PortLabel: rpcCheckAddr,
},
},
2016-06-13 23:29:07 +00:00
}
serfServ := &structs.Service{
Name: a.config.Consul.ServerServiceName,
PortLabel: a.config.AdvertiseAddrs.Serf,
2019-05-14 18:37:34 +00:00
Tags: append([]string{consul.ServiceTagSerf}, a.config.Consul.Tags...),
Checks: []*structs.ServiceCheck{
2017-09-26 22:26:33 +00:00
{
Name: a.config.Consul.ServerSerfCheckName,
Type: "tcp",
Interval: serverSerfCheckInterval,
Timeout: serverSerfCheckTimeout,
PortLabel: serfCheckAddr,
},
},
2016-06-13 23:29:07 +00:00
}
2016-11-04 00:33:58 +00:00
// Add the http port check if TLS isn't enabled
consulServices := []*structs.Service{
rpcServ,
serfServ,
httpServ,
}
if err := a.consulService.RegisterAgent(consulRoleServer, consulServices); err != nil {
return err
2016-11-04 00:33:58 +00:00
}
}
return nil
}
2018-04-10 00:51:55 +00:00
// setupNodeID will pull the persisted node ID, if any, or create a random one
// and persist it.
func (a *Agent) setupNodeID(config *nomad.Config) error {
2018-04-10 17:34:14 +00:00
// For dev mode we have no filesystem access so just make a node ID.
if a.config.DevMode {
config.NodeID = uuid.Generate()
return nil
}
2018-04-10 00:51:55 +00:00
// Load saved state, if any. Since a user could edit this, we also
// validate it. Saved state overwrites any configured node id
2018-04-10 00:51:55 +00:00
fileID := filepath.Join(config.DataDir, "node-id")
if _, err := os.Stat(fileID); err == nil {
rawID, err := ioutil.ReadFile(fileID)
if err != nil {
return err
}
nodeID := strings.TrimSpace(string(rawID))
nodeID = strings.ToLower(nodeID)
if _, err := uuidparse.ParseUUID(nodeID); err != nil {
return err
}
config.NodeID = nodeID
2018-04-10 20:33:01 +00:00
return nil
}
// If they've configured a node ID manually then just use that, as
// long as it's valid.
2018-04-10 20:33:01 +00:00
if config.NodeID != "" {
2018-04-10 16:22:16 +00:00
config.NodeID = strings.ToLower(config.NodeID)
if _, err := uuidparse.ParseUUID(config.NodeID); err != nil {
return err
}
// Persist this configured nodeID to our data directory
if err := escapingfs.EnsurePath(fileID, false); err != nil {
return err
}
if err := ioutil.WriteFile(fileID, []byte(config.NodeID), 0600); err != nil {
return err
}
return nil
}
2018-04-10 00:51:55 +00:00
// If we still don't have a valid node ID, make one.
if config.NodeID == "" {
id := uuid.Generate()
if err := escapingfs.EnsurePath(fileID, false); err != nil {
2018-04-10 00:51:55 +00:00
return err
}
if err := ioutil.WriteFile(fileID, []byte(id), 0600); err != nil {
return err
}
config.NodeID = id
}
return nil
}
// setupKeyrings is used to initialize and load keyrings during agent startup
func (a *Agent) setupKeyrings(config *nomad.Config) error {
file := filepath.Join(a.config.DataDir, serfKeyring)
if a.config.Server.EncryptKey == "" {
goto LOAD
}
if _, err := os.Stat(file); err != nil {
if err := initKeyring(file, a.config.Server.EncryptKey, a.logger); err != nil {
return err
}
}
LOAD:
if _, err := os.Stat(file); err == nil {
config.SerfConfig.KeyringFile = file
}
if err := loadKeyringFile(config.SerfConfig); err != nil {
return err
}
// Success!
return nil
}
// setupClient is used to setup the client if enabled
func (a *Agent) setupClient() error {
if !a.config.Client.Enabled {
return nil
}
// Setup the configuration
conf, err := a.clientConfig()
if err != nil {
return fmt.Errorf("client setup failed: %v", err)
}
2015-08-31 01:10:23 +00:00
// Reserve some ports for the plugins if we are on Windows
if runtime.GOOS == "windows" {
if err := a.reservePortsForClient(conf); err != nil {
return err
}
2016-02-05 23:17:15 +00:00
}
if conf.StateDBFactory == nil {
conf.StateDBFactory = state.GetStateDBFactory(conf.DevMode)
}
// Set up a custom listener and dialer. This is used by Nomad clients when
// running consul-template functions that utilise the Nomad API. We lazy
// load this into the client config, therefore this needs to happen before
// we call NewClient.
a.builtinListener, a.builtinDialer = bufconndialer.New()
conf.TemplateDialer = a.builtinDialer
nomadClient, err := client.NewClient(
conf, a.consulCatalog, a.consulProxies, a.consulService, nil)
2015-08-24 00:40:27 +00:00
if err != nil {
return fmt.Errorf("client setup failed: %v", err)
}
a.client = nomadClient
2016-06-14 00:32:18 +00:00
// Create the Nomad Client services for Consul
2017-01-18 23:55:14 +00:00
if *a.config.Consul.AutoAdvertise {
2016-06-13 23:29:07 +00:00
httpServ := &structs.Service{
Name: a.config.Consul.ClientServiceName,
PortLabel: a.config.AdvertiseAddrs.HTTP,
2019-05-14 18:37:34 +00:00
Tags: append([]string{consul.ServiceTagHTTP}, a.config.Consul.Tags...),
2016-06-13 23:29:07 +00:00
}
const isServer = false
if check := a.agentHTTPCheck(isServer); check != nil {
httpServ.Checks = []*structs.ServiceCheck{check}
2016-11-04 00:33:58 +00:00
}
if err := a.consulService.RegisterAgent(consulRoleClient, []*structs.Service{httpServ}); err != nil {
return err
}
}
2015-08-24 00:40:27 +00:00
return nil
}
// agentHTTPCheck returns a health check for the agent's HTTP API if possible.
// If no HTTP health check can be supported nil is returned.
func (a *Agent) agentHTTPCheck(server bool) *structs.ServiceCheck {
// Resolve the http check address
httpCheckAddr := a.config.normalizedAddrs.HTTP[0]
if *a.config.Consul.ChecksUseAdvertise {
httpCheckAddr = a.config.AdvertiseAddrs.HTTP
}
check := structs.ServiceCheck{
Name: a.config.Consul.ClientHTTPCheckName,
Type: "http",
2017-10-13 22:37:44 +00:00
Path: "/v1/agent/health?type=client",
Protocol: "http",
2017-09-26 22:26:33 +00:00
Interval: agentHttpCheckInterval,
Timeout: agentHttpCheckTimeout,
PortLabel: httpCheckAddr,
}
// Switch to endpoint that doesn't require a leader for servers
if server {
check.Name = a.config.Consul.ServerHTTPCheckName
2017-10-13 22:37:44 +00:00
check.Path = "/v1/agent/health?type=server"
}
if !a.config.TLSConfig.EnableHTTP {
// No HTTPS, return a plain http check
return &check
}
if a.config.TLSConfig.VerifyHTTPSClient {
2018-09-13 17:43:40 +00:00
a.logger.Warn("not registering Nomad HTTPS Health Check because verify_https_client enabled")
return nil
}
// HTTPS enabled; skip verification
check.Protocol = "https"
check.TLSSkipVerify = true
return &check
}
2016-06-01 10:08:39 +00:00
// reservePortsForClient reserves a range of ports for the client to use when
2016-02-12 22:25:32 +00:00
// it creates various plugins for log collection, executors, drivers, etc
func (a *Agent) reservePortsForClient(conf *clientconfig.Config) error {
2018-10-03 16:47:18 +00:00
if conf.Node.ReservedResources == nil {
conf.Node.ReservedResources = &structs.NodeReservedResources{}
}
2016-02-12 22:25:32 +00:00
2018-10-03 16:47:18 +00:00
res := conf.Node.ReservedResources.Networks.ReservedHostPorts
if res == "" {
res = fmt.Sprintf("%d-%d", conf.ClientMinPort, conf.ClientMaxPort)
} else {
res += fmt.Sprintf(",%d-%d", conf.ClientMinPort, conf.ClientMaxPort)
}
2018-10-03 16:47:18 +00:00
conf.Node.ReservedResources.Networks.ReservedHostPorts = res
return nil
}
2015-08-16 23:40:04 +00:00
// Leave is used gracefully exit. Clients will inform servers
// of their departure so that allocations can be rescheduled.
2015-08-16 20:54:49 +00:00
func (a *Agent) Leave() error {
2015-08-23 23:53:15 +00:00
if a.client != nil {
if err := a.client.Leave(); err != nil {
2018-09-13 17:43:40 +00:00
a.logger.Error("client leave failed", "error", err)
2015-08-23 23:53:15 +00:00
}
}
if a.server != nil {
if err := a.server.Leave(); err != nil {
2018-09-13 17:43:40 +00:00
a.logger.Error("server leave failed", "error", err)
2015-08-23 23:53:15 +00:00
}
}
2015-08-16 20:54:49 +00:00
return nil
}
2015-08-16 23:40:04 +00:00
// Shutdown is used to terminate the agent.
2015-08-16 20:54:49 +00:00
func (a *Agent) Shutdown() error {
2015-08-16 23:40:04 +00:00
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
if a.shutdown {
return nil
}
2018-09-13 17:43:40 +00:00
a.logger.Info("requesting shutdown")
2015-08-23 23:53:15 +00:00
if a.client != nil {
if err := a.client.Shutdown(); err != nil {
2018-09-13 17:43:40 +00:00
a.logger.Error("client shutdown failed", "error", err)
2015-08-23 23:53:15 +00:00
}
}
2015-08-16 23:40:04 +00:00
if a.server != nil {
2015-08-23 23:53:15 +00:00
if err := a.server.Shutdown(); err != nil {
2018-09-13 17:43:40 +00:00
a.logger.Error("server shutdown failed", "error", err)
2015-08-23 23:53:15 +00:00
}
2015-08-16 23:40:04 +00:00
}
if err := a.consulService.Shutdown(); err != nil {
2018-09-13 17:43:40 +00:00
a.logger.Error("shutting down Consul client failed", "error", err)
}
2018-09-13 17:43:40 +00:00
a.logger.Info("shutdown complete")
2015-08-16 23:40:04 +00:00
a.shutdown = true
close(a.shutdownCh)
2015-08-23 23:53:15 +00:00
return nil
2015-08-16 23:40:04 +00:00
}
// RPC is used to make an RPC call to the Nomad servers
func (a *Agent) RPC(method string, args interface{}, reply interface{}) error {
if a.server != nil {
return a.server.RPC(method, args, reply)
}
return a.client.RPC(method, args, reply)
2015-08-16 20:54:49 +00:00
}
2015-08-24 00:40:27 +00:00
// Client returns the configured client or nil
func (a *Agent) Client() *client.Client {
return a.client
}
// Server returns the configured server or nil
func (a *Agent) Server() *nomad.Server {
return a.server
}
2015-08-31 01:20:00 +00:00
// Stats is used to return statistics for debugging and insight
// for various sub-systems
func (a *Agent) Stats() map[string]map[string]string {
stats := make(map[string]map[string]string)
if a.server != nil {
subStat := a.server.Stats()
for k, v := range subStat {
stats[k] = v
}
}
if a.client != nil {
subStat := a.client.Stats()
for k, v := range subStat {
stats[k] = v
}
}
return stats
}
// ShouldReload determines if we should reload the configuration and agent
// connections. If the TLS Configuration has not changed, we shouldn't reload.
func (a *Agent) ShouldReload(newConfig *Config) (agent, http bool) {
2017-11-30 15:50:43 +00:00
a.configLock.Lock()
defer a.configLock.Unlock()
2019-07-23 13:52:24 +00:00
if newConfig.LogLevel != "" && newConfig.LogLevel != a.config.LogLevel {
agent = true
}
isEqual, err := a.config.TLSConfig.CertificateInfoIsEqual(newConfig.TLSConfig)
if err != nil {
2018-09-13 17:43:40 +00:00
a.logger.Error("parsing TLS certificate", "error", err)
2019-07-23 13:52:24 +00:00
return agent, false
} else if !isEqual {
return true, true
}
// Allow the ability to only reload HTTP connections
2018-03-23 21:58:10 +00:00
if a.config.TLSConfig.EnableHTTP != newConfig.TLSConfig.EnableHTTP {
http = true
agent = true
}
// Allow the ability to only reload HTTP connections
2018-03-23 21:58:10 +00:00
if a.config.TLSConfig.EnableRPC != newConfig.TLSConfig.EnableRPC {
agent = true
}
if a.config.TLSConfig.RPCUpgradeMode != newConfig.TLSConfig.RPCUpgradeMode {
agent = true
}
return agent, http
}
// Reload handles configuration changes for the agent. Provides a method that
// is easier to unit test, as this action is invoked via SIGHUP.
func (a *Agent) Reload(newConfig *Config) error {
a.configLock.Lock()
defer a.configLock.Unlock()
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
current := a.config.Copy()
updatedLogging := newConfig != nil && (newConfig.LogLevel != current.LogLevel)
2019-07-23 13:52:24 +00:00
if newConfig == nil || newConfig.TLSConfig == nil && !updatedLogging {
return fmt.Errorf("cannot reload agent with nil configuration")
}
2019-07-23 13:52:24 +00:00
if updatedLogging {
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
current.LogLevel = newConfig.LogLevel
a.logger.SetLevel(log.LevelFromString(current.LogLevel))
2019-07-23 13:52:24 +00:00
}
// Update eventer config
if newConfig.Audit != nil {
if err := a.entReloadEventer(newConfig.Audit); err != nil {
return err
}
}
2020-03-25 14:53:38 +00:00
// Allow auditor to call reopen regardless of config changes
// This is primarily for enterprise audit logging to allow the underlying
// file to be reopened if necessary
if err := a.auditor.Reopen(); err != nil {
return err
}
2019-07-23 13:52:24 +00:00
fullUpdateTLSConfig := func() {
// Completely reload the agent's TLS configuration (moving from non-TLS to
// TLS, or vice versa)
// This does not handle errors in loading the new TLS configuration
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
current.TLSConfig = newConfig.TLSConfig.Copy()
2019-07-23 13:52:24 +00:00
}
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
if !current.TLSConfig.IsEmpty() && !newConfig.TLSConfig.IsEmpty() {
2019-07-23 13:52:24 +00:00
// This is just a TLS configuration reload, we don't need to refresh
// existing network connections
// Reload the certificates on the keyloader and on success store the
// updated TLS config. It is important to reuse the same keyloader
// as this allows us to dynamically reload configurations not only
// on the Agent but on the Server and Client too (they are
// referencing the same keyloader).
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
keyloader := current.TLSConfig.GetKeyLoader()
_, err := keyloader.LoadKeyPair(newConfig.TLSConfig.CertFile, newConfig.TLSConfig.KeyFile)
if err != nil {
return err
}
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
current.TLSConfig = newConfig.TLSConfig
current.TLSConfig.KeyLoader = keyloader
a.config = current
return nil
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
} else if newConfig.TLSConfig.IsEmpty() && !current.TLSConfig.IsEmpty() {
2018-09-13 17:43:40 +00:00
a.logger.Warn("downgrading agent's existing TLS configuration to plaintext")
2019-07-23 13:52:24 +00:00
fullUpdateTLSConfig()
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
} else if !newConfig.TLSConfig.IsEmpty() && current.TLSConfig.IsEmpty() {
2018-09-13 17:43:40 +00:00
a.logger.Info("upgrading from plaintext configuration to TLS")
2019-07-23 13:52:24 +00:00
fullUpdateTLSConfig()
}
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
// Set agent config to the updated config
a.config = current
return nil
}
client: fix data races in config handling (#14139) Before this change, Client had 2 copies of the config object: config and configCopy. There was no guidance around which to use where (other than configCopy's comment to pass it to alloc runners), both are shared among goroutines and mutated in data racy ways. At least at one point I think the idea was to have `config` be mutable and then grab a lock to overwrite `configCopy`'s pointer atomically. This would have allowed alloc runners to read their config copies in data race safe ways, but this isn't how the current implementation worked. This change takes the following approach to safely handling configs in the client: 1. `Client.config` is the only copy of the config and all access must go through the `Client.configLock` mutex 2. Since the mutex *only protects the config pointer itself and not fields inside the Config struct:* all config mutation must be done on a *copy* of the config, and then Client's config pointer is overwritten while the mutex is acquired. Alloc runners and other goroutines with the old config pointer will not see config updates. 3. Deep copying is implemented on the Config struct to satisfy the previous approach. The TLS Keyloader is an exception because it has its own internal locking to support mutating in place. An unfortunate complication but one I couldn't find a way to untangle in a timely fashion. 4. To facilitate deep copying I made an *internally backward incompatible API change:* our `helper/funcs` used to turn containers (slices and maps) with 0 elements into nils. This probably saves a few memory allocations but makes it very easy to cause panics. Since my new config handling approach uses more copying, it became very difficult to ensure all code that used containers on configs could handle nils properly. Since this code has caused panics in the past, I fixed it: nil containers are copied as nil, but 0-element containers properly return a new 0-element container. No more "downgrading to nil!"
2022-08-18 23:32:04 +00:00
// GetConfig returns the current agent configuration. The Config should *not*
// be mutated directly. First call Config.Copy.
func (a *Agent) GetConfig() *Config {
a.configLock.Lock()
defer a.configLock.Unlock()
return a.config
}
// setupConsul creates the Consul client and starts its main Run loop.
func (a *Agent) setupConsul(consulConfig *config.ConsulConfig) error {
apiConf, err := consulConfig.ApiConfig()
if err != nil {
return err
}
consulClient, err := consulapi.NewClient(apiConf)
if err != nil {
return err
}
// Create Consul Catalog client for service discovery.
a.consulCatalog = consulClient.Catalog()
// Create Consul ConfigEntries client for managing Config Entries.
a.consulConfigEntries = consulClient.ConfigEntries()
// Create Consul ACL client for managing tokens.
a.consulACLs = consulClient.ACL()
// Create Consul Service client for service advertisement and checks.
isClient := false
if a.config.Client != nil && a.config.Client.Enabled {
isClient = true
}
// Create Consul Agent client for looking info about the agent.
consulAgentClient := consulClient.Agent()
namespacesClient := consul.NewNamespacesClient(consulClient.Namespaces(), consulAgentClient)
a.consulService = consul.NewServiceClient(consulAgentClient, namespacesClient, a.logger, isClient)
a.consulProxies = consul.NewConnectProxiesClient(consulAgentClient)
// Run the Consul service client's sync'ing main loop
go a.consulService.Run()
return nil
}
// noOpAuditor is a no-op Auditor that fulfills the
// event.Auditor interface.
type noOpAuditor struct{}
// Ensure noOpAuditor is an Auditor
var _ event.Auditor = &noOpAuditor{}
func (e *noOpAuditor) Event(ctx context.Context, eventType string, payload interface{}) error {
return nil
}
func (e *noOpAuditor) Enabled() bool {
return false
}
func (e *noOpAuditor) Reopen() error {
return nil
}
func (e *noOpAuditor) SetEnabled(enabled bool) {}
func (e *noOpAuditor) DeliveryEnforced() bool { return false }