open-nomad/nomad/config.go

package nomad

import (
	"io"
	"net"
	"os"
	"runtime"
	"time"

	log "github.com/hashicorp/go-hclog"
	"golang.org/x/exp/slices"

	"github.com/hashicorp/memberlist"
	"github.com/hashicorp/nomad/helper/pluginutils/loader"
	"github.com/hashicorp/nomad/helper/pointer"
	"github.com/hashicorp/nomad/helper/uuid"
	"github.com/hashicorp/nomad/nomad/deploymentwatcher"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/nomad/structs/config"
	"github.com/hashicorp/nomad/scheduler"
	"github.com/hashicorp/raft"
	"github.com/hashicorp/serf/serf"
)

const (
	DefaultRegion   = "global"
	DefaultDC       = "dc1"
	DefaultSerfPort = 4648
)

func DefaultRPCAddr() *net.TCPAddr {
	return &net.TCPAddr{IP: net.ParseIP("127.0.0.1"), Port: 4647}
}

// Config is used to parameterize the server
type Config struct {
	// BootstrapExpect mode is used to automatically bring up a
	// collection of Nomad servers. This can be used to automatically
	// bring up a collection of nodes.
	//
	// The BootstrapExpect can be of any of the following values:
	//  1: Server will form a single node cluster and become a leader immediately
	//  N, larger than 1: Server will wait until it's connected to N servers
	//      before attempting leadership and forming the cluster.  No Raft Log operation
	//      will succeed until then.
	//  0: Server will wait to get a Raft configuration from another node and may not
	//      attempt to form a cluster or establish leadership on its own.
	BootstrapExpect int

	// DataDir is the directory to store our state in
	DataDir string

	// DevMode is used for development purposes only and limits the
	// use of persistence or state.
	DevMode bool

	// EnableDebug is used to enable debugging RPC endpoints
	// in the absence of ACLs
	EnableDebug bool

	// EnableEventBroker is used to enable or disable state store
	// event publishing
	EnableEventBroker bool

	// EventBufferSize is the amount of events to hold in memory.
	EventBufferSize int64

	// LogOutput is the location to write logs to. If this is not set,
	// logs will go to stderr.
	LogOutput io.Writer

	// Logger is the logger used by the server.
	Logger log.InterceptLogger

	// RPCAddr is the RPC address used by Nomad. This should be reachable
	// by the other servers and clients
	RPCAddr *net.TCPAddr

	// ClientRPCAdvertise is the address that is advertised to client nodes for
	// the RPC endpoint. This can differ from the RPC address, if for example
	// the RPCAddr is unspecified "0.0.0.0:4646", but this address must be
	// reachable
	ClientRPCAdvertise *net.TCPAddr

	// ServerRPCAdvertise is the address that is advertised to other servers for
	// the RPC endpoint. This can differ from the RPC address, if for example
	// the RPCAddr is unspecified "0.0.0.0:4646", but this address must be
	// reachable
	ServerRPCAdvertise *net.TCPAddr

	// RaftConfig is the configuration used for Raft in the local DC
	RaftConfig *raft.Config

	// RaftTimeout is applied to any network traffic for raft. Defaults to 10s.
	RaftTimeout time.Duration

	// (Enterprise-only) NonVoter is used to prevent this server from being added
	// as a voting member of the Raft cluster.
	NonVoter bool

	// (Enterprise-only) RedundancyZone is the redundancy zone to use for this server.
	RedundancyZone string

	// (Enterprise-only) UpgradeVersion is the custom upgrade version to use when
	// performing upgrade migrations.
	UpgradeVersion string

	// SerfConfig is the configuration for the serf cluster
	SerfConfig *serf.Config

	// Node name is the name we use to advertise. Defaults to hostname.
	NodeName string

	// NodeID is the uuid of this server.
	NodeID string

	// Region is the region this Nomad server belongs to.
	Region string

	// AuthoritativeRegion is the region which is treated as the authoritative source
	// for ACLs and Policies. This provides a single source of truth to resolve conflicts.
	AuthoritativeRegion string

	// Datacenter is the datacenter this Nomad server belongs to.
	Datacenter string

	// Build is a string that is gossiped around, and can be used to help
	// operators track which versions are actively deployed
	Build string

	// Revision is a string that carries the version.GitCommit of Nomad that
	// was compiled.
	Revision string

	// NumSchedulers is the number of scheduler thread that are run.
	// This can be as many as one per core, or zero to disable this server
	// from doing any scheduling work.
	NumSchedulers int

	// EnabledSchedulers controls the set of sub-schedulers that are
	// enabled for this server to handle. This will restrict the evaluations
	// that the workers dequeue for processing.
	EnabledSchedulers []string

	// ReconcileInterval controls how often we reconcile the strongly
	// consistent store with the Serf info. This is used to handle nodes
	// that are force removed, as well as intermittent unavailability during
	// leader election.
	ReconcileInterval time.Duration

	// EvalGCInterval is how often we dispatch a job to GC evaluations
	EvalGCInterval time.Duration

	// EvalGCThreshold is how "old" an evaluation must be to be eligible
	// for GC. This gives users some time to debug a failed evaluation.
	//
	// Please note that the rules for GC of evaluations which belong to a batch
	// job are separate and controlled by `BatchEvalGCThreshold`
	EvalGCThreshold time.Duration

	// BatchEvalGCThreshold is how "old" an evaluation must be to be eligible
	// for GC if the eval belongs to a batch job.
	BatchEvalGCThreshold time.Duration

	// JobGCInterval is how often we dispatch a job to GC jobs that are
	// available for garbage collection.
	JobGCInterval time.Duration

	// JobGCThreshold is how old a job must be before it eligible for GC. This gives
	// the user time to inspect the job.
	JobGCThreshold time.Duration

	// NodeGCInterval is how often we dispatch a job to GC failed nodes.
	NodeGCInterval time.Duration

	// NodeGCThreshold is how "old" a node must be to be eligible
	// for GC. This gives users some time to view and debug a failed nodes.
	NodeGCThreshold time.Duration

	// DeploymentGCInterval is how often we dispatch a job to GC terminal
	// deployments.
	DeploymentGCInterval time.Duration

	// DeploymentGCThreshold is how "old" a deployment must be to be eligible
	// for GC. This gives users some time to view terminal deployments.
	DeploymentGCThreshold time.Duration

	// CSIPluginGCInterval is how often we dispatch a job to GC unused plugins.
	CSIPluginGCInterval time.Duration

	// CSIPluginGCThreshold is how "old" a plugin must be to be eligible
	// for GC. This gives users some time to debug plugins.
	CSIPluginGCThreshold time.Duration

	// CSIVolumeClaimGCInterval is how often we dispatch a job to GC
	// volume claims.
	CSIVolumeClaimGCInterval time.Duration

	// CSIVolumeClaimGCThreshold is how "old" a volume must be to be
	// eligible for GC. This gives users some time to debug volumes.
	CSIVolumeClaimGCThreshold time.Duration

	// OneTimeTokenGCInterval is how often we dispatch a job to GC
	// one-time tokens.
	OneTimeTokenGCInterval time.Duration

	// ACLTokenExpirationGCInterval is how often we dispatch a job to GC
	// expired ACL tokens.
	ACLTokenExpirationGCInterval time.Duration

	// ACLTokenExpirationGCThreshold controls how "old" an expired ACL token
	// must be to be collected by GC.
	ACLTokenExpirationGCThreshold time.Duration

	// RootKeyGCInterval is how often we dispatch a job to GC
	// encryption key metadata
	RootKeyGCInterval time.Duration

	// RootKeyGCThreshold is how "old" encryption key metadata must be
	// to be eligible for GC.
	RootKeyGCThreshold time.Duration

	// RootKeyRotationThreshold is how "old" an active key can be
	// before it's rotated
	RootKeyRotationThreshold time.Duration

	// VariablesRekeyInterval is how often we dispatch a job to
	// rekey any variables associated with a key in the Rekeying state
	VariablesRekeyInterval time.Duration

	// EvalNackTimeout controls how long we allow a sub-scheduler to
	// work on an evaluation before we consider it failed and Nack it.
	// This allows that evaluation to be handed to another sub-scheduler
	// to work on. Defaults to 60 seconds. This should be long enough that
	// no evaluation hits it unless the sub-scheduler has failed.
	EvalNackTimeout time.Duration

	// EvalDeliveryLimit is the limit of attempts we make to deliver and
	// process an evaluation. This is used so that an eval that will never
	// complete eventually fails out of the system.
	EvalDeliveryLimit int

	// EvalNackInitialReenqueueDelay is the delay applied before reenqueuing a
	// Nacked evaluation for the first time. This value should be small as the
	// initial Nack can be due to a down machine and the eval should be retried
	// quickly for liveliness.
	EvalNackInitialReenqueueDelay time.Duration

	// EvalNackSubsequentReenqueueDelay is the delay applied before reenqueuing
	// an evaluation that has been Nacked more than once. This delay is
	// compounding after the first Nack. This value should be significantly
	// longer than the initial delay as the purpose it severs is to apply
	// back-pressure as evaluations are being Nacked either due to scheduler
	// failures or because they are hitting their Nack timeout, both of which
	// are signs of high server resource usage.
	EvalNackSubsequentReenqueueDelay time.Duration

	// EvalFailedFollowupBaselineDelay is the minimum time waited before
	// retrying a failed evaluation.
	EvalFailedFollowupBaselineDelay time.Duration

	// EvalReapCancelableInterval is the interval for the periodic reaping of
	// cancelable evaluations. Cancelable evaluations are canceled whenever any
	// eval is ack'd but this sweeps up on quiescent clusters. This config value
	// exists only for testing.
	EvalReapCancelableInterval time.Duration

	// EvalFailedFollowupDelayRange defines the range of additional time from
	// the baseline in which to wait before retrying a failed evaluation. The
	// additional delay is selected from this range randomly.
	EvalFailedFollowupDelayRange time.Duration

	// NodePlanRejectionEnabled controls if node rejection tracker is enabled.
	NodePlanRejectionEnabled bool

	// NodePlanRejectionThreshold is the number of times a node must have a
	// plan rejection before it is set as ineligible.
	NodePlanRejectionThreshold int

	// NodePlanRejectionWindow is the time window used to track plan
	// rejections for nodes.
	NodePlanRejectionWindow time.Duration

	// MinHeartbeatTTL is the minimum time between heartbeats.
	// This is used as a floor to prevent excessive updates.
	MinHeartbeatTTL time.Duration

	// MaxHeartbeatsPerSecond is the maximum target rate of heartbeats
	// being processed per second. This allows the TTL to be increased
	// to meet the target rate.
	MaxHeartbeatsPerSecond float64

	// HeartbeatGrace is the additional time given as a grace period
	// beyond the TTL to account for network and processing delays
	// as well as clock skew.
	HeartbeatGrace time.Duration

	// FailoverHeartbeatTTL is the TTL applied to heartbeats after
	// a new leader is elected, since we no longer know the status
	// of all the heartbeats.
	FailoverHeartbeatTTL time.Duration

	// ConsulConfig is this Agent's Consul configuration
	ConsulConfig *config.ConsulConfig

	// VaultConfig is this Agent's Vault configuration
	VaultConfig *config.VaultConfig

	// RPCHoldTimeout is how long an RPC can be "held" before it is errored.
	// This is used to paper over a loss of leadership by instead holding RPCs,
	// so that the caller experiences a slow response rather than an error.
	// This period is meant to be long enough for a leader election to take
	// place, and a small jitter is applied to avoid a thundering herd.
	RPCHoldTimeout time.Duration

	// TLSConfig holds various TLS related configurations
	TLSConfig *config.TLSConfig

	// ACLEnabled controls if ACL enforcement and management is enabled.
	ACLEnabled bool

	// ReplicationBackoff is how much we backoff when replication errors.
	// This is a tunable knob for testing primarily.
	ReplicationBackoff time.Duration

	// ReplicationToken is the ACL Token Secret ID used to fetch from
	// the Authoritative Region.
	ReplicationToken string

	// TokenMinExpirationTTL is used to enforce the lowest acceptable value for
	// ACL token expiration.
	ACLTokenMinExpirationTTL time.Duration

	// TokenMaxExpirationTTL is used to enforce the highest acceptable value
	// for ACL token expiration.
	ACLTokenMaxExpirationTTL time.Duration

	// SentinelGCInterval is the interval that we GC unused policies.
	SentinelGCInterval time.Duration

	// SentinelConfig is this Agent's Sentinel configuration
	SentinelConfig *config.SentinelConfig

	// StatsCollectionInterval is the interval at which the Nomad server
	// publishes metrics which are periodic in nature like updating gauges
	StatsCollectionInterval time.Duration

	// DisableDispatchedJobSummaryMetrics allows for ignore dispatched jobs when
	// publishing Job summary metrics
	DisableDispatchedJobSummaryMetrics bool

	// DisableRPCRateMetricsLabels drops the label for the identity of the
	// requester when publishing metrics on RPC rate on the server. This may be
	// useful to control metrics collection costs in environments where request
	// rate is well-controlled but cardinality of requesters is high.
	DisableRPCRateMetricsLabels bool

	// AutopilotConfig is used to apply the initial autopilot config when
	// bootstrapping.
	AutopilotConfig *structs.AutopilotConfig

	// ServerHealthInterval is the frequency with which the health of the
	// servers in the cluster will be updated.
	ServerHealthInterval time.Duration

	// AutopilotInterval is the frequency with which the leader will perform
	// autopilot tasks, such as promoting eligible non-voters and removing
	// dead servers.
	AutopilotInterval time.Duration

	// DefaultSchedulerConfig configures the initial scheduler config to be persisted in Raft.
	// Once the cluster is bootstrapped, and Raft persists the config (from here or through API)
	// and this value is ignored.
	DefaultSchedulerConfig structs.SchedulerConfiguration `hcl:"default_scheduler_config"`

	// PluginLoader is used to load plugins.
	PluginLoader loader.PluginCatalog

	// PluginSingletonLoader is a plugin loader that will returns singleton
	// instances of the plugins.
	PluginSingletonLoader loader.PluginCatalog

	// RPCHandshakeTimeout is the deadline by which RPC handshakes must
	// complete. The RPC handshake includes the first byte read as well as
	// the TLS handshake and subsequent byte read if TLS is enabled.
	//
	// The deadline is reset after the first byte is read so when TLS is
	// enabled RPC connections may take (timeout * 2) to complete.
	//
	// 0 means no timeout.
	RPCHandshakeTimeout time.Duration

	// RPCMaxConnsPerClient is the maximum number of concurrent RPC
	// connections from a single IP address. nil/0 means no limit.
	RPCMaxConnsPerClient int

	// LicenseConfig is a tunable knob for enterprise license testing.
	LicenseConfig *LicenseConfig
	LicenseEnv    string
	LicensePath   string

	// SearchConfig provides knobs for Search API.
	SearchConfig *structs.SearchConfig

	// RaftBoltNoFreelistSync configures whether freelist syncing is enabled.
	RaftBoltNoFreelistSync bool

	// AgentShutdown is used to call agent.Shutdown from the context of a Server
	// It is used primarily for licensing
	AgentShutdown func() error

	// DeploymentQueryRateLimit is in queries per second and is used by the
	// DeploymentWatcher to throttle the amount of simultaneously deployments
	DeploymentQueryRateLimit float64
}

func (c *Config) Copy() *Config {
	if c == nil {
		return nil
	}

	nc := *c

	// Can't deep copy interfaces
	// LogOutput io.Writer
	// Logger log.InterceptLogger
	// PluginLoader loader.PluginCatalog
	// PluginSingletonLoader loader.PluginCatalog

	nc.RPCAddr = pointer.Copy(c.RPCAddr)
	nc.ClientRPCAdvertise = pointer.Copy(c.ClientRPCAdvertise)
	nc.ServerRPCAdvertise = pointer.Copy(c.ServerRPCAdvertise)
	nc.RaftConfig = pointer.Copy(c.RaftConfig)
	nc.SerfConfig = pointer.Copy(c.SerfConfig)
	nc.EnabledSchedulers = slices.Clone(c.EnabledSchedulers)
	nc.ConsulConfig = c.ConsulConfig.Copy()
	nc.VaultConfig = c.VaultConfig.Copy()
	nc.TLSConfig = c.TLSConfig.Copy()
	nc.SentinelConfig = c.SentinelConfig.Copy()
	nc.AutopilotConfig = c.AutopilotConfig.Copy()
	nc.LicenseConfig = c.LicenseConfig.Copy()
	nc.SearchConfig = c.SearchConfig.Copy()

	return &nc
}

// DefaultConfig returns the default configuration. Only used as the basis for
// merging agent or test parameters.
func DefaultConfig() *Config {
	hostname, err := os.Hostname()
	if err != nil {
		panic(err)
	}

	c := &Config{
		Region:                           DefaultRegion,
		AuthoritativeRegion:              DefaultRegion,
		Datacenter:                       DefaultDC,
		NodeName:                         hostname,
		NodeID:                           uuid.Generate(),
		RaftConfig:                       raft.DefaultConfig(),
		RaftTimeout:                      10 * time.Second,
		LogOutput:                        os.Stderr,
		RPCAddr:                          DefaultRPCAddr(),
		SerfConfig:                       serf.DefaultConfig(),
		NumSchedulers:                    1,
		ReconcileInterval:                60 * time.Second,
		EvalGCInterval:                   5 * time.Minute,
		EvalGCThreshold:                  1 * time.Hour,
		BatchEvalGCThreshold:             24 * time.Hour,
		JobGCInterval:                    5 * time.Minute,
		JobGCThreshold:                   4 * time.Hour,
		NodeGCInterval:                   5 * time.Minute,
		NodeGCThreshold:                  24 * time.Hour,
		DeploymentGCInterval:             5 * time.Minute,
		DeploymentGCThreshold:            1 * time.Hour,
		CSIPluginGCInterval:              5 * time.Minute,
		CSIPluginGCThreshold:             1 * time.Hour,
		CSIVolumeClaimGCInterval:         5 * time.Minute,
		CSIVolumeClaimGCThreshold:        5 * time.Minute,
		OneTimeTokenGCInterval:           10 * time.Minute,
		ACLTokenExpirationGCInterval:     5 * time.Minute,
		ACLTokenExpirationGCThreshold:    1 * time.Hour,
		RootKeyGCInterval:                10 * time.Minute,
		RootKeyGCThreshold:               1 * time.Hour,
		RootKeyRotationThreshold:         720 * time.Hour, // 30 days
		VariablesRekeyInterval:           10 * time.Minute,
		EvalNackTimeout:                  60 * time.Second,
		EvalDeliveryLimit:                3,
		EvalNackInitialReenqueueDelay:    1 * time.Second,
		EvalNackSubsequentReenqueueDelay: 20 * time.Second,
		EvalFailedFollowupBaselineDelay:  1 * time.Minute,
		EvalFailedFollowupDelayRange:     5 * time.Minute,
		EvalReapCancelableInterval:       5 * time.Second,
		MinHeartbeatTTL:                  10 * time.Second,
		MaxHeartbeatsPerSecond:           50.0,
		HeartbeatGrace:                   10 * time.Second,
		FailoverHeartbeatTTL:             300 * time.Second,
		NodePlanRejectionEnabled:         false,
		NodePlanRejectionThreshold:       15,
		NodePlanRejectionWindow:          10 * time.Minute,
		ConsulConfig:                     config.DefaultConsulConfig(),
		VaultConfig:                      config.DefaultVaultConfig(),
		RPCHoldTimeout:                   5 * time.Second,
		StatsCollectionInterval:          1 * time.Minute,
		TLSConfig:                        &config.TLSConfig{},
		ReplicationBackoff:               30 * time.Second,
		SentinelGCInterval:               30 * time.Second,
		LicenseConfig:                    &LicenseConfig{},
		EnableEventBroker:                true,
		EventBufferSize:                  100,
		ACLTokenMinExpirationTTL:         1 * time.Minute,
		ACLTokenMaxExpirationTTL:         24 * time.Hour,
		AutopilotConfig: &structs.AutopilotConfig{
			CleanupDeadServers:      true,
			LastContactThreshold:    200 * time.Millisecond,
			MaxTrailingLogs:         250,
			ServerStabilizationTime: 10 * time.Second,
		},
		ServerHealthInterval: 2 * time.Second,
		AutopilotInterval:    10 * time.Second,
		DefaultSchedulerConfig: structs.SchedulerConfiguration{
			SchedulerAlgorithm: structs.SchedulerAlgorithmBinpack,
			PreemptionConfig: structs.PreemptionConfig{
				SystemSchedulerEnabled:   true,
				SysBatchSchedulerEnabled: false,
				BatchSchedulerEnabled:    false,
				ServiceSchedulerEnabled:  false,
			},
		},
		DeploymentQueryRateLimit: deploymentwatcher.LimitStateQueriesPerSecond,
	}

	// Enable all known schedulers by default
	c.EnabledSchedulers = make([]string, 0, len(scheduler.BuiltinSchedulers))
	for name := range scheduler.BuiltinSchedulers {
		c.EnabledSchedulers = append(c.EnabledSchedulers, name)
	}
	c.EnabledSchedulers = append(c.EnabledSchedulers, structs.JobTypeCore)

	// Default the number of schedulers to match the cores
	c.NumSchedulers = runtime.NumCPU()

	// Increase our reap interval to 3 days instead of 24h.
	c.SerfConfig.ReconnectTimeout = 3 * 24 * time.Hour

	// Serf should use the WAN timing, since we are using it
	// to communicate between DC's
	c.SerfConfig.MemberlistConfig = memberlist.DefaultWANConfig()
	c.SerfConfig.MemberlistConfig.BindPort = DefaultSerfPort

	// Disable shutdown on removal
	c.RaftConfig.ShutdownOnRemove = false

	// Default to Raft v3 since Nomad 1.3
	c.RaftConfig.ProtocolVersion = 3

	return c
}