open-consul/agent/consul/config.go
Dhia Ayachi 9dc5200155
update raft to v1.3.8 (#12844)
* update raft to v1.3.7

* add changelog

* fix compilation error

* fix HeartbeatTimeout

* fix ElectionTimeout to reload only if value is valid

* fix default values for `ElectionTimeout` and `HeartbeatTimeout`

* fix test defaults

* bump raft to v1.3.8
2022-04-25 10:19:26 -04:00

614 lines
22 KiB
Go

package consul
import (
"fmt"
"net"
"os"
"time"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"golang.org/x/time/rate"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/structs"
libserf "github.com/hashicorp/consul/lib/serf"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/consul/version"
)
const (
DefaultDC = "dc1"
DefaultRPCPort = 8300
DefaultLANSerfPort = 8301
DefaultWANSerfPort = 8302
// DefaultRaftMultiplier is used as a baseline Raft configuration that
// will be reliable on a very basic server. See docs/install/performance.html
// for information on how this value was obtained.
DefaultRaftMultiplier uint = 5
// MaxRaftMultiplier is a fairly arbitrary upper bound that limits the
// amount of performance detuning that's possible.
MaxRaftMultiplier uint = 10
)
var (
DefaultRPCAddr = &net.TCPAddr{IP: net.ParseIP("0.0.0.0"), Port: DefaultRPCPort}
// ProtocolVersionMap is the mapping of Consul protocol versions
// to Serf protocol versions. We mask the Serf protocols using
// our own protocol version.
protocolVersionMap map[uint8]uint8
)
func init() {
protocolVersionMap = map[uint8]uint8{
1: 4,
2: 4,
3: 4,
}
}
// (Enterprise-only) NetworkSegment is the address and port configuration
// for a network segment.
type NetworkSegment struct {
Name string
Bind string
Port int
Advertise string
RPCAddr *net.TCPAddr
SerfConfig *serf.Config
}
// Config is used to configure the server
type Config struct {
// Bootstrap mode is used to bring up the first Consul server.
// It is required so that it can elect a leader without any
// other nodes being present
Bootstrap bool
// BootstrapExpect mode is used to automatically bring up a collection of
// Consul servers. This can be used to automatically bring up a collection
// of nodes.
BootstrapExpect int
// Datacenter is the datacenter this Consul server represents.
Datacenter string
// PrimaryDatacenter is the authoritative datacenter for features like ACLs
// and Connect.
PrimaryDatacenter string
// DataDir is the directory to store our state in.
DataDir string
// DefaultQueryTime is the amount of time a blocking query will wait before
// Consul will force a response. This value can be overridden by the 'wait'
// query parameter.
DefaultQueryTime time.Duration
// MaxQueryTime is the maximum amount of time a blocking query can wait
// before Consul will force a response. Consul applies jitter to the wait
// time. The jittered time will be capped to MaxQueryTime.
MaxQueryTime time.Duration
// DevMode is used to enable a development server mode.
DevMode bool
// NodeID is a unique identifier for this node across space and time.
NodeID types.NodeID
// Node name is the name we use to advertise. Defaults to hostname.
NodeName string
// RaftConfig is the configuration used for Raft in the local DC
RaftConfig *raft.Config
// (Enterprise-only) ReadReplica is used to prevent this server from being added
// as a voting member of the Raft cluster.
ReadReplica bool
// NotifyListen is called after the RPC listener has been configured.
// RPCAdvertise will be set to the listener address if it hasn't been
// configured at this point.
NotifyListen func()
// RPCAddr is the RPC address used by Consul. This should be reachable
// by the WAN and LAN
RPCAddr *net.TCPAddr
// RPCAdvertise is the address that is advertised to other nodes for
// the RPC endpoint. This can differ from the RPC address, if for example
// the RPCAddr is unspecified "0.0.0.0:8300", but this address must be
// reachable. If RPCAdvertise is nil then it will be set to the Listener
// address after the listening socket is configured.
RPCAdvertise *net.TCPAddr
// RPCSrcAddr is the source address for outgoing RPC connections.
RPCSrcAddr *net.TCPAddr
// (Enterprise-only) The network segment this agent is part of.
Segment string
// (Enterprise-only) Segments is a list of network segments for a server to
// bind on.
Segments []NetworkSegment
// SerfLANConfig is the configuration for the intra-dc serf
SerfLANConfig *serf.Config
// SerfWANConfig is the configuration for the cross-dc serf
SerfWANConfig *serf.Config
// SerfFloodInterval controls how often we attempt to flood local Serf
// Consul servers into the global areas (WAN and user-defined areas in
// Consul Enterprise).
SerfFloodInterval time.Duration
// ReconcileInterval controls how often we reconcile the strongly
// consistent store with the Serf info. This is used to handle nodes
// that are force removed, as well as intermittent unavailability during
// leader election.
ReconcileInterval time.Duration
// ProtocolVersion is the protocol version to speak. This must be between
// ProtocolVersionMin and ProtocolVersionMax.
ProtocolVersion uint8
TLSConfig tlsutil.Config
// RejoinAfterLeave controls our interaction with Serf.
// When set to false (default), a leave causes a Consul to not rejoin
// the cluster until an explicit join is received. If this is set to
// true, we ignore the leave, and rejoin the cluster on start.
RejoinAfterLeave bool
// AdvertiseReconnectTimeout is the duration after which this node should be
// assumed to not be returning and thus should be reaped within Serf. This
// can only be set for Client agents
AdvertiseReconnectTimeout time.Duration
// Build is a string that is gossiped around, and can be used to help
// operators track which versions are actively deployed
Build string
ACLResolverSettings ACLResolverSettings
// ACLEnabled is used to enable ACLs
ACLsEnabled bool
// ACLInitialManagementToken is used to bootstrap the ACL system. It should be specified
// on the servers in the PrimaryDatacenter. When the leader comes online, it ensures
// that the initial management token is available. This provides the initial token.
ACLInitialManagementToken string
// ACLTokenReplication is used to enabled token replication.
//
// By default policy-only replication is enabled. When token
// replication is off and the primary datacenter is not
// yet upgraded to the new ACLs no replication will be performed
ACLTokenReplication bool
// ACLReplicationRate is the max number of replication rounds that can
// be run per second. Note that either 1 or 2 RPCs are used during each replication
// round
ACLReplicationRate int
// ACLReplicationBurst is how many replication RPCs can be bursted after a
// period of idleness
ACLReplicationBurst int
// ACLReplicationApplyLimit is the max number of replication-related
// apply operations that we allow during a one second period. This is
// used to limit the amount of Raft bandwidth used for replication.
ACLReplicationApplyLimit int
// ACLEnableKeyListPolicy is used to gate enforcement of the new "list" policy that
// protects listing keys by prefix. This behavior is opt-in
// by default in Consul 1.0 and later.
ACLEnableKeyListPolicy bool
AutoConfigEnabled bool
AutoConfigIntroToken string
AutoConfigIntroTokenFile string
AutoConfigServerAddresses []string
AutoConfigDNSSANs []string
AutoConfigIPSANs []net.IP
AutoConfigAuthzEnabled bool
AutoConfigAuthzAuthMethod structs.ACLAuthMethod
AutoConfigAuthzClaimAssertions []string
AutoConfigAuthzAllowReuse bool
// TombstoneTTL is used to control how long KV tombstones are retained.
// This provides a window of time where the X-Consul-Index is monotonic.
// Outside this window, the index may not be monotonic. This is a result
// of a few trade offs:
// 1) The index is defined by the data view and not globally. This is a
// performance optimization that prevents any write from incrementing the
// index for all data views.
// 2) Tombstones are not kept indefinitely, since otherwise storage required
// is also monotonic. This prevents deletes from reducing the disk space
// used.
// In theory, neither of these are intrinsic limitations, however for the
// purposes of building a practical system, they are reasonable trade offs.
//
// It is also possible to set this to an incredibly long time, thereby
// simulating infinite retention. This is not recommended however.
//
TombstoneTTL time.Duration
// TombstoneTTLGranularity is used to control how granular the timers are
// for the Tombstone GC. This is used to batch the GC of many keys together
// to reduce overhead. It is unlikely a user would ever need to tune this.
TombstoneTTLGranularity time.Duration
// Minimum Session TTL
SessionTTLMin time.Duration
// maxTokenExpirationDuration is the maximum difference allowed between
// ACLToken CreateTime and ExpirationTime values if ExpirationTime is set
// on a token.
ACLTokenMaxExpirationTTL time.Duration
// ACLTokenMinExpirationTTL is the minimum difference allowed between
// ACLToken CreateTime and ExpirationTime values if ExpirationTime is set
// on a token.
ACLTokenMinExpirationTTL time.Duration
// ServerUp callback can be used to trigger a notification that
// a Consul server is now up and known about.
ServerUp func()
// UserEventHandler callback can be used to handle incoming
// user events. This function should not block.
UserEventHandler func(serf.UserEvent)
// ConfigReplicationRate is the max number of replication rounds that can
// be run per second. Note that either 1 or 2 RPCs are used during each replication
// round
ConfigReplicationRate int
// ConfigReplicationBurst is how many replication rounds can be bursted after a
// period of idleness
ConfigReplicationBurst int
// ConfigReplicationApply limit is the max number of replication-related
// apply operations that we allow during a one second period. This is
// used to limit the amount of Raft bandwidth used for replication.
ConfigReplicationApplyLimit int
// FederationStateReplicationRate is the max number of replication rounds that can
// be run per second. Note that either 1 or 2 RPCs are used during each replication
// round
FederationStateReplicationRate int
// FederationStateReplicationBurst is how many replication rounds can be bursted after a
// period of idleness
FederationStateReplicationBurst int
// FederationStateReplicationApply limit is the max number of replication-related
// apply operations that we allow during a one second period. This is
// used to limit the amount of Raft bandwidth used for replication.
FederationStateReplicationApplyLimit int
// CoordinateUpdatePeriod controls how long a server batches coordinate
// updates before applying them in a Raft transaction. A larger period
// leads to fewer Raft transactions, but also the stored coordinates
// being more stale.
CoordinateUpdatePeriod time.Duration
// CoordinateUpdateBatchSize controls the maximum number of updates a
// server batches before applying them in a Raft transaction.
CoordinateUpdateBatchSize int
// CoordinateUpdateMaxBatches controls the maximum number of batches we
// are willing to apply in one period. After this limit we will issue a
// warning and discard the remaining updates.
CoordinateUpdateMaxBatches int
// CheckOutputMaxSize control the max size of output of checks
CheckOutputMaxSize int
// RPCHandshakeTimeout limits how long we will wait for the initial magic byte
// on an RPC client connection. It also governs how long we will wait for a
// TLS handshake when TLS is configured however the timout applies separately
// for the initial magic byte and the TLS handshake and inner magic byte.
RPCHandshakeTimeout time.Duration
// RPCHoldTimeout is how long an RPC can be "held" before it is errored.
// This is used to paper over a loss of leadership by instead holding RPCs,
// so that the caller experiences a slow response rather than an error.
// This period is meant to be long enough for a leader election to take
// place, and a small jitter is applied to avoid a thundering herd.
RPCHoldTimeout time.Duration
// RPCRateLimit and RPCMaxBurst control how frequently RPC calls are allowed
// to happen. In any large enough time interval, rate limiter limits the
// rate to RPCRateLimit tokens per second, with a maximum burst size of
// RPCMaxBurst events. As a special case, if RPCRateLimit == Inf (the infinite
// rate), RPCMaxBurst is ignored.
//
// See https://en.wikipedia.org/wiki/Token_bucket for more about token
// buckets.
RPCRateLimit rate.Limit
RPCMaxBurst int
// RPCMaxConnsPerClient is the limit of how many concurrent connections are
// allowed from a single source IP.
RPCMaxConnsPerClient int
// LeaveDrainTime is used to wait after a server has left the LAN Serf
// pool for RPCs to drain and new requests to be sent to other servers.
LeaveDrainTime time.Duration
// AutopilotConfig is used to apply the initial autopilot config when
// bootstrapping.
AutopilotConfig *structs.AutopilotConfig
// ServerHealthInterval is the frequency with which the health of the
// servers in the cluster will be updated.
ServerHealthInterval time.Duration
// AutopilotInterval is the frequency with which the leader will perform
// autopilot tasks, such as promoting eligible non-voters and removing
// dead servers.
AutopilotInterval time.Duration
// MetricsReportingInterval is the frequency with which the server will
// report usage metrics to the configured go-metrics Sinks.
MetricsReportingInterval time.Duration
// ConnectEnabled is whether to enable Connect features such as the CA.
ConnectEnabled bool
// ConnectMeshGatewayWANFederationEnabled determines if wan federation of
// datacenters should exclusively traverse mesh gateways.
ConnectMeshGatewayWANFederationEnabled bool
// DisableFederationStateAntiEntropy solely exists for use in unit tests to
// disable a background routine.
DisableFederationStateAntiEntropy bool
// OverrideInitialSerfTags solely exists for use in unit tests to ensure
// that a serf tag is initially set to a known value, rather than the
// default to test some consul upgrade scenarios with fewer races.
OverrideInitialSerfTags func(tags map[string]string)
// CAConfig is used to apply the initial Connect CA configuration when
// bootstrapping.
CAConfig *structs.CAConfiguration
// ConfigEntryBootstrap contains a list of ConfigEntries to ensure are created
// If entries of the same Kind/Name exist already these will not update them.
ConfigEntryBootstrap []structs.ConfigEntry
// AutoEncryptAllowTLS is whether to enable the server responding to
// AutoEncrypt.Sign requests.
AutoEncryptAllowTLS bool
RPCConfig RPCConfig
RaftBoltDBConfig RaftBoltDBConfig
// Embedded Consul Enterprise specific configuration
*EnterpriseConfig
}
// CheckProtocolVersion validates the protocol version.
func (c *Config) CheckProtocolVersion() error {
if c.ProtocolVersion < ProtocolVersionMin {
return fmt.Errorf("Protocol version '%d' too low. Must be in range: [%d, %d]", c.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax)
}
if c.ProtocolVersion > ProtocolVersionMax {
return fmt.Errorf("Protocol version '%d' too high. Must be in range: [%d, %d]", c.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax)
}
return nil
}
// CheckACL validates the ACL configuration.
// TODO: move this to ACLResolverSettings
func (c *Config) CheckACL() error {
switch c.ACLResolverSettings.ACLDefaultPolicy {
case "allow":
case "deny":
default:
return fmt.Errorf("Unsupported default ACL policy: %s", c.ACLResolverSettings.ACLDefaultPolicy)
}
switch c.ACLResolverSettings.ACLDownPolicy {
case "allow":
case "deny":
case "async-cache", "extend-cache":
default:
return fmt.Errorf("Unsupported down ACL policy: %s", c.ACLResolverSettings.ACLDownPolicy)
}
return nil
}
// DefaultConfig returns a default configuration.
func DefaultConfig() *Config {
hostname, err := os.Hostname()
if err != nil {
panic(err)
}
conf := &Config{
Build: version.Version,
Datacenter: DefaultDC,
NodeName: hostname,
RPCAddr: DefaultRPCAddr,
RaftConfig: raft.DefaultConfig(),
SerfLANConfig: libserf.DefaultConfig(),
SerfWANConfig: libserf.DefaultConfig(),
SerfFloodInterval: 60 * time.Second,
ReconcileInterval: 60 * time.Second,
ProtocolVersion: ProtocolVersion2Compatible,
ACLResolverSettings: ACLResolverSettings{
ACLsEnabled: false,
Datacenter: DefaultDC,
NodeName: hostname,
ACLPolicyTTL: 30 * time.Second,
ACLTokenTTL: 30 * time.Second,
ACLRoleTTL: 30 * time.Second,
ACLDownPolicy: "extend-cache",
ACLDefaultPolicy: "allow",
},
ACLReplicationRate: 1,
ACLReplicationBurst: 5,
ACLReplicationApplyLimit: 100, // ops / sec
ConfigReplicationRate: 1,
ConfigReplicationBurst: 5,
ConfigReplicationApplyLimit: 100, // ops / sec
FederationStateReplicationRate: 1,
FederationStateReplicationBurst: 5,
FederationStateReplicationApplyLimit: 100, // ops / sec
TombstoneTTL: 15 * time.Minute,
TombstoneTTLGranularity: 30 * time.Second,
SessionTTLMin: 10 * time.Second,
ACLTokenMinExpirationTTL: 1 * time.Minute,
ACLTokenMaxExpirationTTL: 24 * time.Hour,
// These are tuned to provide a total throughput of 128 updates
// per second. If you update these, you should update the client-
// side SyncCoordinateRateTarget parameter accordingly.
CoordinateUpdatePeriod: 5 * time.Second,
CoordinateUpdateBatchSize: 128,
CoordinateUpdateMaxBatches: 5,
CheckOutputMaxSize: checks.DefaultBufSize,
RPCRateLimit: rate.Inf,
RPCMaxBurst: 1000,
// TODO (slackpad) - Until #3744 is done, we need to keep these
// in sync with agent/config/default.go.
AutopilotConfig: &structs.AutopilotConfig{
CleanupDeadServers: true,
LastContactThreshold: 200 * time.Millisecond,
MaxTrailingLogs: 250,
ServerStabilizationTime: 10 * time.Second,
},
CAConfig: &structs.CAConfiguration{
Provider: "consul",
Config: map[string]interface{}{
"LeafCertTTL": structs.DefaultLeafCertTTL,
"IntermediateCertTTL": structs.DefaultIntermediateCertTTL,
"RootCertTTL": structs.DefaultRootCertTTL,
},
},
// Stay under the 10 second aggregation interval of
// go-metrics. This ensures we always report the
// usage metrics in each cycle.
MetricsReportingInterval: 9 * time.Second,
ServerHealthInterval: 2 * time.Second,
AutopilotInterval: 10 * time.Second,
DefaultQueryTime: 300 * time.Second,
MaxQueryTime: 600 * time.Second,
EnterpriseConfig: DefaultEnterpriseConfig(),
}
// Increase our reap interval to 3 days instead of 24h.
conf.SerfLANConfig.ReconnectTimeout = 3 * 24 * time.Hour
conf.SerfWANConfig.ReconnectTimeout = 3 * 24 * time.Hour
// WAN Serf should use the WAN timing, since we are using it
// to communicate between DC's
conf.SerfWANConfig.MemberlistConfig = memberlist.DefaultWANConfig()
// Ensure we don't have port conflicts
conf.SerfLANConfig.MemberlistConfig.BindPort = DefaultLANSerfPort
conf.SerfWANConfig.MemberlistConfig.BindPort = DefaultWANSerfPort
// Allow dead nodes to be replaced after 30 seconds.
conf.SerfLANConfig.MemberlistConfig.DeadNodeReclaimTime = 30 * time.Second
conf.SerfWANConfig.MemberlistConfig.DeadNodeReclaimTime = 30 * time.Second
// Raft protocol version 3 only works with other Consul servers running
// 0.8.0 or later.
conf.RaftConfig.ProtocolVersion = 3
// Disable shutdown on removal
conf.RaftConfig.ShutdownOnRemove = false
// Check every 5 seconds to see if there are enough new entries for a snapshot, can be overridden
conf.RaftConfig.SnapshotInterval = 30 * time.Second
// Snapshots are created every 16384 entries by default, can be overridden
conf.RaftConfig.SnapshotThreshold = 16384
return conf
}
// CloneSerfLANConfig clones an existing serf.Config used on the LAN by
// reconstructing it from defaults and re-applying changes made in the agent
// configs.
//
// This function is tricky to keep from rotting so we enforce that it MUST work
// by cloning our own serf LAN configuration on startup and only using the
// cloned one so any configs we need to change have to be changed here for them
// to work at all.
func CloneSerfLANConfig(base *serf.Config) *serf.Config {
cfg := DefaultConfig().SerfLANConfig
// from consul.DefaultConfig()
cfg.ReconnectTimeout = base.ReconnectTimeout
cfg.MemberlistConfig.BindPort = base.MemberlistConfig.BindPort
cfg.MemberlistConfig.DeadNodeReclaimTime = base.MemberlistConfig.DeadNodeReclaimTime
// from agent.newConsulConfig()
cfg.MemberlistConfig.BindAddr = base.MemberlistConfig.BindAddr
cfg.MemberlistConfig.BindPort = base.MemberlistConfig.BindPort
cfg.MemberlistConfig.CIDRsAllowed = base.MemberlistConfig.CIDRsAllowed
cfg.MemberlistConfig.AdvertiseAddr = base.MemberlistConfig.AdvertiseAddr
cfg.MemberlistConfig.AdvertisePort = base.MemberlistConfig.AdvertisePort
cfg.MemberlistConfig.GossipVerifyIncoming = base.MemberlistConfig.GossipVerifyIncoming
cfg.MemberlistConfig.GossipVerifyOutgoing = base.MemberlistConfig.GossipVerifyOutgoing
cfg.MemberlistConfig.GossipInterval = base.MemberlistConfig.GossipInterval
cfg.MemberlistConfig.GossipNodes = base.MemberlistConfig.GossipNodes
cfg.MemberlistConfig.ProbeInterval = base.MemberlistConfig.ProbeInterval
cfg.MemberlistConfig.ProbeTimeout = base.MemberlistConfig.ProbeTimeout
cfg.MemberlistConfig.SuspicionMult = base.MemberlistConfig.SuspicionMult
cfg.MemberlistConfig.RetransmitMult = base.MemberlistConfig.RetransmitMult
// agent/keyring.go
cfg.MemberlistConfig.Keyring = base.MemberlistConfig.Keyring
// tests
cfg.KeyringFile = base.KeyringFile
cfg.ReapInterval = base.ReapInterval
cfg.TombstoneTimeout = base.TombstoneTimeout
cfg.MemberlistConfig.SecretKey = base.MemberlistConfig.SecretKey
return cfg
}
// RPCConfig settings for the RPC server
//
// TODO: move many settings to this struct.
type RPCConfig struct {
EnableStreaming bool
}
// ReloadableConfig is the configuration that is passed to ReloadConfig when
// application config is reloaded.
type ReloadableConfig struct {
RPCRateLimit rate.Limit
RPCMaxBurst int
RPCMaxConnsPerClient int
ConfigEntryBootstrap []structs.ConfigEntry
RaftSnapshotThreshold int
RaftSnapshotInterval time.Duration
RaftTrailingLogs int
HeartbeatTimeout time.Duration
ElectionTimeout time.Duration
}
type RaftBoltDBConfig struct {
NoFreelistSync bool
}