Gossip tuneables (#4444)
Expose a few gossip tuneables for both lan and wan interfaces gossip_nodes gossip_interval probe_timeout probe_interval retransmit_mult suspicion_mult
This commit is contained in:
parent
567d9eedf6
commit
5c7c58ed26
|
@ -782,10 +782,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
|
|||
base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port
|
||||
base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
|
||||
base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
|
||||
base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfLANGossipInterval
|
||||
base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfLANProbeInterval
|
||||
base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfLANProbeTimeout
|
||||
base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfLANSuspicionMult
|
||||
base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval
|
||||
base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes
|
||||
base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval
|
||||
base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout
|
||||
base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult
|
||||
base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult
|
||||
|
||||
if a.config.SerfBindAddrWAN != nil {
|
||||
base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String()
|
||||
|
@ -794,10 +796,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
|
|||
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port
|
||||
base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
|
||||
base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
|
||||
base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfWANGossipInterval
|
||||
base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfWANProbeInterval
|
||||
base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfWANProbeTimeout
|
||||
base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfWANSuspicionMult
|
||||
base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval
|
||||
base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes
|
||||
base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval
|
||||
base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout
|
||||
base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult
|
||||
base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult
|
||||
} else {
|
||||
// Disable serf WAN federation
|
||||
base.SerfWANConfig = nil
|
||||
|
|
|
@ -579,16 +579,22 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
|
|||
ConsulRaftElectionTimeout: consulRaftElectionTimeout,
|
||||
ConsulRaftHeartbeatTimeout: consulRaftHeartbeatTimeout,
|
||||
ConsulRaftLeaderLeaseTimeout: consulRaftLeaderLeaseTimeout,
|
||||
ConsulSerfLANGossipInterval: b.durationVal("consul.serf_lan.gossip_interval", c.Consul.SerfLAN.Memberlist.GossipInterval),
|
||||
ConsulSerfLANProbeInterval: b.durationVal("consul.serf_lan.probe_interval", c.Consul.SerfLAN.Memberlist.ProbeInterval),
|
||||
ConsulSerfLANProbeTimeout: b.durationVal("consul.serf_lan.probe_timeout", c.Consul.SerfLAN.Memberlist.ProbeTimeout),
|
||||
ConsulSerfLANSuspicionMult: b.intVal(c.Consul.SerfLAN.Memberlist.SuspicionMult),
|
||||
ConsulSerfWANGossipInterval: b.durationVal("consul.serf_wan.gossip_interval", c.Consul.SerfWAN.Memberlist.GossipInterval),
|
||||
ConsulSerfWANProbeInterval: b.durationVal("consul.serf_wan.probe_interval", c.Consul.SerfWAN.Memberlist.ProbeInterval),
|
||||
ConsulSerfWANProbeTimeout: b.durationVal("consul.serf_wan.probe_timeout", c.Consul.SerfWAN.Memberlist.ProbeTimeout),
|
||||
ConsulSerfWANSuspicionMult: b.intVal(c.Consul.SerfWAN.Memberlist.SuspicionMult),
|
||||
ConsulServerHealthInterval: b.durationVal("consul.server.health_interval", c.Consul.Server.HealthInterval),
|
||||
|
||||
// gossip configuration
|
||||
GossipLANGossipInterval: b.durationVal("gossip_lan..gossip_interval", c.GossipLAN.GossipInterval),
|
||||
GossipLANGossipNodes: b.intVal(c.GossipLAN.GossipNodes),
|
||||
GossipLANProbeInterval: b.durationVal("gossip_lan..probe_interval", c.GossipLAN.ProbeInterval),
|
||||
GossipLANProbeTimeout: b.durationVal("gossip_lan..probe_timeout", c.GossipLAN.ProbeTimeout),
|
||||
GossipLANSuspicionMult: b.intVal(c.GossipLAN.SuspicionMult),
|
||||
GossipLANRetransmitMult: b.intVal(c.GossipLAN.RetransmitMult),
|
||||
GossipWANGossipInterval: b.durationVal("gossip_wan..gossip_interval", c.GossipWAN.GossipInterval),
|
||||
GossipWANGossipNodes: b.intVal(c.GossipWAN.GossipNodes),
|
||||
GossipWANProbeInterval: b.durationVal("gossip_wan..probe_interval", c.GossipWAN.ProbeInterval),
|
||||
GossipWANProbeTimeout: b.durationVal("gossip_wan..probe_timeout", c.GossipWAN.ProbeTimeout),
|
||||
GossipWANSuspicionMult: b.intVal(c.GossipWAN.SuspicionMult),
|
||||
GossipWANRetransmitMult: b.intVal(c.GossipWAN.RetransmitMult),
|
||||
|
||||
// ACL
|
||||
ACLAgentMasterToken: b.stringVal(c.ACLAgentMasterToken),
|
||||
ACLAgentToken: b.stringVal(c.ACLAgentToken),
|
||||
|
|
|
@ -184,6 +184,8 @@ type Config struct {
|
|||
EncryptKey *string `json:"encrypt,omitempty" hcl:"encrypt" mapstructure:"encrypt"`
|
||||
EncryptVerifyIncoming *bool `json:"encrypt_verify_incoming,omitempty" hcl:"encrypt_verify_incoming" mapstructure:"encrypt_verify_incoming"`
|
||||
EncryptVerifyOutgoing *bool `json:"encrypt_verify_outgoing,omitempty" hcl:"encrypt_verify_outgoing" mapstructure:"encrypt_verify_outgoing"`
|
||||
GossipLAN GossipLANConfig `json:"gossip_lan,omitempty" hcl:"gossip_lan" mapstructure:"gossip_lan"`
|
||||
GossipWAN GossipWANConfig `json:"gossip_wan,omitempty" hcl:"gossip_wan" mapstructure:"gossip_wan"`
|
||||
HTTPConfig HTTPConfig `json:"http_config,omitempty" hcl:"http_config" mapstructure:"http_config"`
|
||||
KeyFile *string `json:"key_file,omitempty" hcl:"key_file" mapstructure:"key_file"`
|
||||
LeaveOnTerm *bool `json:"leave_on_terminate,omitempty" hcl:"leave_on_terminate" mapstructure:"leave_on_terminate"`
|
||||
|
@ -259,6 +261,24 @@ type Config struct {
|
|||
VersionPrerelease *string `json:"version_prerelease,omitempty" hcl:"version_prerelease" mapstructure:"version_prerelease"`
|
||||
}
|
||||
|
||||
type GossipLANConfig struct {
|
||||
GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"`
|
||||
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
|
||||
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
|
||||
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
|
||||
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
|
||||
RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"`
|
||||
}
|
||||
|
||||
type GossipWANConfig struct {
|
||||
GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"`
|
||||
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
|
||||
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
|
||||
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
|
||||
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
|
||||
RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"`
|
||||
}
|
||||
|
||||
type Consul struct {
|
||||
Coordinate struct {
|
||||
UpdateBatchSize *int `json:"update_batch_size,omitempty" hcl:"update_batch_size" mapstructure:"update_batch_size"`
|
||||
|
@ -272,24 +292,6 @@ type Consul struct {
|
|||
LeaderLeaseTimeout *string `json:"leader_lease_timeout,omitempty" hcl:"leader_lease_timeout" mapstructure:"leader_lease_timeout"`
|
||||
} `json:"raft,omitempty" hcl:"raft" mapstructure:"raft"`
|
||||
|
||||
SerfLAN struct {
|
||||
Memberlist struct {
|
||||
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
|
||||
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
|
||||
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
|
||||
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
|
||||
} `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"`
|
||||
} `json:"serf_lan,omitempty" hcl:"serf_lan" mapstructure:"serf_lan"`
|
||||
|
||||
SerfWAN struct {
|
||||
Memberlist struct {
|
||||
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
|
||||
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
|
||||
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
|
||||
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
|
||||
} `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"`
|
||||
} `json:"serf_wan,omitempty" hcl:"serf_wan" mapstructure:"serf_wan"`
|
||||
|
||||
Server struct {
|
||||
HealthInterval *string `json:"health_interval,omitempty" hcl:"health_interval" mapstructure:"health_interval"`
|
||||
} `json:"server,omitempty" hcl:"server" mapstructure:"server"`
|
||||
|
|
|
@ -26,6 +26,10 @@ func DefaultRPCProtocol() (int, error) {
|
|||
// todo(fs): IMO, this should be the definitive default for all configurable values
|
||||
// todo(fs): and whatever is in here should clobber every default value. Hence, no sourcing.
|
||||
func DefaultSource() Source {
|
||||
cfg := consul.DefaultConfig()
|
||||
serfLAN := cfg.SerfLANConfig.MemberlistConfig
|
||||
serfWAN := cfg.SerfWANConfig.MemberlistConfig
|
||||
|
||||
return Source{
|
||||
Name: "default",
|
||||
Format: "hcl",
|
||||
|
@ -62,6 +66,22 @@ func DefaultSource() Source {
|
|||
max_trailing_logs = 250
|
||||
server_stabilization_time = "10s"
|
||||
}
|
||||
gossip_lan = {
|
||||
gossip_interval = "` + serfLAN.GossipInterval.String() + `"
|
||||
gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + `
|
||||
retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + `
|
||||
probe_interval = "` + serfLAN.ProbeInterval.String() + `"
|
||||
probe_timeout = "` + serfLAN.ProbeTimeout.String() + `"
|
||||
suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + `
|
||||
}
|
||||
gossip_wan = {
|
||||
gossip_interval = "` + serfWAN.GossipInterval.String() + `"
|
||||
gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + `
|
||||
retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + `
|
||||
probe_interval = "` + serfWAN.ProbeInterval.String() + `"
|
||||
probe_timeout = "` + serfWAN.ProbeTimeout.String() + `"
|
||||
suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + `
|
||||
}
|
||||
dns_config = {
|
||||
allow_stale = true
|
||||
a_record_limit = 0
|
||||
|
@ -92,6 +112,7 @@ func DefaultSource() Source {
|
|||
metrics_prefix = "consul"
|
||||
filter_default = true
|
||||
}
|
||||
|
||||
`,
|
||||
}
|
||||
}
|
||||
|
@ -111,6 +132,18 @@ func DevSource() Source {
|
|||
log_level = "DEBUG"
|
||||
server = true
|
||||
|
||||
gossip_lan = {
|
||||
gossip_interval = "100ms"
|
||||
probe_interval = "100ms"
|
||||
probe_timeout = "100ms"
|
||||
suspicion_mult = 3
|
||||
}
|
||||
gossip_wan = {
|
||||
gossip_interval = "100ms"
|
||||
probe_interval = "100ms"
|
||||
probe_timeout = "100ms"
|
||||
suspicion_mult = 3
|
||||
}
|
||||
connect = {
|
||||
enabled = true
|
||||
}
|
||||
|
@ -166,8 +199,6 @@ func DefaultVersionSource() Source {
|
|||
func DefaultConsulSource() Source {
|
||||
cfg := consul.DefaultConfig()
|
||||
raft := cfg.RaftConfig
|
||||
serfLAN := cfg.SerfLANConfig.MemberlistConfig
|
||||
serfWAN := cfg.SerfWANConfig.MemberlistConfig
|
||||
return Source{
|
||||
Name: "consul",
|
||||
Format: "hcl",
|
||||
|
@ -183,22 +214,6 @@ func DefaultConsulSource() Source {
|
|||
heartbeat_timeout = "` + raft.HeartbeatTimeout.String() + `"
|
||||
leader_lease_timeout = "` + raft.LeaderLeaseTimeout.String() + `"
|
||||
}
|
||||
serf_lan = {
|
||||
memberlist = {
|
||||
gossip_interval = "` + serfLAN.GossipInterval.String() + `"
|
||||
probe_interval = "` + serfLAN.ProbeInterval.String() + `"
|
||||
probe_timeout = "` + serfLAN.ProbeTimeout.String() + `"
|
||||
suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + `
|
||||
}
|
||||
}
|
||||
serf_wan = {
|
||||
memberlist = {
|
||||
gossip_interval = "` + serfWAN.GossipInterval.String() + `"
|
||||
probe_interval = "` + serfWAN.ProbeInterval.String() + `"
|
||||
probe_timeout = "` + serfWAN.ProbeTimeout.String() + `"
|
||||
suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + `
|
||||
}
|
||||
}
|
||||
server = {
|
||||
health_interval = "` + cfg.ServerHealthInterval.String() + `"
|
||||
}
|
||||
|
@ -223,22 +238,6 @@ func DevConsulSource() Source {
|
|||
heartbeat_timeout = "35ms"
|
||||
leader_lease_timeout = "20ms"
|
||||
}
|
||||
serf_lan = {
|
||||
memberlist = {
|
||||
gossip_interval = "100ms"
|
||||
probe_interval = "100ms"
|
||||
probe_timeout = "100ms"
|
||||
suspicion_mult = 3
|
||||
}
|
||||
}
|
||||
serf_wan = {
|
||||
memberlist = {
|
||||
gossip_interval = "100ms"
|
||||
probe_interval = "100ms"
|
||||
probe_timeout = "100ms"
|
||||
suspicion_mult = 3
|
||||
}
|
||||
}
|
||||
server = {
|
||||
health_interval = "10ms"
|
||||
}
|
||||
|
|
|
@ -47,14 +47,6 @@ type RuntimeConfig struct {
|
|||
ConsulRaftElectionTimeout time.Duration
|
||||
ConsulRaftHeartbeatTimeout time.Duration
|
||||
ConsulRaftLeaderLeaseTimeout time.Duration
|
||||
ConsulSerfLANGossipInterval time.Duration
|
||||
ConsulSerfLANProbeInterval time.Duration
|
||||
ConsulSerfLANProbeTimeout time.Duration
|
||||
ConsulSerfLANSuspicionMult int
|
||||
ConsulSerfWANGossipInterval time.Duration
|
||||
ConsulSerfWANProbeInterval time.Duration
|
||||
ConsulSerfWANProbeTimeout time.Duration
|
||||
ConsulSerfWANSuspicionMult int
|
||||
ConsulServerHealthInterval time.Duration
|
||||
|
||||
// ACLAgentMasterToken is a special token that has full read and write
|
||||
|
@ -964,6 +956,160 @@ type RuntimeConfig struct {
|
|||
// hcl: ports { serf_wan = int }
|
||||
SerfPortWAN int
|
||||
|
||||
// GossipLANGossipInterval is the interval between sending messages that need
|
||||
// to be gossiped that haven't been able to piggyback on probing messages.
|
||||
// If this is set to zero, non-piggyback gossip is disabled. By lowering
|
||||
// this value (more frequent) gossip messages are propagated across
|
||||
// the cluster more quickly at the expense of increased bandwidth. This
|
||||
// configuration only applies to LAN gossip communications
|
||||
//
|
||||
// The default is: 200ms
|
||||
//
|
||||
// hcl: gossip_lan { gossip_interval = duration}
|
||||
GossipLANGossipInterval time.Duration
|
||||
|
||||
// GossipLANGossipNodes is the number of random nodes to send gossip messages to
|
||||
// per GossipInterval. Increasing this number causes the gossip messages to
|
||||
// propagate across the cluster more quickly at the expense of increased
|
||||
// bandwidth. This configuration only applies to LAN gossip communications
|
||||
//
|
||||
// The default is: 3
|
||||
//
|
||||
// hcl: gossip_lan { gossip_nodes = int }
|
||||
GossipLANGossipNodes int
|
||||
|
||||
// GossipLANProbeInterval is the interval between random node probes. Setting
|
||||
// this lower (more frequent) will cause the memberlist cluster to detect
|
||||
// failed nodes more quickly at the expense of increased bandwidth usage.
|
||||
// This configuration only applies to LAN gossip communications
|
||||
//
|
||||
// The default is: 1s
|
||||
//
|
||||
// hcl: gossip_lan { probe_interval = duration }
|
||||
GossipLANProbeInterval time.Duration
|
||||
|
||||
// GossipLANProbeTimeout is the timeout to wait for an ack from a probed node
|
||||
// before assuming it is unhealthy. This should be set to 99-percentile
|
||||
// of RTT (round-trip time) on your network. This configuration
|
||||
// only applies to the LAN gossip communications
|
||||
//
|
||||
// The default is: 500ms
|
||||
//
|
||||
// hcl: gossip_lan { probe_timeout = duration }
|
||||
GossipLANProbeTimeout time.Duration
|
||||
|
||||
// GossipLANSuspicionMult is the multiplier for determining the time an
|
||||
// inaccessible node is considered suspect before declaring it dead. This
|
||||
// configuration only applies to LAN gossip communications
|
||||
//
|
||||
// The actual timeout is calculated using the formula:
|
||||
//
|
||||
// SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
|
||||
//
|
||||
// This allows the timeout to scale properly with expected propagation
|
||||
// delay with a larger cluster size. The higher the multiplier, the longer
|
||||
// an inaccessible node is considered part of the cluster before declaring
|
||||
// it dead, giving that suspect node more time to refute if it is indeed
|
||||
// still alive.
|
||||
//
|
||||
// The default is: 4
|
||||
//
|
||||
// hcl: gossip_lan { suspicion_mult = int }
|
||||
GossipLANSuspicionMult int
|
||||
|
||||
// GossipLANRetransmitMult is the multiplier for the number of retransmissions
|
||||
// that are attempted for messages broadcasted over gossip. This
|
||||
// configuration only applies to LAN gossip communications. The actual
|
||||
// count of retransmissions is calculated using the formula:
|
||||
//
|
||||
// Retransmits = RetransmitMult * log(N+1)
|
||||
//
|
||||
// This allows the retransmits to scale properly with cluster size. The
|
||||
// higher the multiplier, the more likely a failed broadcast is to converge
|
||||
// at the expense of increased bandwidth.
|
||||
//
|
||||
// The default is: 4
|
||||
//
|
||||
// hcl: gossip_lan { retransmit_mult = int }
|
||||
GossipLANRetransmitMult int
|
||||
|
||||
// GossipWANGossipInterval is the interval between sending messages that need
|
||||
// to be gossiped that haven't been able to piggyback on probing messages.
|
||||
// If this is set to zero, non-piggyback gossip is disabled. By lowering
|
||||
// this value (more frequent) gossip messages are propagated across
|
||||
// the cluster more quickly at the expense of increased bandwidth. This
|
||||
// configuration only applies to WAN gossip communications
|
||||
//
|
||||
// The default is: 200ms
|
||||
//
|
||||
// hcl: gossip_wan { gossip_interval = duration}
|
||||
GossipWANGossipInterval time.Duration
|
||||
|
||||
// GossipWANGossipNodes is the number of random nodes to send gossip messages to
|
||||
// per GossipInterval. Increasing this number causes the gossip messages to
|
||||
// propagate across the cluster more quickly at the expense of increased
|
||||
// bandwidth. This configuration only applies to WAN gossip communications
|
||||
//
|
||||
// The default is: 3
|
||||
//
|
||||
// hcl: gossip_wan { gossip_nodes = int }
|
||||
GossipWANGossipNodes int
|
||||
|
||||
// GossipWANProbeInterval is the interval between random node probes. Setting
|
||||
// this lower (more frequent) will cause the memberlist cluster to detect
|
||||
// failed nodes more quickly at the expense of increased bandwidth usage.
|
||||
// This configuration only applies to WAN gossip communications
|
||||
//
|
||||
// The default is: 1s
|
||||
//
|
||||
// hcl: gossip_wan { probe_interval = duration }
|
||||
GossipWANProbeInterval time.Duration
|
||||
|
||||
// GossipWANProbeTimeout is the timeout to wait for an ack from a probed node
|
||||
// before assuming it is unhealthy. This should be set to 99-percentile
|
||||
// of RTT (round-trip time) on your network. This configuration
|
||||
// only applies to the WAN gossip communications
|
||||
//
|
||||
// The default is: 500ms
|
||||
//
|
||||
// hcl: gossip_wan { probe_timeout = duration }
|
||||
GossipWANProbeTimeout time.Duration
|
||||
|
||||
// GossipWANSuspicionMult is the multiplier for determining the time an
|
||||
// inaccessible node is considered suspect before declaring it dead. This
|
||||
// configuration only applies to WAN gossip communications
|
||||
//
|
||||
// The actual timeout is calculated using the formula:
|
||||
//
|
||||
// SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
|
||||
//
|
||||
// This allows the timeout to scale properly with expected propagation
|
||||
// delay with a larger cluster size. The higher the multiplier, the longer
|
||||
// an inaccessible node is considered part of the cluster before declaring
|
||||
// it dead, giving that suspect node more time to refute if it is indeed
|
||||
// still alive.
|
||||
//
|
||||
// The default is: 4
|
||||
//
|
||||
// hcl: gossip_wan { suspicion_mult = int }
|
||||
GossipWANSuspicionMult int
|
||||
|
||||
// GossipWANRetransmitMult is the multiplier for the number of retransmissions
|
||||
// that are attempted for messages broadcasted over gossip. This
|
||||
// configuration only applies to WAN gossip communications. The actual
|
||||
// count of retransmissions is calculated using the formula:
|
||||
//
|
||||
// Retransmits = RetransmitMult * log(N+1)
|
||||
//
|
||||
// This allows the retransmits to scale properly with cluster size. The
|
||||
// higher the multiplier, the more likely a failed broadcast is to converge
|
||||
// at the expense of increased bandwidth.
|
||||
//
|
||||
// The default is: 4
|
||||
//
|
||||
// hcl: gossip_wan { retransmit_mult = int }
|
||||
GossipWANRetransmitMult int
|
||||
|
||||
// ServerMode controls if this agent acts like a Consul server,
|
||||
// or merely as a client. Servers have more state, take part
|
||||
// in leader election, etc.
|
||||
|
|
|
@ -286,14 +286,14 @@ func TestConfigFlagsAndEdgecases(t *testing.T) {
|
|||
rt.ConsulRaftElectionTimeout = 52 * time.Millisecond
|
||||
rt.ConsulRaftHeartbeatTimeout = 35 * time.Millisecond
|
||||
rt.ConsulRaftLeaderLeaseTimeout = 20 * time.Millisecond
|
||||
rt.ConsulSerfLANGossipInterval = 100 * time.Millisecond
|
||||
rt.ConsulSerfLANProbeInterval = 100 * time.Millisecond
|
||||
rt.ConsulSerfLANProbeTimeout = 100 * time.Millisecond
|
||||
rt.ConsulSerfLANSuspicionMult = 3
|
||||
rt.ConsulSerfWANGossipInterval = 100 * time.Millisecond
|
||||
rt.ConsulSerfWANProbeInterval = 100 * time.Millisecond
|
||||
rt.ConsulSerfWANProbeTimeout = 100 * time.Millisecond
|
||||
rt.ConsulSerfWANSuspicionMult = 3
|
||||
rt.GossipLANGossipInterval = 100 * time.Millisecond
|
||||
rt.GossipLANProbeInterval = 100 * time.Millisecond
|
||||
rt.GossipLANProbeTimeout = 100 * time.Millisecond
|
||||
rt.GossipLANSuspicionMult = 3
|
||||
rt.GossipWANGossipInterval = 100 * time.Millisecond
|
||||
rt.GossipWANProbeInterval = 100 * time.Millisecond
|
||||
rt.GossipWANProbeTimeout = 100 * time.Millisecond
|
||||
rt.GossipWANSuspicionMult = 3
|
||||
rt.ConsulServerHealthInterval = 10 * time.Millisecond
|
||||
},
|
||||
},
|
||||
|
@ -2617,6 +2617,22 @@ func TestFullConfig(t *testing.T) {
|
|||
}
|
||||
}
|
||||
},
|
||||
"gossip_lan" : {
|
||||
"gossip_nodes": 6,
|
||||
"gossip_interval" : "25252s",
|
||||
"retransmit_mult" : 1234,
|
||||
"suspicion_mult" : 1235,
|
||||
"probe_interval" : "101ms",
|
||||
"probe_timeout" : "102ms"
|
||||
},
|
||||
"gossip_wan" : {
|
||||
"gossip_nodes" : 2,
|
||||
"gossip_interval" : "6966s",
|
||||
"retransmit_mult" : 16384,
|
||||
"suspicion_mult" : 16385,
|
||||
"probe_interval" : "103ms",
|
||||
"probe_timeout" : "104ms"
|
||||
},
|
||||
"data_dir": "` + dataDir + `",
|
||||
"datacenter": "rzo029wg",
|
||||
"disable_anonymous_signature": true,
|
||||
|
@ -3092,6 +3108,22 @@ func TestFullConfig(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
gossip_lan {
|
||||
gossip_nodes = 6
|
||||
gossip_interval = "25252s"
|
||||
retransmit_mult = 1234
|
||||
suspicion_mult = 1235
|
||||
probe_interval = "101ms"
|
||||
probe_timeout = "102ms"
|
||||
}
|
||||
gossip_wan {
|
||||
gossip_nodes = 2
|
||||
gossip_interval = "6966s"
|
||||
retransmit_mult = 16384
|
||||
suspicion_mult = 16385
|
||||
probe_interval = "103ms"
|
||||
probe_timeout = "104ms"
|
||||
}
|
||||
data_dir = "` + dataDir + `"
|
||||
datacenter = "rzo029wg"
|
||||
disable_anonymous_signature = true
|
||||
|
@ -3473,22 +3505,6 @@ func TestFullConfig(t *testing.T) {
|
|||
"heartbeat_timeout": "25699s",
|
||||
"leader_lease_timeout": "15351s"
|
||||
},
|
||||
"serf_lan": {
|
||||
"memberlist": {
|
||||
"gossip_interval": "25252s",
|
||||
"probe_interval": "5105s",
|
||||
"probe_timeout": "29179s",
|
||||
"suspicion_mult": 8263
|
||||
}
|
||||
},
|
||||
"serf_wan": {
|
||||
"memberlist": {
|
||||
"gossip_interval": "6966s",
|
||||
"probe_interval": "20148s",
|
||||
"probe_timeout": "3007s",
|
||||
"suspicion_mult": 32096
|
||||
}
|
||||
},
|
||||
"server": {
|
||||
"health_interval": "17455s"
|
||||
}
|
||||
|
@ -3527,22 +3543,6 @@ func TestFullConfig(t *testing.T) {
|
|||
heartbeat_timeout = "25699s"
|
||||
leader_lease_timeout = "15351s"
|
||||
}
|
||||
serf_lan = {
|
||||
memberlist = {
|
||||
gossip_interval = "25252s"
|
||||
probe_interval = "5105s"
|
||||
probe_timeout = "29179s"
|
||||
suspicion_mult = 8263
|
||||
}
|
||||
}
|
||||
serf_wan = {
|
||||
memberlist = {
|
||||
gossip_interval = "6966s"
|
||||
probe_interval = "20148s"
|
||||
probe_timeout = "3007s"
|
||||
suspicion_mult = 32096
|
||||
}
|
||||
}
|
||||
server = {
|
||||
health_interval = "17455s"
|
||||
}
|
||||
|
@ -3574,14 +3574,18 @@ func TestFullConfig(t *testing.T) {
|
|||
ConsulRaftElectionTimeout: 5 * 31947 * time.Second,
|
||||
ConsulRaftHeartbeatTimeout: 5 * 25699 * time.Second,
|
||||
ConsulRaftLeaderLeaseTimeout: 5 * 15351 * time.Second,
|
||||
ConsulSerfLANGossipInterval: 25252 * time.Second,
|
||||
ConsulSerfLANProbeInterval: 5105 * time.Second,
|
||||
ConsulSerfLANProbeTimeout: 29179 * time.Second,
|
||||
ConsulSerfLANSuspicionMult: 8263,
|
||||
ConsulSerfWANGossipInterval: 6966 * time.Second,
|
||||
ConsulSerfWANProbeInterval: 20148 * time.Second,
|
||||
ConsulSerfWANProbeTimeout: 3007 * time.Second,
|
||||
ConsulSerfWANSuspicionMult: 32096,
|
||||
GossipLANGossipInterval: 25252 * time.Second,
|
||||
GossipLANGossipNodes: 6,
|
||||
GossipLANProbeInterval: 101 * time.Millisecond,
|
||||
GossipLANProbeTimeout: 102 * time.Millisecond,
|
||||
GossipLANSuspicionMult: 1235,
|
||||
GossipLANRetransmitMult: 1234,
|
||||
GossipWANGossipInterval: 6966 * time.Second,
|
||||
GossipWANGossipNodes: 2,
|
||||
GossipWANProbeInterval: 103 * time.Millisecond,
|
||||
GossipWANProbeTimeout: 104 * time.Millisecond,
|
||||
GossipWANSuspicionMult: 16385,
|
||||
GossipWANRetransmitMult: 16384,
|
||||
ConsulServerHealthInterval: 17455 * time.Second,
|
||||
|
||||
// user configurable values
|
||||
|
@ -4407,14 +4411,18 @@ func TestSanitize(t *testing.T) {
|
|||
"ConsulRaftElectionTimeout": "0s",
|
||||
"ConsulRaftHeartbeatTimeout": "0s",
|
||||
"ConsulRaftLeaderLeaseTimeout": "0s",
|
||||
"ConsulSerfLANGossipInterval": "0s",
|
||||
"ConsulSerfLANProbeInterval": "0s",
|
||||
"ConsulSerfLANProbeTimeout": "0s",
|
||||
"ConsulSerfLANSuspicionMult": 0,
|
||||
"ConsulSerfWANGossipInterval": "0s",
|
||||
"ConsulSerfWANProbeInterval": "0s",
|
||||
"ConsulSerfWANProbeTimeout": "0s",
|
||||
"ConsulSerfWANSuspicionMult": 0,
|
||||
"GossipLANGossipInterval": "0s",
|
||||
"GossipLANGossipNodes": 0,
|
||||
"GossipLANProbeInterval": "0s",
|
||||
"GossipLANProbeTimeout": "0s",
|
||||
"GossipLANRetransmitMult": 0,
|
||||
"GossipLANSuspicionMult": 0,
|
||||
"GossipWANGossipInterval": "0s",
|
||||
"GossipWANGossipNodes": 0,
|
||||
"GossipWANProbeInterval": "0s",
|
||||
"GossipWANProbeTimeout": "0s",
|
||||
"GossipWANRetransmitMult": 0,
|
||||
"GossipWANSuspicionMult": 0,
|
||||
"ConsulServerHealthInterval": "0s",
|
||||
"DNSARecordLimit": 0,
|
||||
"DNSAddrs": [
|
||||
|
|
|
@ -918,6 +918,76 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
|
|||
* <a name="disable_keyring_file"></a><a href="#disable_keyring_file">`disable_keyring_file`</a> - Equivalent to the
|
||||
[`-disable-keyring-file` command-line flag](#_disable_keyring_file).
|
||||
|
||||
* <a name="gossip_lan"></a><a href="#gossip_lan">`gossip_lan`</a> - **(Advanced)** This object contains a number of sub-keys
|
||||
which can be set to tune the LAN gossip communications. These are only provided for users running especially large
|
||||
clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their
|
||||
environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**.
|
||||
The default values are appropriate in almost all deployments.
|
||||
|
||||
* <a name="gossip_nodes"></a><a href="#gossip_nodes">`gossip_nodes`</a> - The number of random nodes to send
|
||||
gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate
|
||||
across the cluster more quickly at the expense of increased bandwidth. The default is 3.
|
||||
|
||||
* <a name="gossip_interval"></a><a href="#gossip_interval">`gossip_interval`</a> - The interval between sending
|
||||
messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to
|
||||
zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated
|
||||
across the cluster more quickly at the expense of increased bandwidth. The default is 200ms.
|
||||
|
||||
* <a name="probe_interval"></a><a href="#probe_interval">`probe_interval`</a> - The interval between random node
|
||||
probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly
|
||||
at the expense of increased bandwidth usage. The default is 1s.
|
||||
|
||||
* <a name="probe_timeout"></a><a href="#probe_timeout">`probe_timeout`</a> - The timeout to wait for an ack from
|
||||
a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on
|
||||
your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments.
|
||||
|
||||
* <a name="retransmit_mult"></a><a href="#retransmit_mult">`retransmit_mult`</a> - The multiplier for the number
|
||||
of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled
|
||||
using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to
|
||||
converge at the expense of increased bandwidth. The default is 4.
|
||||
|
||||
* <a name="suspicion_mult"></a><a href="#suspicion_mult">`suspicion_mult`</a> - The multiplier for determining the
|
||||
time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster
|
||||
size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a
|
||||
larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the
|
||||
cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The
|
||||
default is 4.
|
||||
|
||||
* <a name="gossip_wan"></a><a href="#gossip_wan">`gossip_wan`</a> - **(Advanced)** This object contains a number of sub-keys
|
||||
which can be set to tune the WAN gossip communications. These are only provided for users running especially large
|
||||
clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their
|
||||
environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**.
|
||||
The default values are appropriate in almost all deployments.
|
||||
|
||||
* <a name="gossip_nodes"></a><a href="#gossip_nodes">`gossip_nodes`</a> - The number of random nodes to send
|
||||
gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate
|
||||
across the cluster more quickly at the expense of increased bandwidth. The default is 3.
|
||||
|
||||
* <a name="gossip_interval"></a><a href="#gossip_interval">`gossip_interval`</a> - The interval between sending
|
||||
messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to
|
||||
zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated
|
||||
across the cluster more quickly at the expense of increased bandwidth. The default is 200ms.
|
||||
|
||||
* <a name="probe_interval"></a><a href="#probe_interval">`probe_interval`</a> - The interval between random node
|
||||
probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly
|
||||
at the expense of increased bandwidth usage. The default is 1s.
|
||||
|
||||
* <a name="probe_timeout"></a><a href="#probe_timeout">`probe_timeout`</a> - The timeout to wait for an ack from
|
||||
a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on
|
||||
your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments.
|
||||
|
||||
* <a name="retransmit_mult"></a><a href="#retransmit_mult">`retransmit_mult`</a> - The multiplier for the number
|
||||
of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled
|
||||
using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to
|
||||
converge at the expense of increased bandwidth. The default is 4.
|
||||
|
||||
* <a name="suspicion_mult"></a><a href="#suspicion_mult">`suspicion_mult`</a> - The multiplier for determining the
|
||||
time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster
|
||||
size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a
|
||||
larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the
|
||||
cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The
|
||||
default is 4.
|
||||
|
||||
* <a name="key_file"></a><a href="#key_file">`key_file`</a> This provides a the file path to a
|
||||
PEM-encoded private key. The key is used with the certificate to verify the agent's authenticity.
|
||||
This must be provided along with [`cert_file`](#cert_file).
|
||||
|
|
Loading…
Reference in New Issue