Gossip tuneables (#4444)

Expose a few gossip tuneables for both lan and wan interfaces

gossip_nodes
gossip_interval
probe_timeout
probe_interval
retransmit_mult
suspicion_mult
This commit is contained in:
Matt Keeler 2018-07-26 11:39:49 -04:00 committed by GitHub
parent 567d9eedf6
commit 5c7c58ed26
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 367 additions and 132 deletions

View File

@ -782,10 +782,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port
base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfLANGossipInterval base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval
base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfLANProbeInterval base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes
base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfLANProbeTimeout base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval
base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfLANSuspicionMult base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout
base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult
base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult
if a.config.SerfBindAddrWAN != nil { if a.config.SerfBindAddrWAN != nil {
base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String() base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String()
@ -794,10 +796,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port
base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfWANGossipInterval base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval
base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfWANProbeInterval base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes
base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfWANProbeTimeout base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval
base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfWANSuspicionMult base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout
base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult
base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult
} else { } else {
// Disable serf WAN federation // Disable serf WAN federation
base.SerfWANConfig = nil base.SerfWANConfig = nil

View File

@ -579,16 +579,22 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
ConsulRaftElectionTimeout: consulRaftElectionTimeout, ConsulRaftElectionTimeout: consulRaftElectionTimeout,
ConsulRaftHeartbeatTimeout: consulRaftHeartbeatTimeout, ConsulRaftHeartbeatTimeout: consulRaftHeartbeatTimeout,
ConsulRaftLeaderLeaseTimeout: consulRaftLeaderLeaseTimeout, ConsulRaftLeaderLeaseTimeout: consulRaftLeaderLeaseTimeout,
ConsulSerfLANGossipInterval: b.durationVal("consul.serf_lan.gossip_interval", c.Consul.SerfLAN.Memberlist.GossipInterval),
ConsulSerfLANProbeInterval: b.durationVal("consul.serf_lan.probe_interval", c.Consul.SerfLAN.Memberlist.ProbeInterval),
ConsulSerfLANProbeTimeout: b.durationVal("consul.serf_lan.probe_timeout", c.Consul.SerfLAN.Memberlist.ProbeTimeout),
ConsulSerfLANSuspicionMult: b.intVal(c.Consul.SerfLAN.Memberlist.SuspicionMult),
ConsulSerfWANGossipInterval: b.durationVal("consul.serf_wan.gossip_interval", c.Consul.SerfWAN.Memberlist.GossipInterval),
ConsulSerfWANProbeInterval: b.durationVal("consul.serf_wan.probe_interval", c.Consul.SerfWAN.Memberlist.ProbeInterval),
ConsulSerfWANProbeTimeout: b.durationVal("consul.serf_wan.probe_timeout", c.Consul.SerfWAN.Memberlist.ProbeTimeout),
ConsulSerfWANSuspicionMult: b.intVal(c.Consul.SerfWAN.Memberlist.SuspicionMult),
ConsulServerHealthInterval: b.durationVal("consul.server.health_interval", c.Consul.Server.HealthInterval), ConsulServerHealthInterval: b.durationVal("consul.server.health_interval", c.Consul.Server.HealthInterval),
// gossip configuration
GossipLANGossipInterval: b.durationVal("gossip_lan..gossip_interval", c.GossipLAN.GossipInterval),
GossipLANGossipNodes: b.intVal(c.GossipLAN.GossipNodes),
GossipLANProbeInterval: b.durationVal("gossip_lan..probe_interval", c.GossipLAN.ProbeInterval),
GossipLANProbeTimeout: b.durationVal("gossip_lan..probe_timeout", c.GossipLAN.ProbeTimeout),
GossipLANSuspicionMult: b.intVal(c.GossipLAN.SuspicionMult),
GossipLANRetransmitMult: b.intVal(c.GossipLAN.RetransmitMult),
GossipWANGossipInterval: b.durationVal("gossip_wan..gossip_interval", c.GossipWAN.GossipInterval),
GossipWANGossipNodes: b.intVal(c.GossipWAN.GossipNodes),
GossipWANProbeInterval: b.durationVal("gossip_wan..probe_interval", c.GossipWAN.ProbeInterval),
GossipWANProbeTimeout: b.durationVal("gossip_wan..probe_timeout", c.GossipWAN.ProbeTimeout),
GossipWANSuspicionMult: b.intVal(c.GossipWAN.SuspicionMult),
GossipWANRetransmitMult: b.intVal(c.GossipWAN.RetransmitMult),
// ACL // ACL
ACLAgentMasterToken: b.stringVal(c.ACLAgentMasterToken), ACLAgentMasterToken: b.stringVal(c.ACLAgentMasterToken),
ACLAgentToken: b.stringVal(c.ACLAgentToken), ACLAgentToken: b.stringVal(c.ACLAgentToken),

View File

@ -184,6 +184,8 @@ type Config struct {
EncryptKey *string `json:"encrypt,omitempty" hcl:"encrypt" mapstructure:"encrypt"` EncryptKey *string `json:"encrypt,omitempty" hcl:"encrypt" mapstructure:"encrypt"`
EncryptVerifyIncoming *bool `json:"encrypt_verify_incoming,omitempty" hcl:"encrypt_verify_incoming" mapstructure:"encrypt_verify_incoming"` EncryptVerifyIncoming *bool `json:"encrypt_verify_incoming,omitempty" hcl:"encrypt_verify_incoming" mapstructure:"encrypt_verify_incoming"`
EncryptVerifyOutgoing *bool `json:"encrypt_verify_outgoing,omitempty" hcl:"encrypt_verify_outgoing" mapstructure:"encrypt_verify_outgoing"` EncryptVerifyOutgoing *bool `json:"encrypt_verify_outgoing,omitempty" hcl:"encrypt_verify_outgoing" mapstructure:"encrypt_verify_outgoing"`
GossipLAN GossipLANConfig `json:"gossip_lan,omitempty" hcl:"gossip_lan" mapstructure:"gossip_lan"`
GossipWAN GossipWANConfig `json:"gossip_wan,omitempty" hcl:"gossip_wan" mapstructure:"gossip_wan"`
HTTPConfig HTTPConfig `json:"http_config,omitempty" hcl:"http_config" mapstructure:"http_config"` HTTPConfig HTTPConfig `json:"http_config,omitempty" hcl:"http_config" mapstructure:"http_config"`
KeyFile *string `json:"key_file,omitempty" hcl:"key_file" mapstructure:"key_file"` KeyFile *string `json:"key_file,omitempty" hcl:"key_file" mapstructure:"key_file"`
LeaveOnTerm *bool `json:"leave_on_terminate,omitempty" hcl:"leave_on_terminate" mapstructure:"leave_on_terminate"` LeaveOnTerm *bool `json:"leave_on_terminate,omitempty" hcl:"leave_on_terminate" mapstructure:"leave_on_terminate"`
@ -259,6 +261,24 @@ type Config struct {
VersionPrerelease *string `json:"version_prerelease,omitempty" hcl:"version_prerelease" mapstructure:"version_prerelease"` VersionPrerelease *string `json:"version_prerelease,omitempty" hcl:"version_prerelease" mapstructure:"version_prerelease"`
} }
type GossipLANConfig struct {
GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"`
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"`
}
type GossipWANConfig struct {
GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"`
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"`
}
type Consul struct { type Consul struct {
Coordinate struct { Coordinate struct {
UpdateBatchSize *int `json:"update_batch_size,omitempty" hcl:"update_batch_size" mapstructure:"update_batch_size"` UpdateBatchSize *int `json:"update_batch_size,omitempty" hcl:"update_batch_size" mapstructure:"update_batch_size"`
@ -272,24 +292,6 @@ type Consul struct {
LeaderLeaseTimeout *string `json:"leader_lease_timeout,omitempty" hcl:"leader_lease_timeout" mapstructure:"leader_lease_timeout"` LeaderLeaseTimeout *string `json:"leader_lease_timeout,omitempty" hcl:"leader_lease_timeout" mapstructure:"leader_lease_timeout"`
} `json:"raft,omitempty" hcl:"raft" mapstructure:"raft"` } `json:"raft,omitempty" hcl:"raft" mapstructure:"raft"`
SerfLAN struct {
Memberlist struct {
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
} `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"`
} `json:"serf_lan,omitempty" hcl:"serf_lan" mapstructure:"serf_lan"`
SerfWAN struct {
Memberlist struct {
GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
} `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"`
} `json:"serf_wan,omitempty" hcl:"serf_wan" mapstructure:"serf_wan"`
Server struct { Server struct {
HealthInterval *string `json:"health_interval,omitempty" hcl:"health_interval" mapstructure:"health_interval"` HealthInterval *string `json:"health_interval,omitempty" hcl:"health_interval" mapstructure:"health_interval"`
} `json:"server,omitempty" hcl:"server" mapstructure:"server"` } `json:"server,omitempty" hcl:"server" mapstructure:"server"`

View File

@ -26,6 +26,10 @@ func DefaultRPCProtocol() (int, error) {
// todo(fs): IMO, this should be the definitive default for all configurable values // todo(fs): IMO, this should be the definitive default for all configurable values
// todo(fs): and whatever is in here should clobber every default value. Hence, no sourcing. // todo(fs): and whatever is in here should clobber every default value. Hence, no sourcing.
func DefaultSource() Source { func DefaultSource() Source {
cfg := consul.DefaultConfig()
serfLAN := cfg.SerfLANConfig.MemberlistConfig
serfWAN := cfg.SerfWANConfig.MemberlistConfig
return Source{ return Source{
Name: "default", Name: "default",
Format: "hcl", Format: "hcl",
@ -62,6 +66,22 @@ func DefaultSource() Source {
max_trailing_logs = 250 max_trailing_logs = 250
server_stabilization_time = "10s" server_stabilization_time = "10s"
} }
gossip_lan = {
gossip_interval = "` + serfLAN.GossipInterval.String() + `"
gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + `
retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + `
probe_interval = "` + serfLAN.ProbeInterval.String() + `"
probe_timeout = "` + serfLAN.ProbeTimeout.String() + `"
suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + `
}
gossip_wan = {
gossip_interval = "` + serfWAN.GossipInterval.String() + `"
gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + `
retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + `
probe_interval = "` + serfWAN.ProbeInterval.String() + `"
probe_timeout = "` + serfWAN.ProbeTimeout.String() + `"
suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + `
}
dns_config = { dns_config = {
allow_stale = true allow_stale = true
a_record_limit = 0 a_record_limit = 0
@ -92,6 +112,7 @@ func DefaultSource() Source {
metrics_prefix = "consul" metrics_prefix = "consul"
filter_default = true filter_default = true
} }
`, `,
} }
} }
@ -111,6 +132,18 @@ func DevSource() Source {
log_level = "DEBUG" log_level = "DEBUG"
server = true server = true
gossip_lan = {
gossip_interval = "100ms"
probe_interval = "100ms"
probe_timeout = "100ms"
suspicion_mult = 3
}
gossip_wan = {
gossip_interval = "100ms"
probe_interval = "100ms"
probe_timeout = "100ms"
suspicion_mult = 3
}
connect = { connect = {
enabled = true enabled = true
} }
@ -166,8 +199,6 @@ func DefaultVersionSource() Source {
func DefaultConsulSource() Source { func DefaultConsulSource() Source {
cfg := consul.DefaultConfig() cfg := consul.DefaultConfig()
raft := cfg.RaftConfig raft := cfg.RaftConfig
serfLAN := cfg.SerfLANConfig.MemberlistConfig
serfWAN := cfg.SerfWANConfig.MemberlistConfig
return Source{ return Source{
Name: "consul", Name: "consul",
Format: "hcl", Format: "hcl",
@ -183,22 +214,6 @@ func DefaultConsulSource() Source {
heartbeat_timeout = "` + raft.HeartbeatTimeout.String() + `" heartbeat_timeout = "` + raft.HeartbeatTimeout.String() + `"
leader_lease_timeout = "` + raft.LeaderLeaseTimeout.String() + `" leader_lease_timeout = "` + raft.LeaderLeaseTimeout.String() + `"
} }
serf_lan = {
memberlist = {
gossip_interval = "` + serfLAN.GossipInterval.String() + `"
probe_interval = "` + serfLAN.ProbeInterval.String() + `"
probe_timeout = "` + serfLAN.ProbeTimeout.String() + `"
suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + `
}
}
serf_wan = {
memberlist = {
gossip_interval = "` + serfWAN.GossipInterval.String() + `"
probe_interval = "` + serfWAN.ProbeInterval.String() + `"
probe_timeout = "` + serfWAN.ProbeTimeout.String() + `"
suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + `
}
}
server = { server = {
health_interval = "` + cfg.ServerHealthInterval.String() + `" health_interval = "` + cfg.ServerHealthInterval.String() + `"
} }
@ -223,22 +238,6 @@ func DevConsulSource() Source {
heartbeat_timeout = "35ms" heartbeat_timeout = "35ms"
leader_lease_timeout = "20ms" leader_lease_timeout = "20ms"
} }
serf_lan = {
memberlist = {
gossip_interval = "100ms"
probe_interval = "100ms"
probe_timeout = "100ms"
suspicion_mult = 3
}
}
serf_wan = {
memberlist = {
gossip_interval = "100ms"
probe_interval = "100ms"
probe_timeout = "100ms"
suspicion_mult = 3
}
}
server = { server = {
health_interval = "10ms" health_interval = "10ms"
} }

View File

@ -47,14 +47,6 @@ type RuntimeConfig struct {
ConsulRaftElectionTimeout time.Duration ConsulRaftElectionTimeout time.Duration
ConsulRaftHeartbeatTimeout time.Duration ConsulRaftHeartbeatTimeout time.Duration
ConsulRaftLeaderLeaseTimeout time.Duration ConsulRaftLeaderLeaseTimeout time.Duration
ConsulSerfLANGossipInterval time.Duration
ConsulSerfLANProbeInterval time.Duration
ConsulSerfLANProbeTimeout time.Duration
ConsulSerfLANSuspicionMult int
ConsulSerfWANGossipInterval time.Duration
ConsulSerfWANProbeInterval time.Duration
ConsulSerfWANProbeTimeout time.Duration
ConsulSerfWANSuspicionMult int
ConsulServerHealthInterval time.Duration ConsulServerHealthInterval time.Duration
// ACLAgentMasterToken is a special token that has full read and write // ACLAgentMasterToken is a special token that has full read and write
@ -964,6 +956,160 @@ type RuntimeConfig struct {
// hcl: ports { serf_wan = int } // hcl: ports { serf_wan = int }
SerfPortWAN int SerfPortWAN int
// GossipLANGossipInterval is the interval between sending messages that need
// to be gossiped that haven't been able to piggyback on probing messages.
// If this is set to zero, non-piggyback gossip is disabled. By lowering
// this value (more frequent) gossip messages are propagated across
// the cluster more quickly at the expense of increased bandwidth. This
// configuration only applies to LAN gossip communications
//
// The default is: 200ms
//
// hcl: gossip_lan { gossip_interval = duration}
GossipLANGossipInterval time.Duration
// GossipLANGossipNodes is the number of random nodes to send gossip messages to
// per GossipInterval. Increasing this number causes the gossip messages to
// propagate across the cluster more quickly at the expense of increased
// bandwidth. This configuration only applies to LAN gossip communications
//
// The default is: 3
//
// hcl: gossip_lan { gossip_nodes = int }
GossipLANGossipNodes int
// GossipLANProbeInterval is the interval between random node probes. Setting
// this lower (more frequent) will cause the memberlist cluster to detect
// failed nodes more quickly at the expense of increased bandwidth usage.
// This configuration only applies to LAN gossip communications
//
// The default is: 1s
//
// hcl: gossip_lan { probe_interval = duration }
GossipLANProbeInterval time.Duration
// GossipLANProbeTimeout is the timeout to wait for an ack from a probed node
// before assuming it is unhealthy. This should be set to 99-percentile
// of RTT (round-trip time) on your network. This configuration
// only applies to the LAN gossip communications
//
// The default is: 500ms
//
// hcl: gossip_lan { probe_timeout = duration }
GossipLANProbeTimeout time.Duration
// GossipLANSuspicionMult is the multiplier for determining the time an
// inaccessible node is considered suspect before declaring it dead. This
// configuration only applies to LAN gossip communications
//
// The actual timeout is calculated using the formula:
//
// SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
//
// This allows the timeout to scale properly with expected propagation
// delay with a larger cluster size. The higher the multiplier, the longer
// an inaccessible node is considered part of the cluster before declaring
// it dead, giving that suspect node more time to refute if it is indeed
// still alive.
//
// The default is: 4
//
// hcl: gossip_lan { suspicion_mult = int }
GossipLANSuspicionMult int
// GossipLANRetransmitMult is the multiplier for the number of retransmissions
// that are attempted for messages broadcasted over gossip. This
// configuration only applies to LAN gossip communications. The actual
// count of retransmissions is calculated using the formula:
//
// Retransmits = RetransmitMult * log(N+1)
//
// This allows the retransmits to scale properly with cluster size. The
// higher the multiplier, the more likely a failed broadcast is to converge
// at the expense of increased bandwidth.
//
// The default is: 4
//
// hcl: gossip_lan { retransmit_mult = int }
GossipLANRetransmitMult int
// GossipWANGossipInterval is the interval between sending messages that need
// to be gossiped that haven't been able to piggyback on probing messages.
// If this is set to zero, non-piggyback gossip is disabled. By lowering
// this value (more frequent) gossip messages are propagated across
// the cluster more quickly at the expense of increased bandwidth. This
// configuration only applies to WAN gossip communications
//
// The default is: 200ms
//
// hcl: gossip_wan { gossip_interval = duration}
GossipWANGossipInterval time.Duration
// GossipWANGossipNodes is the number of random nodes to send gossip messages to
// per GossipInterval. Increasing this number causes the gossip messages to
// propagate across the cluster more quickly at the expense of increased
// bandwidth. This configuration only applies to WAN gossip communications
//
// The default is: 3
//
// hcl: gossip_wan { gossip_nodes = int }
GossipWANGossipNodes int
// GossipWANProbeInterval is the interval between random node probes. Setting
// this lower (more frequent) will cause the memberlist cluster to detect
// failed nodes more quickly at the expense of increased bandwidth usage.
// This configuration only applies to WAN gossip communications
//
// The default is: 1s
//
// hcl: gossip_wan { probe_interval = duration }
GossipWANProbeInterval time.Duration
// GossipWANProbeTimeout is the timeout to wait for an ack from a probed node
// before assuming it is unhealthy. This should be set to 99-percentile
// of RTT (round-trip time) on your network. This configuration
// only applies to the WAN gossip communications
//
// The default is: 500ms
//
// hcl: gossip_wan { probe_timeout = duration }
GossipWANProbeTimeout time.Duration
// GossipWANSuspicionMult is the multiplier for determining the time an
// inaccessible node is considered suspect before declaring it dead. This
// configuration only applies to WAN gossip communications
//
// The actual timeout is calculated using the formula:
//
// SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
//
// This allows the timeout to scale properly with expected propagation
// delay with a larger cluster size. The higher the multiplier, the longer
// an inaccessible node is considered part of the cluster before declaring
// it dead, giving that suspect node more time to refute if it is indeed
// still alive.
//
// The default is: 4
//
// hcl: gossip_wan { suspicion_mult = int }
GossipWANSuspicionMult int
// GossipWANRetransmitMult is the multiplier for the number of retransmissions
// that are attempted for messages broadcasted over gossip. This
// configuration only applies to WAN gossip communications. The actual
// count of retransmissions is calculated using the formula:
//
// Retransmits = RetransmitMult * log(N+1)
//
// This allows the retransmits to scale properly with cluster size. The
// higher the multiplier, the more likely a failed broadcast is to converge
// at the expense of increased bandwidth.
//
// The default is: 4
//
// hcl: gossip_wan { retransmit_mult = int }
GossipWANRetransmitMult int
// ServerMode controls if this agent acts like a Consul server, // ServerMode controls if this agent acts like a Consul server,
// or merely as a client. Servers have more state, take part // or merely as a client. Servers have more state, take part
// in leader election, etc. // in leader election, etc.

View File

@ -286,14 +286,14 @@ func TestConfigFlagsAndEdgecases(t *testing.T) {
rt.ConsulRaftElectionTimeout = 52 * time.Millisecond rt.ConsulRaftElectionTimeout = 52 * time.Millisecond
rt.ConsulRaftHeartbeatTimeout = 35 * time.Millisecond rt.ConsulRaftHeartbeatTimeout = 35 * time.Millisecond
rt.ConsulRaftLeaderLeaseTimeout = 20 * time.Millisecond rt.ConsulRaftLeaderLeaseTimeout = 20 * time.Millisecond
rt.ConsulSerfLANGossipInterval = 100 * time.Millisecond rt.GossipLANGossipInterval = 100 * time.Millisecond
rt.ConsulSerfLANProbeInterval = 100 * time.Millisecond rt.GossipLANProbeInterval = 100 * time.Millisecond
rt.ConsulSerfLANProbeTimeout = 100 * time.Millisecond rt.GossipLANProbeTimeout = 100 * time.Millisecond
rt.ConsulSerfLANSuspicionMult = 3 rt.GossipLANSuspicionMult = 3
rt.ConsulSerfWANGossipInterval = 100 * time.Millisecond rt.GossipWANGossipInterval = 100 * time.Millisecond
rt.ConsulSerfWANProbeInterval = 100 * time.Millisecond rt.GossipWANProbeInterval = 100 * time.Millisecond
rt.ConsulSerfWANProbeTimeout = 100 * time.Millisecond rt.GossipWANProbeTimeout = 100 * time.Millisecond
rt.ConsulSerfWANSuspicionMult = 3 rt.GossipWANSuspicionMult = 3
rt.ConsulServerHealthInterval = 10 * time.Millisecond rt.ConsulServerHealthInterval = 10 * time.Millisecond
}, },
}, },
@ -2617,6 +2617,22 @@ func TestFullConfig(t *testing.T) {
} }
} }
}, },
"gossip_lan" : {
"gossip_nodes": 6,
"gossip_interval" : "25252s",
"retransmit_mult" : 1234,
"suspicion_mult" : 1235,
"probe_interval" : "101ms",
"probe_timeout" : "102ms"
},
"gossip_wan" : {
"gossip_nodes" : 2,
"gossip_interval" : "6966s",
"retransmit_mult" : 16384,
"suspicion_mult" : 16385,
"probe_interval" : "103ms",
"probe_timeout" : "104ms"
},
"data_dir": "` + dataDir + `", "data_dir": "` + dataDir + `",
"datacenter": "rzo029wg", "datacenter": "rzo029wg",
"disable_anonymous_signature": true, "disable_anonymous_signature": true,
@ -3092,6 +3108,22 @@ func TestFullConfig(t *testing.T) {
} }
} }
} }
gossip_lan {
gossip_nodes = 6
gossip_interval = "25252s"
retransmit_mult = 1234
suspicion_mult = 1235
probe_interval = "101ms"
probe_timeout = "102ms"
}
gossip_wan {
gossip_nodes = 2
gossip_interval = "6966s"
retransmit_mult = 16384
suspicion_mult = 16385
probe_interval = "103ms"
probe_timeout = "104ms"
}
data_dir = "` + dataDir + `" data_dir = "` + dataDir + `"
datacenter = "rzo029wg" datacenter = "rzo029wg"
disable_anonymous_signature = true disable_anonymous_signature = true
@ -3473,22 +3505,6 @@ func TestFullConfig(t *testing.T) {
"heartbeat_timeout": "25699s", "heartbeat_timeout": "25699s",
"leader_lease_timeout": "15351s" "leader_lease_timeout": "15351s"
}, },
"serf_lan": {
"memberlist": {
"gossip_interval": "25252s",
"probe_interval": "5105s",
"probe_timeout": "29179s",
"suspicion_mult": 8263
}
},
"serf_wan": {
"memberlist": {
"gossip_interval": "6966s",
"probe_interval": "20148s",
"probe_timeout": "3007s",
"suspicion_mult": 32096
}
},
"server": { "server": {
"health_interval": "17455s" "health_interval": "17455s"
} }
@ -3527,22 +3543,6 @@ func TestFullConfig(t *testing.T) {
heartbeat_timeout = "25699s" heartbeat_timeout = "25699s"
leader_lease_timeout = "15351s" leader_lease_timeout = "15351s"
} }
serf_lan = {
memberlist = {
gossip_interval = "25252s"
probe_interval = "5105s"
probe_timeout = "29179s"
suspicion_mult = 8263
}
}
serf_wan = {
memberlist = {
gossip_interval = "6966s"
probe_interval = "20148s"
probe_timeout = "3007s"
suspicion_mult = 32096
}
}
server = { server = {
health_interval = "17455s" health_interval = "17455s"
} }
@ -3574,14 +3574,18 @@ func TestFullConfig(t *testing.T) {
ConsulRaftElectionTimeout: 5 * 31947 * time.Second, ConsulRaftElectionTimeout: 5 * 31947 * time.Second,
ConsulRaftHeartbeatTimeout: 5 * 25699 * time.Second, ConsulRaftHeartbeatTimeout: 5 * 25699 * time.Second,
ConsulRaftLeaderLeaseTimeout: 5 * 15351 * time.Second, ConsulRaftLeaderLeaseTimeout: 5 * 15351 * time.Second,
ConsulSerfLANGossipInterval: 25252 * time.Second, GossipLANGossipInterval: 25252 * time.Second,
ConsulSerfLANProbeInterval: 5105 * time.Second, GossipLANGossipNodes: 6,
ConsulSerfLANProbeTimeout: 29179 * time.Second, GossipLANProbeInterval: 101 * time.Millisecond,
ConsulSerfLANSuspicionMult: 8263, GossipLANProbeTimeout: 102 * time.Millisecond,
ConsulSerfWANGossipInterval: 6966 * time.Second, GossipLANSuspicionMult: 1235,
ConsulSerfWANProbeInterval: 20148 * time.Second, GossipLANRetransmitMult: 1234,
ConsulSerfWANProbeTimeout: 3007 * time.Second, GossipWANGossipInterval: 6966 * time.Second,
ConsulSerfWANSuspicionMult: 32096, GossipWANGossipNodes: 2,
GossipWANProbeInterval: 103 * time.Millisecond,
GossipWANProbeTimeout: 104 * time.Millisecond,
GossipWANSuspicionMult: 16385,
GossipWANRetransmitMult: 16384,
ConsulServerHealthInterval: 17455 * time.Second, ConsulServerHealthInterval: 17455 * time.Second,
// user configurable values // user configurable values
@ -4407,14 +4411,18 @@ func TestSanitize(t *testing.T) {
"ConsulRaftElectionTimeout": "0s", "ConsulRaftElectionTimeout": "0s",
"ConsulRaftHeartbeatTimeout": "0s", "ConsulRaftHeartbeatTimeout": "0s",
"ConsulRaftLeaderLeaseTimeout": "0s", "ConsulRaftLeaderLeaseTimeout": "0s",
"ConsulSerfLANGossipInterval": "0s", "GossipLANGossipInterval": "0s",
"ConsulSerfLANProbeInterval": "0s", "GossipLANGossipNodes": 0,
"ConsulSerfLANProbeTimeout": "0s", "GossipLANProbeInterval": "0s",
"ConsulSerfLANSuspicionMult": 0, "GossipLANProbeTimeout": "0s",
"ConsulSerfWANGossipInterval": "0s", "GossipLANRetransmitMult": 0,
"ConsulSerfWANProbeInterval": "0s", "GossipLANSuspicionMult": 0,
"ConsulSerfWANProbeTimeout": "0s", "GossipWANGossipInterval": "0s",
"ConsulSerfWANSuspicionMult": 0, "GossipWANGossipNodes": 0,
"GossipWANProbeInterval": "0s",
"GossipWANProbeTimeout": "0s",
"GossipWANRetransmitMult": 0,
"GossipWANSuspicionMult": 0,
"ConsulServerHealthInterval": "0s", "ConsulServerHealthInterval": "0s",
"DNSARecordLimit": 0, "DNSARecordLimit": 0,
"DNSAddrs": [ "DNSAddrs": [

View File

@ -918,6 +918,76 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
* <a name="disable_keyring_file"></a><a href="#disable_keyring_file">`disable_keyring_file`</a> - Equivalent to the * <a name="disable_keyring_file"></a><a href="#disable_keyring_file">`disable_keyring_file`</a> - Equivalent to the
[`-disable-keyring-file` command-line flag](#_disable_keyring_file). [`-disable-keyring-file` command-line flag](#_disable_keyring_file).
* <a name="gossip_lan"></a><a href="#gossip_lan">`gossip_lan`</a> - **(Advanced)** This object contains a number of sub-keys
which can be set to tune the LAN gossip communications. These are only provided for users running especially large
clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their
environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**.
The default values are appropriate in almost all deployments.
* <a name="gossip_nodes"></a><a href="#gossip_nodes">`gossip_nodes`</a> - The number of random nodes to send
gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate
across the cluster more quickly at the expense of increased bandwidth. The default is 3.
* <a name="gossip_interval"></a><a href="#gossip_interval">`gossip_interval`</a> - The interval between sending
messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to
zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated
across the cluster more quickly at the expense of increased bandwidth. The default is 200ms.
* <a name="probe_interval"></a><a href="#probe_interval">`probe_interval`</a> - The interval between random node
probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly
at the expense of increased bandwidth usage. The default is 1s.
* <a name="probe_timeout"></a><a href="#probe_timeout">`probe_timeout`</a> - The timeout to wait for an ack from
a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on
your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments.
* <a name="retransmit_mult"></a><a href="#retransmit_mult">`retransmit_mult`</a> - The multiplier for the number
of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled
using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to
converge at the expense of increased bandwidth. The default is 4.
* <a name="suspicion_mult"></a><a href="#suspicion_mult">`suspicion_mult`</a> - The multiplier for determining the
time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster
size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a
larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the
cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The
default is 4.
* <a name="gossip_wan"></a><a href="#gossip_wan">`gossip_wan`</a> - **(Advanced)** This object contains a number of sub-keys
which can be set to tune the WAN gossip communications. These are only provided for users running especially large
clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their
environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**.
The default values are appropriate in almost all deployments.
* <a name="gossip_nodes"></a><a href="#gossip_nodes">`gossip_nodes`</a> - The number of random nodes to send
gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate
across the cluster more quickly at the expense of increased bandwidth. The default is 3.
* <a name="gossip_interval"></a><a href="#gossip_interval">`gossip_interval`</a> - The interval between sending
messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to
zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated
across the cluster more quickly at the expense of increased bandwidth. The default is 200ms.
* <a name="probe_interval"></a><a href="#probe_interval">`probe_interval`</a> - The interval between random node
probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly
at the expense of increased bandwidth usage. The default is 1s.
* <a name="probe_timeout"></a><a href="#probe_timeout">`probe_timeout`</a> - The timeout to wait for an ack from
a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on
your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments.
* <a name="retransmit_mult"></a><a href="#retransmit_mult">`retransmit_mult`</a> - The multiplier for the number
of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled
using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to
converge at the expense of increased bandwidth. The default is 4.
* <a name="suspicion_mult"></a><a href="#suspicion_mult">`suspicion_mult`</a> - The multiplier for determining the
time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster
size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a
larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the
cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The
default is 4.
* <a name="key_file"></a><a href="#key_file">`key_file`</a> This provides a the file path to a * <a name="key_file"></a><a href="#key_file">`key_file`</a> This provides a the file path to a
PEM-encoded private key. The key is used with the certificate to verify the agent's authenticity. PEM-encoded private key. The key is used with the certificate to verify the agent's authenticity.
This must be provided along with [`cert_file`](#cert_file). This must be provided along with [`cert_file`](#cert_file).