diff --git a/agent/agent.go b/agent/agent.go index 603ac23c7..4bd8e215f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -782,10 +782,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) { base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing - base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfLANGossipInterval - base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfLANProbeInterval - base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfLANProbeTimeout - base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfLANSuspicionMult + base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval + base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes + base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval + base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout + base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult + base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult if a.config.SerfBindAddrWAN != nil { base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String() @@ -794,10 +796,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) { base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing - base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfWANGossipInterval - base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfWANProbeInterval - base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfWANProbeTimeout - base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfWANSuspicionMult + base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval + base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes + base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval + base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout + base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult + base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult } else { // Disable serf WAN federation base.SerfWANConfig = nil diff --git a/agent/config/builder.go b/agent/config/builder.go index d76196795..25cb9b9c5 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -579,16 +579,22 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { ConsulRaftElectionTimeout: consulRaftElectionTimeout, ConsulRaftHeartbeatTimeout: consulRaftHeartbeatTimeout, ConsulRaftLeaderLeaseTimeout: consulRaftLeaderLeaseTimeout, - ConsulSerfLANGossipInterval: b.durationVal("consul.serf_lan.gossip_interval", c.Consul.SerfLAN.Memberlist.GossipInterval), - ConsulSerfLANProbeInterval: b.durationVal("consul.serf_lan.probe_interval", c.Consul.SerfLAN.Memberlist.ProbeInterval), - ConsulSerfLANProbeTimeout: b.durationVal("consul.serf_lan.probe_timeout", c.Consul.SerfLAN.Memberlist.ProbeTimeout), - ConsulSerfLANSuspicionMult: b.intVal(c.Consul.SerfLAN.Memberlist.SuspicionMult), - ConsulSerfWANGossipInterval: b.durationVal("consul.serf_wan.gossip_interval", c.Consul.SerfWAN.Memberlist.GossipInterval), - ConsulSerfWANProbeInterval: b.durationVal("consul.serf_wan.probe_interval", c.Consul.SerfWAN.Memberlist.ProbeInterval), - ConsulSerfWANProbeTimeout: b.durationVal("consul.serf_wan.probe_timeout", c.Consul.SerfWAN.Memberlist.ProbeTimeout), - ConsulSerfWANSuspicionMult: b.intVal(c.Consul.SerfWAN.Memberlist.SuspicionMult), ConsulServerHealthInterval: b.durationVal("consul.server.health_interval", c.Consul.Server.HealthInterval), + // gossip configuration + GossipLANGossipInterval: b.durationVal("gossip_lan..gossip_interval", c.GossipLAN.GossipInterval), + GossipLANGossipNodes: b.intVal(c.GossipLAN.GossipNodes), + GossipLANProbeInterval: b.durationVal("gossip_lan..probe_interval", c.GossipLAN.ProbeInterval), + GossipLANProbeTimeout: b.durationVal("gossip_lan..probe_timeout", c.GossipLAN.ProbeTimeout), + GossipLANSuspicionMult: b.intVal(c.GossipLAN.SuspicionMult), + GossipLANRetransmitMult: b.intVal(c.GossipLAN.RetransmitMult), + GossipWANGossipInterval: b.durationVal("gossip_wan..gossip_interval", c.GossipWAN.GossipInterval), + GossipWANGossipNodes: b.intVal(c.GossipWAN.GossipNodes), + GossipWANProbeInterval: b.durationVal("gossip_wan..probe_interval", c.GossipWAN.ProbeInterval), + GossipWANProbeTimeout: b.durationVal("gossip_wan..probe_timeout", c.GossipWAN.ProbeTimeout), + GossipWANSuspicionMult: b.intVal(c.GossipWAN.SuspicionMult), + GossipWANRetransmitMult: b.intVal(c.GossipWAN.RetransmitMult), + // ACL ACLAgentMasterToken: b.stringVal(c.ACLAgentMasterToken), ACLAgentToken: b.stringVal(c.ACLAgentToken), diff --git a/agent/config/config.go b/agent/config/config.go index e49b7f9d6..e0468f2d1 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -184,6 +184,8 @@ type Config struct { EncryptKey *string `json:"encrypt,omitempty" hcl:"encrypt" mapstructure:"encrypt"` EncryptVerifyIncoming *bool `json:"encrypt_verify_incoming,omitempty" hcl:"encrypt_verify_incoming" mapstructure:"encrypt_verify_incoming"` EncryptVerifyOutgoing *bool `json:"encrypt_verify_outgoing,omitempty" hcl:"encrypt_verify_outgoing" mapstructure:"encrypt_verify_outgoing"` + GossipLAN GossipLANConfig `json:"gossip_lan,omitempty" hcl:"gossip_lan" mapstructure:"gossip_lan"` + GossipWAN GossipWANConfig `json:"gossip_wan,omitempty" hcl:"gossip_wan" mapstructure:"gossip_wan"` HTTPConfig HTTPConfig `json:"http_config,omitempty" hcl:"http_config" mapstructure:"http_config"` KeyFile *string `json:"key_file,omitempty" hcl:"key_file" mapstructure:"key_file"` LeaveOnTerm *bool `json:"leave_on_terminate,omitempty" hcl:"leave_on_terminate" mapstructure:"leave_on_terminate"` @@ -259,6 +261,24 @@ type Config struct { VersionPrerelease *string `json:"version_prerelease,omitempty" hcl:"version_prerelease" mapstructure:"version_prerelease"` } +type GossipLANConfig struct { + GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"` + GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"` + ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"` + ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"` + SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"` + RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"` +} + +type GossipWANConfig struct { + GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"` + GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"` + ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"` + ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"` + SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"` + RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"` +} + type Consul struct { Coordinate struct { UpdateBatchSize *int `json:"update_batch_size,omitempty" hcl:"update_batch_size" mapstructure:"update_batch_size"` @@ -272,24 +292,6 @@ type Consul struct { LeaderLeaseTimeout *string `json:"leader_lease_timeout,omitempty" hcl:"leader_lease_timeout" mapstructure:"leader_lease_timeout"` } `json:"raft,omitempty" hcl:"raft" mapstructure:"raft"` - SerfLAN struct { - Memberlist struct { - GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"` - ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"` - ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"` - SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"` - } `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"` - } `json:"serf_lan,omitempty" hcl:"serf_lan" mapstructure:"serf_lan"` - - SerfWAN struct { - Memberlist struct { - GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"` - ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"` - ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"` - SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"` - } `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"` - } `json:"serf_wan,omitempty" hcl:"serf_wan" mapstructure:"serf_wan"` - Server struct { HealthInterval *string `json:"health_interval,omitempty" hcl:"health_interval" mapstructure:"health_interval"` } `json:"server,omitempty" hcl:"server" mapstructure:"server"` diff --git a/agent/config/default.go b/agent/config/default.go index c95a0c643..017120d8e 100644 --- a/agent/config/default.go +++ b/agent/config/default.go @@ -26,6 +26,10 @@ func DefaultRPCProtocol() (int, error) { // todo(fs): IMO, this should be the definitive default for all configurable values // todo(fs): and whatever is in here should clobber every default value. Hence, no sourcing. func DefaultSource() Source { + cfg := consul.DefaultConfig() + serfLAN := cfg.SerfLANConfig.MemberlistConfig + serfWAN := cfg.SerfWANConfig.MemberlistConfig + return Source{ Name: "default", Format: "hcl", @@ -62,6 +66,22 @@ func DefaultSource() Source { max_trailing_logs = 250 server_stabilization_time = "10s" } + gossip_lan = { + gossip_interval = "` + serfLAN.GossipInterval.String() + `" + gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + ` + retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + ` + probe_interval = "` + serfLAN.ProbeInterval.String() + `" + probe_timeout = "` + serfLAN.ProbeTimeout.String() + `" + suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + ` + } + gossip_wan = { + gossip_interval = "` + serfWAN.GossipInterval.String() + `" + gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + ` + retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + ` + probe_interval = "` + serfWAN.ProbeInterval.String() + `" + probe_timeout = "` + serfWAN.ProbeTimeout.String() + `" + suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + ` + } dns_config = { allow_stale = true a_record_limit = 0 @@ -92,6 +112,7 @@ func DefaultSource() Source { metrics_prefix = "consul" filter_default = true } + `, } } @@ -111,6 +132,18 @@ func DevSource() Source { log_level = "DEBUG" server = true + gossip_lan = { + gossip_interval = "100ms" + probe_interval = "100ms" + probe_timeout = "100ms" + suspicion_mult = 3 + } + gossip_wan = { + gossip_interval = "100ms" + probe_interval = "100ms" + probe_timeout = "100ms" + suspicion_mult = 3 + } connect = { enabled = true } @@ -166,8 +199,6 @@ func DefaultVersionSource() Source { func DefaultConsulSource() Source { cfg := consul.DefaultConfig() raft := cfg.RaftConfig - serfLAN := cfg.SerfLANConfig.MemberlistConfig - serfWAN := cfg.SerfWANConfig.MemberlistConfig return Source{ Name: "consul", Format: "hcl", @@ -183,22 +214,6 @@ func DefaultConsulSource() Source { heartbeat_timeout = "` + raft.HeartbeatTimeout.String() + `" leader_lease_timeout = "` + raft.LeaderLeaseTimeout.String() + `" } - serf_lan = { - memberlist = { - gossip_interval = "` + serfLAN.GossipInterval.String() + `" - probe_interval = "` + serfLAN.ProbeInterval.String() + `" - probe_timeout = "` + serfLAN.ProbeTimeout.String() + `" - suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + ` - } - } - serf_wan = { - memberlist = { - gossip_interval = "` + serfWAN.GossipInterval.String() + `" - probe_interval = "` + serfWAN.ProbeInterval.String() + `" - probe_timeout = "` + serfWAN.ProbeTimeout.String() + `" - suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + ` - } - } server = { health_interval = "` + cfg.ServerHealthInterval.String() + `" } @@ -223,22 +238,6 @@ func DevConsulSource() Source { heartbeat_timeout = "35ms" leader_lease_timeout = "20ms" } - serf_lan = { - memberlist = { - gossip_interval = "100ms" - probe_interval = "100ms" - probe_timeout = "100ms" - suspicion_mult = 3 - } - } - serf_wan = { - memberlist = { - gossip_interval = "100ms" - probe_interval = "100ms" - probe_timeout = "100ms" - suspicion_mult = 3 - } - } server = { health_interval = "10ms" } diff --git a/agent/config/runtime.go b/agent/config/runtime.go index dfb7893e0..4c26ecd4d 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -47,14 +47,6 @@ type RuntimeConfig struct { ConsulRaftElectionTimeout time.Duration ConsulRaftHeartbeatTimeout time.Duration ConsulRaftLeaderLeaseTimeout time.Duration - ConsulSerfLANGossipInterval time.Duration - ConsulSerfLANProbeInterval time.Duration - ConsulSerfLANProbeTimeout time.Duration - ConsulSerfLANSuspicionMult int - ConsulSerfWANGossipInterval time.Duration - ConsulSerfWANProbeInterval time.Duration - ConsulSerfWANProbeTimeout time.Duration - ConsulSerfWANSuspicionMult int ConsulServerHealthInterval time.Duration // ACLAgentMasterToken is a special token that has full read and write @@ -964,6 +956,160 @@ type RuntimeConfig struct { // hcl: ports { serf_wan = int } SerfPortWAN int + // GossipLANGossipInterval is the interval between sending messages that need + // to be gossiped that haven't been able to piggyback on probing messages. + // If this is set to zero, non-piggyback gossip is disabled. By lowering + // this value (more frequent) gossip messages are propagated across + // the cluster more quickly at the expense of increased bandwidth. This + // configuration only applies to LAN gossip communications + // + // The default is: 200ms + // + // hcl: gossip_lan { gossip_interval = duration} + GossipLANGossipInterval time.Duration + + // GossipLANGossipNodes is the number of random nodes to send gossip messages to + // per GossipInterval. Increasing this number causes the gossip messages to + // propagate across the cluster more quickly at the expense of increased + // bandwidth. This configuration only applies to LAN gossip communications + // + // The default is: 3 + // + // hcl: gossip_lan { gossip_nodes = int } + GossipLANGossipNodes int + + // GossipLANProbeInterval is the interval between random node probes. Setting + // this lower (more frequent) will cause the memberlist cluster to detect + // failed nodes more quickly at the expense of increased bandwidth usage. + // This configuration only applies to LAN gossip communications + // + // The default is: 1s + // + // hcl: gossip_lan { probe_interval = duration } + GossipLANProbeInterval time.Duration + + // GossipLANProbeTimeout is the timeout to wait for an ack from a probed node + // before assuming it is unhealthy. This should be set to 99-percentile + // of RTT (round-trip time) on your network. This configuration + // only applies to the LAN gossip communications + // + // The default is: 500ms + // + // hcl: gossip_lan { probe_timeout = duration } + GossipLANProbeTimeout time.Duration + + // GossipLANSuspicionMult is the multiplier for determining the time an + // inaccessible node is considered suspect before declaring it dead. This + // configuration only applies to LAN gossip communications + // + // The actual timeout is calculated using the formula: + // + // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval + // + // This allows the timeout to scale properly with expected propagation + // delay with a larger cluster size. The higher the multiplier, the longer + // an inaccessible node is considered part of the cluster before declaring + // it dead, giving that suspect node more time to refute if it is indeed + // still alive. + // + // The default is: 4 + // + // hcl: gossip_lan { suspicion_mult = int } + GossipLANSuspicionMult int + + // GossipLANRetransmitMult is the multiplier for the number of retransmissions + // that are attempted for messages broadcasted over gossip. This + // configuration only applies to LAN gossip communications. The actual + // count of retransmissions is calculated using the formula: + // + // Retransmits = RetransmitMult * log(N+1) + // + // This allows the retransmits to scale properly with cluster size. The + // higher the multiplier, the more likely a failed broadcast is to converge + // at the expense of increased bandwidth. + // + // The default is: 4 + // + // hcl: gossip_lan { retransmit_mult = int } + GossipLANRetransmitMult int + + // GossipWANGossipInterval is the interval between sending messages that need + // to be gossiped that haven't been able to piggyback on probing messages. + // If this is set to zero, non-piggyback gossip is disabled. By lowering + // this value (more frequent) gossip messages are propagated across + // the cluster more quickly at the expense of increased bandwidth. This + // configuration only applies to WAN gossip communications + // + // The default is: 200ms + // + // hcl: gossip_wan { gossip_interval = duration} + GossipWANGossipInterval time.Duration + + // GossipWANGossipNodes is the number of random nodes to send gossip messages to + // per GossipInterval. Increasing this number causes the gossip messages to + // propagate across the cluster more quickly at the expense of increased + // bandwidth. This configuration only applies to WAN gossip communications + // + // The default is: 3 + // + // hcl: gossip_wan { gossip_nodes = int } + GossipWANGossipNodes int + + // GossipWANProbeInterval is the interval between random node probes. Setting + // this lower (more frequent) will cause the memberlist cluster to detect + // failed nodes more quickly at the expense of increased bandwidth usage. + // This configuration only applies to WAN gossip communications + // + // The default is: 1s + // + // hcl: gossip_wan { probe_interval = duration } + GossipWANProbeInterval time.Duration + + // GossipWANProbeTimeout is the timeout to wait for an ack from a probed node + // before assuming it is unhealthy. This should be set to 99-percentile + // of RTT (round-trip time) on your network. This configuration + // only applies to the WAN gossip communications + // + // The default is: 500ms + // + // hcl: gossip_wan { probe_timeout = duration } + GossipWANProbeTimeout time.Duration + + // GossipWANSuspicionMult is the multiplier for determining the time an + // inaccessible node is considered suspect before declaring it dead. This + // configuration only applies to WAN gossip communications + // + // The actual timeout is calculated using the formula: + // + // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval + // + // This allows the timeout to scale properly with expected propagation + // delay with a larger cluster size. The higher the multiplier, the longer + // an inaccessible node is considered part of the cluster before declaring + // it dead, giving that suspect node more time to refute if it is indeed + // still alive. + // + // The default is: 4 + // + // hcl: gossip_wan { suspicion_mult = int } + GossipWANSuspicionMult int + + // GossipWANRetransmitMult is the multiplier for the number of retransmissions + // that are attempted for messages broadcasted over gossip. This + // configuration only applies to WAN gossip communications. The actual + // count of retransmissions is calculated using the formula: + // + // Retransmits = RetransmitMult * log(N+1) + // + // This allows the retransmits to scale properly with cluster size. The + // higher the multiplier, the more likely a failed broadcast is to converge + // at the expense of increased bandwidth. + // + // The default is: 4 + // + // hcl: gossip_wan { retransmit_mult = int } + GossipWANRetransmitMult int + // ServerMode controls if this agent acts like a Consul server, // or merely as a client. Servers have more state, take part // in leader election, etc. diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 04a528b61..75658a250 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -286,14 +286,14 @@ func TestConfigFlagsAndEdgecases(t *testing.T) { rt.ConsulRaftElectionTimeout = 52 * time.Millisecond rt.ConsulRaftHeartbeatTimeout = 35 * time.Millisecond rt.ConsulRaftLeaderLeaseTimeout = 20 * time.Millisecond - rt.ConsulSerfLANGossipInterval = 100 * time.Millisecond - rt.ConsulSerfLANProbeInterval = 100 * time.Millisecond - rt.ConsulSerfLANProbeTimeout = 100 * time.Millisecond - rt.ConsulSerfLANSuspicionMult = 3 - rt.ConsulSerfWANGossipInterval = 100 * time.Millisecond - rt.ConsulSerfWANProbeInterval = 100 * time.Millisecond - rt.ConsulSerfWANProbeTimeout = 100 * time.Millisecond - rt.ConsulSerfWANSuspicionMult = 3 + rt.GossipLANGossipInterval = 100 * time.Millisecond + rt.GossipLANProbeInterval = 100 * time.Millisecond + rt.GossipLANProbeTimeout = 100 * time.Millisecond + rt.GossipLANSuspicionMult = 3 + rt.GossipWANGossipInterval = 100 * time.Millisecond + rt.GossipWANProbeInterval = 100 * time.Millisecond + rt.GossipWANProbeTimeout = 100 * time.Millisecond + rt.GossipWANSuspicionMult = 3 rt.ConsulServerHealthInterval = 10 * time.Millisecond }, }, @@ -2617,6 +2617,22 @@ func TestFullConfig(t *testing.T) { } } }, + "gossip_lan" : { + "gossip_nodes": 6, + "gossip_interval" : "25252s", + "retransmit_mult" : 1234, + "suspicion_mult" : 1235, + "probe_interval" : "101ms", + "probe_timeout" : "102ms" + }, + "gossip_wan" : { + "gossip_nodes" : 2, + "gossip_interval" : "6966s", + "retransmit_mult" : 16384, + "suspicion_mult" : 16385, + "probe_interval" : "103ms", + "probe_timeout" : "104ms" + }, "data_dir": "` + dataDir + `", "datacenter": "rzo029wg", "disable_anonymous_signature": true, @@ -3092,6 +3108,22 @@ func TestFullConfig(t *testing.T) { } } } + gossip_lan { + gossip_nodes = 6 + gossip_interval = "25252s" + retransmit_mult = 1234 + suspicion_mult = 1235 + probe_interval = "101ms" + probe_timeout = "102ms" + } + gossip_wan { + gossip_nodes = 2 + gossip_interval = "6966s" + retransmit_mult = 16384 + suspicion_mult = 16385 + probe_interval = "103ms" + probe_timeout = "104ms" + } data_dir = "` + dataDir + `" datacenter = "rzo029wg" disable_anonymous_signature = true @@ -3473,22 +3505,6 @@ func TestFullConfig(t *testing.T) { "heartbeat_timeout": "25699s", "leader_lease_timeout": "15351s" }, - "serf_lan": { - "memberlist": { - "gossip_interval": "25252s", - "probe_interval": "5105s", - "probe_timeout": "29179s", - "suspicion_mult": 8263 - } - }, - "serf_wan": { - "memberlist": { - "gossip_interval": "6966s", - "probe_interval": "20148s", - "probe_timeout": "3007s", - "suspicion_mult": 32096 - } - }, "server": { "health_interval": "17455s" } @@ -3527,22 +3543,6 @@ func TestFullConfig(t *testing.T) { heartbeat_timeout = "25699s" leader_lease_timeout = "15351s" } - serf_lan = { - memberlist = { - gossip_interval = "25252s" - probe_interval = "5105s" - probe_timeout = "29179s" - suspicion_mult = 8263 - } - } - serf_wan = { - memberlist = { - gossip_interval = "6966s" - probe_interval = "20148s" - probe_timeout = "3007s" - suspicion_mult = 32096 - } - } server = { health_interval = "17455s" } @@ -3574,14 +3574,18 @@ func TestFullConfig(t *testing.T) { ConsulRaftElectionTimeout: 5 * 31947 * time.Second, ConsulRaftHeartbeatTimeout: 5 * 25699 * time.Second, ConsulRaftLeaderLeaseTimeout: 5 * 15351 * time.Second, - ConsulSerfLANGossipInterval: 25252 * time.Second, - ConsulSerfLANProbeInterval: 5105 * time.Second, - ConsulSerfLANProbeTimeout: 29179 * time.Second, - ConsulSerfLANSuspicionMult: 8263, - ConsulSerfWANGossipInterval: 6966 * time.Second, - ConsulSerfWANProbeInterval: 20148 * time.Second, - ConsulSerfWANProbeTimeout: 3007 * time.Second, - ConsulSerfWANSuspicionMult: 32096, + GossipLANGossipInterval: 25252 * time.Second, + GossipLANGossipNodes: 6, + GossipLANProbeInterval: 101 * time.Millisecond, + GossipLANProbeTimeout: 102 * time.Millisecond, + GossipLANSuspicionMult: 1235, + GossipLANRetransmitMult: 1234, + GossipWANGossipInterval: 6966 * time.Second, + GossipWANGossipNodes: 2, + GossipWANProbeInterval: 103 * time.Millisecond, + GossipWANProbeTimeout: 104 * time.Millisecond, + GossipWANSuspicionMult: 16385, + GossipWANRetransmitMult: 16384, ConsulServerHealthInterval: 17455 * time.Second, // user configurable values @@ -4407,14 +4411,18 @@ func TestSanitize(t *testing.T) { "ConsulRaftElectionTimeout": "0s", "ConsulRaftHeartbeatTimeout": "0s", "ConsulRaftLeaderLeaseTimeout": "0s", - "ConsulSerfLANGossipInterval": "0s", - "ConsulSerfLANProbeInterval": "0s", - "ConsulSerfLANProbeTimeout": "0s", - "ConsulSerfLANSuspicionMult": 0, - "ConsulSerfWANGossipInterval": "0s", - "ConsulSerfWANProbeInterval": "0s", - "ConsulSerfWANProbeTimeout": "0s", - "ConsulSerfWANSuspicionMult": 0, + "GossipLANGossipInterval": "0s", + "GossipLANGossipNodes": 0, + "GossipLANProbeInterval": "0s", + "GossipLANProbeTimeout": "0s", + "GossipLANRetransmitMult": 0, + "GossipLANSuspicionMult": 0, + "GossipWANGossipInterval": "0s", + "GossipWANGossipNodes": 0, + "GossipWANProbeInterval": "0s", + "GossipWANProbeTimeout": "0s", + "GossipWANRetransmitMult": 0, + "GossipWANSuspicionMult": 0, "ConsulServerHealthInterval": "0s", "DNSARecordLimit": 0, "DNSAddrs": [ diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md index 75029f63d..6cd75d605 100644 --- a/website/source/docs/agent/options.html.md +++ b/website/source/docs/agent/options.html.md @@ -918,6 +918,76 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass * `disable_keyring_file` - Equivalent to the [`-disable-keyring-file` command-line flag](#_disable_keyring_file). +* `gossip_lan` - **(Advanced)** This object contains a number of sub-keys + which can be set to tune the LAN gossip communications. These are only provided for users running especially large + clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their + environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**. + The default values are appropriate in almost all deployments. + + * `gossip_nodes` - The number of random nodes to send + gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate + across the cluster more quickly at the expense of increased bandwidth. The default is 3. + + * `gossip_interval` - The interval between sending + messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to + zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated + across the cluster more quickly at the expense of increased bandwidth. The default is 200ms. + + * `probe_interval` - The interval between random node + probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly + at the expense of increased bandwidth usage. The default is 1s. + + * `probe_timeout` - The timeout to wait for an ack from + a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on + your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments. + + * `retransmit_mult` - The multiplier for the number + of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled + using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to + converge at the expense of increased bandwidth. The default is 4. + + * `suspicion_mult` - The multiplier for determining the + time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster + size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a + larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the + cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The + default is 4. + +* `gossip_wan` - **(Advanced)** This object contains a number of sub-keys + which can be set to tune the WAN gossip communications. These are only provided for users running especially large + clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their + environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**. + The default values are appropriate in almost all deployments. + + * `gossip_nodes` - The number of random nodes to send + gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate + across the cluster more quickly at the expense of increased bandwidth. The default is 3. + + * `gossip_interval` - The interval between sending + messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to + zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated + across the cluster more quickly at the expense of increased bandwidth. The default is 200ms. + + * `probe_interval` - The interval between random node + probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly + at the expense of increased bandwidth usage. The default is 1s. + + * `probe_timeout` - The timeout to wait for an ack from + a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on + your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments. + + * `retransmit_mult` - The multiplier for the number + of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled + using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to + converge at the expense of increased bandwidth. The default is 4. + + * `suspicion_mult` - The multiplier for determining the + time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster + size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a + larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the + cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The + default is 4. + * `key_file` This provides a the file path to a PEM-encoded private key. The key is used with the certificate to verify the agent's authenticity. This must be provided along with [`cert_file`](#cert_file).