Add FailoverHeartbeatTTL to config (#11127)

FailoverHeartbeatTTL is the amount of time to wait after a server leader failure before considering reallocating client tasks. This TTL should be fairly long as the new server leader needs to rebuild the entire heartbeat map for the cluster. In deployments with a small number of machines, the default TTL (5m) may be unnecessary long. Let's allow operators to configure this value in their config files.
2021-10-06 15:48:12 -07:00 · 2021-10-06 15:48:12 -07:00 · b56432e645
parent 188be1b5df
commit b56432e645
9 changed files with 38 additions and 0 deletions
--- a/.changelog/11127.txt
+++ b/.changelog/11127.txt
@ -0,0 +1,3 @@
+```release-note:improvement
+server: Allow tuning of node failover heartbeat TTL
+```
--- a/command/agent/agent.go
+++ b/command/agent/agent.go
@ -380,6 +380,9 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
 	if maxHPS := agentConfig.Server.MaxHeartbeatsPerSecond; maxHPS != 0 {
 		conf.MaxHeartbeatsPerSecond = maxHPS
 	}
+	if failoverTTL := agentConfig.Server.FailoverHeartbeatTTL; failoverTTL != 0 {
+		conf.FailoverHeartbeatTTL = failoverTTL
+	}

 	if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ServerServiceName == "" {
 		return nil, fmt.Errorf("server_service_name must be set when auto_advertise is enabled")
--- a/command/agent/agent_test.go
+++ b/command/agent/agent_test.go
@ -141,6 +141,11 @@ func TestAgent_ServerConfig(t *testing.T) {
 	require.NoError(t, err)
 	require.Equal(t, float64(11.0), out.MaxHeartbeatsPerSecond)

+	conf.Server.FailoverHeartbeatTTL = 337 * time.Second
+	out, err = a.serverConfig()
+	require.NoError(t, err)
+	require.Equal(t, 337*time.Second, out.FailoverHeartbeatTTL)
+
 	// Defaults to the global bind addr
 	conf.Addresses.RPC = ""
 	conf.Addresses.Serf = ""
--- a/command/agent/config.go
+++ b/command/agent/config.go
@ -441,6 +441,12 @@ type ServerConfig struct {
 	// to meet the target rate.
 	MaxHeartbeatsPerSecond float64 `hcl:"max_heartbeats_per_second"`

+	// FailoverHeartbeatTTL is the TTL applied to heartbeats after
+	// a new leader is elected, since we no longer know the status
+	// of all the heartbeats.
+	FailoverHeartbeatTTL    time.Duration
+	FailoverHeartbeatTTLHCL string `hcl:"failover_heartbeat_ttl" json:"-"`
+
 	// StartJoin is a list of addresses to attempt to join when the
 	// agent starts. If Serf is unable to communicate with any of these
 	// addresses, then the agent will error and exit.
@ -1484,6 +1490,12 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
 	if b.MaxHeartbeatsPerSecond != 0.0 {
 		result.MaxHeartbeatsPerSecond = b.MaxHeartbeatsPerSecond
 	}
+	if b.FailoverHeartbeatTTL != 0 {
+		result.FailoverHeartbeatTTL = b.FailoverHeartbeatTTL
+	}
+	if b.FailoverHeartbeatTTLHCL != "" {
+		result.FailoverHeartbeatTTLHCL = b.FailoverHeartbeatTTLHCL
+	}
 	if b.RetryMaxAttempts != 0 {
 		result.RetryMaxAttempts = b.RetryMaxAttempts
 	}
--- a/command/agent/config_parse.go
+++ b/command/agent/config_parse.go
@ -55,6 +55,7 @@ func ParseConfigFile(path string) (*Config, error) {
 		{"client.server_join.retry_interval", &c.Client.ServerJoin.RetryInterval, &c.Client.ServerJoin.RetryIntervalHCL},
 		{"server.heartbeat_grace", &c.Server.HeartbeatGrace, &c.Server.HeartbeatGraceHCL},
 		{"server.min_heartbeat_ttl", &c.Server.MinHeartbeatTTL, &c.Server.MinHeartbeatTTLHCL},
+		{"server.failover_heartbeat_ttl", &c.Server.FailoverHeartbeatTTL, &c.Server.FailoverHeartbeatTTLHCL},
 		{"server.retry_interval", &c.Server.RetryInterval, &c.Server.RetryIntervalHCL},
 		{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL},
 		{"consul.timeout", &c.Consul.Timeout, &c.Consul.TimeoutHCL},
--- a/command/agent/config_parse_test.go
+++ b/command/agent/config_parse_test.go
@ -112,6 +112,8 @@ var basicConfig = &Config{
 		MinHeartbeatTTL:           33 * time.Second,
 		MinHeartbeatTTLHCL:        "33s",
 		MaxHeartbeatsPerSecond:    11.0,
+		FailoverHeartbeatTTL:      330 * time.Second,
+		FailoverHeartbeatTTLHCL:   "330s",
 		RetryJoin:                 []string{"1.1.1.1", "2.2.2.2"},
 		StartJoin:                 []string{"1.1.1.1", "2.2.2.2"},
 		RetryInterval:             15 * time.Second,
--- a/command/agent/testdata/basic.hcl
+++ b/command/agent/testdata/basic.hcl
@ -120,6 +120,7 @@ server {
  heartbeat_grace               = "30s"
  min_heartbeat_ttl             = "33s"
  max_heartbeats_per_second     = 11.0
+  failover_heartbeat_ttl        = "330s"
  retry_join                    = ["1.1.1.1", "2.2.2.2"]
  start_join                    = ["1.1.1.1", "2.2.2.2"]
  retry_max                     = 3
--- a/command/agent/testdata/basic.json
+++ b/command/agent/testdata/basic.json
@ -273,6 +273,7 @@
      "job_gc_threshold": "12h",
      "max_heartbeats_per_second": 11,
      "min_heartbeat_ttl": "33s",
+      "failover_heartbeat_ttl": "330s",
      "node_gc_threshold": "12h",
      "non_voting_server": true,
      "num_schedulers": 2,
--- a/website/content/docs/configuration/server.mdx
+++ b/website/content/docs/configuration/server.mdx
@ -131,6 +131,16 @@ server {
  a tradeoff as it lowers failure detection time of nodes at the tradeoff of
  false positives and increased load on the leader.

+- `failover_heartbeat_ttl` `(string: "5m")` - Specifies the TTL applied to
+	heartbeats after a new leader is elected, since we no longer know the status
+	of all the heartbeats. This is specified using a label suffix like "30s" or
+	"1h".
+
+  ~> Lowering the `failover_heartbeat_ttl` is a tradeoff as it lowers failure
+  detection time of nodes at the tradeoff of false positives. False positives
+  could cause all clients to stop their allocations if a leadership transition
+  lasts longer than `heartbeat_grace + failover_heartbeat_ttl`.
+
 - `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target
  rate of heartbeats being processed per second. This allows the TTL to be
  increased to meet the target rate. Increasing the maximum heartbeats per