Merge pull request #2859 from hashicorp/f-heartbeat-tunables
Allow tuning of heartbeat ttls
This commit is contained in:
commit
15912d29d2
|
@ -192,12 +192,14 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi
|
|||
conf.DeploymentGCThreshold = dur
|
||||
}
|
||||
|
||||
if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != "" {
|
||||
dur, err := time.ParseDuration(heartbeatGrace)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
conf.HeartbeatGrace = dur
|
||||
if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != 0 {
|
||||
conf.HeartbeatGrace = heartbeatGrace
|
||||
}
|
||||
if min := agentConfig.Server.MinHeartbeatTTL; min != 0 {
|
||||
conf.MinHeartbeatTTL = min
|
||||
}
|
||||
if maxHPS := agentConfig.Server.MaxHeartbeatsPerSecond; maxHPS != 0 {
|
||||
conf.MaxHeartbeatsPerSecond = maxHPS
|
||||
}
|
||||
|
||||
if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ServerServiceName == "" {
|
||||
|
|
|
@ -233,24 +233,24 @@ func TestAgent_ServerConfig(t *testing.T) {
|
|||
t.Fatalf("expect 10s, got: %s", threshold)
|
||||
}
|
||||
|
||||
conf.Server.HeartbeatGrace = "42g"
|
||||
if err := conf.normalizeAddrs(); err != nil {
|
||||
t.Fatalf("error normalizing config: %v", err)
|
||||
}
|
||||
out, err = a.serverConfig()
|
||||
if err == nil || !strings.Contains(err.Error(), "unknown unit") {
|
||||
t.Fatalf("expected unknown unit error, got: %#v", err)
|
||||
}
|
||||
|
||||
conf.Server.HeartbeatGrace = "37s"
|
||||
if err := conf.normalizeAddrs(); err != nil {
|
||||
t.Fatalf("error normalizing config: %v", err)
|
||||
}
|
||||
conf.Server.HeartbeatGrace = 37 * time.Second
|
||||
out, err = a.serverConfig()
|
||||
if threshold := out.HeartbeatGrace; threshold != time.Second*37 {
|
||||
t.Fatalf("expect 37s, got: %s", threshold)
|
||||
}
|
||||
|
||||
conf.Server.MinHeartbeatTTL = 37 * time.Second
|
||||
out, err = a.serverConfig()
|
||||
if min := out.MinHeartbeatTTL; min != time.Second*37 {
|
||||
t.Fatalf("expect 37s, got: %s", min)
|
||||
}
|
||||
|
||||
conf.Server.MaxHeartbeatsPerSecond = 11.0
|
||||
out, err = a.serverConfig()
|
||||
if max := out.MaxHeartbeatsPerSecond; max != 11.0 {
|
||||
t.Fatalf("expect 11, got: %s", max)
|
||||
}
|
||||
|
||||
// Defaults to the global bind addr
|
||||
conf.Addresses.RPC = ""
|
||||
conf.Addresses.Serf = ""
|
||||
|
|
|
@ -73,6 +73,8 @@ server {
|
|||
eval_gc_threshold = "12h"
|
||||
deployment_gc_threshold = "12h"
|
||||
heartbeat_grace = "30s"
|
||||
min_heartbeat_ttl = "33s"
|
||||
max_heartbeats_per_second = 11.0
|
||||
retry_join = [ "1.1.1.1", "2.2.2.2" ]
|
||||
start_join = [ "1.1.1.1", "2.2.2.2" ]
|
||||
retry_max = 3
|
||||
|
|
|
@ -277,7 +277,16 @@ type ServerConfig struct {
|
|||
|
||||
// HeartbeatGrace is the grace period beyond the TTL to account for network,
|
||||
// processing delays and clock skew before marking a node as "down".
|
||||
HeartbeatGrace string `mapstructure:"heartbeat_grace"`
|
||||
HeartbeatGrace time.Duration `mapstructure:"heartbeat_grace"`
|
||||
|
||||
// MinHeartbeatTTL is the minimum time between heartbeats. This is used as
|
||||
// a floor to prevent excessive updates.
|
||||
MinHeartbeatTTL time.Duration `mapstructure:"min_heartbeat_ttl"`
|
||||
|
||||
// MaxHeartbeatsPerSecond is the maximum target rate of heartbeats
|
||||
// being processed per second. This allows the TTL to be increased
|
||||
// to meet the target rate.
|
||||
MaxHeartbeatsPerSecond float64 `mapstructure:"max_heartbeats_per_second"`
|
||||
|
||||
// StartJoin is a list of addresses to attempt to join when the
|
||||
// agent starts. If Serf is unable to communicate with any of these
|
||||
|
@ -924,9 +933,15 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
|
|||
if b.DeploymentGCThreshold != "" {
|
||||
result.DeploymentGCThreshold = b.DeploymentGCThreshold
|
||||
}
|
||||
if b.HeartbeatGrace != "" {
|
||||
if b.HeartbeatGrace != 0 {
|
||||
result.HeartbeatGrace = b.HeartbeatGrace
|
||||
}
|
||||
if b.MinHeartbeatTTL != 0 {
|
||||
result.MinHeartbeatTTL = b.MinHeartbeatTTL
|
||||
}
|
||||
if b.MaxHeartbeatsPerSecond != 0.0 {
|
||||
result.MaxHeartbeatsPerSecond = b.MaxHeartbeatsPerSecond
|
||||
}
|
||||
if b.RetryMaxAttempts != 0 {
|
||||
result.RetryMaxAttempts = b.RetryMaxAttempts
|
||||
}
|
||||
|
|
|
@ -506,6 +506,8 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
|
|||
"job_gc_threshold",
|
||||
"deployment_gc_threshold",
|
||||
"heartbeat_grace",
|
||||
"min_heartbeat_ttl",
|
||||
"max_heartbeats_per_second",
|
||||
"start_join",
|
||||
"retry_join",
|
||||
"retry_max",
|
||||
|
@ -523,7 +525,15 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
|
|||
}
|
||||
|
||||
var config ServerConfig
|
||||
if err := mapstructure.WeakDecode(m, &config); err != nil {
|
||||
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
|
||||
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
|
||||
WeaklyTypedInput: true,
|
||||
Result: &config,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := dec.Decode(m); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
|
@ -82,23 +82,25 @@ func TestConfig_Parse(t *testing.T) {
|
|||
NoHostUUID: helper.BoolToPtr(false),
|
||||
},
|
||||
Server: &ServerConfig{
|
||||
Enabled: true,
|
||||
BootstrapExpect: 5,
|
||||
DataDir: "/tmp/data",
|
||||
ProtocolVersion: 3,
|
||||
NumSchedulers: 2,
|
||||
EnabledSchedulers: []string{"test"},
|
||||
NodeGCThreshold: "12h",
|
||||
EvalGCThreshold: "12h",
|
||||
JobGCThreshold: "12h",
|
||||
DeploymentGCThreshold: "12h",
|
||||
HeartbeatGrace: "30s",
|
||||
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
|
||||
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
|
||||
RetryInterval: "15s",
|
||||
RejoinAfterLeave: true,
|
||||
RetryMaxAttempts: 3,
|
||||
EncryptKey: "abc",
|
||||
Enabled: true,
|
||||
BootstrapExpect: 5,
|
||||
DataDir: "/tmp/data",
|
||||
ProtocolVersion: 3,
|
||||
NumSchedulers: 2,
|
||||
EnabledSchedulers: []string{"test"},
|
||||
NodeGCThreshold: "12h",
|
||||
EvalGCThreshold: "12h",
|
||||
JobGCThreshold: "12h",
|
||||
DeploymentGCThreshold: "12h",
|
||||
HeartbeatGrace: 30 * time.Second,
|
||||
MinHeartbeatTTL: 33 * time.Second,
|
||||
MaxHeartbeatsPerSecond: 11.0,
|
||||
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
|
||||
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
|
||||
RetryInterval: "15s",
|
||||
RejoinAfterLeave: true,
|
||||
RetryMaxAttempts: 3,
|
||||
EncryptKey: "abc",
|
||||
},
|
||||
Telemetry: &Telemetry{
|
||||
StatsiteAddr: "127.0.0.1:1234",
|
||||
|
|
|
@ -90,13 +90,15 @@ func TestConfig_Merge(t *testing.T) {
|
|||
},
|
||||
},
|
||||
Server: &ServerConfig{
|
||||
Enabled: false,
|
||||
BootstrapExpect: 1,
|
||||
DataDir: "/tmp/data1",
|
||||
ProtocolVersion: 1,
|
||||
NumSchedulers: 1,
|
||||
NodeGCThreshold: "1h",
|
||||
HeartbeatGrace: "30s",
|
||||
Enabled: false,
|
||||
BootstrapExpect: 1,
|
||||
DataDir: "/tmp/data1",
|
||||
ProtocolVersion: 1,
|
||||
NumSchedulers: 1,
|
||||
NodeGCThreshold: "1h",
|
||||
HeartbeatGrace: 30 * time.Second,
|
||||
MinHeartbeatTTL: 30 * time.Second,
|
||||
MaxHeartbeatsPerSecond: 30.0,
|
||||
},
|
||||
Ports: &Ports{
|
||||
HTTP: 4646,
|
||||
|
@ -220,19 +222,21 @@ func TestConfig_Merge(t *testing.T) {
|
|||
GCInodeUsageThreshold: 86,
|
||||
},
|
||||
Server: &ServerConfig{
|
||||
Enabled: true,
|
||||
BootstrapExpect: 2,
|
||||
DataDir: "/tmp/data2",
|
||||
ProtocolVersion: 2,
|
||||
NumSchedulers: 2,
|
||||
EnabledSchedulers: []string{structs.JobTypeBatch},
|
||||
NodeGCThreshold: "12h",
|
||||
HeartbeatGrace: "2m",
|
||||
RejoinAfterLeave: true,
|
||||
StartJoin: []string{"1.1.1.1"},
|
||||
RetryJoin: []string{"1.1.1.1"},
|
||||
RetryInterval: "10s",
|
||||
retryInterval: time.Second * 10,
|
||||
Enabled: true,
|
||||
BootstrapExpect: 2,
|
||||
DataDir: "/tmp/data2",
|
||||
ProtocolVersion: 2,
|
||||
NumSchedulers: 2,
|
||||
EnabledSchedulers: []string{structs.JobTypeBatch},
|
||||
NodeGCThreshold: "12h",
|
||||
HeartbeatGrace: 2 * time.Minute,
|
||||
MinHeartbeatTTL: 2 * time.Minute,
|
||||
MaxHeartbeatsPerSecond: 200.0,
|
||||
RejoinAfterLeave: true,
|
||||
StartJoin: []string{"1.1.1.1"},
|
||||
RetryJoin: []string{"1.1.1.1"},
|
||||
RetryInterval: "10s",
|
||||
retryInterval: time.Second * 10,
|
||||
},
|
||||
Ports: &Ports{
|
||||
HTTP: 20000,
|
||||
|
|
|
@ -80,6 +80,23 @@ server {
|
|||
deployment must be in the terminal state before it is eligible for garbage
|
||||
collection. This is specified using a label suffix like "30s" or "1h".
|
||||
|
||||
- `heartbeat_grace` `(string: "10s")` - Specifies the additional time given as a
|
||||
grace period beyond the heartbeat TTL of nodes to account for network and
|
||||
processing delays as well as clock skew. This is specified using a label
|
||||
suffix like "30s" or "1h".
|
||||
|
||||
- `min_heartbeat_ttl` `(string: "10s")` - Specifies the minimum time between
|
||||
node heartbeats. This is used as a floor to prevent excessive updates. This is
|
||||
specified using a label suffix like "30s" or "1h". Lowering the minimum TTL is
|
||||
a tradeoff as it lowers failure detection time of nodes at the tradeoff of
|
||||
false positives and increased load on the leader.
|
||||
|
||||
- `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target
|
||||
rate of heartbeats being processed per second. This allows the TTL to be
|
||||
increased to meet the target rate. Increasing the maximum heartbeats per
|
||||
second is a tradeoff as it lowers failure detection time of nodes at the
|
||||
tradeoff of false positives and increased load on the leader.
|
||||
|
||||
- `num_schedulers` `(int: [num-cores])` - Specifies the number of parallel
|
||||
scheduler threads to run. This can be as many as one per core, or `0` to
|
||||
disallow this server from making any scheduling decisions. This defaults to
|
||||
|
|
Loading…
Reference in a new issue