Merge pull request #8082 from hashicorp/f-raft-multipler
Implement raft multipler flag
This commit is contained in:
commit
963b1251ff
|
@ -43,6 +43,14 @@ const (
|
|||
// roles used in identifying Consul entries for Nomad agents
|
||||
consulRoleServer = "server"
|
||||
consulRoleClient = "client"
|
||||
|
||||
// DefaultRaftMultiplier is used as a baseline Raft configuration that
|
||||
// will be reliable on a very basic server.
|
||||
DefaultRaftMultiplier = 1
|
||||
|
||||
// MaxRaftMultiplier is a fairly arbitrary upper bound that limits the
|
||||
// amount of performance detuning that's possible.
|
||||
MaxRaftMultiplier = 10
|
||||
)
|
||||
|
||||
// Agent is a long running daemon that is used to run both
|
||||
|
@ -180,6 +188,18 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
|
|||
if agentConfig.Server.RaftProtocol != 0 {
|
||||
conf.RaftConfig.ProtocolVersion = raft.ProtocolVersion(agentConfig.Server.RaftProtocol)
|
||||
}
|
||||
raftMultiplier := int(DefaultRaftMultiplier)
|
||||
if agentConfig.Server.RaftMultiplier != nil && *agentConfig.Server.RaftMultiplier != 0 {
|
||||
raftMultiplier = *agentConfig.Server.RaftMultiplier
|
||||
if raftMultiplier < 1 || raftMultiplier > MaxRaftMultiplier {
|
||||
return nil, fmt.Errorf("raft_multiplier cannot be %d. Must be between 1 and %d", *agentConfig.Server.RaftMultiplier, MaxRaftMultiplier)
|
||||
}
|
||||
}
|
||||
conf.RaftConfig.ElectionTimeout *= time.Duration(raftMultiplier)
|
||||
conf.RaftConfig.HeartbeatTimeout *= time.Duration(raftMultiplier)
|
||||
conf.RaftConfig.LeaderLeaseTimeout *= time.Duration(raftMultiplier)
|
||||
conf.RaftConfig.CommitTimeout *= time.Duration(raftMultiplier)
|
||||
|
||||
if agentConfig.Server.NumSchedulers != nil {
|
||||
conf.NumSchedulers = *agentConfig.Server.NumSchedulers
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
@ -362,6 +363,113 @@ func TestAgent_ServerConfig_Limits_OK(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestAgent_ServerConfig_RaftMultiplier_Ok(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
multiplier *int
|
||||
electionTimout time.Duration
|
||||
heartbeatTimeout time.Duration
|
||||
leaderLeaseTimeout time.Duration
|
||||
commitTimeout time.Duration
|
||||
}{
|
||||
// nil, 0 are the defaults of the Raft library.
|
||||
// Expected values are hardcoded to detect changes from raft.
|
||||
{
|
||||
multiplier: nil,
|
||||
|
||||
electionTimout: 1 * time.Second,
|
||||
heartbeatTimeout: 1 * time.Second,
|
||||
leaderLeaseTimeout: 500 * time.Millisecond,
|
||||
commitTimeout: 50 * time.Millisecond,
|
||||
},
|
||||
|
||||
{
|
||||
multiplier: helper.IntToPtr(0),
|
||||
|
||||
electionTimout: 1 * time.Second,
|
||||
heartbeatTimeout: 1 * time.Second,
|
||||
leaderLeaseTimeout: 500 * time.Millisecond,
|
||||
commitTimeout: 50 * time.Millisecond,
|
||||
},
|
||||
{
|
||||
multiplier: helper.IntToPtr(1),
|
||||
|
||||
electionTimout: 1 * time.Second,
|
||||
heartbeatTimeout: 1 * time.Second,
|
||||
leaderLeaseTimeout: 500 * time.Millisecond,
|
||||
commitTimeout: 50 * time.Millisecond,
|
||||
},
|
||||
{
|
||||
multiplier: helper.IntToPtr(5),
|
||||
|
||||
electionTimout: 5 * time.Second,
|
||||
heartbeatTimeout: 5 * time.Second,
|
||||
leaderLeaseTimeout: 2500 * time.Millisecond,
|
||||
commitTimeout: 250 * time.Millisecond,
|
||||
},
|
||||
{
|
||||
multiplier: helper.IntToPtr(6),
|
||||
|
||||
electionTimout: 6 * time.Second,
|
||||
heartbeatTimeout: 6 * time.Second,
|
||||
leaderLeaseTimeout: 3000 * time.Millisecond,
|
||||
commitTimeout: 300 * time.Millisecond,
|
||||
},
|
||||
{
|
||||
multiplier: helper.IntToPtr(10),
|
||||
|
||||
electionTimout: 10 * time.Second,
|
||||
heartbeatTimeout: 10 * time.Second,
|
||||
leaderLeaseTimeout: 5000 * time.Millisecond,
|
||||
commitTimeout: 500 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
v := "default"
|
||||
if tc.multiplier != nil {
|
||||
v = fmt.Sprintf("%v", *tc.multiplier)
|
||||
}
|
||||
t.Run(v, func(t *testing.T) {
|
||||
conf := DevConfig(nil)
|
||||
require.NoError(t, conf.normalizeAddrs())
|
||||
|
||||
conf.Server.RaftMultiplier = tc.multiplier
|
||||
|
||||
serverConf, err := convertServerConfig(conf)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, tc.electionTimout, serverConf.RaftConfig.ElectionTimeout, "election timeout")
|
||||
assert.Equal(t, tc.heartbeatTimeout, serverConf.RaftConfig.HeartbeatTimeout, "heartbeat timeout")
|
||||
assert.Equal(t, tc.leaderLeaseTimeout, serverConf.RaftConfig.LeaderLeaseTimeout, "leader lease timeout")
|
||||
assert.Equal(t, tc.commitTimeout, serverConf.RaftConfig.CommitTimeout, "commit timeout")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgent_ServerConfig_RaftMultiplier_Bad(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []int{
|
||||
-1,
|
||||
100,
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(fmt.Sprintf("%v", tc), func(t *testing.T) {
|
||||
conf := DevConfig(nil)
|
||||
require.NoError(t, conf.normalizeAddrs())
|
||||
|
||||
conf.Server.RaftMultiplier = &tc
|
||||
|
||||
_, err := convertServerConfig(conf)
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "raft_multiplier cannot be")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgent_ClientConfig(t *testing.T) {
|
||||
t.Parallel()
|
||||
conf := DefaultConfig()
|
||||
|
|
|
@ -355,6 +355,9 @@ type ServerConfig struct {
|
|||
// RaftProtocol is the Raft protocol version to speak. This must be from [1-3].
|
||||
RaftProtocol int `hcl:"raft_protocol"`
|
||||
|
||||
// RaftMultiplier scales the Raft timing parameters
|
||||
RaftMultiplier *int `hcl:"raft_multiplier"`
|
||||
|
||||
// NumSchedulers is the number of scheduler thread that are run.
|
||||
// This can be as many as one per core, or zero to disable this server
|
||||
// from doing any scheduling work.
|
||||
|
@ -1315,6 +1318,10 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
|
|||
if b.RaftProtocol != 0 {
|
||||
result.RaftProtocol = b.RaftProtocol
|
||||
}
|
||||
if b.RaftMultiplier != nil {
|
||||
c := *b.RaftMultiplier
|
||||
result.RaftMultiplier = &c
|
||||
}
|
||||
if b.NumSchedulers != nil {
|
||||
result.NumSchedulers = helper.IntToPtr(*b.NumSchedulers)
|
||||
}
|
||||
|
|
|
@ -97,6 +97,7 @@ var basicConfig = &Config{
|
|||
DataDir: "/tmp/data",
|
||||
ProtocolVersion: 3,
|
||||
RaftProtocol: 3,
|
||||
RaftMultiplier: helper.IntToPtr(4),
|
||||
NumSchedulers: helper.IntToPtr(2),
|
||||
EnabledSchedulers: []string{"test"},
|
||||
NodeGCThreshold: "12h",
|
||||
|
|
|
@ -132,6 +132,7 @@ func TestConfig_Merge(t *testing.T) {
|
|||
DataDir: "/tmp/data1",
|
||||
ProtocolVersion: 1,
|
||||
RaftProtocol: 1,
|
||||
RaftMultiplier: helper.IntToPtr(5),
|
||||
NumSchedulers: helper.IntToPtr(1),
|
||||
NodeGCThreshold: "1h",
|
||||
HeartbeatGrace: 30 * time.Second,
|
||||
|
@ -317,6 +318,7 @@ func TestConfig_Merge(t *testing.T) {
|
|||
DataDir: "/tmp/data2",
|
||||
ProtocolVersion: 2,
|
||||
RaftProtocol: 2,
|
||||
RaftMultiplier: helper.IntToPtr(6),
|
||||
NumSchedulers: helper.IntToPtr(2),
|
||||
EnabledSchedulers: []string{structs.JobTypeBatch},
|
||||
NodeGCThreshold: "12h",
|
||||
|
@ -425,9 +427,7 @@ func TestConfig_Merge(t *testing.T) {
|
|||
result := c0.Merge(c1)
|
||||
result = result.Merge(c2)
|
||||
result = result.Merge(c3)
|
||||
if !reflect.DeepEqual(result, c3) {
|
||||
t.Fatalf("bad:\n%#v\n%#v", result, c3)
|
||||
}
|
||||
require.Equal(t, c3, result)
|
||||
}
|
||||
|
||||
func TestConfig_ParseConfigFile(t *testing.T) {
|
||||
|
|
1
command/agent/testdata/basic.hcl
vendored
1
command/agent/testdata/basic.hcl
vendored
|
@ -129,6 +129,7 @@ server {
|
|||
redundancy_zone = "foo"
|
||||
upgrade_version = "0.8.0"
|
||||
encrypt = "abc"
|
||||
raft_multiplier = 4
|
||||
|
||||
server_join {
|
||||
retry_join = ["1.1.1.1", "2.2.2.2"]
|
||||
|
|
1
command/agent/testdata/basic.json
vendored
1
command/agent/testdata/basic.json
vendored
|
@ -276,6 +276,7 @@
|
|||
"num_schedulers": 2,
|
||||
"protocol_version": 3,
|
||||
"raft_protocol": 3,
|
||||
"raft_multiplier": 4,
|
||||
"redundancy_zone": "foo",
|
||||
"rejoin_after_leave": true,
|
||||
"retry_interval": "15s",
|
||||
|
|
|
@ -141,6 +141,25 @@ server {
|
|||
features and is typically not required as the agent internally knows the
|
||||
latest version, but may be useful in some upgrade scenarios.
|
||||
|
||||
- `raft_multiplier` `(int: 1)` - An integer multiplier used by Nomad servers to
|
||||
scale key Raft timing parameters. Omitting this value or setting it to 0 uses
|
||||
default timing described below. Lower values are used to tighten timing and
|
||||
increase sensitivity while higher values relax timings and reduce sensitivity.
|
||||
Tuning this affects the time it takes Nomad to detect leader failures and to
|
||||
perform leader elections, at the expense of requiring more network and CPU
|
||||
resources for better performance. The maximum allowed value is 10.
|
||||
|
||||
By default, Nomad will use the highest-performance timing, currently equivalent
|
||||
to setting this to a value of 1. Increasing the timings makes leader election
|
||||
less likely during periods of networking issues or resource starvation. Since
|
||||
leader elections pause Nomad's normal work, it may be beneficial for slow or
|
||||
unreliable networks to wait longer before electing a new leader. The tradeoff
|
||||
when raising this value is that during network partitions or other events
|
||||
(server crash) where a leader is lost, Nomad will not elect a new leader for
|
||||
a longer period of time than the default. The ['nomad.nomad.leader.barrier' and
|
||||
`nomad.raft.leader.lastContact` metrics](/docs/telemetry/metrics is a good
|
||||
indicator of how often leader elections occur and raft latency.
|
||||
|
||||
- `redundancy_zone` `(string: "")` - (Enterprise-only) Specifies the redundancy
|
||||
zone that this server will be a part of for Autopilot management. For more
|
||||
information, see the [Autopilot Guide](https://learn.hashicorp.com/nomad/operating-nomad/autopilot).
|
||||
|
@ -275,4 +294,4 @@ server {
|
|||
[encryption]: https://learn.hashicorp.com/nomad/transport-security/gossip-encryption 'Nomad Encryption Overview'
|
||||
[server-join]: /docs/configuration/server_join 'Server Join'
|
||||
[update-scheduler-config]: /api-docs/operator#update-scheduler-configuration 'Scheduler Config'
|
||||
[bootstrapping a cluster]: /docs/faq#bootstrapping
|
||||
[bootstrapping a cluster]: /docs/faq#bootstrapping
|
||||
|
|
Loading…
Reference in a new issue