diff --git a/changelog/22040.txt b/changelog/22040.txt new file mode 100644 index 000000000..e96a428b9 --- /dev/null +++ b/changelog/22040.txt @@ -0,0 +1,3 @@ +```release-note:improvement +storage/raft: Cap the minimum dead_server_last_contact_threshold to 1m. +``` diff --git a/vault/external_tests/raft/raft_autopilot_test.go b/vault/external_tests/raft/raft_autopilot_test.go index 9b2f83c18..9185d5b94 100644 --- a/vault/external_tests/raft/raft_autopilot_test.go +++ b/vault/external_tests/raft/raft_autopilot_test.go @@ -190,6 +190,14 @@ func TestRaft_Autopilot_Configuration(t *testing.T) { writeConfigFunc(writableConfig, true) configCheckFunc(config) + // Check dead server last contact threshold minimum + writableConfig = map[string]interface{}{ + "cleanup_dead_servers": true, + "dead_server_last_contact_threshold": "5s", + } + writeConfigFunc(writableConfig, true) + configCheckFunc(config) + // Ensure that the configuration stays across reboots leaderCore := cluster.Cores[0] testhelpers.EnsureCoreSealed(t, cluster.Cores[0]) @@ -442,7 +450,7 @@ func TestRaft_Autopilot_DeadServerCleanup(t *testing.T) { // Ensure Autopilot has the aggressive settings config.CleanupDeadServers = true config.ServerStabilizationTime = 5 * time.Second - config.DeadServerLastContactThreshold = 10 * time.Second + config.DeadServerLastContactThreshold = 1 * time.Minute config.MaxTrailingLogs = 10 config.LastContactThreshold = 10 * time.Second config.MinQuorum = 3 diff --git a/vault/logical_system_raft.go b/vault/logical_system_raft.go index ca475eddc..483c24fc5 100644 --- a/vault/logical_system_raft.go +++ b/vault/logical_system_raft.go @@ -533,6 +533,10 @@ func (b *SystemBackend) handleStorageRaftAutopilotConfigUpdate() framework.Opera return logical.ErrorResponse(fmt.Sprintf("min_quorum must be set when cleanup_dead_servers is set and it should at least be 3; cleanup_dead_servers: %#v, min_quorum: %#v", effectiveConf.CleanupDeadServers, effectiveConf.MinQuorum)), logical.ErrInvalidRequest } + if effectiveConf.CleanupDeadServers && effectiveConf.DeadServerLastContactThreshold.Seconds() < 60 { + return logical.ErrorResponse(fmt.Sprintf("dead_server_last_contact_threshold should not be set to less than 1m; received: %v", deadServerLastContactThreshold)), logical.ErrInvalidRequest + } + // Persist only the user supplied fields if persist { entry, err := logical.StorageEntryJSON(raftAutopilotConfigurationStoragePath, config) diff --git a/website/content/api-docs/system/storage/raftautopilot.mdx b/website/content/api-docs/system/storage/raftautopilot.mdx index 18eff4c8d..6c8e7c9d5 100644 --- a/website/content/api-docs/system/storage/raftautopilot.mdx +++ b/website/content/api-docs/system/storage/raftautopilot.mdx @@ -210,7 +210,8 @@ This endpoint is used to modify the configuration of the autopilot subsystem of - `dead_server_last_contact_threshold` `(string: "24h")` - Limit on the amount of time a server can go without leader contact before being considered failed. This - takes effect only when `cleanup_dead_servers` is `true`. + takes effect only when `cleanup_dead_servers` is `true`. This can not be set to a value + smaller than 1m. - `max_trailing_logs` `(int: 1000)` - Amount of entries in the Raft Log that a server can be behind before being considered unhealthy.