211 lines
6.4 KiB
Go
211 lines
6.4 KiB
Go
|
package rafttests
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"math"
|
||
|
"testing"
|
||
|
"time"
|
||
|
|
||
|
"github.com/hashicorp/vault/api"
|
||
|
"github.com/kr/pretty"
|
||
|
|
||
|
autopilot "github.com/hashicorp/raft-autopilot"
|
||
|
|
||
|
"github.com/stretchr/testify/require"
|
||
|
|
||
|
"github.com/hashicorp/vault/helper/namespace"
|
||
|
"github.com/hashicorp/vault/helper/testhelpers"
|
||
|
"github.com/hashicorp/vault/physical/raft"
|
||
|
"github.com/hashicorp/vault/vault"
|
||
|
)
|
||
|
|
||
|
func TestRaft_Autopilot_Disable(t *testing.T) {
|
||
|
cluster := raftCluster(t, &RaftClusterOpts{
|
||
|
DisableFollowerJoins: true,
|
||
|
InmemCluster: true,
|
||
|
// Not setting EnableAutopilot here.
|
||
|
})
|
||
|
defer cluster.Cleanup()
|
||
|
|
||
|
client := cluster.Cores[0].Client
|
||
|
|
||
|
state, err := client.Sys().RaftAutopilotState()
|
||
|
require.NoError(t, err)
|
||
|
require.Nil(t, nil, state)
|
||
|
}
|
||
|
|
||
|
func TestRaft_Autopilot_Stabilization_And_State(t *testing.T) {
|
||
|
cluster := raftCluster(t, &RaftClusterOpts{
|
||
|
DisableFollowerJoins: true,
|
||
|
InmemCluster: true,
|
||
|
EnableAutopilot: true,
|
||
|
})
|
||
|
defer cluster.Cleanup()
|
||
|
|
||
|
// Check that autopilot execution state is running
|
||
|
client := cluster.Cores[0].Client
|
||
|
state, err := client.Sys().RaftAutopilotState()
|
||
|
require.NoError(t, err)
|
||
|
require.Equal(t, true, state.Healthy)
|
||
|
require.Len(t, state.Servers, 1)
|
||
|
require.Equal(t, "core-0", state.Servers["core-0"].ID)
|
||
|
require.Equal(t, "alive", state.Servers["core-0"].NodeStatus)
|
||
|
require.Equal(t, "leader", state.Servers["core-0"].Status)
|
||
|
|
||
|
config, err := client.Sys().RaftAutopilotConfiguration()
|
||
|
require.NoError(t, err)
|
||
|
|
||
|
// Wait for 110% of the stabilization time to add nodes
|
||
|
stabilizationKickOffWaitDuration := time.Duration(math.Ceil(1.1 * float64(config.ServerStabilizationTime)))
|
||
|
time.Sleep(stabilizationKickOffWaitDuration)
|
||
|
|
||
|
joinAndStabilizeFunc := func(core *vault.TestClusterCore, nodeID string, numServers int) {
|
||
|
joinFunc := func(core *vault.TestClusterCore) {
|
||
|
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), []*raft.LeaderJoinInfo{
|
||
|
{
|
||
|
LeaderAPIAddr: client.Address(),
|
||
|
TLSConfig: cluster.Cores[0].TLSConfig,
|
||
|
Retry: true,
|
||
|
},
|
||
|
}, false)
|
||
|
require.NoError(t, err)
|
||
|
time.Sleep(1 * time.Second)
|
||
|
cluster.UnsealCore(t, core)
|
||
|
}
|
||
|
joinFunc(core)
|
||
|
|
||
|
state, err = client.Sys().RaftAutopilotState()
|
||
|
require.NoError(t, err)
|
||
|
require.Equal(t, false, state.Healthy)
|
||
|
require.Len(t, state.Servers, numServers)
|
||
|
require.Equal(t, false, state.Servers[nodeID].Healthy)
|
||
|
require.Equal(t, "alive", state.Servers[nodeID].NodeStatus)
|
||
|
require.Equal(t, "non-voter", state.Servers[nodeID].Status)
|
||
|
|
||
|
// Wait till the stabilization period is over
|
||
|
stabilizationWaitDuration := time.Duration(float64(config.ServerStabilizationTime))
|
||
|
deadline := time.Now().Add(stabilizationWaitDuration)
|
||
|
healthy := false
|
||
|
for time.Now().Before(deadline) {
|
||
|
state, err := client.Sys().RaftAutopilotState()
|
||
|
require.NoError(t, err)
|
||
|
if state.Healthy {
|
||
|
healthy = true
|
||
|
}
|
||
|
time.Sleep(1 * time.Second)
|
||
|
}
|
||
|
if !healthy {
|
||
|
t.Fatalf("cluster failed to stabilize")
|
||
|
}
|
||
|
|
||
|
// Now that the server is stable, wait for autopilot to reconcile and
|
||
|
// promotion to happen. Reconcile interval is 10 seconds. Bound it by
|
||
|
// doubling.
|
||
|
deadline = time.Now().Add(2 * autopilot.DefaultReconcileInterval)
|
||
|
failed := true
|
||
|
for time.Now().Before(deadline) {
|
||
|
state, err = client.Sys().RaftAutopilotState()
|
||
|
require.NoError(t, err)
|
||
|
if state.Servers[nodeID].Status == "voter" {
|
||
|
failed = false
|
||
|
break
|
||
|
}
|
||
|
time.Sleep(1 * time.Second)
|
||
|
}
|
||
|
|
||
|
if failed {
|
||
|
t.Fatalf("autopilot failed to promote node: id: %#v: state:%# v\n", nodeID, pretty.Formatter(state))
|
||
|
}
|
||
|
}
|
||
|
joinAndStabilizeFunc(cluster.Cores[1], "core-1", 2)
|
||
|
joinAndStabilizeFunc(cluster.Cores[2], "core-2", 3)
|
||
|
state, err = client.Sys().RaftAutopilotState()
|
||
|
require.NoError(t, err)
|
||
|
require.Equal(t, []string{"core-0", "core-1", "core-2"}, state.Voters)
|
||
|
}
|
||
|
|
||
|
func TestRaft_Autopilot_Configuration(t *testing.T) {
|
||
|
cluster := raftCluster(t, &RaftClusterOpts{
|
||
|
DisableFollowerJoins: true,
|
||
|
InmemCluster: true,
|
||
|
EnableAutopilot: true,
|
||
|
})
|
||
|
defer cluster.Cleanup()
|
||
|
|
||
|
client := cluster.Cores[0].Client
|
||
|
configCheckFunc := func(config *api.AutopilotConfig) {
|
||
|
conf, err := client.Sys().RaftAutopilotConfiguration()
|
||
|
require.NoError(t, err)
|
||
|
require.Equal(t, config, conf)
|
||
|
}
|
||
|
|
||
|
writeConfigFunc := func(config map[string]interface{}, expectError bool) {
|
||
|
resp, err := client.Logical().Write("sys/storage/raft/autopilot/configuration", config)
|
||
|
if expectError {
|
||
|
require.Error(t, err)
|
||
|
return
|
||
|
}
|
||
|
require.NoError(t, err)
|
||
|
require.Nil(t, resp)
|
||
|
}
|
||
|
|
||
|
// Ensure autopilot's default config has taken effect
|
||
|
config := &api.AutopilotConfig{
|
||
|
CleanupDeadServers: false,
|
||
|
DeadServerLastContactThreshold: 24 * time.Hour,
|
||
|
LastContactThreshold: 10 * time.Second,
|
||
|
MaxTrailingLogs: 1000,
|
||
|
ServerStabilizationTime: 10 * time.Second,
|
||
|
}
|
||
|
configCheckFunc(config)
|
||
|
|
||
|
// Update config
|
||
|
writableConfig := map[string]interface{}{
|
||
|
"cleanup_dead_servers": true,
|
||
|
"dead_server_last_contact_threshold": "100h",
|
||
|
"last_contact_threshold": "100s",
|
||
|
"max_trailing_logs": 100,
|
||
|
"min_quorum": 100,
|
||
|
"server_stabilization_time": "100s",
|
||
|
}
|
||
|
writeConfigFunc(writableConfig, false)
|
||
|
|
||
|
// Ensure update has taken effect
|
||
|
config.CleanupDeadServers = true
|
||
|
config.DeadServerLastContactThreshold = 100 * time.Hour
|
||
|
config.LastContactThreshold = 100 * time.Second
|
||
|
config.MaxTrailingLogs = 100
|
||
|
config.MinQuorum = 100
|
||
|
config.ServerStabilizationTime = 100 * time.Second
|
||
|
configCheckFunc(config)
|
||
|
|
||
|
// Update some fields and leave the rest as it is.
|
||
|
writableConfig = map[string]interface{}{
|
||
|
"dead_server_last_contact_threshold": "50h",
|
||
|
"max_trailing_logs": 50,
|
||
|
"server_stabilization_time": "50s",
|
||
|
}
|
||
|
writeConfigFunc(writableConfig, false)
|
||
|
|
||
|
// Check update
|
||
|
config.DeadServerLastContactThreshold = 50 * time.Hour
|
||
|
config.MaxTrailingLogs = 50
|
||
|
config.ServerStabilizationTime = 50 * time.Second
|
||
|
configCheckFunc(config)
|
||
|
|
||
|
// Check error case
|
||
|
writableConfig = map[string]interface{}{
|
||
|
"min_quorum": 2,
|
||
|
"dead_server_last_contact_threshold": "48h",
|
||
|
}
|
||
|
writeConfigFunc(writableConfig, true)
|
||
|
configCheckFunc(config)
|
||
|
|
||
|
// Ensure that the configuration stays across reboots
|
||
|
leaderCore := cluster.Cores[0]
|
||
|
testhelpers.EnsureCoreSealed(t, cluster.Cores[0])
|
||
|
cluster.UnsealCore(t, leaderCore)
|
||
|
vault.TestWaitActive(t, leaderCore.Core)
|
||
|
configCheckFunc(config)
|
||
|
}
|