open-vault/vault/external_tests/raft/raft_autopilot_test.go

package rafttests

import (
	"context"
	"math"
	"testing"
	"time"

	"github.com/hashicorp/vault/api"
	"github.com/kr/pretty"

	autopilot "github.com/hashicorp/raft-autopilot"

	"github.com/stretchr/testify/require"

	"github.com/hashicorp/vault/helper/namespace"
	"github.com/hashicorp/vault/helper/testhelpers"
	"github.com/hashicorp/vault/physical/raft"
	"github.com/hashicorp/vault/vault"
)

func TestRaft_Autopilot_Disable(t *testing.T) {
	cluster := raftCluster(t, &RaftClusterOpts{
		DisableFollowerJoins: true,
		InmemCluster:         true,
		// Not setting EnableAutopilot here.
	})
	defer cluster.Cleanup()

	client := cluster.Cores[0].Client

	state, err := client.Sys().RaftAutopilotState()
	require.NoError(t, err)
	require.Nil(t, nil, state)
}

func TestRaft_Autopilot_Stabilization_And_State(t *testing.T) {
	cluster := raftCluster(t, &RaftClusterOpts{
		DisableFollowerJoins: true,
		InmemCluster:         true,
		EnableAutopilot:      true,
	})
	defer cluster.Cleanup()

	// Check that autopilot execution state is running
	client := cluster.Cores[0].Client
	state, err := client.Sys().RaftAutopilotState()
	require.NoError(t, err)
	require.Equal(t, true, state.Healthy)
	require.Len(t, state.Servers, 1)
	require.Equal(t, "core-0", state.Servers["core-0"].ID)
	require.Equal(t, "alive", state.Servers["core-0"].NodeStatus)
	require.Equal(t, "leader", state.Servers["core-0"].Status)

	config, err := client.Sys().RaftAutopilotConfiguration()
	require.NoError(t, err)

	// Wait for 110% of the stabilization time to add nodes
	stabilizationKickOffWaitDuration := time.Duration(math.Ceil(1.1 * float64(config.ServerStabilizationTime)))
	time.Sleep(stabilizationKickOffWaitDuration)

	joinAndStabilizeFunc := func(core *vault.TestClusterCore, nodeID string, numServers int) {
		joinFunc := func(core *vault.TestClusterCore) {
			_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), []*raft.LeaderJoinInfo{
				{
					LeaderAPIAddr: client.Address(),
					TLSConfig:     cluster.Cores[0].TLSConfig,
					Retry:         true,
				},
			}, false)
			require.NoError(t, err)
			time.Sleep(1 * time.Second)
			cluster.UnsealCore(t, core)
		}
		joinFunc(core)

		state, err = client.Sys().RaftAutopilotState()
		require.NoError(t, err)
		require.Equal(t, false, state.Healthy)
		require.Len(t, state.Servers, numServers)
		require.Equal(t, false, state.Servers[nodeID].Healthy)
		require.Equal(t, "alive", state.Servers[nodeID].NodeStatus)
		require.Equal(t, "non-voter", state.Servers[nodeID].Status)

		// Wait till the stabilization period is over
		stabilizationWaitDuration := time.Duration(float64(config.ServerStabilizationTime))
		deadline := time.Now().Add(stabilizationWaitDuration)
		healthy := false
		for time.Now().Before(deadline) {
			state, err := client.Sys().RaftAutopilotState()
			require.NoError(t, err)
			if state.Healthy {
				healthy = true
			}
			time.Sleep(1 * time.Second)
		}
		if !healthy {
			t.Fatalf("cluster failed to stabilize")
		}

		// Now that the server is stable, wait for autopilot to reconcile and
		// promotion to happen. Reconcile interval is 10 seconds. Bound it by
		// doubling.
		deadline = time.Now().Add(2 * autopilot.DefaultReconcileInterval)
		failed := true
		for time.Now().Before(deadline) {
			state, err = client.Sys().RaftAutopilotState()
			require.NoError(t, err)
			if state.Servers[nodeID].Status == "voter" {
				failed = false
				break
			}
			time.Sleep(1 * time.Second)
		}

		if failed {
			t.Fatalf("autopilot failed to promote node: id: %#v: state:%# v\n", nodeID, pretty.Formatter(state))
		}
	}
	joinAndStabilizeFunc(cluster.Cores[1], "core-1", 2)
	joinAndStabilizeFunc(cluster.Cores[2], "core-2", 3)
	state, err = client.Sys().RaftAutopilotState()
	require.NoError(t, err)
	require.Equal(t, []string{"core-0", "core-1", "core-2"}, state.Voters)
}

func TestRaft_Autopilot_Configuration(t *testing.T) {
	cluster := raftCluster(t, &RaftClusterOpts{
		DisableFollowerJoins: true,
		InmemCluster:         true,
		EnableAutopilot:      true,
	})
	defer cluster.Cleanup()

	client := cluster.Cores[0].Client
	configCheckFunc := func(config *api.AutopilotConfig) {
		conf, err := client.Sys().RaftAutopilotConfiguration()
		require.NoError(t, err)
		require.Equal(t, config, conf)
	}

	writeConfigFunc := func(config map[string]interface{}, expectError bool) {
		resp, err := client.Logical().Write("sys/storage/raft/autopilot/configuration", config)
		if expectError {
			require.Error(t, err)
			return
		}
		require.NoError(t, err)
		require.Nil(t, resp)
	}

	// Ensure autopilot's default config has taken effect
	config := &api.AutopilotConfig{
		CleanupDeadServers:             false,
		DeadServerLastContactThreshold: 24 * time.Hour,
		LastContactThreshold:           10 * time.Second,
		MaxTrailingLogs:                1000,
		ServerStabilizationTime:        10 * time.Second,
	}
	configCheckFunc(config)

	// Update config
	writableConfig := map[string]interface{}{
		"cleanup_dead_servers":               true,
		"dead_server_last_contact_threshold": "100h",
		"last_contact_threshold":             "100s",
		"max_trailing_logs":                  100,
		"min_quorum":                         100,
		"server_stabilization_time":          "100s",
	}
	writeConfigFunc(writableConfig, false)

	// Ensure update has taken effect
	config.CleanupDeadServers = true
	config.DeadServerLastContactThreshold = 100 * time.Hour
	config.LastContactThreshold = 100 * time.Second
	config.MaxTrailingLogs = 100
	config.MinQuorum = 100
	config.ServerStabilizationTime = 100 * time.Second
	configCheckFunc(config)

	// Update some fields and leave the rest as it is.
	writableConfig = map[string]interface{}{
		"dead_server_last_contact_threshold": "50h",
		"max_trailing_logs":                  50,
		"server_stabilization_time":          "50s",
	}
	writeConfigFunc(writableConfig, false)

	// Check update
	config.DeadServerLastContactThreshold = 50 * time.Hour
	config.MaxTrailingLogs = 50
	config.ServerStabilizationTime = 50 * time.Second
	configCheckFunc(config)

	// Check error case
	writableConfig = map[string]interface{}{
		"min_quorum":                         2,
		"dead_server_last_contact_threshold": "48h",
	}
	writeConfigFunc(writableConfig, true)
	configCheckFunc(config)

	// Ensure that the configuration stays across reboots
	leaderCore := cluster.Cores[0]
	testhelpers.EnsureCoreSealed(t, cluster.Cores[0])
	cluster.UnsealCore(t, leaderCore)
	vault.TestWaitActive(t, leaderCore.Core)
	configCheckFunc(config)
}
Autopilot: Server Stabilization, State and Dead Server Cleanup (#10856) * k8s doc: update for 0.9.1 and 0.8.0 releases (#10825) * k8s doc: update for 0.9.1 and 0.8.0 releases * Update website/content/docs/platform/k8s/helm/configuration.mdx Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> * Autopilot initial commit * Move autopilot related backend implementations to its own file * Abstract promoter creation * Add nil check for health * Add server state oss no-ops * Config ext stub for oss * Make way for non-voters * s/health/state * s/ReadReplica/NonVoter * Add synopsis and description * Remove struct tags from AutopilotConfig * Use var for config storage path * Handle nin-config when reading * Enable testing autopilot by using inmem cluster * First passing test * Only report the server as known if it is present in raft config * Autopilot defaults to on for all existing and new clusters * Add locking to some functions * Persist initial config * Clarify the command usage doc * Add health metric for each node * Fix audit logging issue * Don't set DisablePerformanceStandby to true in test * Use node id label for health metric * Log updates to autopilot config * Less aggressively consume config loading failures * Return a mutable config * Return early from known servers if raft config is unable to be pulled * Update metrics name * Reduce log level for potentially noisy log * Add knob to disable autopilot * Don't persist if default config is in use * Autopilot: Dead server cleanup (#10857) * Dead server cleanup * Initialize channel in any case * Fix a bunch of tests * Fix panic * Add follower locking in heartbeat tracker * Add LastContactFailureThreshold to config * Add log when marking node as dead * Update follower state locking in heartbeat tracker * Avoid follower states being nil * Pull test to its own file * Add execution status to state response * Optionally enable autopilot in some tests * Updates * Added API function to fetch autopilot configuration * Add test for default autopilot configuration * Configuration tests * Add State API test * Update test * Added TestClusterOptions.PhysicalFactoryConfig * Update locking * Adjust locking in heartbeat tracker * s/last_contact_failure_threshold/left_server_last_contact_threshold * Add disabling autopilot as a core config option * Disable autopilot in some tests * s/left_server_last_contact_threshold/dead_server_last_contact_threshold * Set the lastheartbeat of followers to now when setting up active node * Don't use config defaults from CLI command * Remove config file support * Remove HCL test as well * Persist only supplied config; merge supplied config with default to operate * Use pointer to structs for storing follower information * Test update * Retrieve non voter status from configbucket and set it up when a node comes up * Manage desired suffrage * Consider bucket being created already * Move desired suffrage to its own entry * s/DesiredSuffrageKey/LocalNodeConfigKey * s/witnessSuffrage/recordSuffrage * Fix test compilation * Handle local node config post a snapshot install * Commit to storage first; then record suffrage in fsm * No need of local node config being nili case, post snapshot restore * Reconcile autopilot config when a new leader takes over duty * Grab fsm lock when recording suffrage * s/Suffrage/DesiredSuffrage in FollowerState * Instantiate autopilot only in leader * Default to old ways in more scenarios * Make API gracefully handle 404 * Address some feedback * Make IsDead an atomic.Value * Simplify follower hearbeat tracking * Use uber.atomic * Don't have multiple causes for having autopilot disabled * Don't remove node from follower states if we fail to remove the dead server * Autopilot server removals map (#11019) * Don't remove node from follower states if we fail to remove the dead server * Use map to track dead server removals * Use lock and map * Use delegate lock * Adjust when to remove entry from map * Only hold the lock while accessing map * Fix race * Don't set default min_quorum * Fix test * Ensure follower states is not nil before starting autopilot * Fix race Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com> Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> 2021-03-03 18:59:50 +00:00			`package rafttests`

			`import (`
			`"context"`
			`"math"`
			`"testing"`
			`"time"`

			`"github.com/hashicorp/vault/api"`
			`"github.com/kr/pretty"`

			`autopilot "github.com/hashicorp/raft-autopilot"`

			`"github.com/stretchr/testify/require"`

			`"github.com/hashicorp/vault/helper/namespace"`
			`"github.com/hashicorp/vault/helper/testhelpers"`
			`"github.com/hashicorp/vault/physical/raft"`
			`"github.com/hashicorp/vault/vault"`
			`)`

			`func TestRaft_Autopilot_Disable(t *testing.T) {`
			`cluster := raftCluster(t, &RaftClusterOpts{`
			`DisableFollowerJoins: true,`
			`InmemCluster: true,`
			`// Not setting EnableAutopilot here.`
			`})`
			`defer cluster.Cleanup()`

			`client := cluster.Cores[0].Client`

			`state, err := client.Sys().RaftAutopilotState()`
			`require.NoError(t, err)`
			`require.Nil(t, nil, state)`
			`}`

			`func TestRaft_Autopilot_Stabilization_And_State(t *testing.T) {`
			`cluster := raftCluster(t, &RaftClusterOpts{`
			`DisableFollowerJoins: true,`
			`InmemCluster: true,`
			`EnableAutopilot: true,`
			`})`
			`defer cluster.Cleanup()`

			`// Check that autopilot execution state is running`
			`client := cluster.Cores[0].Client`
			`state, err := client.Sys().RaftAutopilotState()`
			`require.NoError(t, err)`
			`require.Equal(t, true, state.Healthy)`
			`require.Len(t, state.Servers, 1)`
			`require.Equal(t, "core-0", state.Servers["core-0"].ID)`
			`require.Equal(t, "alive", state.Servers["core-0"].NodeStatus)`
			`require.Equal(t, "leader", state.Servers["core-0"].Status)`

			`config, err := client.Sys().RaftAutopilotConfiguration()`
			`require.NoError(t, err)`

			`// Wait for 110% of the stabilization time to add nodes`
			`stabilizationKickOffWaitDuration := time.Duration(math.Ceil(1.1 * float64(config.ServerStabilizationTime)))`
			`time.Sleep(stabilizationKickOffWaitDuration)`

			`joinAndStabilizeFunc := func(core *vault.TestClusterCore, nodeID string, numServers int) {`
			`joinFunc := func(core *vault.TestClusterCore) {`
			`_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), []*raft.LeaderJoinInfo{`
			`{`
			`LeaderAPIAddr: client.Address(),`
			`TLSConfig: cluster.Cores[0].TLSConfig,`
			`Retry: true,`
			`},`
			`}, false)`
			`require.NoError(t, err)`
			`time.Sleep(1 * time.Second)`
			`cluster.UnsealCore(t, core)`
			`}`
			`joinFunc(core)`

			`state, err = client.Sys().RaftAutopilotState()`
			`require.NoError(t, err)`
			`require.Equal(t, false, state.Healthy)`
			`require.Len(t, state.Servers, numServers)`
			`require.Equal(t, false, state.Servers[nodeID].Healthy)`
			`require.Equal(t, "alive", state.Servers[nodeID].NodeStatus)`
			`require.Equal(t, "non-voter", state.Servers[nodeID].Status)`

			`// Wait till the stabilization period is over`
			`stabilizationWaitDuration := time.Duration(float64(config.ServerStabilizationTime))`
			`deadline := time.Now().Add(stabilizationWaitDuration)`
			`healthy := false`
			`for time.Now().Before(deadline) {`
			`state, err := client.Sys().RaftAutopilotState()`
			`require.NoError(t, err)`
			`if state.Healthy {`
			`healthy = true`
			`}`
			`time.Sleep(1 * time.Second)`
			`}`
			`if !healthy {`
			`t.Fatalf("cluster failed to stabilize")`
			`}`

			`// Now that the server is stable, wait for autopilot to reconcile and`
			`// promotion to happen. Reconcile interval is 10 seconds. Bound it by`
			`// doubling.`
			`deadline = time.Now().Add(2 * autopilot.DefaultReconcileInterval)`
			`failed := true`
			`for time.Now().Before(deadline) {`
			`state, err = client.Sys().RaftAutopilotState()`
			`require.NoError(t, err)`
			`if state.Servers[nodeID].Status == "voter" {`
			`failed = false`
			`break`
			`}`
			`time.Sleep(1 * time.Second)`
			`}`

			`if failed {`
			`t.Fatalf("autopilot failed to promote node: id: %#v: state:%# v\n", nodeID, pretty.Formatter(state))`
			`}`
			`}`
			`joinAndStabilizeFunc(cluster.Cores[1], "core-1", 2)`
			`joinAndStabilizeFunc(cluster.Cores[2], "core-2", 3)`
			`state, err = client.Sys().RaftAutopilotState()`
			`require.NoError(t, err)`
			`require.Equal(t, []string{"core-0", "core-1", "core-2"}, state.Voters)`
			`}`

			`func TestRaft_Autopilot_Configuration(t *testing.T) {`
			`cluster := raftCluster(t, &RaftClusterOpts{`
			`DisableFollowerJoins: true,`
			`InmemCluster: true,`
			`EnableAutopilot: true,`
			`})`
			`defer cluster.Cleanup()`

			`client := cluster.Cores[0].Client`
			`configCheckFunc := func(config *api.AutopilotConfig) {`
			`conf, err := client.Sys().RaftAutopilotConfiguration()`
			`require.NoError(t, err)`
			`require.Equal(t, config, conf)`
			`}`

			`writeConfigFunc := func(config map[string]interface{}, expectError bool) {`
			`resp, err := client.Logical().Write("sys/storage/raft/autopilot/configuration", config)`
			`if expectError {`
			`require.Error(t, err)`
			`return`
			`}`
			`require.NoError(t, err)`
			`require.Nil(t, resp)`
			`}`

			`// Ensure autopilot's default config has taken effect`
			`config := &api.AutopilotConfig{`
			`CleanupDeadServers: false,`
			`DeadServerLastContactThreshold: 24 * time.Hour,`
			`LastContactThreshold: 10 * time.Second,`
			`MaxTrailingLogs: 1000,`
			`ServerStabilizationTime: 10 * time.Second,`
			`}`
			`configCheckFunc(config)`

			`// Update config`
			`writableConfig := map[string]interface{}{`
			`"cleanup_dead_servers": true,`
			`"dead_server_last_contact_threshold": "100h",`
			`"last_contact_threshold": "100s",`
			`"max_trailing_logs": 100,`
			`"min_quorum": 100,`
			`"server_stabilization_time": "100s",`
			`}`
			`writeConfigFunc(writableConfig, false)`

			`// Ensure update has taken effect`
			`config.CleanupDeadServers = true`
			`config.DeadServerLastContactThreshold = 100 * time.Hour`
			`config.LastContactThreshold = 100 * time.Second`
			`config.MaxTrailingLogs = 100`
			`config.MinQuorum = 100`
			`config.ServerStabilizationTime = 100 * time.Second`
			`configCheckFunc(config)`

			`// Update some fields and leave the rest as it is.`
			`writableConfig = map[string]interface{}{`
			`"dead_server_last_contact_threshold": "50h",`
			`"max_trailing_logs": 50,`
			`"server_stabilization_time": "50s",`
			`}`
			`writeConfigFunc(writableConfig, false)`

			`// Check update`
			`config.DeadServerLastContactThreshold = 50 * time.Hour`
			`config.MaxTrailingLogs = 50`
			`config.ServerStabilizationTime = 50 * time.Second`
			`configCheckFunc(config)`

			`// Check error case`
			`writableConfig = map[string]interface{}{`
			`"min_quorum": 2,`
			`"dead_server_last_contact_threshold": "48h",`
			`}`
			`writeConfigFunc(writableConfig, true)`
			`configCheckFunc(config)`

			`// Ensure that the configuration stays across reboots`
			`leaderCore := cluster.Cores[0]`
			`testhelpers.EnsureCoreSealed(t, cluster.Cores[0])`
			`cluster.UnsealCore(t, leaderCore)`
			`vault.TestWaitActive(t, leaderCore.Core)`
			`configCheckFunc(config)`
			`}`