Vault 6773/raft rejoin nonvoter (#16324)

* raft: Ensure init before setting suffrage As reported in https://hashicorp.atlassian.net/browse/VAULT-6773: The /sys/storage/raft/join endpoint is intended to be unauthenticated. We rely on the seal to manage trust. It’s possible to use multiple join requests to switch nodes from voter to non-voter. The screenshot shows a 3 node cluster where vault_2 is the leader, and vault_3 and vault_4 are followers with non-voters set to false. sent two requests to the raft join endpoint to have vault_3 and vault_4 join the cluster with non_voters:true. This commit fixes the issue by delaying the call to SetDesiredSuffrage until after the initialization check, preventing unauthenticated mangling of voter status. Tested locally using https://github.com/hashicorp/vault-tools/blob/main/users/ncabatoff/cluster/raft.sh and the reproducer outlined in VAULT-6773. * raft: Return join err on failure This is necessary to correctly distinguish errors returned from the Join workflow. Previously, errors were being masked as timeouts. * raft: Default autopilot parameters in teststorage Change some defaults so we don't have to pass in parameters or set them in the originating tests. These storage types are only used in two places: 1) Raft HA testing 2) Seal migration testing Both consumers have been tested and pass with this change. * changelog: Unauthn voter status change bugfix
2022-07-18 12:37:12 -06:00 · 2022-07-18 12:37:12 -06:00 · 439e35f50f
parent 8169940284
commit 439e35f50f
4 changed files with 27 additions and 14 deletions
--- a/changelog/16324.txt
+++ b/changelog/16324.txt
@ -0,0 +1,3 @@
+```release-note:bug
+storage/raft (enterprise): Prevent unauthenticated voter status change with rejoin
+```
--- a/helper/testhelpers/teststorage/teststorage.go
+++ b/helper/testhelpers/teststorage/teststorage.go
@ -137,9 +137,11 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg

 		nodeID := fmt.Sprintf("core-%d", coreIdx)
 		backendConf := map[string]string{
-			"path":                   raftDir,
-			"node_id":                nodeID,
-			"performance_multiplier": "8",
+			"path":                         raftDir,
+			"node_id":                      nodeID,
+			"performance_multiplier":       "8",
+			"autopilot_reconcile_interval": "300ms",
+			"autopilot_update_interval":    "100ms",
 		}

 		// Create and set the HA Backend
--- a/helper/testhelpers/teststorage/teststorage_reusable.go
+++ b/helper/testhelpers/teststorage/teststorage_reusable.go
@ -18,7 +18,6 @@ import (
 // seal migration, wherein a given physical backend must be re-used as several
 // test clusters are sequentially created, tested, and discarded.
 type ReusableStorage struct {
-
 	// IsRaft specifies whether the storage is using a raft backend.
 	IsRaft bool

@ -169,9 +168,11 @@ func makeRaftDir(t testing.T) string {
 func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle {
 	nodeID := fmt.Sprintf("core-%d", coreIdx)
 	conf := map[string]string{
-		"path":                   raftDir,
-		"node_id":                nodeID,
-		"performance_multiplier": "8",
+		"path":                         raftDir,
+		"node_id":                      nodeID,
+		"performance_multiplier":       "8",
+		"autopilot_reconcile_interval": "300ms",
+		"autopilot_update_interval":    "100ms",
 	}

 	backend, err := raft.NewRaftBackend(conf, logger)
--- a/vault/raft.go
+++ b/vault/raft.go
@ -835,11 +835,6 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
 		return false, errors.New("raft backend not in use")
 	}

-	if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
-		c.logger.Error("failed to set desired suffrage for this node", "error", err)
-		return false, nil
-	}
-
 	init, err := c.InitializedLocally(ctx)
 	if err != nil {
 		return false, fmt.Errorf("failed to check if core is initialized: %w", err)
@ -963,10 +958,21 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
 		if err != nil {
 			return fmt.Errorf("failed to check if core is initialized: %w", err)
 		}
-		if init && !isRaftHAOnly {
+
+		// InitializedLocally will return non-nil before HA backends are
+		// initialized. c.Initialized(ctx) checks InitializedLocally first, so
+		// we can't use that generically for both cases. Instead check
+		// raftBackend.Initialized() directly for the HA-Only case.
+		if (!isRaftHAOnly && init) || (isRaftHAOnly && raftBackend.Initialized()) {
 			c.logger.Info("returning from raft join as the node is initialized")
 			return nil
 		}
+
+		if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
+			c.logger.Error("failed to set desired suffrage for this node", "error", err)
+			return nil
+		}
+
 		challengeCh := make(chan *raftInformation)
 		var expandedJoinInfos []*raft.LeaderJoinInfo
 		for _, leaderInfo := range leaderInfos {
@ -1001,6 +1007,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo

 		select {
 		case <-ctx.Done():
+			err = ctx.Err()
 		case raftInfo := <-challengeCh:
 			if raftInfo != nil {
 				err = answerChallenge(ctx, raftInfo)
@ -1009,7 +1016,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
 				}
 			}
 		}
-		return fmt.Errorf("timed out on raft join: %w", ctx.Err())
+		return err
 	}

 	switch retryFailures {