Vault 6773/raft rejoin nonvoter (#16324)

* raft: Ensure init before setting suffrage

As reported in https://hashicorp.atlassian.net/browse/VAULT-6773:

	The /sys/storage/raft/join endpoint is intended to be unauthenticated. We rely
	on the seal to manage trust.

	It’s possible to use multiple join requests to switch nodes from voter to
	non-voter. The screenshot shows a 3 node cluster where vault_2 is the leader,
	and vault_3 and vault_4 are followers with non-voters set to false.  sent two
	requests to the raft join endpoint to have vault_3 and vault_4 join the cluster
	with non_voters:true.

This commit fixes the issue by delaying the call to SetDesiredSuffrage until after
the initialization check, preventing unauthenticated mangling of voter status.

Tested locally using
https://github.com/hashicorp/vault-tools/blob/main/users/ncabatoff/cluster/raft.sh
and the reproducer outlined in VAULT-6773.

* raft: Return join err on failure

This is necessary to correctly distinguish errors returned from the Join
workflow. Previously, errors were being masked as timeouts.

* raft: Default autopilot parameters in teststorage

Change some defaults so we don't have to pass in parameters or set them
in the originating tests. These storage types are only used in two
places:

1) Raft HA testing
2) Seal migration testing

Both consumers have been tested and pass with this change.

* changelog: Unauthn voter status change bugfix
This commit is contained in:
Mike Palmiotto 2022-07-18 12:37:12 -06:00 committed by GitHub
parent 8169940284
commit 439e35f50f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 14 deletions

3
changelog/16324.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
storage/raft (enterprise): Prevent unauthenticated voter status change with rejoin
```

View File

@ -137,9 +137,11 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg
nodeID := fmt.Sprintf("core-%d", coreIdx) nodeID := fmt.Sprintf("core-%d", coreIdx)
backendConf := map[string]string{ backendConf := map[string]string{
"path": raftDir, "path": raftDir,
"node_id": nodeID, "node_id": nodeID,
"performance_multiplier": "8", "performance_multiplier": "8",
"autopilot_reconcile_interval": "300ms",
"autopilot_update_interval": "100ms",
} }
// Create and set the HA Backend // Create and set the HA Backend

View File

@ -18,7 +18,6 @@ import (
// seal migration, wherein a given physical backend must be re-used as several // seal migration, wherein a given physical backend must be re-used as several
// test clusters are sequentially created, tested, and discarded. // test clusters are sequentially created, tested, and discarded.
type ReusableStorage struct { type ReusableStorage struct {
// IsRaft specifies whether the storage is using a raft backend. // IsRaft specifies whether the storage is using a raft backend.
IsRaft bool IsRaft bool
@ -169,9 +168,11 @@ func makeRaftDir(t testing.T) string {
func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle { func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle {
nodeID := fmt.Sprintf("core-%d", coreIdx) nodeID := fmt.Sprintf("core-%d", coreIdx)
conf := map[string]string{ conf := map[string]string{
"path": raftDir, "path": raftDir,
"node_id": nodeID, "node_id": nodeID,
"performance_multiplier": "8", "performance_multiplier": "8",
"autopilot_reconcile_interval": "300ms",
"autopilot_update_interval": "100ms",
} }
backend, err := raft.NewRaftBackend(conf, logger) backend, err := raft.NewRaftBackend(conf, logger)

View File

@ -835,11 +835,6 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
return false, errors.New("raft backend not in use") return false, errors.New("raft backend not in use")
} }
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
c.logger.Error("failed to set desired suffrage for this node", "error", err)
return false, nil
}
init, err := c.InitializedLocally(ctx) init, err := c.InitializedLocally(ctx)
if err != nil { if err != nil {
return false, fmt.Errorf("failed to check if core is initialized: %w", err) return false, fmt.Errorf("failed to check if core is initialized: %w", err)
@ -963,10 +958,21 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
if err != nil { if err != nil {
return fmt.Errorf("failed to check if core is initialized: %w", err) return fmt.Errorf("failed to check if core is initialized: %w", err)
} }
if init && !isRaftHAOnly {
// InitializedLocally will return non-nil before HA backends are
// initialized. c.Initialized(ctx) checks InitializedLocally first, so
// we can't use that generically for both cases. Instead check
// raftBackend.Initialized() directly for the HA-Only case.
if (!isRaftHAOnly && init) || (isRaftHAOnly && raftBackend.Initialized()) {
c.logger.Info("returning from raft join as the node is initialized") c.logger.Info("returning from raft join as the node is initialized")
return nil return nil
} }
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
c.logger.Error("failed to set desired suffrage for this node", "error", err)
return nil
}
challengeCh := make(chan *raftInformation) challengeCh := make(chan *raftInformation)
var expandedJoinInfos []*raft.LeaderJoinInfo var expandedJoinInfos []*raft.LeaderJoinInfo
for _, leaderInfo := range leaderInfos { for _, leaderInfo := range leaderInfos {
@ -1001,6 +1007,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
select { select {
case <-ctx.Done(): case <-ctx.Done():
err = ctx.Err()
case raftInfo := <-challengeCh: case raftInfo := <-challengeCh:
if raftInfo != nil { if raftInfo != nil {
err = answerChallenge(ctx, raftInfo) err = answerChallenge(ctx, raftInfo)
@ -1009,7 +1016,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
} }
} }
} }
return fmt.Errorf("timed out on raft join: %w", ctx.Err()) return err
} }
switch retryFailures { switch retryFailures {