Vault 6773/raft rejoin nonvoter (#16324)
* raft: Ensure init before setting suffrage As reported in https://hashicorp.atlassian.net/browse/VAULT-6773: The /sys/storage/raft/join endpoint is intended to be unauthenticated. We rely on the seal to manage trust. It’s possible to use multiple join requests to switch nodes from voter to non-voter. The screenshot shows a 3 node cluster where vault_2 is the leader, and vault_3 and vault_4 are followers with non-voters set to false. sent two requests to the raft join endpoint to have vault_3 and vault_4 join the cluster with non_voters:true. This commit fixes the issue by delaying the call to SetDesiredSuffrage until after the initialization check, preventing unauthenticated mangling of voter status. Tested locally using https://github.com/hashicorp/vault-tools/blob/main/users/ncabatoff/cluster/raft.sh and the reproducer outlined in VAULT-6773. * raft: Return join err on failure This is necessary to correctly distinguish errors returned from the Join workflow. Previously, errors were being masked as timeouts. * raft: Default autopilot parameters in teststorage Change some defaults so we don't have to pass in parameters or set them in the originating tests. These storage types are only used in two places: 1) Raft HA testing 2) Seal migration testing Both consumers have been tested and pass with this change. * changelog: Unauthn voter status change bugfix
This commit is contained in:
parent
8169940284
commit
439e35f50f
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
storage/raft (enterprise): Prevent unauthenticated voter status change with rejoin
|
||||||
|
```
|
|
@ -137,9 +137,11 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg
|
||||||
|
|
||||||
nodeID := fmt.Sprintf("core-%d", coreIdx)
|
nodeID := fmt.Sprintf("core-%d", coreIdx)
|
||||||
backendConf := map[string]string{
|
backendConf := map[string]string{
|
||||||
"path": raftDir,
|
"path": raftDir,
|
||||||
"node_id": nodeID,
|
"node_id": nodeID,
|
||||||
"performance_multiplier": "8",
|
"performance_multiplier": "8",
|
||||||
|
"autopilot_reconcile_interval": "300ms",
|
||||||
|
"autopilot_update_interval": "100ms",
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create and set the HA Backend
|
// Create and set the HA Backend
|
||||||
|
|
|
@ -18,7 +18,6 @@ import (
|
||||||
// seal migration, wherein a given physical backend must be re-used as several
|
// seal migration, wherein a given physical backend must be re-used as several
|
||||||
// test clusters are sequentially created, tested, and discarded.
|
// test clusters are sequentially created, tested, and discarded.
|
||||||
type ReusableStorage struct {
|
type ReusableStorage struct {
|
||||||
|
|
||||||
// IsRaft specifies whether the storage is using a raft backend.
|
// IsRaft specifies whether the storage is using a raft backend.
|
||||||
IsRaft bool
|
IsRaft bool
|
||||||
|
|
||||||
|
@ -169,9 +168,11 @@ func makeRaftDir(t testing.T) string {
|
||||||
func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle {
|
func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle {
|
||||||
nodeID := fmt.Sprintf("core-%d", coreIdx)
|
nodeID := fmt.Sprintf("core-%d", coreIdx)
|
||||||
conf := map[string]string{
|
conf := map[string]string{
|
||||||
"path": raftDir,
|
"path": raftDir,
|
||||||
"node_id": nodeID,
|
"node_id": nodeID,
|
||||||
"performance_multiplier": "8",
|
"performance_multiplier": "8",
|
||||||
|
"autopilot_reconcile_interval": "300ms",
|
||||||
|
"autopilot_update_interval": "100ms",
|
||||||
}
|
}
|
||||||
|
|
||||||
backend, err := raft.NewRaftBackend(conf, logger)
|
backend, err := raft.NewRaftBackend(conf, logger)
|
||||||
|
|
|
@ -835,11 +835,6 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
||||||
return false, errors.New("raft backend not in use")
|
return false, errors.New("raft backend not in use")
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
|
|
||||||
c.logger.Error("failed to set desired suffrage for this node", "error", err)
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
init, err := c.InitializedLocally(ctx)
|
init, err := c.InitializedLocally(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, fmt.Errorf("failed to check if core is initialized: %w", err)
|
return false, fmt.Errorf("failed to check if core is initialized: %w", err)
|
||||||
|
@ -963,10 +958,21 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to check if core is initialized: %w", err)
|
return fmt.Errorf("failed to check if core is initialized: %w", err)
|
||||||
}
|
}
|
||||||
if init && !isRaftHAOnly {
|
|
||||||
|
// InitializedLocally will return non-nil before HA backends are
|
||||||
|
// initialized. c.Initialized(ctx) checks InitializedLocally first, so
|
||||||
|
// we can't use that generically for both cases. Instead check
|
||||||
|
// raftBackend.Initialized() directly for the HA-Only case.
|
||||||
|
if (!isRaftHAOnly && init) || (isRaftHAOnly && raftBackend.Initialized()) {
|
||||||
c.logger.Info("returning from raft join as the node is initialized")
|
c.logger.Info("returning from raft join as the node is initialized")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
|
||||||
|
c.logger.Error("failed to set desired suffrage for this node", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
challengeCh := make(chan *raftInformation)
|
challengeCh := make(chan *raftInformation)
|
||||||
var expandedJoinInfos []*raft.LeaderJoinInfo
|
var expandedJoinInfos []*raft.LeaderJoinInfo
|
||||||
for _, leaderInfo := range leaderInfos {
|
for _, leaderInfo := range leaderInfos {
|
||||||
|
@ -1001,6 +1007,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
err = ctx.Err()
|
||||||
case raftInfo := <-challengeCh:
|
case raftInfo := <-challengeCh:
|
||||||
if raftInfo != nil {
|
if raftInfo != nil {
|
||||||
err = answerChallenge(ctx, raftInfo)
|
err = answerChallenge(ctx, raftInfo)
|
||||||
|
@ -1009,7 +1016,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return fmt.Errorf("timed out on raft join: %w", ctx.Err())
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
switch retryFailures {
|
switch retryFailures {
|
||||||
|
|
Loading…
Reference in New Issue