diff --git a/changelog/20986.txt b/changelog/20986.txt new file mode 100644 index 000000000..c0615f9a3 --- /dev/null +++ b/changelog/20986.txt @@ -0,0 +1,3 @@ +```release-note:bug +storage/raft: Fix race where new follower joining can get pruned by dead server cleanup. +``` \ No newline at end of file diff --git a/physical/raft/raft_autopilot.go b/physical/raft/raft_autopilot.go index 41fc321c9..3d8878a95 100644 --- a/physical/raft/raft_autopilot.go +++ b/physical/raft/raft_autopilot.go @@ -215,13 +215,15 @@ func NewFollowerStates() *FollowerStates { } } -// Update the peer information in the follower states. Note that this function runs on the active node. -func (s *FollowerStates) Update(req *EchoRequestUpdate) { +// Update the peer information in the follower states. Note that this function +// runs on the active node. Returns true if a new entry was added, as opposed +// to modifying one already present. +func (s *FollowerStates) Update(req *EchoRequestUpdate) bool { s.l.Lock() defer s.l.Unlock() - state, ok := s.followers[req.NodeID] - if !ok { + state, present := s.followers[req.NodeID] + if !present { state = &FollowerState{ IsDead: atomic.NewBool(false), } @@ -236,6 +238,8 @@ func (s *FollowerStates) Update(req *EchoRequestUpdate) { state.Version = req.SDKVersion state.UpgradeVersion = req.UpgradeVersion state.RedundancyZone = req.RedundancyZone + + return !present } // Clear wipes all the information regarding peers in the follower states. diff --git a/vault/logical_system_raft.go b/vault/logical_system_raft.go index 6faafacec..ca475eddc 100644 --- a/vault/logical_system_raft.go +++ b/vault/logical_system_raft.go @@ -248,9 +248,8 @@ func (b *SystemBackend) handleRaftRemovePeerUpdate() framework.OperationFunc { if err := raftBackend.RemovePeer(ctx, serverID); err != nil { return nil, err } - if b.Core.raftFollowerStates != nil { - b.Core.raftFollowerStates.Delete(serverID) - } + + b.Core.raftFollowerStates.Delete(serverID) return nil, nil } @@ -351,16 +350,6 @@ func (b *SystemBackend) handleRaftBootstrapAnswerWrite() framework.OperationFunc return nil, errors.New("could not decode raft TLS configuration") } - switch nonVoter { - case true: - err = raftBackend.AddNonVotingPeer(ctx, serverID, clusterAddr) - default: - err = raftBackend.AddPeer(ctx, serverID, clusterAddr) - } - if err != nil { - return nil, err - } - var desiredSuffrage string switch nonVoter { case true: @@ -369,11 +358,22 @@ func (b *SystemBackend) handleRaftBootstrapAnswerWrite() framework.OperationFunc desiredSuffrage = "voter" } - if b.Core.raftFollowerStates != nil { - b.Core.raftFollowerStates.Update(&raft.EchoRequestUpdate{ - NodeID: serverID, - DesiredSuffrage: desiredSuffrage, - }) + added := b.Core.raftFollowerStates.Update(&raft.EchoRequestUpdate{ + NodeID: serverID, + DesiredSuffrage: desiredSuffrage, + }) + + switch nonVoter { + case true: + err = raftBackend.AddNonVotingPeer(ctx, serverID, clusterAddr) + default: + err = raftBackend.AddPeer(ctx, serverID, clusterAddr) + } + if err != nil { + if added { + b.Core.raftFollowerStates.Delete(serverID) + } + return nil, err } peers, err := raftBackend.Peers(ctx)