From b45dd03b8f96f33a47beb5ad24eed42b674339b6 Mon Sep 17 00:00:00 2001 From: Matt Keeler Date: Thu, 27 May 2021 12:59:14 -0400 Subject: [PATCH] Bump raft-autopilot version to the latest. (#10306) --- .changelog/10306.txt | 4 + go.mod | 2 +- go.sum | 4 +- .../hashicorp/raft-autopilot/autopilot.go | 10 --- .../hashicorp/raft-autopilot/run.go | 16 +++- .../hashicorp/raft-autopilot/state.go | 83 ++++++++++++++----- .../hashicorp/raft-autopilot/types.go | 16 ++-- vendor/modules.txt | 2 +- 8 files changed, 91 insertions(+), 46 deletions(-) create mode 100644 .changelog/10306.txt diff --git a/.changelog/10306.txt b/.changelog/10306.txt new file mode 100644 index 000000000..5e69a06c8 --- /dev/null +++ b/.changelog/10306.txt @@ -0,0 +1,4 @@ +```release-note:bug +autopilot: **(Enterprise only)** Fixed an issue where autopilot could cause a new leader to demote the wrong voter when redundancy zones are in use and the previous leader failed. +``` + diff --git a/go.mod b/go.mod index 824f105df..0792d835d 100644 --- a/go.mod +++ b/go.mod @@ -52,7 +52,7 @@ require ( github.com/hashicorp/memberlist v0.2.4 github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69 github.com/hashicorp/raft v1.3.1 - github.com/hashicorp/raft-autopilot v0.1.2 + github.com/hashicorp/raft-autopilot v0.1.5 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/serf v0.9.5 github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 diff --git a/go.sum b/go.sum index d5a0cc05a..6e057b4eb 100644 --- a/go.sum +++ b/go.sum @@ -282,8 +282,8 @@ github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7 github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= github.com/hashicorp/raft v1.3.1 h1:zDT8ke8y2aP4wf9zPTB2uSIeavJ3Hx/ceY4jxI2JxuY= github.com/hashicorp/raft v1.3.1/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= -github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs= -github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= +github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I= +github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM= diff --git a/vendor/github.com/hashicorp/raft-autopilot/autopilot.go b/vendor/github.com/hashicorp/raft-autopilot/autopilot.go index d0640fc85..14c3f8141 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/autopilot.go +++ b/vendor/github.com/hashicorp/raft-autopilot/autopilot.go @@ -147,16 +147,6 @@ type Autopilot struct { // racing. stateLock sync.RWMutex - // startTime is recorded so that we can make better determinations about server - // stability during the initial period of time after autopilot first starts. - // If autopilot has just started the default behavior to check if a server is - // stable will not work as it will ensure the server has been healthy for - // the configured server stabilization time. If that configure time is longer - // than the amount of time autopilot has been running you can run into issues - // with leadership flapping during some scenarios where a cluster is being - // brought up. - startTime time.Time - // removeDeadCh is used to trigger the running autopilot go routines to // find and remove any dead/failed servers removeDeadCh chan struct{} diff --git a/vendor/github.com/hashicorp/raft-autopilot/run.go b/vendor/github.com/hashicorp/raft-autopilot/run.go index 382fd1146..970f61690 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/run.go +++ b/vendor/github.com/hashicorp/raft-autopilot/run.go @@ -18,7 +18,6 @@ func (a *Autopilot) Start(ctx context.Context) { } ctx, shutdown := context.WithCancel(ctx) - a.startTime = a.time.Now() exec := &execInfo{ status: Running, @@ -128,6 +127,21 @@ func (a *Autopilot) beginExecution(ctx context.Context, exec *execInfo) { a.logger.Debug("autopilot is now stopped") + // We need to gain this lock so that we can zero out the previous state. + // This prevents us from accidentally tracking stale state in the event + // that we used to be the leader at some point in time, then weren't + // and now are again. In particular this will ensure that that we forget + // about our tracking of the firstStateTime so that once restarted, we + // will ignore server stabilization time just like we do the very + // first time this process ever was the leader. + // + // This isn't included in finishExecution so that we don't perform it + // if we fail to gain the leaderLock before the context gets cancelled + // back at the beginning of this function. + a.stateLock.Lock() + defer a.stateLock.Unlock() + a.state = &State{} + a.finishExecution(exec) a.leaderLock.Unlock() }() diff --git a/vendor/github.com/hashicorp/raft-autopilot/state.go b/vendor/github.com/hashicorp/raft-autopilot/state.go index 035357bea..d44cdbc26 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/state.go +++ b/vendor/github.com/hashicorp/raft-autopilot/state.go @@ -27,15 +27,15 @@ func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server { // nextStateInputs is the collection of values that can influence // creation of the next State. type nextStateInputs struct { - Now time.Time - StartTime time.Time - Config *Config - RaftConfig *raft.Configuration - KnownServers map[raft.ServerID]*Server - LatestIndex uint64 - LastTerm uint64 - FetchedStats map[raft.ServerID]*ServerStats - LeaderID raft.ServerID + Now time.Time + FirstStateTime time.Time + Config *Config + RaftConfig *raft.Configuration + KnownServers map[raft.ServerID]*Server + LatestIndex uint64 + LastTerm uint64 + FetchedStats map[raft.ServerID]*ServerStats + LeaderID raft.ServerID } // gatherNextStateInputs gathers all the information that would be used to @@ -52,9 +52,34 @@ type nextStateInputs struct { func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) { // there are a lot of inputs to computing the next state so they get put into a // struct so that we don't have to return 8 values. + + now := a.time.Now() + + // We need to pull the previous states knowledge of the first time a state was generated. + // This is really only important for when autopilot is first started. We will use the + // first state's time when determining if a server is stable. Under normal circumstances + // we need to just check that the current time - the servers StableSince time is greater + // than the configured stabilization time. However while autopilot has been running for + // less time than the stabilization time we need to consider all servers as stable + // to prevent unnecessary leader elections. Therefore its important to track the first + // time a state was generated so we know if we have a state old enough where there is + // any chance of seeing servers as stable based off that configured threshold. + var firstStateTime time.Time + a.stateLock.Lock() + if a.state != nil { + firstStateTime = a.state.firstStateTime + } + a.stateLock.Unlock() + + // firstStateTime will be the zero value if we are in the process of generating + // the first state. In that case we set it to the now time. + if firstStateTime.IsZero() { + firstStateTime = now + } + inputs := &nextStateInputs{ - Now: a.time.Now(), - StartTime: a.startTime, + Now: now, + FirstStateTime: firstStateTime, } // grab the latest autopilot configuration @@ -71,16 +96,30 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs } inputs.RaftConfig = raftConfig - leader := a.raft.Leader() - for _, s := range inputs.RaftConfig.Servers { - if s.Address == leader { - inputs.LeaderID = s.ID + // get the known servers which may include left/failed ones + inputs.KnownServers = a.delegate.KnownServers() + + // Try to retrieve leader id from the delegate. + for id, srv := range inputs.KnownServers { + if srv.IsLeader { + inputs.LeaderID = id break } } + // Delegate setting the leader information is optional. If leader detection is + // not successful, fallback on raft config to do the same. if inputs.LeaderID == "" { - return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader) + leader := a.raft.Leader() + for _, s := range inputs.RaftConfig.Servers { + if s.Address == leader { + inputs.LeaderID = s.ID + break + } + } + if inputs.LeaderID == "" { + return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader) + } } // get the latest Raft index - this should be kept close to the call to @@ -101,9 +140,6 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs return nil, ctx.Err() } - // get the known servers which may include left/failed ones - inputs.KnownServers = a.delegate.KnownServers() - // in most cases getting the known servers should be quick but as we cannot // account for every potential delegate and prevent them from making // blocking network requests we should probably check the context again. @@ -146,10 +182,13 @@ func (a *Autopilot) nextState(ctx context.Context) (*State, error) { func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State { nextServers := a.nextServers(inputs) + // we record the firstStateTime so that we can ignore the server stabilization + // time up until the time we generated the first state becomes far enough + // in the past. Until that point in time all servers are considered stable. newState := &State{ - startTime: inputs.StartTime, - Healthy: true, - Servers: nextServers, + firstStateTime: inputs.FirstStateTime, + Healthy: true, + Servers: nextServers, } voterCount := 0 diff --git a/vendor/github.com/hashicorp/raft-autopilot/types.go b/vendor/github.com/hashicorp/raft-autopilot/types.go index c96fb9a31..021fd8b06 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/types.go +++ b/vendor/github.com/hashicorp/raft-autopilot/types.go @@ -85,6 +85,7 @@ type Server struct { Version string Meta map[string]string RaftVersion int + IsLeader bool // The remaining fields are those that the promoter // will fill in @@ -166,7 +167,7 @@ type ServerStats struct { } type State struct { - startTime time.Time + firstStateTime time.Time Healthy bool FailureTolerance int Servers map[raft.ServerID]*ServerState @@ -177,14 +178,11 @@ type State struct { func (s *State) ServerStabilizationTime(c *Config) time.Duration { // Only use the configured stabilization time when autopilot has - // been running for 110% of the configured stabilization time. - // Before that time we haven't been running long enough to - // be able to take these values into account. 110% is pretty - // arbitrary but with the default config would prevent the - // stabilization time from mattering for an extra second. This - // allows for leeway in how quickly we get the healthy RPC responses - // after autopilot is started. - if time.Since(s.startTime) > (c.ServerStabilizationTime*110)/100 { + // been running for at least as long as when the first state was + // generated. If it hasn't been running that long then we would + // guarantee that all checks against the stabilization time will + // fail which will result in excessive leader elections. + if time.Since(s.firstStateTime) > c.ServerStabilizationTime { return c.ServerStabilizationTime } diff --git a/vendor/modules.txt b/vendor/modules.txt index ad41cf6d5..a02aa4aee 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -483,7 +483,7 @@ github.com/hashicorp/memberlist github.com/hashicorp/net-rpc-msgpackrpc # github.com/hashicorp/raft v1.3.1 github.com/hashicorp/raft -# github.com/hashicorp/raft-autopilot v0.1.2 +# github.com/hashicorp/raft-autopilot v0.1.5 github.com/hashicorp/raft-autopilot # github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/raft-boltdb