Bump raft-autopilot version to the latest. (#10306)
This commit is contained in:
parent
94dcc3481a
commit
b45dd03b8f
|
@ -0,0 +1,4 @@
|
||||||
|
```release-note:bug
|
||||||
|
autopilot: **(Enterprise only)** Fixed an issue where autopilot could cause a new leader to demote the wrong voter when redundancy zones are in use and the previous leader failed.
|
||||||
|
```
|
||||||
|
|
2
go.mod
2
go.mod
|
@ -52,7 +52,7 @@ require (
|
||||||
github.com/hashicorp/memberlist v0.2.4
|
github.com/hashicorp/memberlist v0.2.4
|
||||||
github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
|
github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
|
||||||
github.com/hashicorp/raft v1.3.1
|
github.com/hashicorp/raft v1.3.1
|
||||||
github.com/hashicorp/raft-autopilot v0.1.2
|
github.com/hashicorp/raft-autopilot v0.1.5
|
||||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
|
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
|
||||||
github.com/hashicorp/serf v0.9.5
|
github.com/hashicorp/serf v0.9.5
|
||||||
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086
|
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086
|
||||||
|
|
4
go.sum
4
go.sum
|
@ -282,8 +282,8 @@ github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7
|
||||||
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
||||||
github.com/hashicorp/raft v1.3.1 h1:zDT8ke8y2aP4wf9zPTB2uSIeavJ3Hx/ceY4jxI2JxuY=
|
github.com/hashicorp/raft v1.3.1 h1:zDT8ke8y2aP4wf9zPTB2uSIeavJ3Hx/ceY4jxI2JxuY=
|
||||||
github.com/hashicorp/raft v1.3.1/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
|
github.com/hashicorp/raft v1.3.1/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
|
||||||
github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs=
|
github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I=
|
||||||
github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
|
github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
|
||||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
|
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
|
||||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
|
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
|
||||||
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
|
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
|
||||||
|
|
|
@ -147,16 +147,6 @@ type Autopilot struct {
|
||||||
// racing.
|
// racing.
|
||||||
stateLock sync.RWMutex
|
stateLock sync.RWMutex
|
||||||
|
|
||||||
// startTime is recorded so that we can make better determinations about server
|
|
||||||
// stability during the initial period of time after autopilot first starts.
|
|
||||||
// If autopilot has just started the default behavior to check if a server is
|
|
||||||
// stable will not work as it will ensure the server has been healthy for
|
|
||||||
// the configured server stabilization time. If that configure time is longer
|
|
||||||
// than the amount of time autopilot has been running you can run into issues
|
|
||||||
// with leadership flapping during some scenarios where a cluster is being
|
|
||||||
// brought up.
|
|
||||||
startTime time.Time
|
|
||||||
|
|
||||||
// removeDeadCh is used to trigger the running autopilot go routines to
|
// removeDeadCh is used to trigger the running autopilot go routines to
|
||||||
// find and remove any dead/failed servers
|
// find and remove any dead/failed servers
|
||||||
removeDeadCh chan struct{}
|
removeDeadCh chan struct{}
|
||||||
|
|
|
@ -18,7 +18,6 @@ func (a *Autopilot) Start(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, shutdown := context.WithCancel(ctx)
|
ctx, shutdown := context.WithCancel(ctx)
|
||||||
a.startTime = a.time.Now()
|
|
||||||
|
|
||||||
exec := &execInfo{
|
exec := &execInfo{
|
||||||
status: Running,
|
status: Running,
|
||||||
|
@ -128,6 +127,21 @@ func (a *Autopilot) beginExecution(ctx context.Context, exec *execInfo) {
|
||||||
|
|
||||||
a.logger.Debug("autopilot is now stopped")
|
a.logger.Debug("autopilot is now stopped")
|
||||||
|
|
||||||
|
// We need to gain this lock so that we can zero out the previous state.
|
||||||
|
// This prevents us from accidentally tracking stale state in the event
|
||||||
|
// that we used to be the leader at some point in time, then weren't
|
||||||
|
// and now are again. In particular this will ensure that that we forget
|
||||||
|
// about our tracking of the firstStateTime so that once restarted, we
|
||||||
|
// will ignore server stabilization time just like we do the very
|
||||||
|
// first time this process ever was the leader.
|
||||||
|
//
|
||||||
|
// This isn't included in finishExecution so that we don't perform it
|
||||||
|
// if we fail to gain the leaderLock before the context gets cancelled
|
||||||
|
// back at the beginning of this function.
|
||||||
|
a.stateLock.Lock()
|
||||||
|
defer a.stateLock.Unlock()
|
||||||
|
a.state = &State{}
|
||||||
|
|
||||||
a.finishExecution(exec)
|
a.finishExecution(exec)
|
||||||
a.leaderLock.Unlock()
|
a.leaderLock.Unlock()
|
||||||
}()
|
}()
|
||||||
|
|
|
@ -27,15 +27,15 @@ func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server {
|
||||||
// nextStateInputs is the collection of values that can influence
|
// nextStateInputs is the collection of values that can influence
|
||||||
// creation of the next State.
|
// creation of the next State.
|
||||||
type nextStateInputs struct {
|
type nextStateInputs struct {
|
||||||
Now time.Time
|
Now time.Time
|
||||||
StartTime time.Time
|
FirstStateTime time.Time
|
||||||
Config *Config
|
Config *Config
|
||||||
RaftConfig *raft.Configuration
|
RaftConfig *raft.Configuration
|
||||||
KnownServers map[raft.ServerID]*Server
|
KnownServers map[raft.ServerID]*Server
|
||||||
LatestIndex uint64
|
LatestIndex uint64
|
||||||
LastTerm uint64
|
LastTerm uint64
|
||||||
FetchedStats map[raft.ServerID]*ServerStats
|
FetchedStats map[raft.ServerID]*ServerStats
|
||||||
LeaderID raft.ServerID
|
LeaderID raft.ServerID
|
||||||
}
|
}
|
||||||
|
|
||||||
// gatherNextStateInputs gathers all the information that would be used to
|
// gatherNextStateInputs gathers all the information that would be used to
|
||||||
|
@ -52,9 +52,34 @@ type nextStateInputs struct {
|
||||||
func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) {
|
func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) {
|
||||||
// there are a lot of inputs to computing the next state so they get put into a
|
// there are a lot of inputs to computing the next state so they get put into a
|
||||||
// struct so that we don't have to return 8 values.
|
// struct so that we don't have to return 8 values.
|
||||||
|
|
||||||
|
now := a.time.Now()
|
||||||
|
|
||||||
|
// We need to pull the previous states knowledge of the first time a state was generated.
|
||||||
|
// This is really only important for when autopilot is first started. We will use the
|
||||||
|
// first state's time when determining if a server is stable. Under normal circumstances
|
||||||
|
// we need to just check that the current time - the servers StableSince time is greater
|
||||||
|
// than the configured stabilization time. However while autopilot has been running for
|
||||||
|
// less time than the stabilization time we need to consider all servers as stable
|
||||||
|
// to prevent unnecessary leader elections. Therefore its important to track the first
|
||||||
|
// time a state was generated so we know if we have a state old enough where there is
|
||||||
|
// any chance of seeing servers as stable based off that configured threshold.
|
||||||
|
var firstStateTime time.Time
|
||||||
|
a.stateLock.Lock()
|
||||||
|
if a.state != nil {
|
||||||
|
firstStateTime = a.state.firstStateTime
|
||||||
|
}
|
||||||
|
a.stateLock.Unlock()
|
||||||
|
|
||||||
|
// firstStateTime will be the zero value if we are in the process of generating
|
||||||
|
// the first state. In that case we set it to the now time.
|
||||||
|
if firstStateTime.IsZero() {
|
||||||
|
firstStateTime = now
|
||||||
|
}
|
||||||
|
|
||||||
inputs := &nextStateInputs{
|
inputs := &nextStateInputs{
|
||||||
Now: a.time.Now(),
|
Now: now,
|
||||||
StartTime: a.startTime,
|
FirstStateTime: firstStateTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
// grab the latest autopilot configuration
|
// grab the latest autopilot configuration
|
||||||
|
@ -71,16 +96,30 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs
|
||||||
}
|
}
|
||||||
inputs.RaftConfig = raftConfig
|
inputs.RaftConfig = raftConfig
|
||||||
|
|
||||||
leader := a.raft.Leader()
|
// get the known servers which may include left/failed ones
|
||||||
for _, s := range inputs.RaftConfig.Servers {
|
inputs.KnownServers = a.delegate.KnownServers()
|
||||||
if s.Address == leader {
|
|
||||||
inputs.LeaderID = s.ID
|
// Try to retrieve leader id from the delegate.
|
||||||
|
for id, srv := range inputs.KnownServers {
|
||||||
|
if srv.IsLeader {
|
||||||
|
inputs.LeaderID = id
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Delegate setting the leader information is optional. If leader detection is
|
||||||
|
// not successful, fallback on raft config to do the same.
|
||||||
if inputs.LeaderID == "" {
|
if inputs.LeaderID == "" {
|
||||||
return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
|
leader := a.raft.Leader()
|
||||||
|
for _, s := range inputs.RaftConfig.Servers {
|
||||||
|
if s.Address == leader {
|
||||||
|
inputs.LeaderID = s.ID
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if inputs.LeaderID == "" {
|
||||||
|
return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the latest Raft index - this should be kept close to the call to
|
// get the latest Raft index - this should be kept close to the call to
|
||||||
|
@ -101,9 +140,6 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs
|
||||||
return nil, ctx.Err()
|
return nil, ctx.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the known servers which may include left/failed ones
|
|
||||||
inputs.KnownServers = a.delegate.KnownServers()
|
|
||||||
|
|
||||||
// in most cases getting the known servers should be quick but as we cannot
|
// in most cases getting the known servers should be quick but as we cannot
|
||||||
// account for every potential delegate and prevent them from making
|
// account for every potential delegate and prevent them from making
|
||||||
// blocking network requests we should probably check the context again.
|
// blocking network requests we should probably check the context again.
|
||||||
|
@ -146,10 +182,13 @@ func (a *Autopilot) nextState(ctx context.Context) (*State, error) {
|
||||||
func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State {
|
func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State {
|
||||||
nextServers := a.nextServers(inputs)
|
nextServers := a.nextServers(inputs)
|
||||||
|
|
||||||
|
// we record the firstStateTime so that we can ignore the server stabilization
|
||||||
|
// time up until the time we generated the first state becomes far enough
|
||||||
|
// in the past. Until that point in time all servers are considered stable.
|
||||||
newState := &State{
|
newState := &State{
|
||||||
startTime: inputs.StartTime,
|
firstStateTime: inputs.FirstStateTime,
|
||||||
Healthy: true,
|
Healthy: true,
|
||||||
Servers: nextServers,
|
Servers: nextServers,
|
||||||
}
|
}
|
||||||
|
|
||||||
voterCount := 0
|
voterCount := 0
|
||||||
|
|
|
@ -85,6 +85,7 @@ type Server struct {
|
||||||
Version string
|
Version string
|
||||||
Meta map[string]string
|
Meta map[string]string
|
||||||
RaftVersion int
|
RaftVersion int
|
||||||
|
IsLeader bool
|
||||||
|
|
||||||
// The remaining fields are those that the promoter
|
// The remaining fields are those that the promoter
|
||||||
// will fill in
|
// will fill in
|
||||||
|
@ -166,7 +167,7 @@ type ServerStats struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type State struct {
|
type State struct {
|
||||||
startTime time.Time
|
firstStateTime time.Time
|
||||||
Healthy bool
|
Healthy bool
|
||||||
FailureTolerance int
|
FailureTolerance int
|
||||||
Servers map[raft.ServerID]*ServerState
|
Servers map[raft.ServerID]*ServerState
|
||||||
|
@ -177,14 +178,11 @@ type State struct {
|
||||||
|
|
||||||
func (s *State) ServerStabilizationTime(c *Config) time.Duration {
|
func (s *State) ServerStabilizationTime(c *Config) time.Duration {
|
||||||
// Only use the configured stabilization time when autopilot has
|
// Only use the configured stabilization time when autopilot has
|
||||||
// been running for 110% of the configured stabilization time.
|
// been running for at least as long as when the first state was
|
||||||
// Before that time we haven't been running long enough to
|
// generated. If it hasn't been running that long then we would
|
||||||
// be able to take these values into account. 110% is pretty
|
// guarantee that all checks against the stabilization time will
|
||||||
// arbitrary but with the default config would prevent the
|
// fail which will result in excessive leader elections.
|
||||||
// stabilization time from mattering for an extra second. This
|
if time.Since(s.firstStateTime) > c.ServerStabilizationTime {
|
||||||
// allows for leeway in how quickly we get the healthy RPC responses
|
|
||||||
// after autopilot is started.
|
|
||||||
if time.Since(s.startTime) > (c.ServerStabilizationTime*110)/100 {
|
|
||||||
return c.ServerStabilizationTime
|
return c.ServerStabilizationTime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -483,7 +483,7 @@ github.com/hashicorp/memberlist
|
||||||
github.com/hashicorp/net-rpc-msgpackrpc
|
github.com/hashicorp/net-rpc-msgpackrpc
|
||||||
# github.com/hashicorp/raft v1.3.1
|
# github.com/hashicorp/raft v1.3.1
|
||||||
github.com/hashicorp/raft
|
github.com/hashicorp/raft
|
||||||
# github.com/hashicorp/raft-autopilot v0.1.2
|
# github.com/hashicorp/raft-autopilot v0.1.5
|
||||||
github.com/hashicorp/raft-autopilot
|
github.com/hashicorp/raft-autopilot
|
||||||
# github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
|
# github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
|
||||||
github.com/hashicorp/raft-boltdb
|
github.com/hashicorp/raft-boltdb
|
||||||
|
|
Loading…
Reference in New Issue