package autopilot import ( "context" "sync" "time" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/raft" ) const ( // These constants were take from what exists in Consul at the time of module extraction. DefaultUpdateInterval = 2 * time.Second DefaultReconcileInterval = 10 * time.Second ) // Option is an option to be used when creating a new Autopilot instance type Option func(*Autopilot) // WithUpdateInterval returns an Option to set the Autopilot instance's // update interval. func WithUpdateInterval(t time.Duration) Option { if t == 0 { t = DefaultUpdateInterval } return func(a *Autopilot) { a.updateInterval = t } } // WithReconcileInterval returns an Option to set the Autopilot instance's // reconcile interval. func WithReconcileInterval(t time.Duration) Option { if t == 0 { t = DefaultReconcileInterval } return func(a *Autopilot) { a.reconcileInterval = t } } // WithLogger returns an Option to set the Autopilot instance's logger func WithLogger(logger hclog.Logger) Option { if logger == nil { logger = hclog.Default() } return func(a *Autopilot) { a.logger = logger.Named("autopilot") } } // withTimeProvider returns an Option which overrides and Autopilot instance's // time provider with the given one. This should only be used in tests // as a means of making some time.Time values in an autopilot state deterministic. // For real uses the default runtimeTimeProvider should be used. func withTimeProvider(provider timeProvider) Option { return func(a *Autopilot) { a.time = provider } } // WithPromoter returns an option to set the Promoter type that Autpilot will // use. When the option is not given the default StablePromoter from this package // will be used. func WithPromoter(promoter Promoter) Option { if promoter == nil { promoter = DefaultPromoter() } return func(a *Autopilot) { a.promoter = promoter } } // ExecutionStatus represents the current status of the autopilot background go routines type ExecutionStatus string const ( NotRunning ExecutionStatus = "not-running" Running ExecutionStatus = "running" ShuttingDown ExecutionStatus = "shutting-down" ) type execInfo struct { // status is the current state of autopilot executation status ExecutionStatus // shutdown is a function that can be execute to shutdown a running // autopilot's go routines. shutdown context.CancelFunc // done is a chan that will be closed when the running autopilot go // routines have exited. Technically closing it is the very last // thing done in the go routine but at that point enough state has // been cleaned up that we would then allow it to be started // immediately afterward done chan struct{} } // Autopilot is the type to manage a running Raft instance. // // Each Raft node in the cluster will have a corresponding Autopilot instance but // only 1 Autopilot instance should run at a time in the cluster. So when a node // gains Raft leadership the corresponding Autopilot instance should have it's // Start method called. Then if leadership is lost that node should call the // Stop method on the Autopilot instance. type Autopilot struct { logger hclog.Logger // delegate is used to get information about the system such as Raft server // states, known servers etc. delegate ApplicationIntegration // promoter is used to calculate promotions, demotions and leadership transfers // given a particular autopilot State. The interface also contains methods // for filling in parts of the autopilot state that the core module doesn't // control such as the Ext fields on the Server and State types. promoter Promoter // raft is an interface that implements all the parts of the Raft library interface // that we use. It is an interface to allow for mocking raft during testing. raft Raft // time is an interface with a single method for getting the current time - `Now`. // In some tests this will be the MockTimeProvider which allows tests to be more // deterministic but for running systems this should not be overrided from the // default which is the runtimeTimeProvider and is a small shim around calling // time.Now. time timeProvider // reconcileInterval is how long between rounds of performing promotions, demotions // and leadership transfers. reconcileInterval time.Duration // updateInterval is the time between the periodic state updates. These periodic // state updates take in known servers from the delegate, request Raft stats be // fetched and pull in other inputs such as the Raft configuration to create // an updated view of the Autopilot State. updateInterval time.Duration // state is the structure that autopilot uses to make decisions about what to do. // This field should be considered immutable and no modifications to an existing // state should be made but instead a new state is created and set to this field // while holding the stateLock. state *State // stateLock is meant to only protect the state field. This just prevents // the periodic state update and consumers requesting the autopilot state from // racing. stateLock sync.RWMutex // startTime is recorded so that we can make better determinations about server // stability during the initial period of time after autopilot first starts. // If autopilot has just started the default behavior to check if a server is // stable will not work as it will ensure the server has been healthy for // the configured server stabilization time. If that configure time is longer // than the amount of time autopilot has been running you can run into issues // with leadership flapping during some scenarios where a cluster is being // brought up. startTime time.Time // removeDeadCh is used to trigger the running autopilot go routines to // find and remove any dead/failed servers removeDeadCh chan struct{} // reconcileCh is used to trigger an immediate round of reconciliation. reconcileCh chan struct{} // leaderLock implements a cancellable mutex that will be used to ensure // that only one autopilot go routine is the "leader". The leader is // the go routine that is currently responsible for updating the // autopilot state and performing raft promotions/demotions. leaderLock *mutex // execution is the information about the most recent autopilot execution. // Start will initialize this with the most recent execution and it will // be updated by Stop and by the go routines being executed when they are // finished. execution *execInfo // execLock protects access to the execution field execLock sync.Mutex } // New will create a new Autopilot instance utilizing the given Raft and Delegate. // If the WithPromoter option is not provided the default StablePromoter will // be used. func New(raft Raft, delegate ApplicationIntegration, options ...Option) *Autopilot { a := &Autopilot{ raft: raft, delegate: delegate, state: &State{}, promoter: DefaultPromoter(), logger: hclog.Default().Named("autopilot"), // should this be buffered? removeDeadCh: make(chan struct{}, 1), reconcileInterval: DefaultReconcileInterval, updateInterval: DefaultUpdateInterval, time: &runtimeTimeProvider{}, leaderLock: newMutex(), } for _, opt := range options { opt(a) } return a } // RemoveDeadServers will trigger an immediate removal of dead/failed servers. func (a *Autopilot) RemoveDeadServers() { select { case a.removeDeadCh <- struct{}{}: default: } } // GetState retrieves the current autopilot State func (a *Autopilot) GetState() *State { a.stateLock.Lock() defer a.stateLock.Unlock() return a.state } // GetServerHealth returns the latest ServerHealth for a given server. // The returned struct should not be modified or else it will im func (a *Autopilot) GetServerHealth(id raft.ServerID) *ServerHealth { state := a.GetState() srv, ok := state.Servers[id] if ok { return &srv.Health } return nil }