open-nomad/nomad/leader.go

341 lines
9.6 KiB
Go

package nomad
import (
"fmt"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
)
// monitorLeadership is used to monitor if we acquire or lose our role
// as the leader in the Raft cluster. There is some work the leader is
// expected to do, so we must react to changes
func (s *Server) monitorLeadership() {
leaderCh := s.raft.LeaderCh()
var stopCh chan struct{}
for {
select {
case isLeader := <-leaderCh:
if isLeader {
stopCh = make(chan struct{})
go s.leaderLoop(stopCh)
s.logger.Printf("[INFO] nomad: cluster leadership acquired")
} else if stopCh != nil {
close(stopCh)
stopCh = nil
s.logger.Printf("[INFO] nomad: cluster leadership lost")
}
case <-s.shutdownCh:
return
}
}
}
// leaderLoop runs as long as we are the leader to run various
// maintence activities
func (s *Server) leaderLoop(stopCh chan struct{}) {
// Ensure we revoke leadership on stepdown
defer s.revokeLeadership()
var reconcileCh chan serf.Member
establishedLeader := false
RECONCILE:
// Setup a reconciliation timer
reconcileCh = nil
interval := time.After(s.config.ReconcileInterval)
// Apply a raft barrier to ensure our FSM is caught up
start := time.Now()
barrier := s.raft.Barrier(0)
if err := barrier.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
goto WAIT
}
metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
// Check if we need to handle initial leadership actions
if !establishedLeader {
if err := s.establishLeadership(stopCh); err != nil {
s.logger.Printf("[ERR] nomad: failed to establish leadership: %v",
err)
goto WAIT
}
establishedLeader = true
}
// Reconcile any missing data
if err := s.reconcile(); err != nil {
s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
goto WAIT
}
// Initial reconcile worked, now we can process the channel
// updates
reconcileCh = s.reconcileCh
WAIT:
// Wait until leadership is lost
for {
select {
case <-stopCh:
return
case <-s.shutdownCh:
return
case <-interval:
goto RECONCILE
case member := <-reconcileCh:
s.reconcileMember(member)
}
}
}
// establishLeadership is invoked once we become leader and are able
// to invoke an initial barrier. The barrier is used to ensure any
// previously inflight transactions have been commited and that our
// state is up-to-date.
func (s *Server) establishLeadership(stopCh chan struct{}) error {
// If we have multiple workers, disable one to free processing
// for the plan queue and evaluation broker
if len(s.workers) > 1 {
s.workers[0].SetPause(true)
}
// Enable the plan queue, since we are now the leader
s.planQueue.SetEnabled(true)
// Start the plan evaluator
go s.planApply()
// Enable the eval broker, since we are now the leader
s.evalBroker.SetEnabled(true)
// Restore the eval broker state
if err := s.restoreEvalBroker(); err != nil {
return err
}
// Scheduler periodic jobs
go s.schedulePeriodic(stopCh)
// Reap any failed evaluations
go s.reapFailedEvaluations(stopCh)
// Setup the heartbeat timers. This is done both when starting up or when
// a leader fail over happens. Since the timers are maintained by the leader
// node, effectively this means all the timers are renewed at the time of failover.
// The TTL contract is that the session will not be expired before the TTL,
// so expiring it later is allowable.
//
// This MUST be done after the initial barrier to ensure the latest Nodes
// are available to be initialized. Otherwise initialization may use stale
// data.
if err := s.initializeHeartbeatTimers(); err != nil {
s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
return err
}
return nil
}
// restoreEvalBroker is used to restore all pending evaluations
// into the eval broker. The broker is maintained only by the leader,
// so it must be restored anytime a leadership transition takes place.
func (s *Server) restoreEvalBroker() error {
// Get an iterator over every evaluation
iter, err := s.fsm.State().Evals()
if err != nil {
return fmt.Errorf("failed to get evaluations: %v", err)
}
for {
raw := iter.Next()
if raw == nil {
break
}
eval := raw.(*structs.Evaluation)
if !eval.ShouldEnqueue() {
continue
}
if err := s.evalBroker.Enqueue(eval); err != nil {
return fmt.Errorf("failed to enqueue evaluation %s: %v", eval.ID, err)
}
}
return nil
}
// schedulePeriodic is used to do periodic job dispatch while we are leader
func (s *Server) schedulePeriodic(stopCh chan struct{}) {
evalGC := time.NewTicker(s.config.EvalGCInterval)
defer evalGC.Stop()
for {
select {
case <-evalGC.C:
s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC))
case <-stopCh:
return
}
}
}
// coreJobEval returns an evaluation for a core job
func (s *Server) coreJobEval(job string) *structs.Evaluation {
return &structs.Evaluation{
ID: generateUUID(),
Priority: structs.CoreJobPriority,
Type: structs.JobTypeCore,
TriggeredBy: structs.EvalTriggerScheduled,
JobID: job,
Status: structs.EvalStatusPending,
ModifyIndex: s.raft.AppliedIndex(),
}
}
// reapFailedEvaluations is used to reap evaluations that
// have reached their delivery limit and should be failed
func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
for {
select {
case <-stopCh:
return
default:
// Scan for a failed evaluation
eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
if err != nil {
return
}
if eval == nil {
continue
}
// Update the status to failed
newEval := eval.Copy()
newEval.Status = structs.EvalStatusFailed
newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval)
// Update via Raft
req := structs.EvalUpdateRequest{
Evals: []*structs.Evaluation{newEval},
}
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err)
continue
}
// Ack completion
s.evalBroker.Ack(eval.ID, token)
}
}
}
// revokeLeadership is invoked once we step down as leader.
// This is used to cleanup any state that may be specific to a leader.
func (s *Server) revokeLeadership() error {
// Disable the plan queue, since we are no longer leader
s.planQueue.SetEnabled(false)
// Disable the eval broker, since it is only useful as a leader
s.evalBroker.SetEnabled(false)
// Clear the heartbeat timers on either shutdown or step down,
// since we are no longer responsible for TTL expirations.
if err := s.clearAllHeartbeatTimers(); err != nil {
s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
return err
}
// Unpause our worker if we paused previously
if len(s.workers) > 1 {
s.workers[0].SetPause(false)
}
return nil
}
// reconcile is used to reconcile the differences between Serf
// membership and what is reflected in our strongly consistent store.
func (s *Server) reconcile() error {
defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
members := s.serf.Members()
for _, member := range members {
if err := s.reconcileMember(member); err != nil {
return err
}
}
return nil
}
// reconcileMember is used to do an async reconcile of a single serf member
func (s *Server) reconcileMember(member serf.Member) error {
// Check if this is a member we should handle
valid, parts := isNomadServer(member)
if !valid || parts.Region != s.config.Region {
return nil
}
defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
// Do not reconcile ourself
if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
return nil
}
var err error
switch member.Status {
case serf.StatusAlive:
err = s.addRaftPeer(member, parts)
case serf.StatusLeft, StatusReap:
err = s.removeRaftPeer(member, parts)
}
if err != nil {
s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
member, err)
return err
}
return nil
}
// addRaftPeer is used to add a new Raft peer when a Nomad server joins
func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// Check for possibility of multiple bootstrap nodes
if parts.Bootstrap {
members := s.serf.Members()
for _, member := range members {
valid, p := isNomadServer(member)
if valid && member.Name != m.Name && p.Bootstrap {
s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
return nil
}
}
}
// Attempt to add as a peer
future := s.raft.AddPeer(parts.Addr.String())
if err := future.Error(); err != nil && err != raft.ErrKnownPeer {
s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
return err
} else if err == nil {
s.logger.Printf("[INFO] nomad: added raft peer: %v", parts)
}
return nil
}
// removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
// or is reaped
func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
// Attempt to remove as peer
future := s.raft.RemovePeer(parts.Addr.String())
if err := future.Error(); err != nil && err != raft.ErrUnknownPeer {
s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
parts, err)
return err
} else if err == nil {
s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name)
}
return nil
}