open-nomad/nomad/heartbeat.go

235 lines
6.6 KiB
Go
Raw Normal View History

package nomad
import (
2018-02-20 18:22:15 +00:00
"errors"
2018-09-15 23:23:13 +00:00
"sync"
"time"
2019-01-15 19:46:12 +00:00
metrics "github.com/armon/go-metrics"
2018-09-15 23:23:13 +00:00
log "github.com/hashicorp/go-hclog"
memdb "github.com/hashicorp/go-memdb"
2016-05-03 07:06:59 +00:00
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/nomad/nomad/structs"
)
2018-02-20 18:22:15 +00:00
const (
// heartbeatNotLeader is the error string returned when the heartbeat request
// couldn't be completed since the server is not the leader.
heartbeatNotLeader = "failed to reset heartbeat since server is not leader"
2018-05-11 21:53:41 +00:00
// NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is
// missed.
NodeHeartbeatEventMissed = "Node heartbeat missed"
2018-02-20 18:22:15 +00:00
)
var (
// heartbeatNotLeaderErr is the error returned when the heartbeat request
// couldn't be completed since the server is not the leader.
heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)
)
2018-09-15 23:23:13 +00:00
// nodeHeartbeater is used to track expiration times of node heartbeats. If it
// detects an expired node, the node status is updated to be 'down'.
type nodeHeartbeater struct {
*Server
logger log.Logger
// heartbeatTimers track the expiration time of each heartbeat that has
// a TTL. On expiration, the node status is updated to be 'down'.
heartbeatTimers map[string]*time.Timer
heartbeatTimersLock sync.Mutex
}
// newNodeHeartbeater returns a new node heartbeater used to detect and act on
// failed node heartbeats.
func newNodeHeartbeater(s *Server) *nodeHeartbeater {
return &nodeHeartbeater{
Server: s,
logger: s.logger.Named("heartbeat"),
}
}
// initializeHeartbeatTimers is used when a leader is newly elected to create
// a new map to track heartbeat expiration and to reset all the timers from
// the previously known set of timers.
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) initializeHeartbeatTimers() error {
// Scan all nodes and reset their timer
2018-09-15 23:23:13 +00:00
snap, err := h.fsm.State().Snapshot()
if err != nil {
return err
}
// Get an iterator over nodes
2017-02-08 04:31:23 +00:00
ws := memdb.NewWatchSet()
iter, err := snap.Nodes(ws)
if err != nil {
return err
}
2018-09-15 23:23:13 +00:00
h.heartbeatTimersLock.Lock()
defer h.heartbeatTimersLock.Unlock()
// Handle each node
for {
raw := iter.Next()
if raw == nil {
break
}
node := raw.(*structs.Node)
if node.TerminalStatus() {
continue
}
2018-09-15 23:23:13 +00:00
h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL)
}
return nil
}
// resetHeartbeatTimer is used to reset the TTL of a heartbeat.
// This can be used for new heartbeats and existing ones.
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) {
h.heartbeatTimersLock.Lock()
defer h.heartbeatTimersLock.Unlock()
2018-02-20 18:22:15 +00:00
// Do not create a timer for the node since we are not the leader. This
// check avoids the race in which leadership is lost but a timer is created
// on this server since it was servicing an RPC during a leadership loss.
2018-09-15 23:23:13 +00:00
if !h.IsLeader() {
h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id)
2018-02-20 18:22:15 +00:00
return 0, heartbeatNotLeaderErr
}
// Compute the target TTL value
2018-09-15 23:23:13 +00:00
n := len(h.heartbeatTimers)
ttl := lib.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n)
ttl += lib.RandomStagger(ttl)
// Reset the TTL
2018-09-15 23:23:13 +00:00
h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace)
return ttl, nil
}
// resetHeartbeatTimerLocked is used to reset a heartbeat timer
// assuming the heartbeatTimerLock is already held
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
// Ensure a timer map exists
2018-09-15 23:23:13 +00:00
if h.heartbeatTimers == nil {
h.heartbeatTimers = make(map[string]*time.Timer)
}
// Renew the heartbeat timer if it exists
2018-09-15 23:23:13 +00:00
if timer, ok := h.heartbeatTimers[id]; ok {
timer.Reset(ttl)
return
}
2016-05-03 07:18:48 +00:00
// Create a new timer to track expiration of this heartbeat
timer := time.AfterFunc(ttl, func() {
2018-09-15 23:23:13 +00:00
h.invalidateHeartbeat(id)
})
2018-09-15 23:23:13 +00:00
h.heartbeatTimers[id] = timer
}
// invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
// need to invalidate the heartbeat.
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) invalidateHeartbeat(id string) {
defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
// Clear the heartbeat timer
2018-09-15 23:23:13 +00:00
h.heartbeatTimersLock.Lock()
if timer, ok := h.heartbeatTimers[id]; ok {
timer.Stop()
2018-09-15 23:23:13 +00:00
delete(h.heartbeatTimers, id)
}
2018-09-15 23:23:13 +00:00
h.heartbeatTimersLock.Unlock()
2018-02-20 18:22:15 +00:00
// Do not invalidate the node since we are not the leader. This check avoids
// the race in which leadership is lost but a timer is created on this
// server since it was servicing an RPC during a leadership loss.
2018-09-15 23:23:13 +00:00
if !h.IsLeader() {
h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id)
2018-02-20 18:22:15 +00:00
return
}
2018-09-15 23:23:13 +00:00
h.logger.Warn("node TTL expired", "node_id", id)
// Make a request to update the node status
req := structs.NodeUpdateStatusRequest{
2018-05-11 21:53:41 +00:00
NodeID: id,
Status: structs.NodeStatusDown,
NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),
WriteRequest: structs.WriteRequest{
2018-09-15 23:23:13 +00:00
Region: h.config.Region,
},
}
if h.shouldDisconnect(id) {
req.Status = structs.NodeStatusDisconnected
}
var resp structs.NodeUpdateResponse
2018-09-15 23:23:13 +00:00
if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil {
h.logger.Error("update node status failed", "error", err)
}
}
func (h *nodeHeartbeater) shouldDisconnect(id string) bool {
allocs, err := h.State().AllocsByNode(nil, id)
if err != nil {
h.logger.Error("error retrieving allocs by node", "error", err)
return false
}
now := time.Now().UTC()
for _, alloc := range allocs {
if alloc.DisconnectTimeout(now).After(now) {
return true
}
}
return false
}
// clearHeartbeatTimer is used to clear the heartbeat time for
// a single heartbeat. This is used when a heartbeat is destroyed
// explicitly and no longer needed.
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error {
h.heartbeatTimersLock.Lock()
defer h.heartbeatTimersLock.Unlock()
2018-09-15 23:23:13 +00:00
if timer, ok := h.heartbeatTimers[id]; ok {
timer.Stop()
2018-09-15 23:23:13 +00:00
delete(h.heartbeatTimers, id)
}
return nil
}
// clearAllHeartbeatTimers is used when a leader is stepping
// down and we no longer need to track any heartbeat timers.
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) clearAllHeartbeatTimers() error {
h.heartbeatTimersLock.Lock()
defer h.heartbeatTimersLock.Unlock()
2018-09-15 23:23:13 +00:00
for _, t := range h.heartbeatTimers {
t.Stop()
}
2018-09-15 23:23:13 +00:00
h.heartbeatTimers = nil
return nil
}
// heartbeatStats is a long running routine used to capture
// the number of active heartbeats being tracked
2018-09-15 23:23:13 +00:00
func (h *nodeHeartbeater) heartbeatStats() {
for {
select {
case <-time.After(5 * time.Second):
2018-09-15 23:23:13 +00:00
h.heartbeatTimersLock.Lock()
num := len(h.heartbeatTimers)
h.heartbeatTimersLock.Unlock()
metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
2018-09-15 23:23:13 +00:00
case <-h.shutdownCh:
return
}
}
}