open-nomad/nomad/heartbeat.go

package nomad

import (
	"errors"
	"sync"
	"time"

	metrics "github.com/armon/go-metrics"
	log "github.com/hashicorp/go-hclog"
	memdb "github.com/hashicorp/go-memdb"

	"github.com/hashicorp/consul/lib"
	"github.com/hashicorp/nomad/nomad/structs"
)

const (
	// heartbeatNotLeader is the error string returned when the heartbeat request
	// couldn't be completed since the server is not the leader.
	heartbeatNotLeader = "failed to reset heartbeat since server is not leader"

	// NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is
	// missed.
	NodeHeartbeatEventMissed = "Node heartbeat missed"
)

var (
	// heartbeatNotLeaderErr is the error returned when the heartbeat request
	// couldn't be completed since the server is not the leader.
	heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)
)

// nodeHeartbeater is used to track expiration times of node heartbeats. If it
// detects an expired node, the node status is updated to be 'down'.
type nodeHeartbeater struct {
	*Server
	logger log.Logger

	// heartbeatTimers track the expiration time of each heartbeat that has
	// a TTL. On expiration, the node status is updated to be 'down'.
	heartbeatTimers     map[string]*time.Timer
	heartbeatTimersLock sync.Mutex
}

// newNodeHeartbeater returns a new node heartbeater used to detect and act on
// failed node heartbeats.
func newNodeHeartbeater(s *Server) *nodeHeartbeater {
	return &nodeHeartbeater{
		Server: s,
		logger: s.logger.Named("heartbeat"),
	}
}

// initializeHeartbeatTimers is used when a leader is newly elected to create
// a new map to track heartbeat expiration and to reset all the timers from
// the previously known set of timers.
func (h *nodeHeartbeater) initializeHeartbeatTimers() error {
	// Scan all nodes and reset their timer
	snap, err := h.fsm.State().Snapshot()
	if err != nil {
		return err
	}

	// Get an iterator over nodes
	ws := memdb.NewWatchSet()
	iter, err := snap.Nodes(ws)
	if err != nil {
		return err
	}

	h.heartbeatTimersLock.Lock()
	defer h.heartbeatTimersLock.Unlock()

	// Handle each node
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		node := raw.(*structs.Node)
		if node.TerminalStatus() {
			continue
		}
		h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL)
	}
	return nil
}

// resetHeartbeatTimer is used to reset the TTL of a heartbeat.
// This can be used for new heartbeats and existing ones.
func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) {
	h.heartbeatTimersLock.Lock()
	defer h.heartbeatTimersLock.Unlock()

	// Do not create a timer for the node since we are not the leader. This
	// check avoids the race in which leadership is lost but a timer is created
	// on this server since it was servicing an RPC during a leadership loss.
	if !h.IsLeader() {
		h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id)
		return 0, heartbeatNotLeaderErr
	}

	// Compute the target TTL value
	n := len(h.heartbeatTimers)
	ttl := lib.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n)
	ttl += lib.RandomStagger(ttl)

	// Reset the TTL
	h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace)
	return ttl, nil
}

// resetHeartbeatTimerLocked is used to reset a heartbeat timer
// assuming the heartbeatTimerLock is already held
func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
	// Ensure a timer map exists
	if h.heartbeatTimers == nil {
		h.heartbeatTimers = make(map[string]*time.Timer)
	}

	// Renew the heartbeat timer if it exists
	if timer, ok := h.heartbeatTimers[id]; ok {
		timer.Reset(ttl)
		return
	}

	// Create a new timer to track expiration of this heartbeat
	timer := time.AfterFunc(ttl, func() {
		h.invalidateHeartbeat(id)
	})
	h.heartbeatTimers[id] = timer
}

// invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
// need to invalidate the heartbeat.
func (h *nodeHeartbeater) invalidateHeartbeat(id string) {
	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
	// Clear the heartbeat timer
	h.heartbeatTimersLock.Lock()
	if timer, ok := h.heartbeatTimers[id]; ok {
		timer.Stop()
		delete(h.heartbeatTimers, id)
	}
	h.heartbeatTimersLock.Unlock()

	// Do not invalidate the node since we are not the leader. This check avoids
	// the race in which leadership is lost but a timer is created on this
	// server since it was servicing an RPC during a leadership loss.
	if !h.IsLeader() {
		h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id)
		return
	}

	h.logger.Warn("node TTL expired", "node_id", id)

	// Make a request to update the node status
	req := structs.NodeUpdateStatusRequest{
		NodeID:    id,
		Status:    structs.NodeStatusDown,
		NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),
		WriteRequest: structs.WriteRequest{
			Region: h.config.Region,
		},
	}

	if h.shouldDisconnect(id) {
		req.Status = structs.NodeStatusDisconnected
	}

	var resp structs.NodeUpdateResponse
	if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil {
		h.logger.Error("update node status failed", "error", err)
	}
}

func (h *nodeHeartbeater) shouldDisconnect(id string) bool {
	allocs, err := h.State().AllocsByNode(nil, id)
	if err != nil {
		h.logger.Error("error retrieving allocs by node", "error", err)
		return false
	}

	now := time.Now().UTC()
	for _, alloc := range allocs {
		if alloc.DisconnectTimeout(now).After(now) {
			return true
		}
	}

	return false
}

// clearHeartbeatTimer is used to clear the heartbeat time for
// a single heartbeat. This is used when a heartbeat is destroyed
// explicitly and no longer needed.
func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error {
	h.heartbeatTimersLock.Lock()
	defer h.heartbeatTimersLock.Unlock()

	if timer, ok := h.heartbeatTimers[id]; ok {
		timer.Stop()
		delete(h.heartbeatTimers, id)
	}
	return nil
}

// clearAllHeartbeatTimers is used when a leader is stepping
// down and we no longer need to track any heartbeat timers.
func (h *nodeHeartbeater) clearAllHeartbeatTimers() error {
	h.heartbeatTimersLock.Lock()
	defer h.heartbeatTimersLock.Unlock()

	for _, t := range h.heartbeatTimers {
		t.Stop()
	}
	h.heartbeatTimers = nil
	return nil
}

// heartbeatStats is a long running routine used to capture
// the number of active heartbeats being tracked
func (h *nodeHeartbeater) heartbeatStats() {
	for {
		select {
		case <-time.After(5 * time.Second):
			h.heartbeatTimersLock.Lock()
			num := len(h.heartbeatTimers)
			h.heartbeatTimersLock.Unlock()
			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))

		case <-h.shutdownCh:
			return
		}
	}
}
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`package nomad`

			`import (`
Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00			`"errors"`
server 2018-09-15 23:23:13 +00:00			`"sync"`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`"time"`

goimports 2019-01-15 19:46:12 +00:00			`metrics "github.com/armon/go-metrics"`
server 2018-09-15 23:23:13 +00:00			`log "github.com/hashicorp/go-hclog"`
			`memdb "github.com/hashicorp/go-memdb"`

Use consul/lib's RateScaledInterval 2016-05-03 07:06:59 +00:00			`"github.com/hashicorp/consul/lib"`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`

Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00			`const (`
			`// heartbeatNotLeader is the error string returned when the heartbeat request`
			`// couldn't be completed since the server is not the leader.`
			`heartbeatNotLeader = "failed to reset heartbeat since server is not leader"`
node heartbeat missed event 2018-05-11 21:53:41 +00:00
			`// NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is`
			`// missed.`
			`NodeHeartbeatEventMissed = "Node heartbeat missed"`
Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00			`)`

			`var (`
			`// heartbeatNotLeaderErr is the error returned when the heartbeat request`
			`// couldn't be completed since the server is not the leader.`
			`heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)`
			`)`

server 2018-09-15 23:23:13 +00:00			`// nodeHeartbeater is used to track expiration times of node heartbeats. If it`
			`// detects an expired node, the node status is updated to be 'down'.`
			`type nodeHeartbeater struct {`
			`*Server`
			`logger log.Logger`

			`// heartbeatTimers track the expiration time of each heartbeat that has`
			`// a TTL. On expiration, the node status is updated to be 'down'.`
			`heartbeatTimers map[string]*time.Timer`
			`heartbeatTimersLock sync.Mutex`
			`}`

			`// newNodeHeartbeater returns a new node heartbeater used to detect and act on`
			`// failed node heartbeats.`
			`func newNodeHeartbeater(s Server) nodeHeartbeater {`
			`return &nodeHeartbeater{`
			`Server: s,`
			`logger: s.logger.Named("heartbeat"),`
			`}`
			`}`

nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`// initializeHeartbeatTimers is used when a leader is newly elected to create`
			`// a new map to track heartbeat expiration and to reset all the timers from`
			`// the previously known set of timers.`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) initializeHeartbeatTimers() error {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`// Scan all nodes and reset their timer`
server 2018-09-15 23:23:13 +00:00			`snap, err := h.fsm.State().Snapshot()`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`if err != nil {`
			`return err`
			`}`

			`// Get an iterator over nodes`
Nomad builds 2017-02-08 04:31:23 +00:00			`ws := memdb.NewWatchSet()`
			`iter, err := snap.Nodes(ws)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`if err != nil {`
			`return err`
			`}`

server 2018-09-15 23:23:13 +00:00			`h.heartbeatTimersLock.Lock()`
			`defer h.heartbeatTimersLock.Unlock()`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00
			`// Handle each node`
			`for {`
			`raw := iter.Next()`
			`if raw == nil {`
			`break`
			`}`
			`node := raw.(*structs.Node)`
			`if node.TerminalStatus() {`
			`continue`
			`}`
server 2018-09-15 23:23:13 +00:00			`h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`}`
			`return nil`
			`}`

			`// resetHeartbeatTimer is used to reset the TTL of a heartbeat.`
			`// This can be used for new heartbeats and existing ones.`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) {`
			`h.heartbeatTimersLock.Lock()`
			`defer h.heartbeatTimersLock.Unlock()`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00
Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00			`// Do not create a timer for the node since we are not the leader. This`
			`// check avoids the race in which leadership is lost but a timer is created`
			`// on this server since it was servicing an RPC during a leadership loss.`
server 2018-09-15 23:23:13 +00:00			`if !h.IsLeader() {`
			`h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id)`
Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00			`return 0, heartbeatNotLeaderErr`
			`}`

nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`// Compute the target TTL value`
server 2018-09-15 23:23:13 +00:00			`n := len(h.heartbeatTimers)`
			`ttl := lib.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n)`
Use consul/lib's RandomStagger Removes four redundant copies of the method in the process. 2016-05-03 07:15:29 +00:00			`ttl += lib.RandomStagger(ttl)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00
			`// Reset the TTL`
server 2018-09-15 23:23:13 +00:00			`h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`return ttl, nil`
			`}`

			`// resetHeartbeatTimerLocked is used to reset a heartbeat timer`
			`// assuming the heartbeatTimerLock is already held`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`// Ensure a timer map exists`
server 2018-09-15 23:23:13 +00:00			`if h.heartbeatTimers == nil {`
			`h.heartbeatTimers = make(map[string]*time.Timer)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`}`

			`// Renew the heartbeat timer if it exists`
server 2018-09-15 23:23:13 +00:00			`if timer, ok := h.heartbeatTimers[id]; ok {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`timer.Reset(ttl)`
			`return`
			`}`

Fix small typo 2016-05-03 07:18:48 +00:00			`// Create a new timer to track expiration of this heartbeat`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`timer := time.AfterFunc(ttl, func() {`
server 2018-09-15 23:23:13 +00:00			`h.invalidateHeartbeat(id)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`})`
server 2018-09-15 23:23:13 +00:00			`h.heartbeatTimers[id] = timer`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`}`

			`// invalidateHeartbeat is invoked when a heartbeat TTL is reached and we`
			`// need to invalidate the heartbeat.`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) invalidateHeartbeat(id string) {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())`
			`// Clear the heartbeat timer`
server 2018-09-15 23:23:13 +00:00			`h.heartbeatTimersLock.Lock()`
			`if timer, ok := h.heartbeatTimers[id]; ok {`
Fixed #3595 (https://github.com/hashicorp/nomad/issues/3595) Stopping heartbeat timer before remove 2018-05-24 13:15:06 +00:00			`timer.Stop()`
server 2018-09-15 23:23:13 +00:00			`delete(h.heartbeatTimers, id)`
Fixed #3595 (https://github.com/hashicorp/nomad/issues/3595) Stopping heartbeat timer before remove 2018-05-24 13:15:06 +00:00			`}`
server 2018-09-15 23:23:13 +00:00			`h.heartbeatTimersLock.Unlock()`
Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00
			`// Do not invalidate the node since we are not the leader. This check avoids`
			`// the race in which leadership is lost but a timer is created on this`
			`// server since it was servicing an RPC during a leadership loss.`
server 2018-09-15 23:23:13 +00:00			`if !h.IsLeader() {`
			`h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id)`
Add escape hatches when non-leader 2018-02-20 18:22:15 +00:00			`return`
			`}`

server 2018-09-15 23:23:13 +00:00			`h.logger.Warn("node TTL expired", "node_id", id)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00
			`// Make a request to update the node status`
			`req := structs.NodeUpdateStatusRequest{`
node heartbeat missed event 2018-05-11 21:53:41 +00:00			`NodeID: id,`
			`Status: structs.NodeStatusDown,`
			`NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`WriteRequest: structs.WriteRequest{`
server 2018-09-15 23:23:13 +00:00			`Region: h.config.Region,`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`},`
			`}`
NodeStatusDisconnected: support state transitions for new node status 2022-02-22 11:13:14 +00:00
			`if h.shouldDisconnect(id) {`
			`req.Status = structs.NodeStatusDisconnected`
			`}`

nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`var resp structs.NodeUpdateResponse`
server 2018-09-15 23:23:13 +00:00			`if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil {`
			`h.logger.Error("update node status failed", "error", err)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`}`
			`}`

NodeStatusDisconnected: support state transitions for new node status 2022-02-22 11:13:14 +00:00			`func (h *nodeHeartbeater) shouldDisconnect(id string) bool {`
			`allocs, err := h.State().AllocsByNode(nil, id)`
			`if err != nil {`
			`h.logger.Error("error retrieving allocs by node", "error", err)`
			`return false`
			`}`

			`now := time.Now().UTC()`
			`for _, alloc := range allocs {`
			`if alloc.DisconnectTimeout(now).After(now) {`
			`return true`
			`}`
			`}`

			`return false`
			`}`

nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`// clearHeartbeatTimer is used to clear the heartbeat time for`
			`// a single heartbeat. This is used when a heartbeat is destroyed`
			`// explicitly and no longer needed.`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error {`
			`h.heartbeatTimersLock.Lock()`
			`defer h.heartbeatTimersLock.Unlock()`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00
server 2018-09-15 23:23:13 +00:00			`if timer, ok := h.heartbeatTimers[id]; ok {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`timer.Stop()`
server 2018-09-15 23:23:13 +00:00			`delete(h.heartbeatTimers, id)`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`}`
			`return nil`
			`}`

			`// clearAllHeartbeatTimers is used when a leader is stepping`
			`// down and we no longer need to track any heartbeat timers.`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) clearAllHeartbeatTimers() error {`
			`h.heartbeatTimersLock.Lock()`
			`defer h.heartbeatTimersLock.Unlock()`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00
server 2018-09-15 23:23:13 +00:00			`for _, t := range h.heartbeatTimers {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`t.Stop()`
			`}`
server 2018-09-15 23:23:13 +00:00			`h.heartbeatTimers = nil`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`return nil`
			`}`

			`// heartbeatStats is a long running routine used to capture`
			`// the number of active heartbeats being tracked`
server 2018-09-15 23:23:13 +00:00			`func (h *nodeHeartbeater) heartbeatStats() {`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`for {`
			`select {`
			`case <-time.After(5 * time.Second):`
server 2018-09-15 23:23:13 +00:00			`h.heartbeatTimersLock.Lock()`
			`num := len(h.heartbeatTimers)`
			`h.heartbeatTimersLock.Unlock()`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))`

server 2018-09-15 23:23:13 +00:00			`case <-h.shutdownCh:`
nomad: no heartbeat for nodes in terminal status 2015-08-23 00:17:13 +00:00			`return`
			`}`
			`}`
			`}`