2018-03-08 23:08:23 +00:00
|
|
|
package drainer
|
2018-03-02 00:37:19 +00:00
|
|
|
|
2018-03-03 01:15:38 +00:00
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"log"
|
|
|
|
"time"
|
2018-03-02 00:37:19 +00:00
|
|
|
|
2018-03-03 01:15:38 +00:00
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
|
|
|
"github.com/hashicorp/nomad/nomad/state"
|
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
"golang.org/x/time/rate"
|
|
|
|
)
|
|
|
|
|
|
|
|
// DrainingNodeWatcher is the interface for watching for draining nodes.
|
|
|
|
type DrainingNodeWatcher interface{}
|
|
|
|
|
2018-03-06 18:12:17 +00:00
|
|
|
// TrackedNodes returns the set of tracked nodes
|
|
|
|
func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node {
|
2018-03-03 01:15:38 +00:00
|
|
|
n.l.RLock()
|
|
|
|
defer n.l.RUnlock()
|
|
|
|
|
2018-03-06 18:12:17 +00:00
|
|
|
t := make(map[string]*structs.Node, len(n.nodes))
|
|
|
|
for n, d := range n.nodes {
|
|
|
|
t[n] = d.GetNode()
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
2018-03-06 18:12:17 +00:00
|
|
|
return t
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Remove removes the given node from being tracked
|
|
|
|
func (n *NodeDrainer) Remove(nodeID string) {
|
|
|
|
n.l.Lock()
|
|
|
|
defer n.l.Unlock()
|
2018-03-03 01:24:48 +00:00
|
|
|
|
|
|
|
// TODO test the notifier is updated
|
|
|
|
// Remove it from being tracked and remove it from the dealiner
|
2018-03-03 01:15:38 +00:00
|
|
|
delete(n.nodes, nodeID)
|
2018-03-03 01:24:48 +00:00
|
|
|
n.deadlineNotifier.Remove(nodeID)
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update updates the node, either updating the tracked version or starting to
|
|
|
|
// track the node.
|
|
|
|
func (n *NodeDrainer) Update(node *structs.Node) {
|
|
|
|
n.l.Lock()
|
|
|
|
defer n.l.Unlock()
|
|
|
|
|
|
|
|
if node == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
draining, ok := n.nodes[node.ID]
|
|
|
|
if !ok {
|
2018-03-07 22:57:35 +00:00
|
|
|
draining = NewDrainingNode(node, n.state)
|
|
|
|
n.nodes[node.ID] = draining
|
|
|
|
} else {
|
|
|
|
// Update it
|
|
|
|
draining.Update(node)
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
2018-03-03 01:24:48 +00:00
|
|
|
// TODO test the notifier is updated
|
|
|
|
if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf {
|
|
|
|
n.deadlineNotifier.Watch(node.ID, deadline)
|
|
|
|
} else {
|
|
|
|
// There is an infinite deadline so it shouldn't be tracked for
|
|
|
|
// deadlining
|
|
|
|
n.deadlineNotifier.Remove(node.ID)
|
|
|
|
}
|
|
|
|
|
2018-03-07 22:57:35 +00:00
|
|
|
// TODO Test this
|
|
|
|
// Register interest in the draining jobs.
|
|
|
|
jobs, err := draining.RunningServices()
|
|
|
|
if err != nil {
|
|
|
|
n.logger.Printf("[ERR] nomad.drain: error retrieving services on node %q: %v", node.ID, err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
n.logger.Printf("[TRACE] nomad.drain: node %q has %d services on it", node.ID, len(jobs))
|
|
|
|
for _, job := range jobs {
|
|
|
|
n.jobWatcher.RegisterJob(job)
|
|
|
|
}
|
|
|
|
|
2018-03-07 23:42:17 +00:00
|
|
|
// TODO Test at this layer as well that a node drain on a node without
|
|
|
|
// allocs immediately gets unmarked as draining
|
2018-03-07 23:16:45 +00:00
|
|
|
// Check if the node is done such that if an operator drains a node with
|
|
|
|
// nothing on it we unset drain
|
|
|
|
done, err := draining.IsDone()
|
|
|
|
if err != nil {
|
|
|
|
n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err)
|
|
|
|
return
|
|
|
|
}
|
2018-03-07 22:57:35 +00:00
|
|
|
|
2018-03-07 23:16:45 +00:00
|
|
|
if done {
|
|
|
|
index, err := n.raft.NodeDrainComplete(node.ID)
|
|
|
|
if err != nil {
|
|
|
|
n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err)
|
|
|
|
} else {
|
|
|
|
n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index)
|
|
|
|
}
|
|
|
|
}
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// nodeDrainWatcher is used to watch nodes that are entering, leaving or
|
|
|
|
// changing their drain strategy.
|
|
|
|
type nodeDrainWatcher struct {
|
|
|
|
ctx context.Context
|
|
|
|
logger *log.Logger
|
|
|
|
|
|
|
|
// state is the state that is watched for state changes.
|
|
|
|
state *state.StateStore
|
|
|
|
|
|
|
|
// limiter is used to limit the rate of blocking queries
|
|
|
|
limiter *rate.Limiter
|
|
|
|
|
|
|
|
// tracker is the object that is tracking the nodes and provides us with the
|
|
|
|
// needed callbacks
|
|
|
|
tracker NodeTracker
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewNodeDrainWatcher returns a new node drain watcher.
|
|
|
|
func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher {
|
|
|
|
w := &nodeDrainWatcher{
|
|
|
|
ctx: ctx,
|
|
|
|
limiter: limiter,
|
|
|
|
logger: logger,
|
|
|
|
tracker: tracker,
|
|
|
|
state: state,
|
|
|
|
}
|
|
|
|
|
|
|
|
go w.watch()
|
|
|
|
return w
|
|
|
|
}
|
|
|
|
|
|
|
|
// watch is the long lived watching routine that detects node changes.
|
|
|
|
func (w *nodeDrainWatcher) watch() {
|
|
|
|
nindex := uint64(1)
|
|
|
|
for {
|
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex)
|
|
|
|
nodes, index, err := w.getNodes(nindex)
|
2018-03-07 22:57:35 +00:00
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err)
|
2018-03-03 01:15:38 +00:00
|
|
|
if err != nil {
|
|
|
|
if err == context.Canceled {
|
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err)
|
|
|
|
select {
|
|
|
|
case <-w.ctx.Done():
|
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
|
|
|
|
return
|
|
|
|
case <-time.After(stateReadErrorDelay):
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// update index for next run
|
|
|
|
nindex = index
|
|
|
|
|
2018-03-06 18:12:17 +00:00
|
|
|
tracked := w.tracker.TrackedNodes()
|
|
|
|
for nodeID, node := range nodes {
|
2018-03-03 01:15:38 +00:00
|
|
|
newDraining := node.DrainStrategy != nil
|
2018-03-06 18:12:17 +00:00
|
|
|
currentNode, tracked := tracked[nodeID]
|
2018-03-03 01:15:38 +00:00
|
|
|
|
|
|
|
switch {
|
|
|
|
// If the node is tracked but not draining, untrack
|
|
|
|
case tracked && !newDraining:
|
2018-03-06 18:12:17 +00:00
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID)
|
|
|
|
w.tracker.Remove(nodeID)
|
2018-03-03 01:15:38 +00:00
|
|
|
|
|
|
|
// If the node is not being tracked but is draining, track
|
|
|
|
case !tracked && newDraining:
|
2018-03-06 18:12:17 +00:00
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID)
|
2018-03-03 01:15:38 +00:00
|
|
|
w.tracker.Update(node)
|
|
|
|
|
|
|
|
// If the node is being tracked but has changed, update:
|
|
|
|
case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy):
|
2018-03-06 18:12:17 +00:00
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID)
|
2018-03-03 01:15:38 +00:00
|
|
|
w.tracker.Update(node)
|
|
|
|
default:
|
2018-03-06 18:12:17 +00:00
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining)
|
|
|
|
}
|
2018-03-07 23:42:17 +00:00
|
|
|
|
|
|
|
// TODO(schmichael) handle the case of a lost node
|
2018-03-06 18:12:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for nodeID := range tracked {
|
|
|
|
if _, ok := nodes[nodeID]; !ok {
|
|
|
|
w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID)
|
|
|
|
w.tracker.Remove(nodeID)
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// getNodes returns all nodes blocking until the nodes are after the given index.
|
2018-03-06 18:12:17 +00:00
|
|
|
func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) {
|
2018-03-03 01:15:38 +00:00
|
|
|
if err := w.limiter.Wait(w.ctx); err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx)
|
|
|
|
if err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
|
2018-03-06 18:12:17 +00:00
|
|
|
return resp.(map[string]*structs.Node), index, nil
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// getNodesImpl is used to get nodes from the state store, returning the set of
|
|
|
|
// nodes and the given index.
|
|
|
|
func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
|
|
|
|
iter, err := state.Nodes(ws)
|
|
|
|
if err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
index, err := state.Index("nodes")
|
|
|
|
if err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
|
2018-03-06 18:12:17 +00:00
|
|
|
resp := make(map[string]*structs.Node, 64)
|
2018-03-03 01:15:38 +00:00
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
node := raw.(*structs.Node)
|
2018-03-06 18:12:17 +00:00
|
|
|
resp[node.ID] = node
|
2018-03-03 01:15:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return resp, index, nil
|
2018-03-02 00:37:19 +00:00
|
|
|
}
|