open-nomad/api/nodes.go

612 lines
15 KiB
Go
Raw Normal View History

2015-09-11 23:51:18 +00:00
package api
2015-09-12 00:20:33 +00:00
import (
"context"
"fmt"
2015-09-17 19:40:51 +00:00
"sort"
"time"
2018-02-27 20:59:27 +00:00
"github.com/hashicorp/nomad/nomad/structs"
2015-09-12 00:20:33 +00:00
)
2015-09-11 23:51:18 +00:00
// Nodes is used to query node-related API endpoints
type Nodes struct {
client *Client
}
// Nodes returns a handle on the node endpoints.
func (c *Client) Nodes() *Nodes {
return &Nodes{client: c}
}
// List is used to list out all of the nodes
2015-09-14 02:55:47 +00:00
func (n *Nodes) List(q *QueryOptions) ([]*NodeListStub, *QueryMeta, error) {
2015-09-17 19:40:51 +00:00
var resp NodeIndexSort
2015-09-11 23:51:18 +00:00
qm, err := n.client.query("/v1/nodes", &resp, q)
if err != nil {
return nil, nil, err
}
2017-09-26 22:26:33 +00:00
sort.Sort(resp)
2015-09-11 23:51:18 +00:00
return resp, qm, nil
}
func (n *Nodes) PrefixList(prefix string) ([]*NodeListStub, *QueryMeta, error) {
return n.List(&QueryOptions{Prefix: prefix})
}
2015-09-11 23:51:18 +00:00
// Info is used to query a specific node by its ID.
func (n *Nodes) Info(nodeID string, q *QueryOptions) (*Node, *QueryMeta, error) {
var resp Node
qm, err := n.client.query("/v1/node/"+nodeID, &resp, q)
if err != nil {
return nil, nil, err
}
return &resp, qm, nil
}
2018-02-23 23:56:36 +00:00
// NodeUpdateDrainRequest is used to update the drain specification for a node.
type NodeUpdateDrainRequest struct {
// NodeID is the node to update the drain specification for.
NodeID string
// DrainSpec is the drain specification to set for the node. A nil DrainSpec
// will disable draining.
DrainSpec *DrainSpec
// MarkEligible marks the node as eligible for scheduling if removing
// the drain strategy.
MarkEligible bool
2018-02-23 23:56:36 +00:00
}
// NodeDrainUpdateResponse is used to respond to a node drain update
type NodeDrainUpdateResponse struct {
NodeModifyIndex uint64
EvalIDs []string
EvalCreateIndex uint64
WriteMeta
}
// UpdateDrain is used to update the drain strategy for a given node. If
// markEligible is true and the drain is being removed, the node will be marked
// as having its scheduling being eligible
func (n *Nodes) UpdateDrain(nodeID string, spec *DrainSpec, markEligible bool, q *WriteOptions) (*NodeDrainUpdateResponse, error) {
2018-02-23 23:56:36 +00:00
req := &NodeUpdateDrainRequest{
NodeID: nodeID,
DrainSpec: spec,
MarkEligible: markEligible,
2018-02-23 23:56:36 +00:00
}
var resp NodeDrainUpdateResponse
wm, err := n.client.write("/v1/node/"+nodeID+"/drain", req, &resp, q)
2015-09-11 23:51:18 +00:00
if err != nil {
return nil, err
}
resp.WriteMeta = *wm
return &resp, nil
2015-09-11 23:51:18 +00:00
}
2018-03-30 18:07:40 +00:00
// MonitorMsgLevels represents the severity log level of a MonitorMessage.
type MonitorMsgLevel int
const (
MonitorMsgLevelNormal MonitorMsgLevel = 0
MonitorMsgLevelInfo MonitorMsgLevel = 1
MonitorMsgLevelWarn MonitorMsgLevel = 2
MonitorMsgLevelError MonitorMsgLevel = 3
2018-03-30 18:07:40 +00:00
)
// MonitorMessage contains a message and log level.
type MonitorMessage struct {
Level MonitorMsgLevel
Message string
}
// Messagef formats a new MonitorMessage.
func Messagef(lvl MonitorMsgLevel, msg string, args ...interface{}) *MonitorMessage {
return &MonitorMessage{
Level: lvl,
Message: fmt.Sprintf(msg, args...),
}
}
func (m *MonitorMessage) String() string {
return m.Message
}
// MonitorDrain emits drain related events on the returned string channel. The
// channel will be closed when all allocations on the draining node have
// stopped or the context is canceled.
2018-03-30 18:07:40 +00:00
func (n *Nodes) MonitorDrain(ctx context.Context, nodeID string, index uint64, ignoreSys bool) <-chan *MonitorMessage {
outCh := make(chan *MonitorMessage, 8)
nodeCh := make(chan *MonitorMessage, 1)
allocCh := make(chan *MonitorMessage, 8)
// Multiplex node and alloc chans onto outCh. This goroutine closes
// outCh when other chans have been closed or context canceled.
multiplexCtx, cancel := context.WithCancel(ctx)
2018-06-06 00:58:44 +00:00
go n.monitorDrainMultiplex(multiplexCtx, cancel, outCh, nodeCh, allocCh)
// Monitor node for updates
2018-06-06 00:58:44 +00:00
go n.monitorDrainNode(multiplexCtx, cancel, nodeID, index, nodeCh)
// Monitor allocs on node for updates
2018-03-30 18:07:40 +00:00
go n.monitorDrainAllocs(multiplexCtx, nodeID, ignoreSys, allocCh)
return outCh
}
// monitorDrainMultiplex multiplexes node and alloc updates onto the out chan.
// Closes out chan when either the context is canceled, both update chans are
// closed, or an error occurs.
func (n *Nodes) monitorDrainMultiplex(ctx context.Context, cancel func(),
2018-03-30 18:07:40 +00:00
outCh chan<- *MonitorMessage, nodeCh, allocCh <-chan *MonitorMessage) {
defer cancel()
defer close(outCh)
2018-03-30 18:07:40 +00:00
nodeOk := true
allocOk := true
2018-03-30 18:07:40 +00:00
var msg *MonitorMessage
for {
// If both chans have been closed, close the output chan
if !nodeOk && !allocOk {
return
}
select {
case msg, nodeOk = <-nodeCh:
if !nodeOk {
// nil chan to prevent further recvs
nodeCh = nil
}
case msg, allocOk = <-allocCh:
if !allocOk {
// nil chan to prevent further recvs
allocCh = nil
}
case <-ctx.Done():
return
}
2018-03-30 18:07:40 +00:00
if msg == nil {
continue
}
select {
case outCh <- msg:
case <-ctx.Done():
2018-06-06 00:58:44 +00:00
// If we are exiting but we have a message, attempt to send it
// so we don't loose a message but do not block.
select {
case outCh <- msg:
default:
}
return
}
2018-03-30 18:07:40 +00:00
// Abort on error messages
if msg.Level == MonitorMsgLevelError {
return
}
}
}
// monitorDrainNode emits node updates on nodeCh and closes the channel when
// the node has finished draining.
2018-06-06 00:58:44 +00:00
func (n *Nodes) monitorDrainNode(ctx context.Context, cancel func(),
nodeID string, index uint64, nodeCh chan<- *MonitorMessage) {
defer close(nodeCh)
var lastStrategy *DrainStrategy
var strategyChanged bool
2018-03-30 18:07:40 +00:00
q := QueryOptions{
AllowStale: true,
WaitIndex: index,
}
for {
node, meta, err := n.Info(nodeID, &q)
if err != nil {
2018-03-30 18:07:40 +00:00
msg := Messagef(MonitorMsgLevelError, "Error monitoring node: %v", err)
select {
2018-03-30 18:07:40 +00:00
case nodeCh <- msg:
case <-ctx.Done():
}
return
}
if node.DrainStrategy == nil {
var msg *MonitorMessage
if strategyChanged {
msg = Messagef(MonitorMsgLevelInfo, "Node %q has marked all allocations for migration", nodeID)
} else {
msg = Messagef(MonitorMsgLevelInfo, "No drain strategy set for node %s", nodeID)
2018-06-06 00:58:44 +00:00
defer cancel()
}
select {
case nodeCh <- msg:
case <-ctx.Done():
}
return
}
if node.Status == structs.NodeStatusDown {
msg := Messagef(MonitorMsgLevelWarn, "Node %q down", nodeID)
select {
case nodeCh <- msg:
case <-ctx.Done():
}
}
// DrainStrategy changed
if lastStrategy != nil && !node.DrainStrategy.Equal(lastStrategy) {
2018-03-30 18:07:40 +00:00
msg := Messagef(MonitorMsgLevelInfo, "Node %q drain updated: %s", nodeID, node.DrainStrategy)
select {
case nodeCh <- msg:
case <-ctx.Done():
return
}
}
lastStrategy = node.DrainStrategy
strategyChanged = true
// Drain still ongoing, update index and block for updates
2018-03-30 19:43:53 +00:00
q.WaitIndex = meta.LastIndex
}
}
// monitorDrainAllocs emits alloc updates on allocCh and closes the channel
// when the node has finished draining.
2018-03-30 18:07:40 +00:00
func (n *Nodes) monitorDrainAllocs(ctx context.Context, nodeID string, ignoreSys bool, allocCh chan<- *MonitorMessage) {
defer close(allocCh)
q := QueryOptions{AllowStale: true}
initial := make(map[string]*Allocation, 4)
for {
allocs, meta, err := n.Allocations(nodeID, &q)
if err != nil {
2018-03-30 18:07:40 +00:00
msg := Messagef(MonitorMsgLevelError, "Error monitoring allocations: %v", err)
select {
2018-03-30 18:07:40 +00:00
case allocCh <- msg:
case <-ctx.Done():
}
return
}
q.WaitIndex = meta.LastIndex
runningAllocs := 0
for _, a := range allocs {
// Get previous version of alloc
orig, existing := initial[a.ID]
// Update local alloc state
initial[a.ID] = a
migrating := a.DesiredTransition.ShouldMigrate()
var msg string
switch {
case !existing:
// Should only be possible if response
// from initial Allocations call was
// stale. No need to output
case orig.ClientStatus != a.ClientStatus:
// Alloc status has changed; output
msg = fmt.Sprintf("status %s -> %s", orig.ClientStatus, a.ClientStatus)
case migrating && !orig.DesiredTransition.ShouldMigrate():
// Alloc was marked for migration
msg = "marked for migration"
case migrating && (orig.DesiredStatus != a.DesiredStatus) && a.DesiredStatus == structs.AllocDesiredStatusStop:
// Alloc has already been marked for migration and is now being stopped
msg = "draining"
}
if msg != "" {
select {
case allocCh <- Messagef(MonitorMsgLevelNormal, "Alloc %q %s", a.ID, msg):
case <-ctx.Done():
return
}
}
// Ignore malformed allocs
if a.Job == nil || a.Job.Type == nil {
continue
}
// Track how many allocs are still running
if ignoreSys && a.Job.Type != nil && *a.Job.Type == structs.JobTypeSystem {
continue
}
switch a.ClientStatus {
case structs.AllocClientStatusPending, structs.AllocClientStatusRunning:
runningAllocs++
}
}
// Exit if all allocs are terminal
if runningAllocs == 0 {
msg := Messagef(MonitorMsgLevelInfo, "All allocations on node %q have stopped.", nodeID)
select {
case allocCh <- msg:
case <-ctx.Done():
}
return
}
}
}
2018-02-27 20:59:27 +00:00
// NodeUpdateEligibilityRequest is used to update the drain specification for a node.
type NodeUpdateEligibilityRequest struct {
// NodeID is the node to update the drain specification for.
NodeID string
Eligibility string
}
// NodeEligibilityUpdateResponse is used to respond to a node eligibility update
type NodeEligibilityUpdateResponse struct {
NodeModifyIndex uint64
EvalIDs []string
EvalCreateIndex uint64
WriteMeta
}
2018-02-27 20:59:27 +00:00
// ToggleEligibility is used to update the scheduling eligibility of the node
func (n *Nodes) ToggleEligibility(nodeID string, eligible bool, q *WriteOptions) (*NodeEligibilityUpdateResponse, error) {
2018-02-27 20:59:27 +00:00
e := structs.NodeSchedulingEligible
if !eligible {
e = structs.NodeSchedulingIneligible
}
req := &NodeUpdateEligibilityRequest{
NodeID: nodeID,
Eligibility: e,
}
var resp NodeEligibilityUpdateResponse
wm, err := n.client.write("/v1/node/"+nodeID+"/eligibility", req, &resp, q)
2018-02-27 20:59:27 +00:00
if err != nil {
return nil, err
}
resp.WriteMeta = *wm
return &resp, nil
2018-02-27 20:59:27 +00:00
}
2015-09-11 23:51:18 +00:00
// Allocations is used to return the allocations associated with a node.
func (n *Nodes) Allocations(nodeID string, q *QueryOptions) ([]*Allocation, *QueryMeta, error) {
var resp []*Allocation
2015-09-11 23:51:18 +00:00
qm, err := n.client.query("/v1/node/"+nodeID+"/allocations", &resp, q)
if err != nil {
return nil, nil, err
}
sort.Sort(AllocationSort(resp))
2015-09-11 23:51:18 +00:00
return resp, qm, nil
}
// ForceEvaluate is used to force-evaluate an existing node.
func (n *Nodes) ForceEvaluate(nodeID string, q *WriteOptions) (string, *WriteMeta, error) {
var resp nodeEvalResponse
wm, err := n.client.write("/v1/node/"+nodeID+"/evaluate", nil, &resp, q)
if err != nil {
return "", nil, err
}
return resp.EvalID, wm, nil
}
func (n *Nodes) Stats(nodeID string, q *QueryOptions) (*HostStats, error) {
var resp HostStats
path := fmt.Sprintf("/v1/client/stats?node_id=%s", nodeID)
if _, err := n.client.query(path, &resp, q); err != nil {
return nil, err
}
return &resp, nil
}
func (n *Nodes) GC(nodeID string, q *QueryOptions) error {
var resp struct{}
path := fmt.Sprintf("/v1/client/gc?node_id=%s", nodeID)
_, err := n.client.query(path, &resp, q)
return err
}
// TODO Add tests
func (n *Nodes) GcAlloc(allocID string, q *QueryOptions) error {
var resp struct{}
path := fmt.Sprintf("/v1/client/allocation/%s/gc", allocID)
_, err := n.client.query(path, &resp, q)
return err
}
// DriverInfo is used to deserialize a DriverInfo entry
type DriverInfo struct {
Attributes map[string]string
Detected bool
Healthy bool
HealthDescription string
UpdateTime time.Time
}
2015-09-11 23:51:18 +00:00
// Node is used to deserialize a node entry.
type Node struct {
2018-02-23 23:56:36 +00:00
ID string
Datacenter string
Name string
HTTPAddr string
TLSEnabled bool
Attributes map[string]string
Resources *Resources
Reserved *Resources
Links map[string]string
Meta map[string]string
NodeClass string
Drain bool
DrainStrategy *DrainStrategy
SchedulingEligibility string
Status string
StatusDescription string
StatusUpdatedAt int64
Events []*NodeEvent
Drivers map[string]*DriverInfo
CreateIndex uint64
ModifyIndex uint64
}
// DrainStrategy describes a Node's drain behavior.
type DrainStrategy struct {
// DrainSpec is the user declared drain specification
DrainSpec
2018-03-01 00:25:56 +00:00
2018-03-12 20:44:33 +00:00
// ForceDeadline is the deadline time for the drain after which drains will
// be forced
ForceDeadline time.Time
2018-02-23 23:56:36 +00:00
}
// DrainSpec describes a Node's drain behavior.
type DrainSpec struct {
// Deadline is the duration after StartTime when the remaining
// allocations on a draining Node should be told to stop.
Deadline time.Duration
// IgnoreSystemJobs allows systems jobs to remain on the node even though it
// has been marked for draining.
IgnoreSystemJobs bool
2015-09-14 02:55:47 +00:00
}
func (d *DrainStrategy) Equal(o *DrainStrategy) bool {
if d == nil || o == nil {
return d == o
}
if d.ForceDeadline != o.ForceDeadline {
return false
}
if d.Deadline != o.Deadline {
return false
}
if d.IgnoreSystemJobs != o.IgnoreSystemJobs {
return false
}
return true
}
// String returns a human readable version of the drain strategy.
func (d *DrainStrategy) String() string {
if d.IgnoreSystemJobs {
return fmt.Sprintf("drain ignoring system jobs and deadline at %s", d.ForceDeadline)
}
return fmt.Sprintf("drain with deadline at %s", d.ForceDeadline)
}
const (
2018-03-14 00:59:37 +00:00
NodeEventSubsystemDrain = "Drain"
NodeEventSubsystemDriver = "Driver"
NodeEventSubsystemHeartbeat = "Heartbeat"
NodeEventSubsystemCluster = "Cluster"
)
// NodeEvent is a single unit representing a nodes state change
type NodeEvent struct {
2018-03-14 01:04:55 +00:00
Message string
Subsystem string
Details map[string]string
Timestamp time.Time
CreateIndex uint64
}
// HostStats represents resource usage stats of the host running a Nomad client
type HostStats struct {
Memory *HostMemoryStats
CPU []*HostCPUStats
DiskStats []*HostDiskStats
Uptime uint64
CPUTicksConsumed float64
}
type HostMemoryStats struct {
Total uint64
Available uint64
Used uint64
Free uint64
}
type HostCPUStats struct {
CPU string
User float64
System float64
Idle float64
}
2016-05-22 10:46:49 +00:00
type HostDiskStats struct {
Device string
Mountpoint string
Size uint64
Used uint64
Available uint64
UsedPercent float64
InodesUsedPercent float64
}
2015-09-14 02:55:47 +00:00
// NodeListStub is a subset of information returned during
// node list operations.
type NodeListStub struct {
2018-02-27 22:00:55 +00:00
Address string
ID string
Datacenter string
Name string
NodeClass string
Version string
Drain bool
SchedulingEligibility string
Status string
StatusDescription string
Drivers map[string]*DriverInfo
2018-02-27 22:00:55 +00:00
CreateIndex uint64
ModifyIndex uint64
2015-09-11 23:51:18 +00:00
}
2015-09-17 19:40:51 +00:00
// NodeIndexSort reverse sorts nodes by CreateIndex
type NodeIndexSort []*NodeListStub
func (n NodeIndexSort) Len() int {
return len(n)
}
func (n NodeIndexSort) Less(i, j int) bool {
return n[i].CreateIndex > n[j].CreateIndex
}
func (n NodeIndexSort) Swap(i, j int) {
n[i], n[j] = n[j], n[i]
}
2015-09-11 23:51:18 +00:00
// nodeEvalResponse is used to decode a force-eval.
type nodeEvalResponse struct {
EvalID string
}
// AllocationSort reverse sorts allocs by CreateIndex.
type AllocationSort []*Allocation
func (a AllocationSort) Len() int {
return len(a)
}
func (a AllocationSort) Less(i, j int) bool {
return a[i].CreateIndex > a[j].CreateIndex
}
func (a AllocationSort) Swap(i, j int) {
a[i], a[j] = a[j], a[i]
}