open-nomad/command/monitor.go

package command

import (
	"fmt"
	"sync"
	"time"

	"github.com/hashicorp/nomad/api"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/mitchellh/cli"
)

const (
	// updateWait is the amount of time to wait between status
	// updates. Because the monitor is poll-based, we use this
	// delay to avoid overwhelming the API server.
	updateWait = time.Second
)

// evalState is used to store the current "state of the world"
// in the context of monitoring an evaluation.
type evalState struct {
	status string
	desc   string
	node   string
	allocs map[string]*allocState
	wait   time.Duration
	index  uint64
}

// allocState is used to track the state of an allocation
type allocState struct {
	id          string
	group       string
	node        string
	desired     string
	desiredDesc string
	client      string
	index       uint64

	// full is the allocation struct with full details. This
	// must be queried for explicitly so it is only included
	// if there is important error information inside.
	full *api.Allocation
}

// monitor wraps an evaluation monitor and holds metadata and
// state information.
type monitor struct {
	ui     cli.Ui
	client *api.Client
	state  *evalState

	sync.Mutex
}

// newMonitor returns a new monitor. The returned monitor will
// write output information to the provided ui.
func newMonitor(ui cli.Ui, client *api.Client) *monitor {
	mon := &monitor{
		ui: &cli.PrefixedUi{
			InfoPrefix:   "==> ",
			OutputPrefix: "    ",
			ErrorPrefix:  "==> ",
			Ui:           ui,
		},
		client: client,
	}
	mon.init()
	return mon
}

// init allocates substructures
func (m *monitor) init() {
	m.state = &evalState{
		allocs: make(map[string]*allocState),
	}
}

// update is used to update our monitor with new state. It can be
// called whether the passed information is new or not, and will
// only dump update messages when state changes.
func (m *monitor) update(update *evalState) {
	m.Lock()
	defer m.Unlock()

	existing := m.state

	// Swap in the new state at the end
	defer func() {
		m.state = update
	}()

	// Check the allocations
	for allocID, alloc := range update.allocs {
		if existing, ok := existing.allocs[allocID]; !ok {
			switch {
			case alloc.desired == structs.AllocDesiredStatusFailed:
				// New allocs with desired state failed indicate
				// scheduling failure.
				m.ui.Output(fmt.Sprintf("Scheduling error for group %q (%s)",
					alloc.group, alloc.desiredDesc))

				// Generate a more descriptive error for why the allocation
				// failed and dump it to the screen
				if alloc.full != nil {
					dumpAllocStatus(m.ui, alloc.full)
				}

			case alloc.index < update.index:
				// New alloc with create index lower than the eval
				// create index indicates modification
				m.ui.Output(fmt.Sprintf(
					"Allocation %q modified: node %q, group %q",
					alloc.id, alloc.node, alloc.group))

			case alloc.desired == structs.AllocDesiredStatusRun:
				// New allocation with desired status running
				m.ui.Output(fmt.Sprintf(
					"Allocation %q created: node %q, group %q",
					alloc.id, alloc.node, alloc.group))
			}
		} else {
			switch {
			case existing.client != alloc.client:
				// Allocation status has changed
				m.ui.Output(fmt.Sprintf(
					"Allocation %q status changed: %q -> %q",
					alloc.id, existing.client, alloc.client))
			}
		}
	}

	// Check if the status changed
	if existing.status != update.status {
		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
			existing.status, update.status))
	}

	// Check if the wait time is different
	if existing.wait == 0 && update.wait != 0 {
		m.ui.Output(fmt.Sprintf("Waiting %s before running eval",
			update.wait))
	}

	// Check if the node changed
	if existing.node == "" && update.node != "" {
		m.ui.Output(fmt.Sprintf("Evaluation was assigned node ID %q",
			update.node))
	}
}

// monitor is used to start monitoring the given evaluation ID. It
// writes output directly to the monitor's ui, and returns the
// exit code for the command.
//
// The return code will be 0 on successful evaluation. If there are
// problems scheduling the job (impossible constraints, resources
// exhausted, etc), then the return code will be 2. For any other
// failures (API connectivity, internal errors, etc), the return code
// will be 1.
func (m *monitor) monitor(evalID string) int {
	// Track if we encounter a scheduling failure. This can only be
	// detected while querying allocations, so we use this bool to
	// carry that status into the return code.
	var schedFailure bool

	m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", evalID))
	for {
		// Query the evaluation
		eval, _, err := m.client.Evaluations().Info(evalID, nil)
		if err != nil {
			m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
			return 1
		}

		// Create the new eval state.
		state := &evalState{
			status: eval.Status,
			desc:   eval.StatusDescription,
			node:   eval.NodeID,
			allocs: make(map[string]*allocState),
			wait:   eval.Wait,
			index:  eval.CreateIndex,
		}

		// Query the allocations associated with the evaluation
		allocs, _, err := m.client.Evaluations().Allocations(evalID, nil)
		if err != nil {
			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
			return 1
		}

		// Add the allocs to the state
		for _, alloc := range allocs {
			state.allocs[alloc.ID] = &allocState{
				id:          alloc.ID,
				group:       alloc.TaskGroup,
				node:        alloc.NodeID,
				desired:     alloc.DesiredStatus,
				desiredDesc: alloc.DesiredDescription,
				client:      alloc.ClientStatus,
				index:       alloc.CreateIndex,
			}

			// If we have a scheduling error, query the full allocation
			// to get the details.
			if alloc.DesiredStatus == structs.AllocDesiredStatusFailed {
				schedFailure = true
				failed, _, err := m.client.Allocations().Info(alloc.ID, nil)
				if err != nil {
					m.ui.Error(fmt.Sprintf("Error querying allocation: %s", err))
					return 1
				}
				state.allocs[alloc.ID].full = failed
			}
		}

		// Update the state
		m.update(state)

		switch eval.Status {
		case structs.EvalStatusComplete, structs.EvalStatusFailed:
			m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
				eval.ID, eval.Status))
		default:
			// Wait for the next update
			time.Sleep(updateWait)
			continue
		}

		// Monitor the next eval, if it exists.
		if eval.NextEval != "" {
			m.init()
			return m.monitor(eval.NextEval)
		}
		break
	}

	// Treat scheduling failures specially using a dedicated exit code.
	// This makes it easier to detect failures from the CLI.
	if schedFailure {
		return 2
	}

	return 0
}
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`package command`

			`import (`
			`"fmt"`
			`"sync"`
			`"time"`

			`"github.com/hashicorp/nomad/api"`
			`"github.com/hashicorp/nomad/nomad/structs"`
			`"github.com/mitchellh/cli"`
			`)`

			`const (`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`// updateWait is the amount of time to wait between status`
			`// updates. Because the monitor is poll-based, we use this`
			`// delay to avoid overwhelming the API server.`
			`updateWait = time.Second`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`)`

command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`// evalState is used to store the current "state of the world"`
			`// in the context of monitoring an evaluation.`
			`type evalState struct {`
			`status string`
			`desc string`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`node string`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`allocs map[string]*allocState`
			`wait time.Duration`
			`index uint64`
			`}`

			`// allocState is used to track the state of an allocation`
			`type allocState struct {`
			`id string`
			`group string`
			`node string`
			`desired string`
			`desiredDesc string`
			`client string`
			`index uint64`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00
			`// full is the allocation struct with full details. This`
			`// must be queried for explicitly so it is only included`
			`// if there is important error information inside.`
			`full *api.Allocation`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`}`

command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`// monitor wraps an evaluation monitor and holds metadata and`
			`// state information.`
			`type monitor struct {`
			`ui cli.Ui`
			`client *api.Client`
command/monitor: cleanup 2015-09-16 23:27:55 +00:00			`state *evalState`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00
			`sync.Mutex`
			`}`

			`// newMonitor returns a new monitor. The returned monitor will`
			`// write output information to the provided ui.`
			`func newMonitor(ui cli.Ui, client api.Client) monitor {`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`mon := &monitor{`
command: only use prefixed UI during monitoring 2015-09-17 00:36:14 +00:00			`ui: &cli.PrefixedUi{`
			`InfoPrefix: "==> ",`
			`OutputPrefix: " ",`
			`ErrorPrefix: "==> ",`
			`Ui: ui,`
			`},`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`client: client,`
			`}`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`mon.init()`
			`return mon`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`}`

command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`// init allocates substructures`
			`func (m *monitor) init() {`
			`m.state = &evalState{`
			`allocs: make(map[string]*allocState),`
			`}`
command: working on monitor 2015-09-16 21:45:21 +00:00			`}`

command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`// update is used to update our monitor with new state. It can be`
			`// called whether the passed information is new or not, and will`
			`// only dump update messages when state changes.`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`func (m monitor) update(update evalState) {`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`m.Lock()`
			`defer m.Unlock()`

			`existing := m.state`

command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`// Swap in the new state at the end`
			`defer func() {`
			`m.state = update`
			`}()`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00
command: working on monitor 2015-09-16 21:45:21 +00:00			`// Check the allocations`
			`for allocID, alloc := range update.allocs {`
command: monitor allocation client status updates 2015-09-16 22:37:08 +00:00			`if existing, ok := existing.allocs[allocID]; !ok {`
command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00			`switch {`
			`case alloc.desired == structs.AllocDesiredStatusFailed:`
			`// New allocs with desired state failed indicate`
			`// scheduling failure.`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf("Scheduling error for group %q (%s)",`
command/monitor: display scheduling errors 2015-09-18 16:37:33 +00:00			`alloc.group, alloc.desiredDesc))`
command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00
command: simplify alloc status, dump during job run 2015-09-21 00:38:25 +00:00			`// Generate a more descriptive error for why the allocation`
			`// failed and dump it to the screen`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`if alloc.full != nil {`
			`dumpAllocStatus(m.ui, alloc.full)`
command: simplify alloc status, dump during job run 2015-09-21 00:38:25 +00:00			`}`

command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00			`case alloc.index < update.index:`
			`// New alloc with create index lower than the eval`
			`// create index indicates modification`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf(`
command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00			`"Allocation %q modified: node %q, group %q",`
			`alloc.id, alloc.node, alloc.group))`

			`case alloc.desired == structs.AllocDesiredStatusRun:`
			`// New allocation with desired status running`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf(`
command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00			`"Allocation %q created: node %q, group %q",`
			`alloc.id, alloc.node, alloc.group))`
command: monitor allocation client status updates 2015-09-16 22:37:08 +00:00			`}`
			`} else {`
command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00			`switch {`
			`case existing.client != alloc.client:`
			`// Allocation status has changed`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf(`
command/monitor: handle more alloc state changes 2015-09-18 03:18:33 +00:00			`"Allocation %q status changed: %q -> %q",`
command: monitor allocation client status updates 2015-09-16 22:37:08 +00:00			`alloc.id, existing.client, alloc.client))`
command: working on monitor 2015-09-16 21:45:21 +00:00			`}`
			`}`
			`}`

command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`// Check if the status changed`
			`if existing.status != update.status {`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`existing.status, update.status))`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`}`

			`// Check if the wait time is different`
			`if existing.wait == 0 && update.wait != 0 {`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf("Waiting %s before running eval",`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`update.wait))`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`}`

command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`// Check if the node changed`
			`if existing.node == "" && update.node != "" {`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.ui.Output(fmt.Sprintf("Evaluation was assigned node ID %q",`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`update.node))`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`}`
			`}`

			`// monitor is used to start monitoring the given evaluation ID. It`
			`// writes output directly to the monitor's ui, and returns the`
command: return 2 for scheduling failures when using monitor 2015-09-21 19:19:34 +00:00			`// exit code for the command.`
			`//`
			`// The return code will be 0 on successful evaluation. If there are`
			`// problems scheduling the job (impossible constraints, resources`
			`// exhausted, etc), then the return code will be 2. For any other`
			`// failures (API connectivity, internal errors, etc), the return code`
			`// will be 1.`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`func (m *monitor) monitor(evalID string) int {`
command: return 2 for scheduling failures when using monitor 2015-09-21 19:19:34 +00:00			`// Track if we encounter a scheduling failure. This can only be`
			`// detected while querying allocations, so we use this bool to`
			`// carry that status into the return code.`
			`var schedFailure bool`

command: working on monitor 2015-09-16 21:45:21 +00:00			`m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", evalID))`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`for {`
command/monitor: cleanup 2015-09-16 23:27:55 +00:00			`// Query the evaluation`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`eval, _, err := m.client.Evaluations().Info(evalID, nil)`
			`if err != nil {`
			`m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))`
			`return 1`
			`}`

command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`// Create the new eval state.`
			`state := &evalState{`
			`status: eval.Status,`
			`desc: eval.StatusDescription,`
			`node: eval.NodeID,`
			`allocs: make(map[string]*allocState),`
			`wait: eval.Wait,`
			`index: eval.CreateIndex,`
			`}`

command/monitor: cleanup 2015-09-16 23:27:55 +00:00			`// Query the allocations associated with the evaluation`
command: working on monitor 2015-09-16 21:45:21 +00:00			`allocs, _, err := m.client.Evaluations().Allocations(evalID, nil)`
			`if err != nil {`
			`m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))`
			`return 1`
			`}`

command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`// Add the allocs to the state`
			`for _, alloc := range allocs {`
			`state.allocs[alloc.ID] = &allocState{`
			`id: alloc.ID,`
			`group: alloc.TaskGroup,`
			`node: alloc.NodeID,`
			`desired: alloc.DesiredStatus,`
			`desiredDesc: alloc.DesiredDescription,`
			`client: alloc.ClientStatus,`
			`index: alloc.CreateIndex,`
			`}`

			`// If we have a scheduling error, query the full allocation`
			`// to get the details.`
			`if alloc.DesiredStatus == structs.AllocDesiredStatusFailed {`
command: return 2 for scheduling failures when using monitor 2015-09-21 19:19:34 +00:00			`schedFailure = true`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`failed, _, err := m.client.Allocations().Info(alloc.ID, nil)`
			`if err != nil {`
			`m.ui.Error(fmt.Sprintf("Error querying allocation: %s", err))`
			`return 1`
			`}`
			`state.allocs[alloc.ID].full = failed`
			`}`
			`}`

command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`// Update the state`
command: allocation monitor adjustments 2015-09-21 18:25:22 +00:00			`m.update(state)`
command: working on monitor 2015-09-16 21:45:21 +00:00
			`switch eval.Status {`
			`case structs.EvalStatusComplete, structs.EvalStatusFailed:`
			`m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",`
			`eval.ID, eval.Status))`
			`default:`
			`// Wait for the next update`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`time.Sleep(updateWait)`
command: working on monitor 2015-09-16 21:45:21 +00:00			`continue`
			`}`

			`// Monitor the next eval, if it exists.`
			`if eval.NextEval != "" {`
command/monitor: cleanup 2015-09-18 17:03:23 +00:00			`m.init()`
			`return m.monitor(eval.NextEval)`
command: working on monitor 2015-09-16 21:45:21 +00:00			`}`
command: easier exit codes, check if eval is already finished 2015-09-16 23:20:19 +00:00			`break`
command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`}`

command: return 2 for scheduling failures when using monitor 2015-09-21 19:19:34 +00:00			`// Treat scheduling failures specially using a dedicated exit code.`
			`// This makes it easier to detect failures from the CLI.`
			`if schedFailure {`
			`return 2`
			`}`

command: start implementing eval monitoring for run 2015-09-16 20:58:33 +00:00			`return 0`
			`}`