401 lines
12 KiB
Go
401 lines
12 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package command
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
|
"github.com/mitchellh/cli"
|
|
)
|
|
|
|
const (
|
|
// updateWait is the amount of time to wait between status
|
|
// updates. Because the monitor is poll-based, we use this
|
|
// delay to avoid overwhelming the API server.
|
|
updateWait = time.Second
|
|
)
|
|
|
|
// evalState is used to store the current "state of the world"
|
|
// in the context of monitoring an evaluation.
|
|
type evalState struct {
|
|
status string
|
|
desc string
|
|
node string
|
|
deployment string
|
|
job string
|
|
allocs map[string]*allocState
|
|
wait time.Duration
|
|
index uint64
|
|
}
|
|
|
|
// newEvalState creates and initializes a new monitorState
|
|
func newEvalState() *evalState {
|
|
return &evalState{
|
|
status: api.EvalStatusPending,
|
|
allocs: make(map[string]*allocState),
|
|
}
|
|
}
|
|
|
|
// allocState is used to track the state of an allocation
|
|
type allocState struct {
|
|
id string
|
|
group string
|
|
node string
|
|
desired string
|
|
desiredDesc string
|
|
client string
|
|
clientDesc string
|
|
index uint64
|
|
}
|
|
|
|
// monitor wraps an evaluation monitor and holds metadata and
|
|
// state information.
|
|
type monitor struct {
|
|
ui cli.Ui
|
|
client *api.Client
|
|
state *evalState
|
|
|
|
// length determines the number of characters for identifiers in the ui.
|
|
length int
|
|
|
|
sync.Mutex
|
|
}
|
|
|
|
// newMonitor returns a new monitor. The returned monitor will
|
|
// write output information to the provided ui. The length parameter determines
|
|
// the number of characters for identifiers in the ui.
|
|
func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
|
|
if colorUi, ok := ui.(*cli.ColoredUi); ok {
|
|
// Disable Info color for monitored output
|
|
ui = &cli.ColoredUi{
|
|
ErrorColor: colorUi.ErrorColor,
|
|
WarnColor: colorUi.WarnColor,
|
|
InfoColor: cli.UiColorNone,
|
|
Ui: colorUi.Ui,
|
|
}
|
|
}
|
|
mon := &monitor{
|
|
ui: &cli.PrefixedUi{
|
|
InfoPrefix: "==> ",
|
|
OutputPrefix: " ",
|
|
ErrorPrefix: "==> ",
|
|
Ui: ui,
|
|
},
|
|
client: client,
|
|
state: newEvalState(),
|
|
length: length,
|
|
}
|
|
return mon
|
|
}
|
|
|
|
// update is used to update our monitor with new state. It can be
|
|
// called whether the passed information is new or not, and will
|
|
// only dump update messages when state changes.
|
|
func (m *monitor) update(update *evalState) {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
|
|
existing := m.state
|
|
|
|
// Swap in the new state at the end
|
|
defer func() {
|
|
m.state = update
|
|
}()
|
|
|
|
// Check if the evaluation was triggered by a node
|
|
if existing.node == "" && update.node != "" {
|
|
m.ui.Output(fmt.Sprintf("%s: Evaluation triggered by node %q",
|
|
formatTime(time.Now()), limit(update.node, m.length)))
|
|
}
|
|
|
|
// Check if the evaluation was triggered by a job
|
|
if existing.job == "" && update.job != "" {
|
|
m.ui.Output(fmt.Sprintf("%s: Evaluation triggered by job %q",
|
|
formatTime(time.Now()), update.job))
|
|
}
|
|
|
|
// Check if the evaluation was triggered by a deployment
|
|
if existing.deployment == "" && update.deployment != "" {
|
|
m.ui.Output(fmt.Sprintf("%s: Evaluation within deployment: %q",
|
|
formatTime(time.Now()), limit(update.deployment, m.length)))
|
|
}
|
|
|
|
// Check the allocations
|
|
for allocID, alloc := range update.allocs {
|
|
if existing, ok := existing.allocs[allocID]; !ok {
|
|
switch {
|
|
case alloc.index < update.index:
|
|
// New alloc with create index lower than the eval
|
|
// create index indicates modification
|
|
m.ui.Output(fmt.Sprintf(
|
|
"%s: Allocation %q modified: node %q, group %q",
|
|
formatTime(time.Now()), limit(alloc.id, m.length),
|
|
limit(alloc.node, m.length), alloc.group))
|
|
|
|
case alloc.desired == api.AllocDesiredStatusRun:
|
|
// New allocation with desired status running
|
|
m.ui.Output(fmt.Sprintf(
|
|
"%s: Allocation %q created: node %q, group %q",
|
|
formatTime(time.Now()), limit(alloc.id, m.length),
|
|
limit(alloc.node, m.length), alloc.group))
|
|
}
|
|
} else {
|
|
switch {
|
|
case existing.client != alloc.client:
|
|
description := ""
|
|
if alloc.clientDesc != "" {
|
|
description = fmt.Sprintf(" (%s)", alloc.clientDesc)
|
|
}
|
|
// Allocation status has changed
|
|
m.ui.Output(fmt.Sprintf(
|
|
"%s: Allocation %q status changed: %q -> %q%s",
|
|
formatTime(time.Now()), limit(alloc.id, m.length),
|
|
existing.client, alloc.client, description))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if the status changed. We skip any transitions to pending status.
|
|
if existing.status != "" &&
|
|
update.status != api.AllocClientStatusPending &&
|
|
existing.status != update.status {
|
|
m.ui.Output(fmt.Sprintf("%s: Evaluation status changed: %q -> %q",
|
|
formatTime(time.Now()), existing.status, update.status))
|
|
}
|
|
}
|
|
|
|
// monitor is used to start monitoring the given evaluation ID. It
|
|
// writes output directly to the monitor's ui, and returns the
|
|
// exit code for the command.
|
|
//
|
|
// The return code will be 0 on successful evaluation. If there are
|
|
// problems scheduling the job (impossible constraints, resources
|
|
// exhausted, etc), then the return code will be 2. For any other
|
|
// failures (API connectivity, internal errors, etc), the return code
|
|
// will be 1.
|
|
func (m *monitor) monitor(evalID string) int {
|
|
// Track if we encounter a scheduling failure. This can only be
|
|
// detected while querying allocations, so we use this bool to
|
|
// carry that status into the return code.
|
|
var schedFailure bool
|
|
|
|
// Add the initial pending state
|
|
m.update(newEvalState())
|
|
|
|
m.ui.Info(fmt.Sprintf("%s: Monitoring evaluation %q",
|
|
formatTime(time.Now()), limit(evalID, m.length)))
|
|
|
|
for {
|
|
// Query the evaluation
|
|
eval, _, err := m.client.Evaluations().Info(evalID, nil)
|
|
if err != nil {
|
|
m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
|
|
return 1
|
|
}
|
|
|
|
// Create the new eval state.
|
|
state := newEvalState()
|
|
state.status = eval.Status
|
|
state.desc = eval.StatusDescription
|
|
state.node = eval.NodeID
|
|
state.job = eval.JobID
|
|
state.deployment = eval.DeploymentID
|
|
state.wait = eval.Wait
|
|
state.index = eval.CreateIndex
|
|
|
|
// Query the allocations associated with the evaluation
|
|
allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
|
|
if err != nil {
|
|
m.ui.Error(fmt.Sprintf("%s: Error reading allocations: %s", formatTime(time.Now()), err))
|
|
return 1
|
|
}
|
|
|
|
// Add the allocs to the state
|
|
for _, alloc := range allocs {
|
|
state.allocs[alloc.ID] = &allocState{
|
|
id: alloc.ID,
|
|
group: alloc.TaskGroup,
|
|
node: alloc.NodeID,
|
|
desired: alloc.DesiredStatus,
|
|
desiredDesc: alloc.DesiredDescription,
|
|
client: alloc.ClientStatus,
|
|
clientDesc: alloc.ClientDescription,
|
|
index: alloc.CreateIndex,
|
|
}
|
|
}
|
|
|
|
// Update the state
|
|
m.update(state)
|
|
|
|
switch eval.Status {
|
|
case api.EvalStatusComplete, api.EvalStatusFailed, api.EvalStatusCancelled:
|
|
if len(eval.FailedTGAllocs) == 0 {
|
|
m.ui.Info(fmt.Sprintf("%s: Evaluation %q finished with status %q",
|
|
formatTime(time.Now()), limit(eval.ID, m.length), eval.Status))
|
|
} else {
|
|
// There were failures making the allocations
|
|
schedFailure = true
|
|
m.ui.Info(fmt.Sprintf("%s: Evaluation %q finished with status %q but failed to place all allocations:",
|
|
formatTime(time.Now()), limit(eval.ID, m.length), eval.Status))
|
|
|
|
// Print the failures per task group
|
|
for tg, metrics := range eval.FailedTGAllocs {
|
|
noun := "allocation"
|
|
if metrics.CoalescedFailures > 0 {
|
|
noun += "s"
|
|
}
|
|
m.ui.Output(fmt.Sprintf("%s: Task Group %q (failed to place %d %s):",
|
|
formatTime(time.Now()), tg, metrics.CoalescedFailures+1, noun))
|
|
metrics := formatAllocMetrics(metrics, false, " ")
|
|
for _, line := range strings.Split(metrics, "\n") {
|
|
m.ui.Output(line)
|
|
}
|
|
}
|
|
|
|
if eval.BlockedEval != "" {
|
|
m.ui.Output(fmt.Sprintf("%s: Evaluation %q waiting for additional capacity to place remainder",
|
|
formatTime(time.Now()), limit(eval.BlockedEval, m.length)))
|
|
}
|
|
}
|
|
default:
|
|
// Wait for the next update
|
|
time.Sleep(updateWait)
|
|
continue
|
|
}
|
|
|
|
// Monitor the next eval in the chain, if present
|
|
if eval.NextEval != "" {
|
|
if eval.Wait.Nanoseconds() != 0 {
|
|
m.ui.Info(fmt.Sprintf(
|
|
"%s: Monitoring next evaluation %q in %s",
|
|
formatTime(time.Now()), limit(eval.NextEval, m.length), eval.Wait))
|
|
|
|
// Skip some unnecessary polling
|
|
time.Sleep(eval.Wait)
|
|
}
|
|
|
|
// Reset the state and monitor the new eval
|
|
m.state = newEvalState()
|
|
return m.monitor(eval.NextEval)
|
|
}
|
|
break
|
|
}
|
|
|
|
// Monitor the deployment if it exists
|
|
dID := m.state.deployment
|
|
if dID != "" {
|
|
m.ui.Info(fmt.Sprintf("%s: Monitoring deployment %q", formatTime(time.Now()), limit(dID, m.length)))
|
|
|
|
var verbose bool
|
|
if m.length == fullId {
|
|
verbose = true
|
|
} else {
|
|
verbose = false
|
|
}
|
|
|
|
meta := new(Meta)
|
|
meta.Ui = m.ui
|
|
cmd := &DeploymentStatusCommand{Meta: *meta}
|
|
status, err := cmd.monitor(m.client, dID, 0, m.state.wait, verbose)
|
|
if err != nil || status != api.DeploymentStatusSuccessful {
|
|
return 1
|
|
}
|
|
}
|
|
|
|
// Treat scheduling failures specially using a dedicated exit code.
|
|
// This makes it easier to detect failures from the CLI.
|
|
if schedFailure {
|
|
return 2
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string {
|
|
// Print a helpful message if we have an eligibility problem
|
|
var out string
|
|
if metrics.NodesEvaluated == 0 {
|
|
out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix)
|
|
}
|
|
|
|
// Print a helpful message if the user has asked for a DC that has no
|
|
// available nodes.
|
|
for dc, available := range metrics.NodesAvailable {
|
|
if available == 0 {
|
|
out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc)
|
|
}
|
|
}
|
|
|
|
// Print filter info
|
|
for class, num := range metrics.ClassFiltered {
|
|
out += fmt.Sprintf("%s* Class %q: %d nodes excluded by filter\n", prefix, class, num)
|
|
}
|
|
for cs, num := range metrics.ConstraintFiltered {
|
|
out += fmt.Sprintf("%s* Constraint %q: %d nodes excluded by filter\n", prefix, cs, num)
|
|
}
|
|
|
|
// Print exhaustion info
|
|
if ne := metrics.NodesExhausted; ne > 0 {
|
|
out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne)
|
|
}
|
|
for class, num := range metrics.ClassExhausted {
|
|
out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num)
|
|
}
|
|
for dim, num := range metrics.DimensionExhausted {
|
|
out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num)
|
|
}
|
|
|
|
// Print quota info
|
|
for _, dim := range metrics.QuotaExhausted {
|
|
out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim)
|
|
}
|
|
|
|
// Print scores
|
|
if scores {
|
|
if len(metrics.ScoreMetaData) > 0 {
|
|
scoreOutput := make([]string, len(metrics.ScoreMetaData)+1)
|
|
|
|
// Find all possible scores and build header row.
|
|
allScores := make(map[string]struct{})
|
|
for _, scoreMeta := range metrics.ScoreMetaData {
|
|
for score := range scoreMeta.Scores {
|
|
allScores[score] = struct{}{}
|
|
}
|
|
}
|
|
// Sort scores alphabetically.
|
|
scores := make([]string, 0, len(allScores))
|
|
for score := range allScores {
|
|
scores = append(scores, score)
|
|
}
|
|
sort.Strings(scores)
|
|
scoreOutput[0] = fmt.Sprintf("Node|%s|final score", strings.Join(scores, "|"))
|
|
|
|
// Build row for each score.
|
|
for i, scoreMeta := range metrics.ScoreMetaData {
|
|
scoreOutput[i+1] = fmt.Sprintf("%v|", scoreMeta.NodeID)
|
|
for _, scorerName := range scores {
|
|
scoreVal := scoreMeta.Scores[scorerName]
|
|
scoreOutput[i+1] += fmt.Sprintf("%.3g|", scoreVal)
|
|
}
|
|
scoreOutput[i+1] += fmt.Sprintf("%.3g", scoreMeta.NormScore)
|
|
}
|
|
|
|
out += formatList(scoreOutput)
|
|
} else {
|
|
// Backwards compatibility for old allocs
|
|
for name, score := range metrics.Scores {
|
|
out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score)
|
|
}
|
|
}
|
|
}
|
|
|
|
out = strings.TrimSuffix(out, "\n")
|
|
return out
|
|
}
|