open-nomad/client/allochealth/tracker.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package allochealth

import (
	"context"
	"fmt"
	"strings"
	"sync"
	"time"

	"github.com/hashicorp/consul/api"
	"github.com/hashicorp/go-hclog"
	"github.com/hashicorp/nomad/client/serviceregistration"
	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
	cstructs "github.com/hashicorp/nomad/client/structs"
	"github.com/hashicorp/nomad/client/taskenv"
	"github.com/hashicorp/nomad/helper"
	"github.com/hashicorp/nomad/nomad/structs"
	"golang.org/x/exp/maps"
)

const (
	// AllocHealthEventSource is the source used for emitting task events
	AllocHealthEventSource = "Alloc Unhealthy"

	// checkLookupInterval is the pace at which we check if the Consul or Nomad
	// checks for an allocation are healthy or unhealthy.
	checkLookupInterval = 500 * time.Millisecond
)

// Tracker tracks the health of an allocation and makes health events watchable
// via channels.
type Tracker struct {
	// ctx and cancelFn is used to shutdown the tracker
	ctx      context.Context
	cancelFn context.CancelFunc

	// alloc is the alloc we are tracking
	alloc *structs.Allocation

	// tg is the task group we are tracking
	tg *structs.TaskGroup

	// minHealthyTime is the duration an alloc must remain healthy to be
	// considered healthy
	minHealthyTime time.Duration

	// checkLookupInterval is the repeated interval after which which we check
	// if the Consul checks are healthy or unhealthy.
	checkLookupInterval time.Duration

	// useChecks specifies whether to consider Consul and Nomad service checks.
	useChecks bool

	// consulCheckCount is the total number of Consul service checks in the task
	// group including task level checks.
	consulCheckCount int

	// nomadCheckCount is the total the number of Nomad service checks in the task
	// group including task level checks.
	nomadCheckCount int

	// allocUpdates is a listener for retrieving new alloc updates
	allocUpdates *cstructs.AllocListener

	// consulClient is used to look up the status of Consul service checks
	consulClient serviceregistration.Handler

	// checkStore is used to lookup the status of Nomad service checks
	checkStore checkstore.Shim

	// healthy is used to signal whether we have determined the allocation to be
	// healthy or unhealthy
	healthy chan bool

	// allocStopped is triggered when the allocation is stopped and tracking is
	// not needed
	allocStopped chan struct{}

	// lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks.
	// These tasks may terminate without affecting alloc health
	lifecycleTasks map[string]string

	// lock is used to lock shared fields listed below
	lock sync.Mutex

	// tasksHealthy marks whether all the tasks have met their health check
	// (disregards Consul and Nomad checks)
	tasksHealthy bool

	// allocFailed marks whether the allocation failed
	allocFailed bool

	// checksHealthy marks whether all the task's Consul checks are healthy
	checksHealthy bool

	// taskHealth contains the health state for each task in the allocation
	// name -> state
	taskHealth map[string]*taskHealthState

	// taskEnvs maps each task in the allocation to a *taskenv.TaskEnv that is
	// used to interpolate runtime variables used in service definitions.
	taskEnvs map[string]*taskenv.TaskEnv

	// logger is for logging things
	logger hclog.Logger
}

// NewTracker returns a health tracker for the given allocation.
//
// Depending on job configuration, an allocation's health takes into consideration
// - An alloc listener
// - Consul checks (via consul API)
// - Nomad checks (via client state)
func NewTracker(
	parentCtx context.Context,
	logger hclog.Logger,
	alloc *structs.Allocation,
	allocUpdates *cstructs.AllocListener,
	taskEnvBuilder *taskenv.Builder,
	consulClient serviceregistration.Handler,
	checkStore checkstore.Shim,
	minHealthyTime time.Duration,
	useChecks bool,
) *Tracker {

	t := &Tracker{
		healthy:             make(chan bool, 1),
		allocStopped:        make(chan struct{}),
		alloc:               alloc,
		tg:                  alloc.Job.LookupTaskGroup(alloc.TaskGroup),
		minHealthyTime:      minHealthyTime,
		useChecks:           useChecks,
		allocUpdates:        allocUpdates,
		consulClient:        consulClient,
		checkStore:          checkStore,
		checkLookupInterval: checkLookupInterval,
		logger:              logger,
		lifecycleTasks:      map[string]string{},
	}

	// Build the map of TaskEnv for each task. Create the group-level TaskEnv
	// first because taskEnvBuilder is mutated in every loop and we can't undo
	// a call to UpdateTask().
	t.taskEnvs = make(map[string]*taskenv.TaskEnv, len(t.tg.Tasks)+1)
	t.taskEnvs[""] = taskEnvBuilder.Build()

	t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
	for _, task := range t.tg.Tasks {
		t.taskHealth[task.Name] = &taskHealthState{task: task}

		if task.Lifecycle != nil && !task.Lifecycle.Sidecar {
			t.lifecycleTasks[task.Name] = task.Lifecycle.Hook
		}

		t.taskEnvs[task.Name] = taskEnvBuilder.UpdateTask(alloc, task).Build()

		c, n := countChecks(task.Services)
		t.consulCheckCount += c
		t.nomadCheckCount += n
	}

	c, n := countChecks(t.tg.Services)
	t.consulCheckCount += c
	t.nomadCheckCount += n

	t.ctx, t.cancelFn = context.WithCancel(parentCtx)
	return t
}

func countChecks(services []*structs.Service) (consul, nomad int) {
	for _, service := range services {
		switch service.Provider {
		case structs.ServiceProviderNomad:
			nomad += len(service.Checks)
		default:
			consul += len(service.Checks)
		}
	}
	return
}

// Start starts the watcher.
func (t *Tracker) Start() {
	go t.watchTaskEvents()

	switch {
	case !t.useChecks:
		return
	case t.consulCheckCount > 0:
		go t.watchConsulEvents()
	case t.nomadCheckCount > 0:
		go t.watchNomadEvents()
	}
}

// HealthyCh returns a channel that will emit a boolean indicating the health of
// the allocation.
func (t *Tracker) HealthyCh() <-chan bool {
	return t.healthy
}

// AllocStoppedCh returns a channel that will be fired if the allocation is
// stopped. This means that health will not be set.
func (t *Tracker) AllocStoppedCh() <-chan struct{} {
	return t.allocStopped
}

// TaskEvents returns a map of events by task. This should only be called after
// health has been determined. Only tasks that have contributed to the
// allocation being unhealthy will have an event.
func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
	t.lock.Lock()
	defer t.lock.Unlock()

	// Nothing to do since the failure wasn't task related
	if t.allocFailed {
		return nil
	}

	deadline, _ := t.ctx.Deadline()
	events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks))

	// Go through are task information and build the event map
	for task, state := range t.taskHealth {
		useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
		if e, ok := state.event(deadline, t.tg.Update.HealthyDeadline, t.tg.Update.MinHealthyTime, useChecks); ok {
			events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e)
		}
	}

	return events
}

// setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
// allocation is terminal, health is immediately broadcast.
func (t *Tracker) setTaskHealth(healthy, terminal bool) {
	t.lock.Lock()
	defer t.lock.Unlock()

	t.tasksHealthy = healthy

	// if unhealthy, force waiting for new checks health status
	if !terminal && !healthy {
		t.checksHealthy = false
		return
	}

	// If we are marked healthy but we also require Consul checks to be healthy
	// and they are not yet, return, unless the task is terminal.
	usesConsulChecks := t.useChecks && t.consulCheckCount > 0
	if !terminal && healthy && usesConsulChecks && !t.checksHealthy {
		return
	}

	// If we are marked healthy but also require Nomad checks to be healthy and
	// they are not yet, return, unless the task is terminal.
	usesNomadChecks := t.useChecks && t.nomadCheckCount > 0
	if !terminal && healthy && usesNomadChecks && !t.checksHealthy {
		return
	}

	select {
	case t.healthy <- healthy:
		// nothing
	default:
	}

	// Shutdown the tracker
	t.cancelFn()
}

// setCheckHealth is used to mark the checks as either healthy or unhealthy.
// returns true if health is propagated and no more health monitoring is needed
//
// todo: this is currently being shared by watchConsulEvents and watchNomadEvents
// and must be split up if/when we support registering services (and thus checks)
// of different providers.
func (t *Tracker) setCheckHealth(healthy bool) bool {
	t.lock.Lock()
	defer t.lock.Unlock()

	// check health should always be false if tasks are unhealthy
	// as checks might be missing from unhealthy tasks
	t.checksHealthy = healthy && t.tasksHealthy

	// Only signal if we are healthy and so is the tasks
	if !t.checksHealthy {
		return false
	}

	select {
	case t.healthy <- healthy:
		// nothing
	default:
	}

	// Shutdown the tracker, things are healthy so nothing to do
	t.cancelFn()
	return true
}

// markAllocStopped is used to mark the allocation as having stopped.
func (t *Tracker) markAllocStopped() {
	close(t.allocStopped)
	t.cancelFn()
}

// watchTaskEvents is a long lived watcher that watches for the health of the
// allocation's tasks.
func (t *Tracker) watchTaskEvents() {
	alloc := t.alloc
	allStartedTime := time.Time{}

	waiter := newHealthyFuture()

	for {
		// If the alloc is being stopped by the server just exit
		switch alloc.DesiredStatus {
		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
			t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
			t.markAllocStopped()
			return
		}

		// Store the task states
		t.lock.Lock()
		for task, state := range alloc.TaskStates {
			//TODO(schmichael) for now skip unknown tasks as
			//they're task group services which don't currently
			//support checks anyway
			if v, ok := t.taskHealth[task]; ok {
				v.state = state
			}
		}
		t.lock.Unlock()

		// Detect if the alloc is unhealthy or if all tasks have started yet
		latestStartTime := time.Time{}
		for taskName, state := range alloc.TaskStates {
			// If the task is a poststop task we do not want to evaluate it
			// since it will remain pending until the main task has finished
			// or exited.
			if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop {
				continue
			}

			// If this is a poststart task which has already succeeded, we
			// should skip evaluation.
			if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
				continue
			}

			// One of the tasks has failed so we can exit watching
			if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
				t.setTaskHealth(false, true)
				return
			}

			if state.State == structs.TaskStatePending {
				latestStartTime = time.Time{}
				break
			} else if state.StartedAt.After(latestStartTime) {
				// task is either running or exited successfully
				latestStartTime = state.StartedAt
			}
		}

		// If the alloc is marked as failed by the client but none of the
		// individual tasks failed, that means something failed at the alloc
		// level.
		if alloc.ClientStatus == structs.AllocClientStatusFailed {
			t.lock.Lock()
			t.allocFailed = true
			t.lock.Unlock()

			t.setTaskHealth(false, true)
			return
		}

		if !latestStartTime.Equal(allStartedTime) {
			// reset task health
			t.setTaskHealth(false, false)

			// Prevent the timer from firing at the old start time
			waiter.disable()

			// Set the timer since all tasks are started
			if !latestStartTime.IsZero() {
				allStartedTime = latestStartTime
				waiter.wait(t.minHealthyTime)
			}
		}

		select {
		case <-t.ctx.Done():
			return
		case newAlloc, ok := <-t.allocUpdates.Ch():
			if !ok {
				return
			}
			alloc = newAlloc
		case <-waiter.C():
			t.setTaskHealth(true, false)
		}
	}
}

// healthyFuture is used to fire after checks have been healthy for MinHealthyTime
type healthyFuture struct {
	timer *time.Timer
}

// newHealthyFuture will create a healthyFuture in a disabled state, and
// will do nothing until a call to wait takes place
func newHealthyFuture() *healthyFuture {
	timer := time.NewTimer(0)
	ht := &healthyFuture{timer: timer}
	ht.disable()
	return ht
}

// disable the healthyFuture from triggering
func (h *healthyFuture) disable() {
	if !h.timer.Stop() {
		// must ensure channel is clear
		// https://pkg.go.dev/time#Timer.Stop
		select {
		case <-h.timer.C:
		default:
		}
	}
}

// wait will reset the healthyFuture to trigger after dur passes.
func (h *healthyFuture) wait(dur time.Duration) {
	// must ensure timer is stopped
	// https://pkg.go.dev/time#Timer.Reset
	h.disable()
	h.timer.Reset(dur)
}

// C returns a channel on which the future will send when ready.
func (h *healthyFuture) C() <-chan time.Time {
	return h.timer.C
}

// watchConsulEvents is a watcher for the health of the allocation's Consul
// checks. If all checks report healthy the watcher will exit after the
// MinHealthyTime has been reached, otherwise the watcher will continue to
// check unhealthy checks until the ctx is cancelled.
//
// Does not watch Nomad service checks; see watchNomadEvents for those.
func (t *Tracker) watchConsulEvents() {

	// checkTicker is the ticker that triggers us to look at the checks in Consul
	checkTicker := time.NewTicker(t.checkLookupInterval)
	defer checkTicker.Stop()

	// waiter is used to fire when the checks have been healthy for the MinHealthyTime
	waiter := newHealthyFuture()

	// primed marks whether the healthy waiter has been set
	primed := false

	// Store whether the last Consul checks call was successful or not
	consulChecksErr := false

	// allocReg are the registered objects in Consul for the allocation
	var allocReg *serviceregistration.AllocRegistration

OUTER:
	for {
		select {

		// we are shutting down
		case <-t.ctx.Done():
			return

		// it is time to check the checks
		case <-checkTicker.C:
			newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
			if err != nil {
				if !consulChecksErr {
					consulChecksErr = true
					t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID)
				}
				continue OUTER
			} else {
				consulChecksErr = false
				allocReg = newAllocReg
			}

			// enough time has passed with healthy checks
		case <-waiter.C():
			if t.setCheckHealth(true) {
				// final health set and propagated
				return
			}
			// checks are healthy but tasks are unhealthy,
			// reset and wait until all is healthy
			primed = false
		}

		if allocReg == nil {
			continue
		}

		// Store the task registrations
		t.lock.Lock()
		for task, reg := range allocReg.Tasks {
			if v, ok := t.taskHealth[task]; ok {
				v.taskRegistrations = reg
			}
		}
		t.lock.Unlock()

		// Detect if all the checks are passing
		passed := true

		// interpolate services to replace runtime variables
		consulServices := t.tg.ConsulServices()
		interpolatedServices := make([]*structs.Service, 0, len(consulServices))
		for _, service := range consulServices {
			env := t.taskEnvs[service.TaskName]
			if env == nil {
				// This is not expected to happen, but guard against a nil
				// task environment that could case a panic.
				t.logger.Error("failed to interpolate service runtime variables: task environment not found",
					"alloc_id", t.alloc.ID, "task", service.TaskName)
				continue
			}
			interpolatedService := taskenv.InterpolateService(env, service)
			interpolatedServices = append(interpolatedServices, interpolatedService)
		}

		// scan for missing or unhealthy consul checks
		if !evaluateConsulChecks(interpolatedServices, allocReg) {
			t.setCheckHealth(false)
			passed = false
		}

		if !passed {
			// Reset the timer since we have transitioned back to unhealthy
			if primed {
				primed = false
				waiter.disable()
			}
		} else if !primed {
			// Reset the timer to fire after MinHealthyTime
			primed = true
			waiter.wait(t.minHealthyTime)
		}
	}
}

func evaluateConsulChecks(services []*structs.Service, registrations *serviceregistration.AllocRegistration) bool {
	// First, identify any case where a check definition is missing or outdated
	// on the Consul side. Note that because check names are not unique, we must
	// also keep track of the counts on each side and make sure those also match.
	expChecks := make(map[string]int)
	regChecks := make(map[string]int)
	for _, service := range services {
		for _, check := range service.Checks {
			expChecks[check.Name]++
		}
	}
	for _, task := range registrations.Tasks {
		for _, service := range task.Services {
			for _, check := range service.Checks {
				regChecks[check.Name]++
			}
		}
	}

	// The sidecar service and checks are created when the service is
	// registered, not on job registration, so they won't appear in the jobspec.
	if !maps.Equal(expChecks, regChecks) {
		return false
	}

	// Now we can simply scan the status of each Check reported by Consul.
	for _, task := range registrations.Tasks {
		for _, service := range task.Services {
			for _, check := range service.Checks {
				onUpdate := service.CheckOnUpdate[check.CheckID]
				switch check.Status {
				case api.HealthWarning:
					if onUpdate != structs.OnUpdateIgnoreWarn && onUpdate != structs.OnUpdateIgnore {
						return false
					}
				case api.HealthCritical:
					if onUpdate != structs.OnUpdateIgnore {
						return false
					}
				}
			}

			// Check the health of Connect sidecars, so we don't accidentally
			// mark an allocation healthy before min_healthy_time. These don't
			// currently support on_update.
			if service.SidecarService != nil {
				for _, check := range service.SidecarChecks {
					switch check.Status {
					case api.HealthWarning:
						return false
					case api.HealthCritical:
						return false
					}
				}
			}

		}
	}

	// All checks are present and healthy.
	return true
}

// watchNomadEvents is a watcher for the health of the allocation's Nomad checks.
// If all checks report healthy the watcher will exit after the MinHealthyTime has
// been reached, otherwise the watcher will continue to check unhealthy checks until
// the ctx is cancelled.
//
// Does not watch Consul service checks; see watchConsulEvents for those.
func (t *Tracker) watchNomadEvents() {
	// checkTicker is the ticker that triggers us to look at the checks in Nomad
	checkTicker, cancel := helper.NewSafeTimer(t.checkLookupInterval)
	defer cancel()

	// waiter is used to fire when the checks have been healthy for the MinHealthyTime
	waiter := newHealthyFuture()

	// allocID of the allocation we are watching checks for
	allocID := t.alloc.ID

	// primed marks whether the healthy waiter has been set
	primed := false

	// latest set of nomad check results
	var results map[structs.CheckID]*structs.CheckQueryResult

	for {
		select {

		// tracker has been canceled, so stop waiting
		case <-t.ctx.Done():
			return

		// it is time to check the checks
		case <-checkTicker.C:
			results = t.checkStore.List(allocID)
			checkTicker.Reset(t.checkLookupInterval)

		// enough time has passed with healthy checks
		case <-waiter.C():
			if t.setCheckHealth(true) { // todo(shoenig) this needs to be split between Consul and Nomad
				return // final health set and propagated
			}
			// checks are healthy but tasks are unhealthy, reset and wait
			primed = false
		}

		// scan to see if any checks are failing
		passing := true
		for _, result := range results {
			switch result.Status {
			case structs.CheckSuccess:
				continue
			case structs.CheckFailure:
				if result.Mode == structs.Readiness {
					continue
				}
				passing = false
			default:
				// i.e. pending check; do not consider healthy or ready
				passing = false
			}

			if !passing {
				break // 1+ check is failing; no need to continue
			}
		}

		if !passing {
			// at least one check is failing, transition to unhealthy
			t.setCheckHealth(false)
			primed = false
			waiter.disable()
		}

		if passing && !primed {
			// healthy but not yet primed, set timer to wait
			primed = true
			waiter.wait(t.minHealthyTime)
		}
	}
}

// taskHealthState captures all known health information about a task. It is
// largely used to determine if the task has contributed to the allocation being
// unhealthy.
type taskHealthState struct {
	task              *structs.Task
	state             *structs.TaskState
	taskRegistrations *serviceregistration.ServiceRegistrations
}

// event takes the deadline time for the allocation to be healthy and the update
// strategy of the group. It returns true if the task has contributed to the
// allocation being unhealthy and if so, an event description of why.
func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyTime time.Duration, useChecks bool) (string, bool) {
	desiredChecks := 0
	for _, s := range t.task.Services {
		if nc := len(s.Checks); nc > 0 {
			desiredChecks += nc
		}
	}
	requireChecks := (desiredChecks > 0) && useChecks

	if t.state != nil {
		if t.state.Failed {
			return "Unhealthy because of failed task", true
		}

		switch t.state.State {
		case structs.TaskStatePending:
			return fmt.Sprintf("Task not running by healthy_deadline of %v", healthyDeadline), true
		case structs.TaskStateDead:
			// non-sidecar hook lifecycle tasks are healthy if they exit with success
			if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar {
				return "Unhealthy because of dead task", true
			}
		case structs.TaskStateRunning:
			// We are running so check if we have been running long enough
			if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
				return fmt.Sprintf("Task not running for min_healthy_time of %v by healthy_deadline of %v", minHealthyTime, healthyDeadline), true
			}
		}
	}

	if t.taskRegistrations != nil {
		var notPassing []string
		passing := 0

	OUTER:
		for _, sreg := range t.taskRegistrations.Services {
			for _, check := range sreg.Checks {
				if check.Status != api.HealthPassing {
					notPassing = append(notPassing, sreg.Service.Service)
					continue OUTER
				} else {
					passing++
				}
			}
		}

		if len(notPassing) != 0 {
			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
		}

		if passing != desiredChecks {
			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
		}

	} else if requireChecks {
		return "Service checks not registered", true
	}

	return "", false
}