open-nomad/client/allocrunner/alloc_runner.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package allocrunner

import (
	"context"
	"fmt"
	"sync"
	"time"

	log "github.com/hashicorp/go-hclog"
	multierror "github.com/hashicorp/go-multierror"
	"golang.org/x/exp/maps"

	"github.com/hashicorp/nomad/client/allocdir"
	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
	"github.com/hashicorp/nomad/client/allocrunner/state"
	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
	"github.com/hashicorp/nomad/client/config"
	"github.com/hashicorp/nomad/client/consul"
	"github.com/hashicorp/nomad/client/devicemanager"
	"github.com/hashicorp/nomad/client/dynamicplugins"
	cinterfaces "github.com/hashicorp/nomad/client/interfaces"
	"github.com/hashicorp/nomad/client/lib/cgutil"
	"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
	"github.com/hashicorp/nomad/client/serviceregistration"
	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
	"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
	cstate "github.com/hashicorp/nomad/client/state"
	cstructs "github.com/hashicorp/nomad/client/structs"
	"github.com/hashicorp/nomad/client/vaultclient"
	"github.com/hashicorp/nomad/helper/pointer"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/plugins/device"
	"github.com/hashicorp/nomad/plugins/drivers"
)

// allocRunner is used to run all the tasks in a given allocation
type allocRunner struct {
	// id is the ID of the allocation. Can be accessed without a lock
	id string

	// Logger is the logger for the alloc runner.
	logger log.Logger

	// clientConfig is the client configuration block.
	clientConfig *config.Config

	// stateUpdater is used to emit updated alloc state
	stateUpdater cinterfaces.AllocStateHandler

	// taskStateUpdatedCh is ticked whenever task state as changed. Must
	// have len==1 to allow nonblocking notification of state updates while
	// the goroutine is already processing a previous update.
	taskStateUpdatedCh chan struct{}

	// taskStateUpdateHandlerCh is closed when the task state handling
	// goroutine exits. It is unsafe to destroy the local allocation state
	// before this goroutine exits.
	taskStateUpdateHandlerCh chan struct{}

	// allocUpdatedCh is a channel that is used to stream allocation updates into
	// the allocUpdate handler. Must have len==1 to allow nonblocking notification
	// of new allocation updates while the goroutine is processing a previous
	// update.
	allocUpdatedCh chan *structs.Allocation

	// consulClient is the client used by the consul service hook for
	// registering services and checks
	consulClient serviceregistration.Handler

	// consulProxiesClient is the client used by the envoy version hook for
	// looking up supported envoy versions of the consul agent.
	consulProxiesClient consul.SupportedProxiesAPI

	// sidsClient is the client used by the service identity hook for
	// managing SI tokens
	sidsClient consul.ServiceIdentityAPI

	// vaultClient is the used to manage Vault tokens
	vaultClient vaultclient.VaultClient

	// waitCh is closed when the Run loop has exited
	waitCh chan struct{}

	// destroyed is true when the Run loop has exited, postrun hooks have
	// run, and alloc runner has been destroyed. Must acquire destroyedLock
	// to access.
	destroyed bool

	// destroyCh is closed when the Run loop has exited, postrun hooks have
	// run, and alloc runner has been destroyed.
	destroyCh chan struct{}

	// shutdown is true when the Run loop has exited, and shutdown hooks have
	// run. Must acquire destroyedLock to access.
	shutdown bool

	// shutdownCh is closed when the Run loop has exited, and shutdown hooks
	// have run.
	shutdownCh chan struct{}

	// destroyLaunched is true if Destroy has been called. Must acquire
	// destroyedLock to access.
	destroyLaunched bool

	// shutdownLaunched is true if Shutdown has been called. Must acquire
	// destroyedLock to access.
	shutdownLaunched bool

	// destroyedLock guards destroyed, destroyLaunched, shutdownLaunched,
	// and serializes Shutdown/Destroy calls.
	destroyedLock sync.Mutex

	// Alloc captures the allocation being run.
	alloc     *structs.Allocation
	allocLock sync.RWMutex

	// state is the alloc runner's state
	state     *state.State
	stateLock sync.RWMutex

	// lastAcknowledgedState is the alloc runner state that was last
	// acknowledged by the server (may lag behind ar.state)
	lastAcknowledgedState *state.State

	stateDB cstate.StateDB

	// allocDir is used to build the allocations directory structure.
	allocDir *allocdir.AllocDir

	// runnerHooks are alloc runner lifecycle hooks that should be run on state
	// transitions.
	runnerHooks []interfaces.RunnerHook

	// hookResources holds the output from allocrunner hooks so that later
	// allocrunner hooks or task runner hooks can read them
	hookResources *cstructs.AllocHookResources

	// tasks are the set of task runners
	tasks map[string]*taskrunner.TaskRunner

	// deviceStatsReporter is used to lookup resource usage for alloc devices
	deviceStatsReporter cinterfaces.DeviceStatsReporter

	// allocBroadcaster sends client allocation updates to all listeners
	allocBroadcaster *cstructs.AllocBroadcaster

	// prevAllocWatcher allows waiting for any previous or preempted allocations
	// to exit
	prevAllocWatcher config.PrevAllocWatcher

	// prevAllocMigrator allows the migration of a previous allocations alloc dir.
	prevAllocMigrator config.PrevAllocMigrator

	// dynamicRegistry contains all locally registered dynamic plugins (e.g csi
	// plugins).
	dynamicRegistry dynamicplugins.Registry

	// csiManager is used to wait for CSI Volumes to be attached, and by the task
	// runner to manage their mounting
	csiManager csimanager.Manager

	// cpusetManager is responsible for configuring task cgroups if supported by the platform
	cpusetManager cgutil.CpusetManager

	// devicemanager is used to mount devices as well as lookup device
	// statistics
	devicemanager devicemanager.Manager

	// driverManager is responsible for dispensing driver plugins and registering
	// event handlers
	driverManager drivermanager.Manager

	// serversContactedCh is passed to TaskRunners so they can detect when
	// servers have been contacted for the first time in case of a failed
	// restore.
	serversContactedCh chan struct{}

	// taskCoordinator is used to controlled when tasks are allowed to run
	// depending on their lifecycle configuration.
	taskCoordinator *tasklifecycle.Coordinator

	shutdownDelayCtx      context.Context
	shutdownDelayCancelFn context.CancelFunc

	// rpcClient is the RPC Client that should be used by the allocrunner and its
	// hooks to communicate with Nomad Servers.
	rpcClient config.RPCer

	// serviceRegWrapper is the handler wrapper that is used by service hooks
	// to perform service and check registration and deregistration.
	serviceRegWrapper *wrapper.HandlerWrapper

	// checkStore contains check status information
	checkStore checkstore.Shim

	// getter is an interface for retrieving artifacts.
	getter cinterfaces.ArtifactGetter
}

// NewAllocRunner returns a new allocation runner.
func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, error) {
	alloc := config.Alloc
	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
	if tg == nil {
		return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup)
	}

	ar := &allocRunner{
		id:                       alloc.ID,
		alloc:                    alloc,
		clientConfig:             config.ClientConfig,
		consulClient:             config.Consul,
		consulProxiesClient:      config.ConsulProxies,
		sidsClient:               config.ConsulSI,
		vaultClient:              config.Vault,
		tasks:                    make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)),
		waitCh:                   make(chan struct{}),
		destroyCh:                make(chan struct{}),
		shutdownCh:               make(chan struct{}),
		state:                    &state.State{},
		stateDB:                  config.StateDB,
		stateUpdater:             config.StateUpdater,
		taskStateUpdatedCh:       make(chan struct{}, 1),
		taskStateUpdateHandlerCh: make(chan struct{}),
		allocUpdatedCh:           make(chan *structs.Allocation, 1),
		deviceStatsReporter:      config.DeviceStatsReporter,
		prevAllocWatcher:         config.PrevAllocWatcher,
		prevAllocMigrator:        config.PrevAllocMigrator,
		dynamicRegistry:          config.DynamicRegistry,
		csiManager:               config.CSIManager,
		cpusetManager:            config.CpusetManager,
		devicemanager:            config.DeviceManager,
		driverManager:            config.DriverManager,
		serversContactedCh:       config.ServersContactedCh,
		rpcClient:                config.RPCClient,
		serviceRegWrapper:        config.ServiceRegWrapper,
		checkStore:               config.CheckStore,
		getter:                   config.Getter,
		hookResources:            cstructs.NewAllocHookResources(),
	}

	// Create the logger based on the allocation ID
	ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID)

	// Create alloc broadcaster
	ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger)

	// Create alloc dir
	ar.allocDir = allocdir.NewAllocDir(ar.logger, config.ClientConfig.AllocDir, alloc.ID)

	ar.taskCoordinator = tasklifecycle.NewCoordinator(ar.logger, tg.Tasks, ar.waitCh)

	shutdownDelayCtx, shutdownDelayCancel := context.WithCancel(context.Background())
	ar.shutdownDelayCtx = shutdownDelayCtx
	ar.shutdownDelayCancelFn = shutdownDelayCancel

	// Initialize the runners hooks.
	if err := ar.initRunnerHooks(config.ClientConfig); err != nil {
		return nil, err
	}

	// Create the TaskRunners
	if err := ar.initTaskRunners(tg.Tasks); err != nil {
		return nil, err
	}

	return ar, nil
}

// initTaskRunners creates task runners but does *not* run them.
func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
	for _, task := range tasks {
		trConfig := &taskrunner.Config{
			Alloc:               ar.alloc,
			ClientConfig:        ar.clientConfig,
			Task:                task,
			TaskDir:             ar.allocDir.NewTaskDir(task.Name),
			Logger:              ar.logger,
			StateDB:             ar.stateDB,
			StateUpdater:        ar,
			DynamicRegistry:     ar.dynamicRegistry,
			Consul:              ar.consulClient,
			ConsulProxies:       ar.consulProxiesClient,
			ConsulSI:            ar.sidsClient,
			Vault:               ar.vaultClient,
			DeviceStatsReporter: ar.deviceStatsReporter,
			CSIManager:          ar.csiManager,
			DeviceManager:       ar.devicemanager,
			DriverManager:       ar.driverManager,
			ServersContactedCh:  ar.serversContactedCh,
			StartConditionMetCh: ar.taskCoordinator.StartConditionForTask(task),
			ShutdownDelayCtx:    ar.shutdownDelayCtx,
			ServiceRegWrapper:   ar.serviceRegWrapper,
			Getter:              ar.getter,
			AllocHookResources:  ar.hookResources,
		}

		if ar.cpusetManager != nil {
			trConfig.CpusetCgroupPathGetter = ar.cpusetManager.CgroupPathFor(ar.id, task.Name)
		}

		// Create, but do not Run, the task runner
		tr, err := taskrunner.NewTaskRunner(trConfig)
		if err != nil {
			return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err)
		}

		ar.tasks[task.Name] = tr
	}
	return nil
}

func (ar *allocRunner) WaitCh() <-chan struct{} {
	return ar.waitCh
}

// Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes
// WaitCh when it exits. Should be started in a goroutine.
func (ar *allocRunner) Run() {
	// Close the wait channel on return
	defer close(ar.waitCh)

	// Start the task state update handler
	go ar.handleTaskStateUpdates()

	// Start the alloc update handler
	go ar.handleAllocUpdates()

	// If task update chan has been closed, that means we've been shutdown.
	select {
	case <-ar.taskStateUpdateHandlerCh:
		return
	default:
	}

	// When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run
	// to perform any cleanup that's necessary, potentially not done prior to earlier termination

	// Run the prestart hooks if non-terminal
	if ar.shouldRun() {
		if err := ar.prerun(); err != nil {
			ar.logger.Error("prerun failed", "error", err)

			for _, tr := range ar.tasks {
				// emit event and mark task to be cleaned up during runTasks()
				tr.MarkFailedKill(fmt.Sprintf("failed to setup alloc: %v", err))
			}
		}
	}

	// Run the runners (blocks until they exit)
	ar.runTasks()

	if ar.isShuttingDown() {
		return
	}

	// Run the postrun hooks
	if err := ar.postrun(); err != nil {
		ar.logger.Error("postrun failed", "error", err)
	}

}

// shouldRun returns true if the alloc is in a state that the alloc runner
// should run it.
func (ar *allocRunner) shouldRun() bool {
	// Do not run allocs that are terminal
	if ar.Alloc().TerminalStatus() {
		ar.logger.Trace("alloc terminal; not running",
			"desired_status", ar.Alloc().DesiredStatus,
			"client_status", ar.Alloc().ClientStatus,
		)
		return false
	}

	// It's possible that the alloc local state was marked terminal before
	// the server copy of the alloc (checked above) was marked as terminal,
	// so check the local state as well.
	switch clientStatus := ar.AllocState().ClientStatus; clientStatus {
	case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost:
		ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus)
		return false
	}

	return true
}

// runTasks is used to run the task runners and block until they exit.
func (ar *allocRunner) runTasks() {
	// Start and wait for all tasks.
	for _, task := range ar.tasks {
		go task.Run()
	}
	for _, task := range ar.tasks {
		<-task.WaitCh()
	}
}

// Alloc returns the current allocation being run by this runner as sent by the
// server. This view of the allocation does not have updated task states.
func (ar *allocRunner) Alloc() *structs.Allocation {
	ar.allocLock.RLock()
	defer ar.allocLock.RUnlock()
	return ar.alloc
}

func (ar *allocRunner) setAlloc(updated *structs.Allocation) {
	ar.allocLock.Lock()
	ar.alloc = updated
	ar.allocLock.Unlock()
}

// GetAllocDir returns the alloc dir which is safe for concurrent use.
func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir {
	return ar.allocDir
}

// Restore state from database. Must be called after NewAllocRunner but before
// Run.
func (ar *allocRunner) Restore() error {
	// Retrieve deployment status to avoid reseting it across agent
	// restarts. Once a deployment status is set Nomad no longer monitors
	// alloc health, so we must persist deployment state across restarts.
	ds, err := ar.stateDB.GetDeploymentStatus(ar.id)
	if err != nil {
		return err
	}

	ns, err := ar.stateDB.GetNetworkStatus(ar.id)
	if err != nil {
		return err
	}

	ar.stateLock.Lock()
	ar.state.DeploymentStatus = ds
	ar.state.NetworkStatus = ns
	ar.stateLock.Unlock()

	states := make(map[string]*structs.TaskState)

	// Restore task runners
	for _, tr := range ar.tasks {
		if err := tr.Restore(); err != nil {
			return err
		}
		states[tr.Task().Name] = tr.TaskState()
	}

	ar.taskCoordinator.Restore(states)

	return nil
}

// persistDeploymentStatus stores AllocDeploymentStatus.
func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
	if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {
		// While any persistence errors are very bad, the worst case
		// scenario for failing to persist deployment status is that if
		// the agent is restarted it will monitor the deployment status
		// again. This could cause a deployment's status to change when
		// that shouldn't happen. However, allowing that seems better
		// than failing the entire allocation.
		ar.logger.Error("error storing deployment status", "error", err)
	}
}

// TaskStateUpdated is called by TaskRunner when a task's state has been
// updated. It does not process the update synchronously but instead notifies a
// goroutine the state has change. Since processing the state change may cause
// the task to be killed (thus change its state again) it cannot be done
// synchronously as it would cause a deadlock due to reentrancy.
//
// The goroutine is used to compute changes to the alloc's ClientStatus and to
// update the server with the new state.
func (ar *allocRunner) TaskStateUpdated() {
	select {
	case ar.taskStateUpdatedCh <- struct{}{}:
	default:
		// already pending updates
	}
}

// handleTaskStateUpdates must be run in goroutine as it monitors
// taskStateUpdatedCh for task state update notifications and processes task
// states.
//
// Processing task state updates must be done in a goroutine as it may have to
// kill tasks which causes further task state updates.
func (ar *allocRunner) handleTaskStateUpdates() {
	defer close(ar.taskStateUpdateHandlerCh)

	hasSidecars := hasSidecarTasks(ar.tasks)

	for done := false; !done; {
		select {
		case <-ar.taskStateUpdatedCh:
		case <-ar.waitCh:
			// Run has exited, sync once more to ensure final
			// states are collected.
			done = true
		}

		ar.logger.Trace("handling task state update", "done", done)

		// Set with the appropriate event if task runners should be
		// killed.
		var killEvent *structs.TaskEvent

		// If task runners should be killed, this is set to the task
		// name whose fault it is.
		killTask := ""

		// Task state has been updated; gather the state of the other tasks
		trNum := len(ar.tasks)
		liveRunners := make([]*taskrunner.TaskRunner, 0, trNum)
		states := make(map[string]*structs.TaskState, trNum)

		for name, tr := range ar.tasks {
			taskState := tr.TaskState()
			states[name] = taskState

			if tr.IsPoststopTask() {
				continue
			}

			// Capture live task runners in case we need to kill them
			if taskState.State != structs.TaskStateDead {
				liveRunners = append(liveRunners, tr)
				continue
			}

			// Task is dead, determine if other tasks should be killed
			if taskState.Failed {
				// Only set failed event if no event has been
				// set yet to give dead leaders priority.
				if killEvent == nil {
					killTask = name
					killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed).
						SetFailedSibling(name)
				}
			} else if tr.IsLeader() {
				killEvent = structs.NewTaskEvent(structs.TaskLeaderDead)
			}
		}

		// kill remaining live tasks
		if len(liveRunners) > 0 {

			// if all live runners are sidecars - kill alloc
			onlySidecarsRemaining := hasSidecars && !hasNonSidecarTasks(liveRunners)
			if killEvent == nil && onlySidecarsRemaining {
				killEvent = structs.NewTaskEvent(structs.TaskMainDead)
			}

			// If there's a kill event set and live runners, kill them
			if killEvent != nil {

				// Log kill reason
				switch killEvent.Type {
				case structs.TaskLeaderDead:
					ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask)
				case structs.TaskMainDead:
					ar.logger.Debug("main tasks dead, destroying all sidecar tasks")
				default:
					ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask)
				}

				// Emit kill event for live runners
				for _, tr := range liveRunners {
					tr.EmitEvent(killEvent)
				}

				// Kill 'em all
				states = ar.killTasks()

				// Wait for TaskRunners to exit before continuing. This will
				// prevent looping before TaskRunners have transitioned to
				// Dead.
				for _, tr := range liveRunners {
					ar.logger.Info("waiting for task to exit", "task", tr.Task().Name)
					select {
					case <-tr.WaitCh():
					case <-ar.waitCh:
					}
				}
			}
		} else {
			// there are no live runners left

			// run AR pre-kill hooks if this alloc is done, but not if it's because
			// the agent is shutting down.
			if !ar.isShuttingDown() && done {
				ar.preKillHooks()
			}

			// If there are no live runners left kill all non-poststop task
			// runners to unblock them from the alloc restart loop.
			for _, tr := range ar.tasks {
				if tr.IsPoststopTask() {
					continue
				}

				select {
				case <-tr.WaitCh():
				case <-ar.waitCh:
				default:
					// Kill task runner without setting an event because the
					// task is already dead, it's just waiting in the alloc
					// restart loop.
					err := tr.Kill(context.TODO(), nil)
					if err != nil {
						ar.logger.Warn("failed to kill task", "task", tr.Task().Name, "error", err)
					}
				}
			}
		}

		ar.taskCoordinator.TaskStateUpdated(states)

		// Get the client allocation
		calloc := ar.clientAlloc(states)

		// Update the server
		ar.stateUpdater.AllocStateUpdated(calloc)

		// Broadcast client alloc to listeners
		ar.allocBroadcaster.Send(calloc)
	}
}

// hasNonSidecarTasks returns false if all the passed tasks are sidecar tasks
func hasNonSidecarTasks(tasks []*taskrunner.TaskRunner) bool {
	for _, tr := range tasks {
		if !tr.IsSidecarTask() {
			return true
		}
	}

	return false
}

// hasSidecarTasks returns true if any of the passed tasks are sidecar tasks
func hasSidecarTasks(tasks map[string]*taskrunner.TaskRunner) bool {
	for _, tr := range tasks {
		if tr.IsSidecarTask() {
			return true
		}
	}

	return false
}

// killTasks kills all task runners, leader (if there is one) first. Errors are
// logged except taskrunner.ErrTaskNotRunning which is ignored. Task states
// after Kill has been called are returned.
func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
	var mu sync.Mutex
	states := make(map[string]*structs.TaskState, len(ar.tasks))

	// run alloc prekill hooks
	ar.preKillHooks()

	// Kill leader first, synchronously
	for name, tr := range ar.tasks {
		if !tr.IsLeader() {
			continue
		}

		taskEvent := structs.NewTaskEvent(structs.TaskKilling)
		taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
		err := tr.Kill(context.TODO(), taskEvent)
		if err != nil && err != taskrunner.ErrTaskNotRunning {
			ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
		}

		taskState := tr.TaskState()
		states[name] = taskState
		break
	}

	// Kill the rest non-sidecar and non-poststop tasks concurrently
	wg := sync.WaitGroup{}
	for name, tr := range ar.tasks {
		// Filter out poststop and sidecar tasks so that they stop after all the other tasks are killed
		if tr.IsLeader() || tr.IsPoststopTask() || tr.IsSidecarTask() {
			continue
		}

		wg.Add(1)
		go func(name string, tr *taskrunner.TaskRunner) {
			defer wg.Done()
			taskEvent := structs.NewTaskEvent(structs.TaskKilling)
			taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
			err := tr.Kill(context.TODO(), taskEvent)
			if err != nil && err != taskrunner.ErrTaskNotRunning {
				ar.logger.Warn("error stopping task", "error", err, "task_name", name)
			}

			taskState := tr.TaskState()
			mu.Lock()
			states[name] = taskState
			mu.Unlock()
		}(name, tr)
	}
	wg.Wait()

	// Kill the sidecar tasks last.
	for name, tr := range ar.tasks {
		if !tr.IsSidecarTask() || tr.IsLeader() || tr.IsPoststopTask() {
			continue
		}

		wg.Add(1)
		go func(name string, tr *taskrunner.TaskRunner) {
			defer wg.Done()
			taskEvent := structs.NewTaskEvent(structs.TaskKilling)
			taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
			err := tr.Kill(context.TODO(), taskEvent)
			if err != nil && err != taskrunner.ErrTaskNotRunning {
				ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)
			}

			taskState := tr.TaskState()
			mu.Lock()
			states[name] = taskState
			mu.Unlock()
		}(name, tr)
	}
	wg.Wait()

	// Perform no action on post stop tasks, but retain their states if they exist. This
	// commonly happens at the time of alloc GC from the client node.
	for name, tr := range ar.tasks {
		if !tr.IsPoststopTask() {
			continue
		}

		state := tr.TaskState()
		if state != nil {
			states[name] = state
		}
	}

	return states
}

// clientAlloc takes in the task states and returns an Allocation populated with
// Client specific fields. Note: this mutates the allocRunner's state to store
// the taskStates!
func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation {
	ar.stateLock.Lock()
	defer ar.stateLock.Unlock()

	// store task states for AllocState to expose
	ar.state.TaskStates = taskStates

	a := &structs.Allocation{
		ID:         ar.id,
		TaskStates: taskStates,
	}

	if d := ar.state.DeploymentStatus; d != nil {
		a.DeploymentStatus = d.Copy()
	}

	// Compute the ClientStatus
	if ar.state.ClientStatus != "" {
		// The client status is being forced
		a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription
	} else {
		a.ClientStatus, a.ClientDescription = getClientStatus(taskStates)
	}

	// If the allocation is terminal, make sure all required fields are properly
	// set.
	if a.ClientTerminalStatus() {
		alloc := ar.Alloc()

		// If we are part of a deployment and the alloc has failed, mark the
		// alloc as unhealthy. This guards against the watcher not be started.
		// If the health status is already set then terminal allocations should not
		if a.ClientStatus == structs.AllocClientStatusFailed &&
			alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() {
			a.DeploymentStatus = &structs.AllocDeploymentStatus{
				Healthy: pointer.Of(false),
			}
		}

		// Make sure we have marked the finished at for every task. This is used
		// to calculate the reschedule time for failed allocations.
		now := time.Now()
		for taskName := range ar.tasks {
			ts, ok := a.TaskStates[taskName]
			if !ok {
				ts = &structs.TaskState{}
				a.TaskStates[taskName] = ts
			}
			if ts.FinishedAt.IsZero() {
				ts.FinishedAt = now
			}
		}
	}

	// Set the NetworkStatus and default DNSConfig if one is not returned from the client
	netStatus := ar.state.NetworkStatus
	if netStatus != nil {
		a.NetworkStatus = netStatus
	} else {
		a.NetworkStatus = new(structs.AllocNetworkStatus)
	}

	if a.NetworkStatus.DNS == nil {
		alloc := ar.Alloc()
		nws := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Networks
		if len(nws) > 0 {
			a.NetworkStatus.DNS = nws[0].DNS.Copy()
		}
	}

	return a
}

// getClientStatus takes in the task states for a given allocation and computes
// the client status and description
func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) {
	var pending, running, dead, failed bool
	for _, state := range taskStates {
		switch state.State {
		case structs.TaskStateRunning:
			running = true
		case structs.TaskStatePending:
			pending = true
		case structs.TaskStateDead:
			if state.Failed {
				failed = true
			} else {
				dead = true
			}
		}
	}

	// Determine the alloc status
	if failed {
		return structs.AllocClientStatusFailed, "Failed tasks"
	} else if running {
		return structs.AllocClientStatusRunning, "Tasks are running"
	} else if pending {
		return structs.AllocClientStatusPending, "No tasks have started"
	} else if dead {
		return structs.AllocClientStatusComplete, "All tasks have completed"
	}

	return "", ""
}

// SetClientStatus is a helper for forcing a specific client
// status on the alloc runner. This is used during restore errors
// when the task state can't be restored.
func (ar *allocRunner) SetClientStatus(clientStatus string) {
	ar.stateLock.Lock()
	defer ar.stateLock.Unlock()
	ar.state.ClientStatus = clientStatus
}

func (ar *allocRunner) SetNetworkStatus(s *structs.AllocNetworkStatus) {
	ar.stateLock.Lock()
	defer ar.stateLock.Unlock()
	ar.state.NetworkStatus = s.Copy()
}

func (ar *allocRunner) NetworkStatus() *structs.AllocNetworkStatus {
	ar.stateLock.Lock()
	defer ar.stateLock.Unlock()
	return ar.state.NetworkStatus.Copy()
}

// setIndexes is a helper for forcing alloc state on the alloc runner. This is
// used during reconnect when the task has been marked unknown by the server.
func (ar *allocRunner) setIndexes(update *structs.Allocation) {
	ar.allocLock.Lock()
	defer ar.allocLock.Unlock()
	ar.alloc.AllocModifyIndex = update.AllocModifyIndex
	ar.alloc.ModifyIndex = update.ModifyIndex
	ar.alloc.ModifyTime = update.ModifyTime
}

// AllocState returns a copy of allocation state including a snapshot of task
// states.
func (ar *allocRunner) AllocState() *state.State {
	ar.stateLock.RLock()
	state := ar.state.Copy()
	ar.stateLock.RUnlock()

	// If TaskStateUpdated has not been called yet, ar.state.TaskStates
	// won't be set as it is not the canonical source of TaskStates.
	if len(state.TaskStates) == 0 {
		ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks))
		for k, tr := range ar.tasks {
			state.TaskStates[k] = tr.TaskState()
		}
	}

	// Generate alloc to get other state fields
	alloc := ar.clientAlloc(state.TaskStates)
	state.ClientStatus = alloc.ClientStatus
	state.ClientDescription = alloc.ClientDescription
	state.DeploymentStatus = alloc.DeploymentStatus

	return state
}

// Update asyncronously updates the running allocation with a new version
// received from the server.
// When processing a new update, we will first attempt to drain stale updates
// from the queue, before appending the new one.
func (ar *allocRunner) Update(update *structs.Allocation) {
	select {
	// Drain queued update from the channel if possible, and check the modify
	// index
	case oldUpdate := <-ar.allocUpdatedCh:
		// If the old update is newer than the replacement, then skip the new one
		// and return. This case shouldn't happen, but may in the case of a bug
		// elsewhere inside the system.
		if oldUpdate.AllocModifyIndex > update.AllocModifyIndex {
			ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue",
				"old_modify_index", oldUpdate.AllocModifyIndex,
				"new_modify_index", update.AllocModifyIndex)
			ar.allocUpdatedCh <- oldUpdate
			return
		} else {
			ar.logger.Debug("Discarding allocation update",
				"skipped_modify_index", oldUpdate.AllocModifyIndex,
				"new_modify_index", update.AllocModifyIndex)
		}
	case <-ar.waitCh:
		ar.logger.Trace("AllocRunner has terminated, skipping alloc update",
			"modify_index", update.AllocModifyIndex)
		return
	default:
	}

	if update.DesiredTransition.ShouldIgnoreShutdownDelay() {
		ar.shutdownDelayCancelFn()
	}

	// Queue the new update
	ar.allocUpdatedCh <- update
}

func (ar *allocRunner) handleAllocUpdates() {
	for {
		select {
		case update := <-ar.allocUpdatedCh:
			ar.handleAllocUpdate(update)
		case <-ar.waitCh:
			return
		}
	}
}

// This method sends the updated alloc to Run for serially processing updates.
// If there is already a pending update it will be discarded and replaced by
// the latest update.
func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) {
	// Detect Stop updates
	stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus()

	// Update ar.alloc
	ar.setAlloc(update)

	// Run update hooks if not stopping or dead
	if !update.TerminalStatus() {
		if err := ar.update(update); err != nil {
			ar.logger.Error("error running update hooks", "error", err)
		}

	}

	// Update task runners
	for _, tr := range ar.tasks {
		tr.Update(update)
	}

	// If alloc is being terminated, kill all tasks, leader first
	if stopping {
		ar.killTasks()
	}

}

func (ar *allocRunner) Listener() *cstructs.AllocListener {
	return ar.allocBroadcaster.Listen()
}

func (ar *allocRunner) destroyImpl() {
	// Stop any running tasks and persist states in case the client is
	// shutdown before Destroy finishes.
	states := ar.killTasks()
	calloc := ar.clientAlloc(states)
	ar.stateUpdater.AllocStateUpdated(calloc)

	// Wait for tasks to exit and postrun hooks to finish
	<-ar.waitCh

	// Run destroy hooks
	if err := ar.destroy(); err != nil {
		ar.logger.Warn("error running destroy hooks", "error", err)
	}

	// Wait for task state update handler to exit before removing local
	// state if Run() ran at all.
	<-ar.taskStateUpdateHandlerCh

	// Mark alloc as destroyed
	ar.destroyedLock.Lock()

	// Cleanup state db; while holding the lock to avoid
	// a race periodic PersistState that may resurrect the alloc
	if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
		ar.logger.Warn("failed to delete allocation state", "error", err)
	}

	if !ar.shutdown {
		ar.shutdown = true
		close(ar.shutdownCh)
	}

	ar.destroyed = true
	close(ar.destroyCh)

	ar.destroyedLock.Unlock()
}

func (ar *allocRunner) PersistState() error {
	ar.destroyedLock.Lock()
	defer ar.destroyedLock.Unlock()

	if ar.destroyed {
		err := ar.stateDB.DeleteAllocationBucket(ar.id, cstate.WithBatchMode())
		if err != nil {
			ar.logger.Warn("failed to delete allocation bucket", "error", err)
		}
		return nil
	}

	// persist network status, wrapping in a func to release state lock as early as possible
	err := func() error {
		ar.stateLock.Lock()
		defer ar.stateLock.Unlock()
		if ar.state.NetworkStatus != nil {
			err := ar.stateDB.PutNetworkStatus(ar.id, ar.state.NetworkStatus, cstate.WithBatchMode())
			if err != nil {
				return err
			}
		}
		return nil
	}()
	if err != nil {
		return err
	}

	// TODO: consider persisting deployment state along with task status.
	// While we study why only the alloc is persisted, I opted to maintain current
	// behavior and not risk adding yet more IO calls unnecessarily.
	return ar.stateDB.PutAllocation(ar.Alloc(), cstate.WithBatchMode())
}

// Destroy the alloc runner by stopping it if it is still running and cleaning
// up all of its resources.
//
// This method is safe for calling concurrently with Run() and will cause it to
// exit (thus closing WaitCh).
// When the destroy action is completed, it will close DestroyCh().
func (ar *allocRunner) Destroy() {
	ar.destroyedLock.Lock()
	defer ar.destroyedLock.Unlock()

	if ar.destroyed {
		// Only destroy once
		return
	}

	if ar.destroyLaunched {
		// Only dispatch a destroy once
		return
	}

	ar.destroyLaunched = true

	// Synchronize calls to shutdown/destroy
	if ar.shutdownLaunched {
		go func() {
			ar.logger.Debug("Waiting for shutdown before destroying runner")
			<-ar.shutdownCh
			ar.destroyImpl()
		}()

		return
	}

	go ar.destroyImpl()
}

// IsDestroyed returns true if the alloc runner has been destroyed (stopped and
// garbage collected).
//
// This method is safe for calling concurrently with Run(). Callers must
// receive on WaitCh() to block until alloc runner has stopped and been
// destroyed.
func (ar *allocRunner) IsDestroyed() bool {
	ar.destroyedLock.Lock()
	defer ar.destroyedLock.Unlock()
	return ar.destroyed
}

// IsWaiting returns true if the alloc runner is waiting for its previous
// allocation to terminate.
//
// This method is safe for calling concurrently with Run().
func (ar *allocRunner) IsWaiting() bool {
	return ar.prevAllocWatcher.IsWaiting()
}

// isShuttingDown returns true if the alloc runner is in a shutdown state
// due to a call to Shutdown() or Destroy()
func (ar *allocRunner) isShuttingDown() bool {
	ar.destroyedLock.Lock()
	defer ar.destroyedLock.Unlock()
	return ar.shutdownLaunched
}

// DestroyCh is a channel that is closed when an allocrunner is closed due to
// an explicit call to Destroy().
func (ar *allocRunner) DestroyCh() <-chan struct{} {
	return ar.destroyCh
}

// ShutdownCh is a channel that is closed when an allocrunner is closed due to
// either an explicit call to Shutdown(), or Destroy().
func (ar *allocRunner) ShutdownCh() <-chan struct{} {
	return ar.shutdownCh
}

// Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners.
// Tasks are unaffected and may be restored.
// When the destroy action is completed, it will close ShutdownCh().
func (ar *allocRunner) Shutdown() {
	ar.destroyedLock.Lock()
	defer ar.destroyedLock.Unlock()

	// Destroy is a superset of Shutdown so there's nothing to do if this
	// has already been destroyed.
	if ar.destroyed {
		return
	}

	// Destroy is a superset of Shutdown so if it's been marked for destruction,
	// don't try and shutdown in parallel. If shutdown has been launched, don't
	// try again.
	if ar.destroyLaunched || ar.shutdownLaunched {
		return
	}

	ar.shutdownLaunched = true

	go func() {
		ar.logger.Trace("shutting down")

		// Shutdown tasks gracefully if they were run
		wg := sync.WaitGroup{}
		for _, tr := range ar.tasks {
			wg.Add(1)
			go func(tr *taskrunner.TaskRunner) {
				tr.Shutdown()
				wg.Done()
			}(tr)
		}
		wg.Wait()

		// Wait for Run to exit
		<-ar.waitCh

		// Run shutdown hooks
		ar.shutdownHooks()

		// Wait for updater to finish its final run
		<-ar.taskStateUpdateHandlerCh

		ar.destroyedLock.Lock()
		ar.shutdown = true
		close(ar.shutdownCh)
		ar.destroyedLock.Unlock()
	}()
}

// IsMigrating returns true if the alloc runner is migrating data from its
// previous allocation.
//
// This method is safe for calling concurrently with Run().
func (ar *allocRunner) IsMigrating() bool {
	return ar.prevAllocMigrator.IsMigrating()
}

func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter {
	return ar
}

// LatestAllocStats returns the latest stats for an allocation. If taskFilter
// is set, only stats for that task -- if it exists -- are returned.
func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
	astat := &cstructs.AllocResourceUsage{
		Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)),
		ResourceUsage: &cstructs.ResourceUsage{
			MemoryStats: &cstructs.MemoryStats{},
			CpuStats:    &cstructs.CpuStats{},
			DeviceStats: []*device.DeviceGroupStats{},
		},
	}

	for name, tr := range ar.tasks {
		if taskFilter != "" && taskFilter != name {
			// Getting stats for a particular task and its not this one!
			continue
		}

		if usage := tr.LatestResourceUsage(); usage != nil {
			astat.Tasks[name] = usage
			astat.ResourceUsage.Add(usage.ResourceUsage)
			if usage.Timestamp > astat.Timestamp {
				astat.Timestamp = usage.Timestamp
			}
		}
	}

	return astat, nil
}

func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler {
	if tr, ok := ar.tasks[taskName]; ok {
		return func(ev *drivers.TaskEvent) {
			tr.EmitEvent(&structs.TaskEvent{
				Type:          structs.TaskDriverMessage,
				Time:          ev.Timestamp.UnixNano(),
				Details:       ev.Annotations,
				DriverMessage: ev.Message,
			})
		}
	}
	return nil
}

// Restart satisfies the WorkloadRestarter interface and restarts all tasks
// that are currently running.
func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
	return ar.restartTasks(ctx, event, failure, false)
}

// RestartTask restarts the provided task.
func (ar *allocRunner) RestartTask(taskName string, event *structs.TaskEvent) error {
	tr, ok := ar.tasks[taskName]
	if !ok {
		return fmt.Errorf("Could not find task runner for task: %s", taskName)
	}

	return tr.Restart(context.TODO(), event, false)
}

// RestartRunning restarts all tasks that are currently running.
func (ar *allocRunner) RestartRunning(event *structs.TaskEvent) error {
	return ar.restartTasks(context.TODO(), event, false, false)
}

// RestartAll restarts all tasks in the allocation, including dead ones. They
// will restart following their lifecycle order.
func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {
	// Restart the taskCoordinator to allow dead tasks to run again.
	ar.taskCoordinator.Restart()
	return ar.restartTasks(context.TODO(), event, false, true)
}

// restartTasks restarts all task runners concurrently.
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {

	// ensure we are not trying to restart an alloc that is terminal
	if !ar.shouldRun() {
		return fmt.Errorf("restart of an alloc that should not run")
	}

	waitCh := make(chan struct{})
	var err *multierror.Error
	var errMutex sync.Mutex

	// run alloc task restart hooks
	ar.taskRestartHooks()

	go func() {
		var wg sync.WaitGroup
		defer close(waitCh)
		for tn, tr := range ar.tasks {
			wg.Add(1)
			go func(taskName string, taskRunner *taskrunner.TaskRunner) {
				defer wg.Done()

				var e error
				if force {
					e = taskRunner.ForceRestart(ctx, event.Copy(), failure)
				} else {
					e = taskRunner.Restart(ctx, event.Copy(), failure)
				}

				// Ignore ErrTaskNotRunning errors since tasks that are not
				// running are expected to not be restarted.
				if e != nil && e != taskrunner.ErrTaskNotRunning {
					errMutex.Lock()
					defer errMutex.Unlock()
					err = multierror.Append(err, fmt.Errorf("failed to restart task %s: %v", taskName, e))
				}
			}(tn, tr)
		}
		wg.Wait()
	}()

	select {
	case <-waitCh:
	case <-ctx.Done():
	}

	return err.ErrorOrNil()
}

// Signal sends a signal request to task runners inside an allocation. If the
// taskName is empty, then it is sent to all tasks.
func (ar *allocRunner) Signal(taskName, signal string) error {
	event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal)

	if taskName != "" {
		tr, ok := ar.tasks[taskName]
		if !ok {
			return fmt.Errorf("Task not found")
		}

		return tr.Signal(event, signal)
	}

	var err *multierror.Error

	for tn, tr := range ar.tasks {
		rerr := tr.Signal(event.Copy(), signal)
		if rerr != nil {
			err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr))
		}
	}

	return err.ErrorOrNil()
}

// Reconnect logs a reconnect event for each task in the allocation and syncs the current alloc state with the server.
func (ar *allocRunner) Reconnect(update *structs.Allocation) (err error) {
	event := structs.NewTaskEvent(structs.TaskClientReconnected)
	event.Time = time.Now().UnixNano()
	for _, tr := range ar.tasks {
		tr.AppendEvent(event)
	}

	// Update the client alloc with the server side indexes.
	ar.setIndexes(update)

	// Calculate alloc state to get the final state with the new events.
	// Cannot rely on AllocStates as it won't recompute TaskStates once they are set.
	states := make(map[string]*structs.TaskState, len(ar.tasks))
	for name, tr := range ar.tasks {
		states[name] = tr.TaskState()
	}

	// Build the client allocation
	alloc := ar.clientAlloc(states)

	// Update the client state store.
	err = ar.stateUpdater.PutAllocation(alloc)
	if err != nil {
		return
	}

	// Update the server.
	ar.stateUpdater.AllocStateUpdated(alloc)

	// Broadcast client alloc to listeners.
	err = ar.allocBroadcaster.Send(alloc)

	return
}

func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler {
	tr, ok := ar.tasks[taskName]
	if !ok {
		return nil
	}

	return tr.TaskExecHandler()
}

func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) {
	tr, ok := ar.tasks[taskName]
	if !ok {
		return nil, fmt.Errorf("task not found")
	}

	return tr.DriverCapabilities()
}

// AcknowledgeState is called by the client's alloc sync when a given client
// state has been acknowledged by the server
func (ar *allocRunner) AcknowledgeState(a *state.State) {
	ar.stateLock.Lock()
	defer ar.stateLock.Unlock()
	ar.lastAcknowledgedState = a
	ar.persistLastAcknowledgedState(a)
}

// persistLastAcknowledgedState stores the last client state acknowledged by the server
func (ar *allocRunner) persistLastAcknowledgedState(a *state.State) {
	if err := ar.stateDB.PutAcknowledgedState(ar.id, a); err != nil {
		// While any persistence errors are very bad, the worst case scenario
		// for failing to persist last acknowledged state is that if the agent
		// is restarted it will send the update again.
		ar.logger.Error("error storing acknowledged allocation status", "error", err)
	}
}

// GetUpdatePriority returns the update priority based the difference between
// the current state and the state that was last acknowledged from a server
// update, returning urgent priority when the update is critical to marking
// allocations for rescheduling. This is called from the client in the same
// goroutine that called AcknowledgeState so that we can't get a TOCTOU error.
func (ar *allocRunner) GetUpdatePriority(a *structs.Allocation) cstructs.AllocUpdatePriority {
	ar.stateLock.RLock()
	defer ar.stateLock.RUnlock()

	last := ar.lastAcknowledgedState
	if last == nil {
		if a.ClientStatus == structs.AllocClientStatusFailed {
			return cstructs.AllocUpdatePriorityUrgent
		}
		return cstructs.AllocUpdatePriorityTypical
	}

	switch {
	case last.ClientStatus != a.ClientStatus:
		return cstructs.AllocUpdatePriorityUrgent
	case last.ClientDescription != a.ClientDescription:
		return cstructs.AllocUpdatePriorityTypical
	case !last.DeploymentStatus.Equal(a.DeploymentStatus):
		// TODO: this field gates deployment progress, so we may consider
		// returning urgent here; right now urgent updates are primarily focused
		// on recovering from failure
		return cstructs.AllocUpdatePriorityTypical
	case !last.NetworkStatus.Equal(a.NetworkStatus):
		return cstructs.AllocUpdatePriorityTypical
	}

	if !maps.EqualFunc(last.TaskStates, a.TaskStates, func(st, o *structs.TaskState) bool {
		return st.Equal(o)

	}) {
		return cstructs.AllocUpdatePriorityTypical
	}

	return cstructs.AllocUpdatePriorityNone
}

func (ar *allocRunner) SetCSIVolumes(vols map[string]*state.CSIVolumeStub) error {
	return ar.stateDB.PutAllocVolumes(ar.id, &state.AllocVolumes{
		CSIVolumes: vols,
	})
}

func (ar *allocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error) {
	allocVols, err := ar.stateDB.GetAllocVolumes(ar.id)
	if err != nil {
		return nil, err
	}
	if allocVols == nil {
		return nil, nil
	}
	return allocVols.CSIVolumes, nil
}