open-nomad/client/allocrunner/alloc_runner.go
Mahmood Ali f72e599ee7 Populate alloc stats API with device stats
This change makes few compromises:

* Looks up the devices associated with tasks at look up time.  Given
that `nomad alloc status` is called rarely generally (compared to stats
telemetry and general job reporting), it seems fine.  However, the
lookup overhead grows bounded by number of `tasks x total-host-devices`,
which can be significant.

* `client.Client` performs the task devices->statistics lookup.  It
passes self to alloc/task runners so they can look up the device statistics
allocated to them.
  * Currently alloc/task runners are responsible for constructing the
entire RPC response for stats
  * The alternatives for making task runners device statistics aware
don't seem appealing (e.g. having task runners contain reference to hostStats)

* On the alloc aggregation resource usage, I did a naive merging of task device statistics.
  * Personally, I question the value of such aggregation, compared to
costs of struct duplication and bloating the response - but opted to be
consistent in the API.
  * With naive concatination, device instances from a single device group used by separate tasks in the alloc, would be aggregated in two separate device group statistics.
2018-11-16 10:26:32 -05:00

728 lines
21 KiB
Go

package allocrunner
import (
"context"
"fmt"
"path/filepath"
"sync"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/allocrunner/state"
"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
"github.com/hashicorp/nomad/client/allocwatcher"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/consul"
cinterfaces "github.com/hashicorp/nomad/client/interfaces"
cstate "github.com/hashicorp/nomad/client/state"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/shared/loader"
)
// allocRunner is used to run all the tasks in a given allocation
type allocRunner struct {
// id is the ID of the allocation. Can be accessed without a lock
id string
// Logger is the logger for the alloc runner.
logger log.Logger
clientConfig *config.Config
// stateUpdater is used to emit updated alloc state
stateUpdater cinterfaces.AllocStateHandler
// taskStateUpdateCh is ticked whenever task state as changed. Must
// have len==1 to allow nonblocking notification of state updates while
// the goroutine is already processing a previous update.
taskStateUpdatedCh chan struct{}
// taskStateUpdateHandlerCh is closed when the task state handling
// goroutine exits. It is unsafe to destroy the local allocation state
// before this goroutine exits.
taskStateUpdateHandlerCh chan struct{}
// consulClient is the client used by the consul service hook for
// registering services and checks
consulClient consul.ConsulServiceAPI
// vaultClient is the used to manage Vault tokens
vaultClient vaultclient.VaultClient
// waitCh is closed when the Run() loop has exited
waitCh chan struct{}
// destroyed is true when the Run() loop has exited, postrun hooks have
// run, and alloc runner has been destroyed. Must acquire destroyedLock
// to access.
destroyed bool
// runLaunched is true if Run() has been called. If this is false
// Destroy() does not wait on tasks to shutdown as they are not
// running. Must acquire destroyedLock to access.
runLaunched bool
// destroyedLock guards destroyed, ran, and serializes Destroy() calls.
destroyedLock sync.Mutex
// Alloc captures the allocation being run.
alloc *structs.Allocation
allocLock sync.RWMutex
// state is the alloc runner's state
state *state.State
stateLock sync.RWMutex
stateDB cstate.StateDB
// allocDir is used to build the allocations directory structure.
allocDir *allocdir.AllocDir
// runnerHooks are alloc runner lifecycle hooks that should be run on state
// transistions.
runnerHooks []interfaces.RunnerHook
// tasks are the set of task runners
tasks map[string]*taskrunner.TaskRunner
// deviceStatsReporter is used to lookup resource usage for alloc devices
deviceStatsReporter cinterfaces.DeviceStatsReporter
// allocBroadcaster sends client allocation updates to all listeners
allocBroadcaster *cstructs.AllocBroadcaster
// prevAllocWatcher allows waiting for a previous allocation to exit
// and if necessary migrate its alloc dir.
prevAllocWatcher allocwatcher.PrevAllocWatcher
// pluginSingletonLoader is a plugin loader that will returns singleton
// instances of the plugins.
pluginSingletonLoader loader.PluginCatalog
}
// NewAllocRunner returns a new allocation runner.
func NewAllocRunner(config *Config) (*allocRunner, error) {
alloc := config.Alloc
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup)
}
ar := &allocRunner{
id: alloc.ID,
alloc: alloc,
clientConfig: config.ClientConfig,
consulClient: config.Consul,
vaultClient: config.Vault,
tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)),
waitCh: make(chan struct{}),
state: &state.State{},
stateDB: config.StateDB,
stateUpdater: config.StateUpdater,
taskStateUpdatedCh: make(chan struct{}, 1),
taskStateUpdateHandlerCh: make(chan struct{}),
deviceStatsReporter: config.DeviceStatsReporter,
allocBroadcaster: cstructs.NewAllocBroadcaster(),
prevAllocWatcher: config.PrevAllocWatcher,
pluginSingletonLoader: config.PluginSingletonLoader,
}
// Create the logger based on the allocation ID
ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID)
// Create alloc dir
ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID))
// Initialize the runners hooks.
ar.initRunnerHooks()
// Create the TaskRunners
if err := ar.initTaskRunners(tg.Tasks); err != nil {
return nil, err
}
return ar, nil
}
// initTaskRunners creates task runners but does *not* run them.
func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
for _, task := range tasks {
config := &taskrunner.Config{
Alloc: ar.alloc,
ClientConfig: ar.clientConfig,
Task: task,
TaskDir: ar.allocDir.NewTaskDir(task.Name),
Logger: ar.logger,
StateDB: ar.stateDB,
StateUpdater: ar,
Consul: ar.consulClient,
Vault: ar.vaultClient,
PluginSingletonLoader: ar.pluginSingletonLoader,
DeviceStatsReporter: ar.deviceStatsReporter,
}
// Create, but do not Run, the task runner
tr, err := taskrunner.NewTaskRunner(config)
if err != nil {
return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err)
}
ar.tasks[task.Name] = tr
}
return nil
}
func (ar *allocRunner) WaitCh() <-chan struct{} {
return ar.waitCh
}
// Run is the main goroutine that executes all the tasks.
func (ar *allocRunner) Run() {
ar.destroyedLock.Lock()
defer ar.destroyedLock.Unlock()
// Run should not be called after Destroy is called. This is a
// programming error.
if ar.destroyed {
ar.logger.Error("alloc destroyed; cannot run")
return
}
// If an alloc should not be run, ensure any restored task handles are
// destroyed and exit to wait for the AR to be GC'd by the client.
if !ar.shouldRun() {
ar.logger.Debug("not running terminal alloc")
// Cleanup and sync state
states := ar.killTasks()
// Get the client allocation
calloc := ar.clientAlloc(states)
// Update the server
ar.stateUpdater.AllocStateUpdated(calloc)
// Broadcast client alloc to listeners
ar.allocBroadcaster.Send(calloc)
return
}
// Run! (and mark as having been run to ensure Destroy cleans up properly)
ar.runLaunched = true
go ar.runImpl()
}
// shouldRun returns true if the alloc is in a state that the alloc runner
// should run it.
func (ar *allocRunner) shouldRun() bool {
// Do not run allocs that are terminal
if ar.Alloc().TerminalStatus() {
ar.logger.Trace("alloc terminal; not running",
"desired_status", ar.Alloc().DesiredStatus,
"client_status", ar.Alloc().ClientStatus,
)
return false
}
// It's possible that the alloc local state was marked terminal before
// the server copy of the alloc (checked above) was marked as terminal,
// so check the local state as well.
switch clientStatus := ar.AllocState().ClientStatus; clientStatus {
case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost:
ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus)
return false
}
return true
}
func (ar *allocRunner) runImpl() {
// Close the wait channel on return
defer close(ar.waitCh)
// Start the task state update handler
go ar.handleTaskStateUpdates()
// Run the prestart hooks
if err := ar.prerun(); err != nil {
ar.logger.Error("prerun failed", "error", err)
goto POST
}
// Run the runners and block until they exit
<-ar.runTasks()
POST:
// Run the postrun hooks
// XXX Equivalent to TR.Poststop hook
if err := ar.postrun(); err != nil {
ar.logger.Error("postrun failed", "error", err)
}
}
// runTasks is used to run the task runners.
func (ar *allocRunner) runTasks() <-chan struct{} {
for _, task := range ar.tasks {
go task.Run()
}
// Return a combined WaitCh that is closed when all task runners have
// exited.
waitCh := make(chan struct{})
go func() {
defer close(waitCh)
for _, task := range ar.tasks {
<-task.WaitCh()
}
}()
return waitCh
}
// Alloc returns the current allocation being run by this runner as sent by the
// server. This view of the allocation does not have updated task states.
func (ar *allocRunner) Alloc() *structs.Allocation {
ar.allocLock.RLock()
defer ar.allocLock.RUnlock()
return ar.alloc
}
func (ar *allocRunner) setAlloc(updated *structs.Allocation) {
ar.allocLock.Lock()
ar.alloc = updated
ar.allocLock.Unlock()
}
// GetAllocDir returns the alloc dir which is safe for concurrent use.
func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir {
return ar.allocDir
}
// Restore state from database. Must be called after NewAllocRunner but before
// Run.
func (ar *allocRunner) Restore() error {
// Restore task runners
for _, tr := range ar.tasks {
if err := tr.Restore(); err != nil {
return err
}
}
return nil
}
// TaskStateUpdated is called by TaskRunner when a task's state has been
// updated. It does not process the update synchronously but instead notifies a
// goroutine the state has change. Since processing the state change may cause
// the task to be killed (thus change its state again) it cannot be done
// synchronously as it would cause a deadlock due to reentrancy.
//
// The goroutine is used to compute changes to the alloc's ClientStatus and to
// update the server with the new state.
func (ar *allocRunner) TaskStateUpdated() {
select {
case ar.taskStateUpdatedCh <- struct{}{}:
default:
// already pending updates
}
}
// handleTaskStateUpdates must be run in goroutine as it monitors
// taskStateUpdateCh for task state update notifications and processes task
// states.
//
// Processing task state updates must be done in a goroutine as it may have to
// kill tasks which causes further task state updates.
func (ar *allocRunner) handleTaskStateUpdates() {
defer close(ar.taskStateUpdateHandlerCh)
for done := false; !done; {
select {
case <-ar.taskStateUpdatedCh:
case <-ar.waitCh:
// Tasks have exited, run once more to ensure final
// states are collected.
done = true
}
// Set with the appropriate event if task runners should be
// killed.
var killEvent *structs.TaskEvent
// If task runners should be killed, this is set to the task
// name whose fault it is.
killTask := ""
// True if task runners should be killed because a leader
// failed (informational).
leaderFailed := false
// Task state has been updated; gather the state of the other tasks
trNum := len(ar.tasks)
liveRunners := make([]*taskrunner.TaskRunner, 0, trNum)
states := make(map[string]*structs.TaskState, trNum)
for name, tr := range ar.tasks {
state := tr.TaskState()
states[name] = state
// Capture live task runners in case we need to kill them
if state.State != structs.TaskStateDead {
liveRunners = append(liveRunners, tr)
continue
}
// Task is dead, determine if other tasks should be killed
if state.Failed {
// Only set failed event if no event has been
// set yet to give dead leaders priority.
if killEvent == nil {
killTask = name
killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed).
SetFailedSibling(name)
}
} else if tr.IsLeader() {
killEvent = structs.NewTaskEvent(structs.TaskLeaderDead)
leaderFailed = true
killTask = name
}
}
// If there's a kill event set and live runners, kill them
if killEvent != nil && len(liveRunners) > 0 {
// Log kill reason
if leaderFailed {
ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask)
} else {
ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask)
}
states = ar.killTasks()
}
// Get the client allocation
calloc := ar.clientAlloc(states)
// Update the server
ar.stateUpdater.AllocStateUpdated(calloc)
// Broadcast client alloc to listeners
ar.allocBroadcaster.Send(calloc)
}
}
// killTasks kills all task runners, leader (if there is one) first. Errors are
// logged except taskrunner.ErrTaskNotRunning which is ignored. Task states
// after Kill has been called are returned.
func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
var mu sync.Mutex
states := make(map[string]*structs.TaskState, len(ar.tasks))
// Kill leader first, synchronously
for name, tr := range ar.tasks {
if !tr.IsLeader() {
continue
}
err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilled))
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
}
state := tr.TaskState()
states[name] = state
break
}
// Kill the rest concurrently
wg := sync.WaitGroup{}
for name, tr := range ar.tasks {
if tr.IsLeader() {
continue
}
wg.Add(1)
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilled))
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping task", "error", err, "task_name", name)
}
state := tr.TaskState()
mu.Lock()
states[name] = state
mu.Unlock()
}(name, tr)
}
wg.Wait()
return states
}
// clientAlloc takes in the task states and returns an Allocation populated
// with Client specific fields
func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation {
ar.stateLock.Lock()
defer ar.stateLock.Unlock()
// store task states for AllocState to expose
ar.state.TaskStates = taskStates
a := &structs.Allocation{
ID: ar.id,
TaskStates: taskStates,
}
if d := ar.state.DeploymentStatus; d != nil {
a.DeploymentStatus = d.Copy()
}
// Compute the ClientStatus
if ar.state.ClientStatus != "" {
// The client status is being forced
a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription
} else {
a.ClientStatus, a.ClientDescription = getClientStatus(taskStates)
}
// If the allocation is terminal, make sure all required fields are properly
// set.
if a.ClientTerminalStatus() {
alloc := ar.Alloc()
// If we are part of a deployment and the task has failed, mark the
// alloc as unhealthy. This guards against the watcher not be started.
if a.ClientStatus == structs.AllocClientStatusFailed &&
alloc.DeploymentID != "" && !a.DeploymentStatus.IsUnhealthy() {
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(false),
}
}
// Make sure we have marked the finished at for every task. This is used
// to calculate the reschedule time for failed allocations.
now := time.Now()
for _, task := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks {
ts, ok := a.TaskStates[task.Name]
if !ok {
ts = &structs.TaskState{}
a.TaskStates[task.Name] = ts
}
if ts.FinishedAt.IsZero() {
ts.FinishedAt = now
}
}
}
return a
}
// getClientStatus takes in the task states for a given allocation and computes
// the client status and description
func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) {
var pending, running, dead, failed bool
for _, state := range taskStates {
switch state.State {
case structs.TaskStateRunning:
running = true
case structs.TaskStatePending:
pending = true
case structs.TaskStateDead:
if state.Failed {
failed = true
} else {
dead = true
}
}
}
// Determine the alloc status
if failed {
return structs.AllocClientStatusFailed, "Failed tasks"
} else if running {
return structs.AllocClientStatusRunning, "Tasks are running"
} else if pending {
return structs.AllocClientStatusPending, "No tasks have started"
} else if dead {
return structs.AllocClientStatusComplete, "All tasks have completed"
}
return "", ""
}
// AllocState returns a copy of allocation state including a snapshot of task
// states.
func (ar *allocRunner) AllocState() *state.State {
ar.stateLock.RLock()
state := ar.state.Copy()
ar.stateLock.RUnlock()
// If TaskStateUpdated has not been called yet, ar.state.TaskStates
// won't be set as it is not the canonical source of TaskStates.
if len(state.TaskStates) == 0 {
ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks))
for k, tr := range ar.tasks {
state.TaskStates[k] = tr.TaskState()
}
}
// Generate alloc to get other state fields
alloc := ar.clientAlloc(state.TaskStates)
state.ClientStatus = alloc.ClientStatus
state.ClientDescription = alloc.ClientDescription
state.DeploymentStatus = alloc.DeploymentStatus
return state
}
// Update the running allocation with a new version received from the server.
//
// This method sends the updated alloc to Run for serially processing updates.
// If there is already a pending update it will be discarded and replaced by
// the latest update.
func (ar *allocRunner) Update(update *structs.Allocation) {
// Detect Stop updates
stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus()
// Update ar.alloc
ar.setAlloc(update)
// Run update hooks if not stopping or dead
if !update.TerminalStatus() {
if err := ar.update(update); err != nil {
ar.logger.Error("error running update hooks", "error", err)
}
}
// Update task runners
for _, tr := range ar.tasks {
tr.Update(update)
}
// If alloc is being terminated, kill all tasks, leader first
if stopping {
ar.killTasks()
}
}
func (ar *allocRunner) Listener() *cstructs.AllocListener {
return ar.allocBroadcaster.Listen()
}
// Destroy the alloc runner by synchronously stopping it if it is still running
// and cleaning up all of its resources.
//
// This method is safe for calling concurrently with Run() and will cause it to
// exit (thus closing WaitCh).
func (ar *allocRunner) Destroy() {
ar.destroyedLock.Lock()
if ar.destroyed {
// Only destroy once
ar.destroyedLock.Unlock()
return
}
defer ar.destroyedLock.Unlock()
// Stop any running tasks and persist states in case the client is
// shutdown before Destroy finishes.
states := ar.killTasks()
calloc := ar.clientAlloc(states)
ar.stateUpdater.AllocStateUpdated(calloc)
// Wait for tasks to exit and postrun hooks to finish (if they ran at all)
if ar.runLaunched {
<-ar.waitCh
}
// Run destroy hooks
if err := ar.destroy(); err != nil {
ar.logger.Warn("error running destroy hooks", "error", err)
}
// Wait for task state update handler to exit before removing local
// state if Run() ran at all.
if ar.runLaunched {
<-ar.taskStateUpdateHandlerCh
}
// Cleanup state db
if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
ar.logger.Warn("failed to delete allocation state", "error", err)
}
// Mark alloc as destroyed
ar.destroyed = true
}
// IsDestroyed returns true if the alloc runner has been destroyed (stopped and
// garbage collected).
//
// This method is safe for calling concurrently with Run(). Callers must
// receive on WaitCh() to block until alloc runner has stopped and been
// destroyed.
func (ar *allocRunner) IsDestroyed() bool {
ar.destroyedLock.Lock()
defer ar.destroyedLock.Unlock()
return ar.destroyed
}
// IsWaiting returns true if the alloc runner is waiting for its previous
// allocation to terminate.
//
// This method is safe for calling concurrently with Run().
func (ar *allocRunner) IsWaiting() bool {
return ar.prevAllocWatcher.IsWaiting()
}
// IsMigrating returns true if the alloc runner is migrating data from its
// previous allocation.
//
// This method is safe for calling concurrently with Run().
func (ar *allocRunner) IsMigrating() bool {
return ar.prevAllocWatcher.IsMigrating()
}
func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter {
return ar
}
// LatestAllocStats returns the latest stats for an allocation. If taskFilter
// is set, only stats for that task -- if it exists -- are returned.
func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
astat := &cstructs.AllocResourceUsage{
Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)),
ResourceUsage: &cstructs.ResourceUsage{
MemoryStats: &cstructs.MemoryStats{},
CpuStats: &cstructs.CpuStats{},
DeviceStats: []*device.DeviceGroupStats{},
},
}
for name, tr := range ar.tasks {
if taskFilter != "" && taskFilter != name {
// Getting stats for a particular task and its not this one!
continue
}
if usage := tr.LatestResourceUsage(); usage != nil {
astat.Tasks[name] = usage
astat.ResourceUsage.Add(usage.ResourceUsage)
if usage.Timestamp > astat.Timestamp {
astat.Timestamp = usage.Timestamp
}
}
}
return astat, nil
}