package client import ( "context" "fmt" "log" "os" "path/filepath" "sync" "time" metrics "github.com/armon/go-metrics" "github.com/boltdb/bolt" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/vaultclient" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" cstructs "github.com/hashicorp/nomad/client/structs" ) var ( // The following are the key paths written to the state database allocRunnerStateAllocKey = []byte("alloc") allocRunnerStateImmutableKey = []byte("immutable") allocRunnerStateMutableKey = []byte("mutable") allocRunnerStateAllocDirKey = []byte("alloc-dir") ) // AllocStateUpdater is used to update the status of an allocation type AllocStateUpdater func(alloc *structs.Allocation) type AllocStatsReporter interface { LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) } // AllocRunner is used to wrap an allocation and provide the execution context. type AllocRunner struct { config *config.Config updater AllocStateUpdater logger *log.Logger // allocID is the ID of this runner's allocation. Since it does not // change for the lifetime of the AllocRunner it is safe to read // without acquiring a lock (unlike alloc). allocID string alloc *structs.Allocation allocClientStatus string // Explicit status of allocation. Set when there are failures allocClientDescription string allocHealth *bool // Whether the allocation is healthy allocHealthTime time.Time // Time at which allocation health has been set allocBroadcast *cstructs.AllocBroadcaster allocLock sync.Mutex dirtyCh chan struct{} allocDir *allocdir.AllocDir allocDirLock sync.Mutex tasks map[string]*TaskRunner taskStates map[string]*structs.TaskState restored map[string]struct{} taskLock sync.RWMutex taskStatusLock sync.RWMutex updateCh chan *structs.Allocation vaultClient vaultclient.VaultClient consulClient ConsulServiceAPI // prevAlloc allows for Waiting until a previous allocation exits and // the migrates it data. If sticky volumes aren't used and there's no // previous allocation a noop implementation is used so it always safe // to call. prevAlloc prevAllocWatcher // ctx is cancelled with exitFn to cause the alloc to be destroyed // (stopped and GC'd). ctx context.Context exitFn context.CancelFunc // waitCh is closed when the Run method exits. At that point the alloc // has stopped and been GC'd. waitCh chan struct{} // State related fields // stateDB is used to store the alloc runners state stateDB *bolt.DB allocStateLock sync.Mutex // persistedEval is the last persisted evaluation ID. Since evaluation // IDs change on every allocation update we only need to persist the // allocation when its eval ID != the last persisted eval ID. persistedEvalLock sync.Mutex persistedEval string // immutablePersisted and allocDirPersisted are used to track whether the // immutable data and the alloc dir have been persisted. Once persisted we // can lower write volume by not re-writing these values immutablePersisted bool allocDirPersisted bool // baseLabels are used when emitting tagged metrics. All alloc runner metrics // will have these tags, and optionally more. baseLabels []metrics.Label } // COMPAT: Remove in 0.7.0 // allocRunnerState is used to snapshot the state of the alloc runner type allocRunnerState struct { Version string Alloc *structs.Allocation AllocDir *allocdir.AllocDir AllocClientStatus string AllocClientDescription string // COMPAT: Remove in 0.7.0: removing will break upgrading directly from // 0.5.2, so don't remove in the 0.6 series. // Context is deprecated and only used to migrate from older releases. // It will be removed in the future. Context *struct { AllocID string // unused; included for completeness AllocDir struct { AllocDir string SharedDir string // unused; included for completeness TaskDirs map[string]string } } `json:"Context,omitempty"` } // allocRunnerAllocState is state that only has to be written when the alloc // changes. type allocRunnerAllocState struct { Alloc *structs.Allocation } // allocRunnerImmutableState is state that only has to be written once. type allocRunnerImmutableState struct { Version string } // allocRunnerMutableState is state that has to be written on each save as it // changes over the life-cycle of the alloc_runner. type allocRunnerMutableState struct { AllocClientStatus string AllocClientDescription string TaskStates map[string]*structs.TaskState DeploymentStatus *structs.AllocDeploymentStatus } // NewAllocRunner is used to create a new allocation context func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB, updater AllocStateUpdater, alloc *structs.Allocation, vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI, prevAlloc prevAllocWatcher) *AllocRunner { ar := &AllocRunner{ config: config, stateDB: stateDB, updater: updater, logger: logger, alloc: alloc, allocID: alloc.ID, allocBroadcast: cstructs.NewAllocBroadcaster(8), prevAlloc: prevAlloc, dirtyCh: make(chan struct{}, 1), allocDir: allocdir.NewAllocDir(logger, filepath.Join(config.AllocDir, alloc.ID)), tasks: make(map[string]*TaskRunner), taskStates: copyTaskStates(alloc.TaskStates), restored: make(map[string]struct{}), updateCh: make(chan *structs.Allocation, 64), waitCh: make(chan struct{}), vaultClient: vaultClient, consulClient: consulClient, } // TODO Should be passed a context ar.ctx, ar.exitFn = context.WithCancel(context.TODO()) return ar } // setBaseLabels creates the set of base labels. This should be called after // Restore has been called so the allocation is guaranteed to be loaded func (r *AllocRunner) setBaseLabels() { r.baseLabels = make([]metrics.Label, 0, 3) if r.alloc.Job != nil { r.baseLabels = append(r.baseLabels, metrics.Label{ Name: "job", Value: r.alloc.Job.Name, }) } if r.alloc.TaskGroup != "" { r.baseLabels = append(r.baseLabels, metrics.Label{ Name: "task_group", Value: r.alloc.TaskGroup, }) } if r.config != nil && r.config.Node != nil { r.baseLabels = append(r.baseLabels, metrics.Label{ Name: "node_id", Value: r.config.Node.ID, }) } } // pre060StateFilePath returns the path to our state file that would have been // written pre v0.6.0 // COMPAT: Remove in 0.7.0 func (r *AllocRunner) pre060StateFilePath() string { r.allocLock.Lock() defer r.allocLock.Unlock() path := filepath.Join(r.config.StateDir, "alloc", r.allocID, "state.json") return path } // RestoreState is used to restore the state of the alloc runner func (r *AllocRunner) RestoreState() error { // COMPAT: Remove in 0.7.0 // Check if the old snapshot is there oldPath := r.pre060StateFilePath() var snap allocRunnerState var upgrading bool if err := pre060RestoreState(oldPath, &snap); err == nil { // Restore fields r.logger.Printf("[INFO] client: restoring pre v0.6.0 alloc runner state for alloc %q", r.allocID) r.alloc = snap.Alloc r.allocDir = snap.AllocDir r.allocClientStatus = snap.AllocClientStatus r.allocClientDescription = snap.AllocClientDescription if r.alloc != nil { r.taskStates = snap.Alloc.TaskStates } // COMPAT: Remove in 0.7.0 // #2132 Upgrade path: if snap.AllocDir is nil, try to convert old // Context struct to new AllocDir struct if snap.AllocDir == nil && snap.Context != nil { r.logger.Printf("[DEBUG] client: migrating state snapshot for alloc %q", r.allocID) r.allocDir = allocdir.NewAllocDir(r.logger, snap.Context.AllocDir.AllocDir) for taskName := range snap.Context.AllocDir.TaskDirs { r.allocDir.NewTaskDir(taskName) } } // Delete the old state os.RemoveAll(oldPath) upgrading = true } else if !os.IsNotExist(err) { // Something corrupt in the old state file return err } else { // We are doing a normal restore err := r.stateDB.View(func(tx *bolt.Tx) error { bkt, err := getAllocationBucket(tx, r.allocID) if err != nil { return fmt.Errorf("failed to get allocation bucket: %v", err) } // Get the state objects var mutable allocRunnerMutableState var immutable allocRunnerImmutableState var allocState allocRunnerAllocState var allocDir allocdir.AllocDir if err := getObject(bkt, allocRunnerStateAllocKey, &allocState); err != nil { return fmt.Errorf("failed to read alloc runner alloc state: %v", err) } if err := getObject(bkt, allocRunnerStateImmutableKey, &immutable); err != nil { return fmt.Errorf("failed to read alloc runner immutable state: %v", err) } if err := getObject(bkt, allocRunnerStateMutableKey, &mutable); err != nil { return fmt.Errorf("failed to read alloc runner mutable state: %v", err) } if err := getObject(bkt, allocRunnerStateAllocDirKey, &allocDir); err != nil { return fmt.Errorf("failed to read alloc runner alloc_dir state: %v", err) } // Populate the fields r.alloc = allocState.Alloc r.allocDir = &allocDir r.allocClientStatus = mutable.AllocClientStatus r.allocClientDescription = mutable.AllocClientDescription r.taskStates = mutable.TaskStates r.alloc.ClientStatus = getClientStatus(r.taskStates) r.alloc.DeploymentStatus = mutable.DeploymentStatus return nil }) if err != nil { return fmt.Errorf("failed to read allocation state: %v", err) } } var snapshotErrors multierror.Error if r.alloc == nil { snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil allocation")) } if r.allocDir == nil { snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil alloc dir")) } if e := snapshotErrors.ErrorOrNil(); e != nil { return e } tg := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup) if tg == nil { return fmt.Errorf("restored allocation doesn't contain task group %q", r.alloc.TaskGroup) } // Restore the task runners taskDestroyEvent := structs.NewTaskEvent(structs.TaskKilled) var mErr multierror.Error for _, task := range tg.Tasks { name := task.Name state := r.taskStates[name] // Nomad exited before task could start, nothing to restore. // AllocRunner.Run will start a new TaskRunner for this task if state == nil { continue } // Mark the task as restored. r.restored[name] = struct{}{} td, ok := r.allocDir.TaskDirs[name] if !ok { // Create the task dir metadata if it doesn't exist. // Since task dirs are created during r.Run() the // client may save state and exit before all task dirs // are created td = r.allocDir.NewTaskDir(name) } // Skip tasks in terminal states. if state.State == structs.TaskStateDead { continue } tr := NewTaskRunner(r.logger, r.config, r.stateDB, r.setTaskState, td, r.Alloc(), task, r.vaultClient, r.consulClient) r.tasks[name] = tr if restartReason, err := tr.RestoreState(); err != nil { r.logger.Printf("[ERR] client: failed to restore state for alloc %s task %q: %v", r.allocID, name, err) mErr.Errors = append(mErr.Errors, err) } else if !r.alloc.TerminalStatus() { // Only start if the alloc isn't in a terminal status. go tr.Run() if upgrading { if err := tr.SaveState(); err != nil { r.logger.Printf("[WARN] client: initial save state for alloc %s task %s failed: %v", r.allocID, name, err) } } // Restart task runner if RestoreState gave a reason if restartReason != "" { r.logger.Printf("[INFO] client: restarting alloc %s task %s: %v", r.allocID, name, restartReason) const failure = false tr.Restart("upgrade", restartReason, failure) } } else { tr.Destroy(taskDestroyEvent) } } return mErr.ErrorOrNil() } // SaveState is used to snapshot the state of the alloc runner // if the fullSync is marked as false only the state of the Alloc Runner // is snapshotted. If fullSync is marked as true, we snapshot // all the Task Runners associated with the Alloc func (r *AllocRunner) SaveState() error { if err := r.saveAllocRunnerState(); err != nil { return err } // Save state for each task runners := r.getTaskRunners() var mErr multierror.Error for _, tr := range runners { if err := tr.SaveState(); err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("failed to save state for alloc %s task %q: %v", r.allocID, tr.task.Name, err)) } } return mErr.ErrorOrNil() } func (r *AllocRunner) saveAllocRunnerState() error { r.allocStateLock.Lock() defer r.allocStateLock.Unlock() if r.ctx.Err() == context.Canceled { return nil } // Grab all the relevant data alloc := r.Alloc() r.allocLock.Lock() allocClientStatus := r.allocClientStatus allocClientDescription := r.allocClientDescription r.allocLock.Unlock() r.allocDirLock.Lock() allocDir := r.allocDir.Copy() r.allocDirLock.Unlock() // Start the transaction. return r.stateDB.Batch(func(tx *bolt.Tx) error { // Grab the allocation bucket allocBkt, err := getAllocationBucket(tx, r.allocID) if err != nil { return fmt.Errorf("failed to retrieve allocation bucket: %v", err) } // Write the allocation if the eval has changed r.persistedEvalLock.Lock() lastPersisted := r.persistedEval r.persistedEvalLock.Unlock() if alloc.EvalID != lastPersisted { allocState := &allocRunnerAllocState{ Alloc: alloc, } if err := putObject(allocBkt, allocRunnerStateAllocKey, &allocState); err != nil { return fmt.Errorf("failed to write alloc_runner alloc state: %v", err) } tx.OnCommit(func() { r.persistedEvalLock.Lock() r.persistedEval = alloc.EvalID r.persistedEvalLock.Unlock() }) } // Write immutable data iff it hasn't been written yet if !r.immutablePersisted { immutable := &allocRunnerImmutableState{ Version: r.config.Version.VersionNumber(), } if err := putObject(allocBkt, allocRunnerStateImmutableKey, &immutable); err != nil { return fmt.Errorf("failed to write alloc_runner immutable state: %v", err) } tx.OnCommit(func() { r.immutablePersisted = true }) } // Write the alloc dir data if it hasn't been written before and it exists. if !r.allocDirPersisted && allocDir != nil { if err := putObject(allocBkt, allocRunnerStateAllocDirKey, allocDir); err != nil { return fmt.Errorf("failed to write alloc_runner allocDir state: %v", err) } tx.OnCommit(func() { r.allocDirPersisted = true }) } // Write the mutable state every time mutable := &allocRunnerMutableState{ AllocClientStatus: allocClientStatus, AllocClientDescription: allocClientDescription, TaskStates: alloc.TaskStates, DeploymentStatus: alloc.DeploymentStatus, } if err := putObject(allocBkt, allocRunnerStateMutableKey, &mutable); err != nil { return fmt.Errorf("failed to write alloc_runner mutable state: %v", err) } return nil }) } // DestroyState is used to cleanup after ourselves func (r *AllocRunner) DestroyState() error { r.allocStateLock.Lock() defer r.allocStateLock.Unlock() return r.stateDB.Update(func(tx *bolt.Tx) error { if err := deleteAllocationBucket(tx, r.allocID); err != nil { return fmt.Errorf("failed to delete allocation bucket: %v", err) } return nil }) } // DestroyContext is used to destroy the context func (r *AllocRunner) DestroyContext() error { return r.allocDir.Destroy() } // GetAllocDir returns the alloc dir for the alloc runner func (r *AllocRunner) GetAllocDir() *allocdir.AllocDir { return r.allocDir } // GetListener returns a listener for updates broadcast by this alloc runner. // Callers are responsible for calling Close on their Listener. func (r *AllocRunner) GetListener() *cstructs.AllocListener { return r.allocBroadcast.Listen() } // copyTaskStates returns a copy of the passed task states. func copyTaskStates(states map[string]*structs.TaskState) map[string]*structs.TaskState { copy := make(map[string]*structs.TaskState, len(states)) for task, state := range states { copy[task] = state.Copy() } return copy } // finalizeTerminalAlloc sets any missing required fields like // finishedAt in the alloc runner's task States. finishedAt is used // to calculate reschedule time for failed allocs, so we make sure that // it is set func (r *AllocRunner) finalizeTerminalAlloc(alloc *structs.Allocation) { if !alloc.ClientTerminalStatus() { return } r.taskStatusLock.Lock() defer r.taskStatusLock.Unlock() group := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if r.taskStates == nil { r.taskStates = make(map[string]*structs.TaskState) } now := time.Now() for _, task := range group.Tasks { ts, ok := r.taskStates[task.Name] if !ok { ts = &structs.TaskState{} r.taskStates[task.Name] = ts } if ts.FinishedAt.IsZero() { ts.FinishedAt = now } } alloc.TaskStates = copyTaskStates(r.taskStates) } // Alloc returns the associated allocation func (r *AllocRunner) Alloc() *structs.Allocation { r.allocLock.Lock() // Don't do a deep copy of the job alloc := r.alloc.CopySkipJob() // The status has explicitly been set. if r.allocClientStatus != "" || r.allocClientDescription != "" { alloc.ClientStatus = r.allocClientStatus alloc.ClientDescription = r.allocClientDescription // Copy over the task states so we don't lose them r.taskStatusLock.RLock() alloc.TaskStates = copyTaskStates(r.taskStates) r.taskStatusLock.RUnlock() r.allocLock.Unlock() r.finalizeTerminalAlloc(alloc) return alloc } // The health has been set if r.allocHealth != nil { if alloc.DeploymentStatus == nil { alloc.DeploymentStatus = &structs.AllocDeploymentStatus{} } alloc.DeploymentStatus.Healthy = helper.BoolToPtr(*r.allocHealth) alloc.DeploymentStatus.Timestamp = r.allocHealthTime } r.allocLock.Unlock() // Scan the task states to determine the status of the alloc r.taskStatusLock.RLock() alloc.TaskStates = copyTaskStates(r.taskStates) alloc.ClientStatus = getClientStatus(r.taskStates) r.taskStatusLock.RUnlock() // If the client status is failed and we are part of a deployment, mark the // alloc as unhealthy. This guards against the watcher not be started. r.allocLock.Lock() if alloc.ClientStatus == structs.AllocClientStatusFailed && alloc.DeploymentID != "" && !alloc.DeploymentStatus.IsUnhealthy() { alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(false), } } r.allocLock.Unlock() r.finalizeTerminalAlloc(alloc) return alloc } // getClientStatus takes in the task states for a given allocation and computes // the client status func getClientStatus(taskStates map[string]*structs.TaskState) string { var pending, running, dead, failed bool for _, state := range taskStates { switch state.State { case structs.TaskStateRunning: running = true case structs.TaskStatePending: pending = true case structs.TaskStateDead: if state.Failed { failed = true } else { dead = true } } } // Determine the alloc status if failed { return structs.AllocClientStatusFailed } else if running { return structs.AllocClientStatusRunning } else if pending { return structs.AllocClientStatusPending } else if dead { return structs.AllocClientStatusComplete } return "" } // dirtySyncState is used to watch for state being marked dirty to sync func (r *AllocRunner) dirtySyncState() { for { select { case <-r.dirtyCh: if err := r.syncStatus(); err != nil { // Only WARN instead of ERR because we continue on r.logger.Printf("[WARN] client: error persisting alloc %q state: %v", r.allocID, err) } case <-r.ctx.Done(): return } } } // syncStatus is used to run and sync the status when it changes func (r *AllocRunner) syncStatus() error { // Get a copy of our alloc, update status server side and sync to disk alloc := r.Alloc() r.updater(alloc) r.sendBroadcast(alloc) return r.saveAllocRunnerState() } // sendBroadcast broadcasts an alloc update. func (r *AllocRunner) sendBroadcast(alloc *structs.Allocation) { // Try to send the alloc up to three times with a delay to allow recovery. sent := false for i := 0; i < 3; i++ { if sent = r.allocBroadcast.Send(alloc); sent { break } time.Sleep(500 * time.Millisecond) } if !sent { r.logger.Printf("[WARN] client: failed to broadcast update to allocation %q", r.allocID) } } // setStatus is used to update the allocation status func (r *AllocRunner) setStatus(status, desc string) { r.allocLock.Lock() r.allocClientStatus = status r.allocClientDescription = desc r.allocLock.Unlock() select { case r.dirtyCh <- struct{}{}: default: } } // setTaskState is used to set the status of a task. If lazySync is set then the // event is appended but not synced with the server. If state is omitted, the // last known state is used. func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEvent, lazySync bool) { r.taskStatusLock.Lock() defer r.taskStatusLock.Unlock() taskState, ok := r.taskStates[taskName] if !ok { taskState = &structs.TaskState{} r.taskStates[taskName] = taskState } // Set the tasks state. if event != nil { if event.FailsTask { taskState.Failed = true } if event.Type == structs.TaskRestarting { if !r.config.DisableTaggedMetrics { metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, r.baseLabels) } if r.config.BackwardsCompatibleMetrics { metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) } taskState.Restarts++ taskState.LastRestart = time.Unix(0, event.Time) } r.appendTaskEvent(taskState, event) } if lazySync { return } // If the state hasn't been set use the existing state. if state == "" { state = taskState.State if taskState.State == "" { state = structs.TaskStatePending } } switch state { case structs.TaskStateRunning: // Capture the start time if it is just starting if taskState.State != structs.TaskStateRunning { taskState.StartedAt = time.Now().UTC() if !r.config.DisableTaggedMetrics { metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, r.baseLabels) } if r.config.BackwardsCompatibleMetrics { metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) } } case structs.TaskStateDead: // Capture the finished time if not already set if taskState.FinishedAt.IsZero() { taskState.FinishedAt = time.Now().UTC() } // Find all tasks that are not the one that is dead and check if the one // that is dead is a leader var otherTaskRunners []*TaskRunner var otherTaskNames []string leader := false for task, tr := range r.tasks { if task != taskName { otherTaskRunners = append(otherTaskRunners, tr) otherTaskNames = append(otherTaskNames, task) } else if tr.task.Leader { leader = true } } // Emitting metrics to indicate task complete and failures if taskState.Failed { if !r.config.DisableTaggedMetrics { metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, r.baseLabels) } if r.config.BackwardsCompatibleMetrics { metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) } } else { if !r.config.DisableTaggedMetrics { metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, r.baseLabels) } if r.config.BackwardsCompatibleMetrics { metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) } } // If the task failed, we should kill all the other tasks in the task group. if taskState.Failed { for _, tr := range otherTaskRunners { tr.Destroy(structs.NewTaskEvent(structs.TaskSiblingFailed).SetFailedSibling(taskName)) } if len(otherTaskRunners) > 0 { r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames) } } else if leader { // If the task was a leader task we should kill all the other tasks. for _, tr := range otherTaskRunners { tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead)) } if len(otherTaskRunners) > 0 { r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames) } } } // Store the new state taskState.State = state select { case r.dirtyCh <- struct{}{}: default: } } // appendTaskEvent updates the task status by appending the new event. func (r *AllocRunner) appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) { capacity := 10 if state.Events == nil { state.Events = make([]*structs.TaskEvent, 0, capacity) } // If we hit capacity, then shift it. if len(state.Events) == capacity { old := state.Events state.Events = make([]*structs.TaskEvent, 0, capacity) state.Events = append(state.Events, old[1:]...) } state.Events = append(state.Events, event) } // Run is a long running goroutine used to manage an allocation func (r *AllocRunner) Run() { defer close(r.waitCh) r.setBaseLabels() go r.dirtySyncState() // Find the task group to run in the allocation alloc := r.Alloc() tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if tg == nil { r.logger.Printf("[ERR] client: alloc %q for missing task group %q", r.allocID, alloc.TaskGroup) r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup)) return } // Build allocation directory (idempotent) r.allocDirLock.Lock() err := r.allocDir.Build() r.allocDirLock.Unlock() if err != nil { r.logger.Printf("[ERR] client: alloc %q failed to build task directories: %v", r.allocID, err) r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup)) return } // Wait for a previous alloc - if any - to terminate if err := r.prevAlloc.Wait(r.ctx); err != nil { if err == context.Canceled { return } r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("error while waiting for previous alloc to terminate: %v", err)) return } // Wait for data to be migrated from a previous alloc if applicable if err := r.prevAlloc.Migrate(r.ctx, r.allocDir); err != nil { if err == context.Canceled { return } // Soft-fail on migration errors r.logger.Printf("[WARN] client: alloc %q error while migrating data from previous alloc: %v", r.allocID, err) // Recreate alloc dir to ensure a clean slate r.allocDir.Destroy() if err := r.allocDir.Build(); err != nil { r.logger.Printf("[ERR] client: alloc %q failed to clean task directories after failed migration: %v", r.allocID, err) r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to rebuild task dirs for '%s'", alloc.TaskGroup)) return } } // Check if the allocation is in a terminal status. In this case, we don't // start any of the task runners and directly wait for the destroy signal to // clean up the allocation. if alloc.TerminalStatus() { r.logger.Printf("[DEBUG] client: alloc %q in terminal status, waiting for destroy", r.allocID) // mark this allocation as completed if it is not already in a // terminal state if !alloc.Terminated() { r.setStatus(structs.AllocClientStatusComplete, "canceled running tasks for allocation in terminal state") } r.handleDestroy() r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.allocID) return } // Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution if !r.config.DisableTaggedMetrics { metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"}, 1, r.baseLabels) } if r.config.BackwardsCompatibleMetrics { metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) } // Start the watcher wCtx, watcherCancel := context.WithCancel(r.ctx) go r.watchHealth(wCtx) // Start the task runners r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.allocID) r.taskLock.Lock() for _, task := range tg.Tasks { if _, ok := r.restored[task.Name]; ok { continue } r.allocDirLock.Lock() taskdir := r.allocDir.NewTaskDir(task.Name) r.allocDirLock.Unlock() tr := NewTaskRunner(r.logger, r.config, r.stateDB, r.setTaskState, taskdir, r.Alloc(), task.Copy(), r.vaultClient, r.consulClient) r.tasks[task.Name] = tr tr.MarkReceived() go tr.Run() } r.taskLock.Unlock() // taskDestroyEvent contains an event that caused the destruction of a task // in the allocation. var taskDestroyEvent *structs.TaskEvent OUTER: // Wait for updates for { select { case update := <-r.updateCh: // Store the updated allocation. r.allocLock.Lock() // If the deployment ids have changed clear the health if r.alloc.DeploymentID != update.DeploymentID { r.allocHealth = nil r.allocHealthTime = time.Time{} } r.alloc = update r.allocLock.Unlock() // Create a new watcher watcherCancel() wCtx, watcherCancel = context.WithCancel(r.ctx) go r.watchHealth(wCtx) // Check if we're in a terminal status if update.TerminalStatus() { taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled) break OUTER } // Update the task groups runners := r.getTaskRunners() for _, tr := range runners { tr.Update(update) } if err := r.syncStatus(); err != nil { r.logger.Printf("[WARN] client: failed to sync alloc %q status upon receiving alloc update: %v", r.allocID, err) } case <-r.ctx.Done(): taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled) break OUTER } } // Kill the task runners r.destroyTaskRunners(taskDestroyEvent) // Block until we should destroy the state of the alloc r.handleDestroy() // Free up the context. It has likely exited already watcherCancel() r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.allocID) } // destroyTaskRunners destroys the task runners, waits for them to terminate and // then saves state. func (r *AllocRunner) destroyTaskRunners(destroyEvent *structs.TaskEvent) { // First destroy the leader if one exists tg := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup) leader := "" for _, task := range tg.Tasks { if task.Leader { leader = task.Name break } } if leader != "" { r.taskLock.RLock() tr := r.tasks[leader] r.taskLock.RUnlock() // Dead tasks don't have a task runner created so guard against // the leader being dead when this AR was saved. if tr == nil { r.logger.Printf("[DEBUG] client: alloc %q leader task %q of task group %q already stopped", r.allocID, leader, r.alloc.TaskGroup) } else { r.logger.Printf("[DEBUG] client: alloc %q destroying leader task %q of task group %q first", r.allocID, leader, r.alloc.TaskGroup) tr.Destroy(destroyEvent) <-tr.WaitCh() } } // Then destroy non-leader tasks concurrently r.taskLock.RLock() for name, tr := range r.tasks { if name != leader { tr.Destroy(destroyEvent) } } r.taskLock.RUnlock() // Wait for termination of the task runners for _, tr := range r.getTaskRunners() { <-tr.WaitCh() } } // handleDestroy blocks till the AllocRunner should be destroyed and does the // necessary cleanup. func (r *AllocRunner) handleDestroy() { // Final state sync. We do this to ensure that the server has the correct // state as we wait for a destroy. alloc := r.Alloc() // Increment the destroy count for this alloc runner since this allocation is being removed from this client. if !r.config.DisableTaggedMetrics { metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"}, 1, r.baseLabels) } if r.config.BackwardsCompatibleMetrics { metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) } // Broadcast and persist state synchronously r.sendBroadcast(alloc) if err := r.saveAllocRunnerState(); err != nil { r.logger.Printf("[WARN] client: alloc %q unable to persist state but should be GC'd soon anyway:%v", r.allocID, err) } // Unmount any mounted directories as no tasks are running and makes // cleaning up Nomad's data directory simpler. if err := r.allocDir.UnmountAll(); err != nil { r.logger.Printf("[ERR] client: alloc %q unable unmount task directories: %v", r.allocID, err) } // Update the server with the alloc's status -- also marks the alloc as // being eligible for GC, so from this point on the alloc can be gc'd // at any time. r.updater(alloc) for { select { case <-r.ctx.Done(): if err := r.DestroyContext(); err != nil { r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v", r.allocID, err) } if err := r.DestroyState(); err != nil { r.logger.Printf("[ERR] client: failed to destroy state for alloc '%s': %v", r.allocID, err) } return case <-r.updateCh: r.logger.Printf("[DEBUG] client: dropping update to terminal alloc '%s'", r.allocID) } } } // IsWaiting returns true if this alloc is waiting on a previous allocation to // terminate. func (r *AllocRunner) IsWaiting() bool { return r.prevAlloc.IsWaiting() } // IsMigrating returns true if this alloc is migrating data from a previous // allocation. func (r *AllocRunner) IsMigrating() bool { return r.prevAlloc.IsMigrating() } // Update is used to update the allocation of the context func (r *AllocRunner) Update(update *structs.Allocation) { select { case r.updateCh <- update: default: r.logger.Printf("[ERR] client: dropping update to alloc '%s'", update.ID) } } // StatsReporter returns an interface to query resource usage statistics of an // allocation func (r *AllocRunner) StatsReporter() AllocStatsReporter { return r } // getTaskRunners is a helper that returns a copy of the task runners list using // the taskLock. func (r *AllocRunner) getTaskRunners() []*TaskRunner { // Get the task runners r.taskLock.RLock() defer r.taskLock.RUnlock() runners := make([]*TaskRunner, 0, len(r.tasks)) for _, tr := range r.tasks { runners = append(runners, tr) } return runners } // LatestAllocStats returns the latest allocation stats. If the optional taskFilter is set // the allocation stats will only include the given task. func (r *AllocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { astat := &cstructs.AllocResourceUsage{ Tasks: make(map[string]*cstructs.TaskResourceUsage), } var flat []*cstructs.TaskResourceUsage if taskFilter != "" { r.taskLock.RLock() tr, ok := r.tasks[taskFilter] r.taskLock.RUnlock() if !ok { return nil, fmt.Errorf("allocation %q has no task %q", r.allocID, taskFilter) } l := tr.LatestResourceUsage() if l != nil { astat.Tasks[taskFilter] = l flat = []*cstructs.TaskResourceUsage{l} astat.Timestamp = l.Timestamp } } else { // Get the task runners runners := r.getTaskRunners() for _, tr := range runners { l := tr.LatestResourceUsage() if l != nil { astat.Tasks[tr.task.Name] = l flat = append(flat, l) if l.Timestamp > astat.Timestamp { astat.Timestamp = l.Timestamp } } } } astat.ResourceUsage = sumTaskResourceUsage(flat) return astat, nil } // sumTaskResourceUsage takes a set of task resources and sums their resources func sumTaskResourceUsage(usages []*cstructs.TaskResourceUsage) *cstructs.ResourceUsage { summed := &cstructs.ResourceUsage{ MemoryStats: &cstructs.MemoryStats{}, CpuStats: &cstructs.CpuStats{}, } for _, usage := range usages { summed.Add(usage.ResourceUsage) } return summed } // shouldUpdate takes the AllocModifyIndex of an allocation sent from the server and // checks if the current running allocation is behind and should be updated. func (r *AllocRunner) shouldUpdate(serverIndex uint64) bool { r.allocLock.Lock() defer r.allocLock.Unlock() return r.alloc.AllocModifyIndex < serverIndex } // Destroy is used to indicate that the allocation context should be destroyed func (r *AllocRunner) Destroy() { // Lock when closing the context as that gives the save state code // serialization. r.allocStateLock.Lock() defer r.allocStateLock.Unlock() r.exitFn() r.allocBroadcast.Close() } // IsDestroyed returns true if the AllocRunner is not running and has been // destroyed (GC'd). func (r *AllocRunner) IsDestroyed() bool { select { case <-r.waitCh: return true default: return false } } // WaitCh returns a channel to wait for termination func (r *AllocRunner) WaitCh() <-chan struct{} { return r.waitCh }