open-nomad/client/alloc_runner.go

424 lines
11 KiB
Go
Raw Normal View History

2015-08-23 22:06:47 +00:00
package client
2015-08-23 22:15:48 +00:00
import (
2015-08-29 22:46:10 +00:00
"encoding/json"
"fmt"
2015-08-23 22:15:48 +00:00
"log"
"os"
"path/filepath"
2015-08-23 22:15:48 +00:00
"sync"
2015-08-29 22:46:10 +00:00
"time"
2015-08-23 22:15:48 +00:00
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/config"
2015-08-23 23:49:48 +00:00
"github.com/hashicorp/nomad/client/driver"
2015-08-23 22:15:48 +00:00
"github.com/hashicorp/nomad/nomad/structs"
)
2015-08-23 22:06:47 +00:00
2015-08-29 22:46:10 +00:00
const (
// allocSyncRetryIntv is the interval on which we retry updating
// the status of the allocation
allocSyncRetryIntv = 15 * time.Second
)
// taskStatus is used to track the status of a task
type taskStatus struct {
Status string
Description string
}
// AllocStateUpdater is used to update the status of an allocation
type AllocStateUpdater func(alloc *structs.Allocation) error
// AllocRunner is used to wrap an allocation and provide the execution context.
type AllocRunner struct {
config *config.Config
updater AllocStateUpdater
logger *log.Logger
2015-08-23 22:15:48 +00:00
alloc *structs.Allocation
2015-08-29 22:46:10 +00:00
dirtyCh chan struct{}
ctx *driver.ExecContext
tasks map[string]*TaskRunner
RestartPolicy *structs.RestartPolicy
taskLock sync.RWMutex
2015-08-23 23:49:48 +00:00
2015-08-29 22:46:10 +00:00
taskStatus map[string]taskStatus
taskStatusLock sync.RWMutex
2015-08-23 22:15:48 +00:00
updateCh chan *structs.Allocation
destroy bool
destroyCh chan struct{}
destroyLock sync.Mutex
2015-10-04 20:36:03 +00:00
waitCh chan struct{}
2015-08-23 22:06:47 +00:00
}
// allocRunnerState is used to snapshot the state of the alloc runner
type allocRunnerState struct {
Alloc *structs.Allocation
RestartPolicy *structs.RestartPolicy
TaskStatus map[string]taskStatus
Context *driver.ExecContext
}
// NewAllocRunner is used to create a new allocation context
func NewAllocRunner(logger *log.Logger, config *config.Config, updater AllocStateUpdater, alloc *structs.Allocation) *AllocRunner {
2015-08-30 02:14:47 +00:00
ar := &AllocRunner{
config: config,
updater: updater,
logger: logger,
2015-08-29 22:46:10 +00:00
alloc: alloc,
dirtyCh: make(chan struct{}, 1),
tasks: make(map[string]*TaskRunner),
taskStatus: make(map[string]taskStatus),
updateCh: make(chan *structs.Allocation, 8),
destroyCh: make(chan struct{}),
2015-10-04 20:36:03 +00:00
waitCh: make(chan struct{}),
2015-08-23 22:06:47 +00:00
}
2015-08-30 02:14:47 +00:00
return ar
2015-08-23 22:06:47 +00:00
}
// stateFilePath returns the path to our state file
func (r *AllocRunner) stateFilePath() string {
return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, "state.json")
}
// RestoreState is used to restore the state of the alloc runner
func (r *AllocRunner) RestoreState() error {
// Load the snapshot
var snap allocRunnerState
if err := restoreState(r.stateFilePath(), &snap); err != nil {
return err
}
// Restore fields
r.alloc = snap.Alloc
r.RestartPolicy = snap.RestartPolicy
r.taskStatus = snap.TaskStatus
2015-08-30 02:14:47 +00:00
r.ctx = snap.Context
// Restore the task runners
var mErr multierror.Error
2015-11-09 23:55:31 +00:00
for name, status := range r.taskStatus {
task := &structs.Task{Name: name}
2015-11-06 01:30:41 +00:00
restartTracker := newRestartTracker(r.alloc.Job.Type, r.RestartPolicy)
tr := NewTaskRunner(r.logger, r.config, r.setTaskStatus, r.ctx, r.alloc.ID, task, restartTracker)
r.tasks[name] = tr
2015-11-09 23:55:31 +00:00
// Skip tasks in terminal states.
if status.Status == structs.AllocClientStatusDead ||
status.Status == structs.AllocClientStatusFailed {
continue
}
if err := tr.RestoreState(); err != nil {
r.logger.Printf("[ERR] client: failed to restore state for alloc %s task '%s': %v", r.alloc.ID, name, err)
mErr.Errors = append(mErr.Errors, err)
} else {
go tr.Run()
}
}
return mErr.ErrorOrNil()
}
2015-11-10 00:15:11 +00:00
// SaveState is used to snapshot the state of the alloc runner
// if the fullSync is marked as false only the state of the Alloc Runner
// is snapshotted. If fullSync is marked as true, we snapshot
// all the Task Runners associated with the Alloc
func (r *AllocRunner) SaveState() error {
if err := r.saveAllocRunnerState(); err != nil {
return err
}
2015-11-10 00:15:11 +00:00
// Save state for each task
r.taskLock.RLock()
defer r.taskLock.RUnlock()
var mErr multierror.Error
for _, tr := range r.tasks {
2015-11-10 00:15:11 +00:00
if err := r.saveTaskRunnerState(tr); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
}
return mErr.ErrorOrNil()
}
func (r *AllocRunner) saveAllocRunnerState() error {
r.taskStatusLock.RLock()
defer r.taskStatusLock.RUnlock()
snap := allocRunnerState{
Alloc: r.alloc,
RestartPolicy: r.RestartPolicy,
TaskStatus: r.taskStatus,
Context: r.ctx,
}
return persistState(r.stateFilePath(), &snap)
}
2015-11-10 00:15:11 +00:00
func (r *AllocRunner) saveTaskRunnerState(tr *TaskRunner) error {
var err error
if err = tr.SaveState(); err != nil {
r.logger.Printf("[ERR] client: failed to save state for alloc %s task '%s': %v",
r.alloc.ID, tr.task.Name, err)
}
2015-11-10 00:15:11 +00:00
return err
}
// DestroyState is used to cleanup after ourselves
func (r *AllocRunner) DestroyState() error {
return os.RemoveAll(filepath.Dir(r.stateFilePath()))
}
2015-08-31 00:35:58 +00:00
// DestroyContext is used to destroy the context
func (r *AllocRunner) DestroyContext() error {
return r.ctx.AllocDir.Destroy()
2015-08-31 00:35:58 +00:00
}
2015-08-23 22:06:47 +00:00
// Alloc returns the associated allocation
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Alloc() *structs.Allocation {
return r.alloc
2015-08-23 22:06:47 +00:00
}
2015-08-29 22:46:10 +00:00
// setAlloc is used to update the allocation of the runner
// we preserve the existing client status and description
func (r *AllocRunner) setAlloc(alloc *structs.Allocation) {
if r.alloc != nil {
alloc.ClientStatus = r.alloc.ClientStatus
alloc.ClientDescription = r.alloc.ClientDescription
}
r.alloc = alloc
}
2015-08-31 00:10:17 +00:00
// dirtySyncState is used to watch for state being marked dirty to sync
func (r *AllocRunner) dirtySyncState() {
2015-08-29 22:46:10 +00:00
for {
select {
case <-r.dirtyCh:
2015-08-31 00:10:17 +00:00
r.retrySyncState(r.destroyCh)
2015-08-29 22:46:10 +00:00
case <-r.destroyCh:
return
}
2015-08-31 00:10:17 +00:00
}
}
2015-08-29 22:46:10 +00:00
2015-08-31 00:10:17 +00:00
// retrySyncState is used to retry the state sync until success
func (r *AllocRunner) retrySyncState(stopCh chan struct{}) {
for {
2015-11-10 00:45:42 +00:00
if err := r.syncStatus(); err == nil {
2015-11-10 00:15:11 +00:00
// The Alloc State might have been re-computed so we are
// snapshoting only the alloc runner
r.saveAllocRunnerState()
2015-08-31 00:10:17 +00:00
return
2015-08-29 22:46:10 +00:00
}
2015-08-31 00:10:17 +00:00
select {
case <-time.After(allocSyncRetryIntv + randomStagger(allocSyncRetryIntv)):
case <-stopCh:
return
2015-08-29 22:46:10 +00:00
}
2015-08-31 00:10:17 +00:00
}
}
2015-08-29 22:46:10 +00:00
2015-08-31 00:10:17 +00:00
// syncStatus is used to run and sync the status when it changes
func (r *AllocRunner) syncStatus() error {
// Scan the task status to termine the status of the alloc
var pending, running, dead, failed bool
r.taskStatusLock.RLock()
pending = len(r.taskStatus) < len(r.tasks)
for _, status := range r.taskStatus {
switch status.Status {
case structs.AllocClientStatusRunning:
running = true
case structs.AllocClientStatusDead:
dead = true
case structs.AllocClientStatusFailed:
failed = true
2015-08-29 22:46:10 +00:00
}
}
2015-08-31 00:10:17 +00:00
if len(r.taskStatus) > 0 {
taskDesc, _ := json.Marshal(r.taskStatus)
r.alloc.ClientDescription = string(taskDesc)
}
r.taskStatusLock.RUnlock()
// Determine the alloc status
if failed {
r.alloc.ClientStatus = structs.AllocClientStatusFailed
} else if running {
r.alloc.ClientStatus = structs.AllocClientStatusRunning
} else if dead && !pending {
r.alloc.ClientStatus = structs.AllocClientStatusDead
}
// Attempt to update the status
if err := r.updater(r.alloc); err != nil {
r.logger.Printf("[ERR] client: failed to update alloc '%s' status to %s: %s",
r.alloc.ID, r.alloc.ClientStatus, err)
return err
}
return nil
2015-08-29 22:46:10 +00:00
}
// setStatus is used to update the allocation status
func (r *AllocRunner) setStatus(status, desc string) {
r.alloc.ClientStatus = status
r.alloc.ClientDescription = desc
select {
case r.dirtyCh <- struct{}{}:
default:
}
}
// setTaskStatus is used to set the status of a task
func (r *AllocRunner) setTaskStatus(taskName, status, desc string) {
r.taskStatusLock.Lock()
r.taskStatus[taskName] = taskStatus{
Status: status,
Description: desc,
}
2015-08-31 00:10:17 +00:00
r.taskStatusLock.Unlock()
2015-11-10 00:15:11 +00:00
if tr, ok := r.tasks[taskName]; ok {
r.saveTaskRunnerState(tr)
}
2015-08-29 22:46:10 +00:00
select {
case r.dirtyCh <- struct{}{}:
default:
}
}
2015-08-23 22:06:47 +00:00
// Run is a long running goroutine used to manage an allocation
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Run() {
2015-10-04 20:36:03 +00:00
defer close(r.waitCh)
2015-08-31 00:10:17 +00:00
go r.dirtySyncState()
2015-08-23 22:15:48 +00:00
2015-08-29 22:46:10 +00:00
// Check if the allocation is in a terminal status
2015-08-23 23:49:48 +00:00
alloc := r.alloc
2015-08-29 22:46:10 +00:00
if alloc.TerminalStatus() {
r.logger.Printf("[DEBUG] client: aborting runner for alloc '%s', terminal status", r.alloc.ID)
return
}
r.logger.Printf("[DEBUG] client: starting runner for alloc '%s'", r.alloc.ID)
// Find the task group to run in the allocation
2015-08-30 02:14:47 +00:00
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
2015-08-23 23:49:48 +00:00
if tg == nil {
r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
2015-08-29 22:46:10 +00:00
r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup))
2015-08-23 23:49:48 +00:00
return
}
// Extract the RestartPolicy from the TG and set it on the alloc
r.RestartPolicy = tg.RestartPolicy
2015-08-23 23:49:48 +00:00
// Create the execution context
2015-08-30 02:14:47 +00:00
if r.ctx == nil {
allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID))
if err := allocDir.Build(tg.Tasks); err != nil {
r.logger.Printf("[WARN] client: failed to build task directories: %v", err)
r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup))
return
}
r.ctx = driver.NewExecContext(allocDir, r.alloc.ID)
2015-08-30 02:14:47 +00:00
}
2015-08-23 23:49:48 +00:00
// Start the task runners
r.taskLock.Lock()
2015-08-23 23:49:48 +00:00
for _, task := range tg.Tasks {
2015-08-30 02:14:47 +00:00
// Skip tasks that were restored
2015-11-09 23:55:31 +00:00
if _, ok := r.taskStatus[task.Name]; ok {
2015-08-30 02:14:47 +00:00
continue
}
// Merge in the task resources
task.Resources = alloc.TaskResources[task.Name]
2015-11-06 01:30:41 +00:00
restartTracker := newRestartTracker(r.alloc.Job.Type, r.RestartPolicy)
tr := NewTaskRunner(r.logger, r.config, r.setTaskStatus, r.ctx, r.alloc.ID, task, restartTracker)
2015-08-23 23:49:48 +00:00
r.tasks[task.Name] = tr
go tr.Run()
}
r.taskLock.Unlock()
2015-08-23 23:49:48 +00:00
2015-08-29 22:46:10 +00:00
OUTER:
2015-08-23 23:49:48 +00:00
// Wait for updates
2015-08-23 22:06:47 +00:00
for {
2015-08-23 22:15:48 +00:00
select {
2015-08-23 22:36:06 +00:00
case update := <-r.updateCh:
2015-08-29 22:46:10 +00:00
// Check if we're in a terminal status
if update.TerminalStatus() {
r.setAlloc(update)
break OUTER
}
// Update the task groups
r.taskLock.RLock()
2015-08-29 22:46:10 +00:00
for _, task := range tg.Tasks {
tr := r.tasks[task.Name]
// Merge in the task resources
task.Resources = update.TaskResources[task.Name]
2015-08-29 22:46:10 +00:00
tr.Update(task)
}
r.taskLock.RUnlock()
2015-08-29 22:46:10 +00:00
2015-08-23 22:36:06 +00:00
case <-r.destroyCh:
2015-08-29 22:46:10 +00:00
break OUTER
2015-08-23 22:15:48 +00:00
}
2015-08-23 22:06:47 +00:00
}
2015-08-29 22:46:10 +00:00
// Destroy each sub-task
r.taskLock.RLock()
defer r.taskLock.RUnlock()
2015-08-29 22:46:10 +00:00
for _, tr := range r.tasks {
tr.Destroy()
}
// Wait for termination of the task runners
for _, tr := range r.tasks {
<-tr.WaitCh()
}
2015-08-31 00:10:17 +00:00
// Final state sync
r.retrySyncState(nil)
// Check if we should destroy our state
if r.destroy {
2015-08-31 00:35:58 +00:00
if err := r.DestroyContext(); err != nil {
r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v",
r.alloc.ID, err)
}
if err := r.DestroyState(); err != nil {
r.logger.Printf("[ERR] client: failed to destroy state for alloc '%s': %v",
r.alloc.ID, err)
}
}
2015-08-29 22:46:10 +00:00
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
2015-08-23 22:06:47 +00:00
}
// Update is used to update the allocation of the context
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Update(update *structs.Allocation) {
2015-08-23 22:15:48 +00:00
select {
2015-08-23 22:36:06 +00:00
case r.updateCh <- update:
2015-08-23 22:15:48 +00:00
default:
2015-08-23 22:36:06 +00:00
r.logger.Printf("[ERR] client: dropping update to alloc '%s'", update.ID)
2015-08-23 22:15:48 +00:00
}
2015-08-23 22:06:47 +00:00
}
// Destroy is used to indicate that the allocation context should be destroyed
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Destroy() {
r.destroyLock.Lock()
defer r.destroyLock.Unlock()
2015-08-23 22:15:48 +00:00
2015-08-23 22:36:06 +00:00
if r.destroy {
2015-08-23 22:15:48 +00:00
return
}
2015-08-23 22:36:06 +00:00
r.destroy = true
close(r.destroyCh)
2015-08-23 22:06:47 +00:00
}
2015-10-04 20:36:03 +00:00
// WaitCh returns a channel to wait for termination
func (r *AllocRunner) WaitCh() <-chan struct{} {
return r.waitCh
}