open-nomad/client/task_runner.go

package client

import (
	"crypto/md5"
	"encoding/hex"
	"fmt"
	"log"
	"os"
	"path/filepath"
	"sync"

	"github.com/hashicorp/nomad/client/config"
	"github.com/hashicorp/nomad/client/driver"
	"github.com/hashicorp/nomad/nomad/structs"
)

// TaskRunner is used to wrap a task within an allocation and provide the execution context.
type TaskRunner struct {
	config  *config.Config
	updater TaskStateUpdater
	logger  *log.Logger
	ctx     *driver.ExecContext
	allocID string

	task     *structs.Task
	updateCh chan *structs.Task
	handle   driver.DriverHandle

	destroy     bool
	destroyCh   chan struct{}
	destroyLock sync.Mutex
	waitCh      chan struct{}
}

// taskRunnerState is used to snapshot the state of the task runner
type taskRunnerState struct {
	Task     *structs.Task
	HandleID string
}

// TaskStateUpdater is used to update the status of a task
type TaskStateUpdater func(taskName, status, desc string)

// NewTaskRunner is used to create a new task context
func NewTaskRunner(logger *log.Logger, config *config.Config,
	updater TaskStateUpdater, ctx *driver.ExecContext,
	allocID string, task *structs.Task) *TaskRunner {
	tc := &TaskRunner{
		config:    config,
		updater:   updater,
		logger:    logger,
		ctx:       ctx,
		allocID:   allocID,
		task:      task,
		updateCh:  make(chan *structs.Task, 8),
		destroyCh: make(chan struct{}),
		waitCh:    make(chan struct{}),
	}
	return tc
}

// WaitCh returns a channel to wait for termination
func (r *TaskRunner) WaitCh() <-chan struct{} {
	return r.waitCh
}

// stateFilePath returns the path to our state file
func (r *TaskRunner) stateFilePath() string {
	// Get the MD5 of the task name
	hashVal := md5.Sum([]byte(r.task.Name))
	hashHex := hex.EncodeToString(hashVal[:])
	dirName := fmt.Sprintf("task-%s", hashHex)

	// Generate the path
	path := filepath.Join(r.config.StateDir, "alloc", r.allocID,
		dirName, "state.json")
	return path
}

// RestoreState is used to restore our state
func (r *TaskRunner) RestoreState() error {
	// Load the snapshot
	var snap taskRunnerState
	if err := restoreState(r.stateFilePath(), &snap); err != nil {
		return err
	}

	// Restore fields
	r.task = snap.Task

	// Restore the driver
	if snap.HandleID != "" {
		driver, err := r.createDriver()
		if err != nil {
			return err
		}

		handle, err := driver.Open(r.ctx, snap.HandleID)
		if err != nil {
			r.logger.Printf("[ERR] client: failed to open handle to task '%s' for alloc '%s': %v",
				r.task.Name, r.allocID, err)
			return err
		}
		r.handle = handle
	}
	return nil
}

// SaveState is used to snapshot our state
func (r *TaskRunner) SaveState() error {
	snap := taskRunnerState{
		Task: r.task,
	}
	if r.handle != nil {
		snap.HandleID = r.handle.ID()
	}
	return persistState(r.stateFilePath(), &snap)
}

// DestroyState is used to cleanup after ourselves
func (r *TaskRunner) DestroyState() error {
	return os.RemoveAll(r.stateFilePath())
}

// setStatus is used to update the status of the task runner
func (r *TaskRunner) setStatus(status, desc string) {
	r.updater(r.task.Name, status, desc)
}

// createDriver makes a driver for the task
func (r *TaskRunner) createDriver() (driver.Driver, error) {
	driverCtx := driver.NewDriverContext(r.config, r.config.Node, r.logger)
	driver, err := driver.NewDriver(r.task.Driver, driverCtx)
	if err != nil {
		err = fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
			r.task.Driver, r.allocID, err)
		r.logger.Printf("[ERR] client: %s", err)
	}
	return driver, err
}

// startTask is used to start the task if there is no handle
func (r *TaskRunner) startTask() error {
	// Create a driver
	driver, err := r.createDriver()
	if err != nil {
		r.setStatus(structs.AllocClientStatusFailed, err.Error())
		return err
	}

	// Start the job
	handle, err := driver.Start(r.ctx, r.task)
	if err != nil {
		r.logger.Printf("[ERR] client: failed to start task '%s' for alloc '%s': %v",
			r.task.Name, r.allocID, err)
		r.setStatus(structs.AllocClientStatusFailed,
			fmt.Sprintf("failed to start: %v", err))
		return err
	}
	r.handle = handle
	r.setStatus(structs.AllocClientStatusRunning, "task started")
	return nil
}

// Run is a long running routine used to manage the task
func (r *TaskRunner) Run() {
	defer close(r.waitCh)
	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
		r.task.Name, r.allocID)

	// Start the task if not yet started
	if r.handle == nil {
		if err := r.startTask(); err != nil {
			return
		}
	}

OUTER:
	// Wait for updates
	for {
		select {
		case err := <-r.handle.WaitCh():
			if err != nil {
				r.logger.Printf("[ERR] client: failed to complete task '%s' for alloc '%s': %v",
					r.task.Name, r.allocID, err)
				r.setStatus(structs.AllocClientStatusDead,
					fmt.Sprintf("task failed with: %v", err))
			} else {
				r.logger.Printf("[INFO] client: completed task '%s' for alloc '%s'",
					r.task.Name, r.allocID)
				r.setStatus(structs.AllocClientStatusDead,
					"task completed")
			}
			break OUTER

		case update := <-r.updateCh:
			// Update
			r.task = update
			if err := r.handle.Update(update); err != nil {
				r.logger.Printf("[ERR] client: failed to update task '%s' for alloc '%s': %v",
					r.task.Name, r.allocID, err)
			}

		case <-r.destroyCh:
			// Send the kill signal, and use the WaitCh to block until complete
			if err := r.handle.Kill(); err != nil {
				r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc '%s': %v",
					r.task.Name, r.allocID, err)
			}
		}
	}

	// Cleanup after ourselves
	r.DestroyState()
}

// Update is used to update the task of the context
func (r *TaskRunner) Update(update *structs.Task) {
	select {
	case r.updateCh <- update:
	default:
		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
			update.Name, r.allocID)
	}
}

// Destroy is used to indicate that the task context should be destroyed
func (r *TaskRunner) Destroy() {
	r.destroyLock.Lock()
	defer r.destroyLock.Unlock()

	if r.destroy {
		return
	}
	r.destroy = true
	close(r.destroyCh)
}
client: alloc/task context 2015-08-23 22:30:16 +00:00			`package client`

			`import (`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`"crypto/md5"`
			`"encoding/hex"`
client: working on runners 2015-08-29 22:46:10 +00:00			`"fmt"`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`"log"`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`"os"`
			`"path/filepath"`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`"sync"`

client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`"github.com/hashicorp/nomad/client/config"`
client: making progress 2015-08-23 23:49:48 +00:00			`"github.com/hashicorp/nomad/client/driver"`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`

client: rename Alloc/TaskContext to Runner 2015-08-23 22:32:46 +00:00			`// TaskRunner is used to wrap a task within an allocation and provide the execution context.`
			`type TaskRunner struct {`
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`config *config.Config`
			`updater TaskStateUpdater`
			`logger *log.Logger`
			`ctx *driver.ExecContext`
client: working on runners 2015-08-29 22:46:10 +00:00			`allocID string`
client: alloc/task context 2015-08-23 22:30:16 +00:00
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`task *structs.Task`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`updateCh chan *structs.Task`
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`handle driver.DriverHandle`
client: alloc/task context 2015-08-23 22:30:16 +00:00
			`destroy bool`
			`destroyCh chan struct{}`
			`destroyLock sync.Mutex`
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`waitCh chan struct{}`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`}`

client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`// taskRunnerState is used to snapshot the state of the task runner`
			`type taskRunnerState struct {`
client: working on state restore 2015-08-30 02:14:47 +00:00			`Task *structs.Task`
			`HandleID string`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`}`

client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`// TaskStateUpdater is used to update the status of a task`
			`type TaskStateUpdater func(taskName, status, desc string)`

client: rename Alloc/TaskContext to Runner 2015-08-23 22:32:46 +00:00			`// NewTaskRunner is used to create a new task context`
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`func NewTaskRunner(logger log.Logger, config config.Config,`
			`updater TaskStateUpdater, ctx *driver.ExecContext,`
			`allocID string, task structs.Task) TaskRunner {`
client: rename Alloc/TaskContext to Runner 2015-08-23 22:32:46 +00:00			`tc := &TaskRunner{`
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`config: config,`
			`updater: updater,`
			`logger: logger,`
			`ctx: ctx,`
			`allocID: allocID,`
			`task: task,`
			`updateCh: make(chan *structs.Task, 8),`
			`destroyCh: make(chan struct{}),`
			`waitCh: make(chan struct{}),`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`}`
			`return tc`
			`}`

client: working on runners 2015-08-29 22:46:10 +00:00			`// WaitCh returns a channel to wait for termination`
			`func (r *TaskRunner) WaitCh() <-chan struct{} {`
			`return r.waitCh`
			`}`

client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`// stateFilePath returns the path to our state file`
			`func (r *TaskRunner) stateFilePath() string {`
			`// Get the MD5 of the task name`
			`hashVal := md5.Sum([]byte(r.task.Name))`
			`hashHex := hex.EncodeToString(hashVal[:])`
			`dirName := fmt.Sprintf("task-%s", hashHex)`

			`// Generate the path`
			`path := filepath.Join(r.config.StateDir, "alloc", r.allocID,`
			`dirName, "state.json")`
			`return path`
			`}`

			`// RestoreState is used to restore our state`
			`func (r *TaskRunner) RestoreState() error {`
			`// Load the snapshot`
			`var snap taskRunnerState`
			`if err := restoreState(r.stateFilePath(), &snap); err != nil {`
			`return err`
			`}`

			`// Restore fields`
			`r.task = snap.Task`

client: working on state restore 2015-08-30 02:14:47 +00:00			`// Restore the driver`
			`if snap.HandleID != "" {`
			`driver, err := r.createDriver()`
			`if err != nil {`
			`return err`
			`}`

			`handle, err := driver.Open(r.ctx, snap.HandleID)`
			`if err != nil {`
			`r.logger.Printf("[ERR] client: failed to open handle to task '%s' for alloc '%s': %v",`
			`r.task.Name, r.allocID, err)`
			`return err`
			`}`
			`r.handle = handle`
			`}`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`return nil`
			`}`

			`// SaveState is used to snapshot our state`
			`func (r *TaskRunner) SaveState() error {`
			`snap := taskRunnerState{`
			`Task: r.task,`
			`}`
client: working on state restore 2015-08-30 02:14:47 +00:00			`if r.handle != nil {`
			`snap.HandleID = r.handle.ID()`
			`}`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`return persistState(r.stateFilePath(), &snap)`
			`}`

			`// DestroyState is used to cleanup after ourselves`
			`func (r *TaskRunner) DestroyState() error {`
			`return os.RemoveAll(r.stateFilePath())`
			`}`

client: working on runners 2015-08-29 22:46:10 +00:00			`// setStatus is used to update the status of the task runner`
			`func (r *TaskRunner) setStatus(status, desc string) {`
client: remove TaskRunner dependence on AllocRunner 2015-08-30 02:42:35 +00:00			`r.updater(r.task.Name, status, desc)`
client: making progress 2015-08-23 23:49:48 +00:00			`}`

client: working on state restore 2015-08-30 02:14:47 +00:00			`// createDriver makes a driver for the task`
			`func (r *TaskRunner) createDriver() (driver.Driver, error) {`
Replace logging and config with DriverContext, which allows us to expand the dependency injection without changing the interface 2015-09-10 01:06:23 +00:00			`driverCtx := driver.NewDriverContext(r.config, r.config.Node, r.logger)`
			`driver, err := driver.NewDriver(r.task.Driver, driverCtx)`
client: making progress 2015-08-23 23:49:48 +00:00			`if err != nil {`
client: working on state restore 2015-08-30 02:14:47 +00:00			`err = fmt.Errorf("failed to create driver '%s' for alloc %s: %v",`
			`r.task.Driver, r.allocID, err)`
			`r.logger.Printf("[ERR] client: %s", err)`
			`}`
			`return driver, err`
			`}`

			`// startTask is used to start the task if there is no handle`
			`func (r *TaskRunner) startTask() error {`
			`// Create a driver`
			`driver, err := r.createDriver()`
			`if err != nil {`
			`r.setStatus(structs.AllocClientStatusFailed, err.Error())`
			`return err`
client: making progress 2015-08-23 23:49:48 +00:00			`}`
client: alloc/task context 2015-08-23 22:30:16 +00:00
client: making progress 2015-08-23 23:49:48 +00:00			`// Start the job`
			`handle, err := driver.Start(r.ctx, r.task)`
			`if err != nil {`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.logger.Printf("[ERR] client: failed to start task '%s' for alloc '%s': %v",`
			`r.task.Name, r.allocID, err)`
			`r.setStatus(structs.AllocClientStatusFailed,`
			`fmt.Sprintf("failed to start: %v", err))`
client: working on state restore 2015-08-30 02:14:47 +00:00			`return err`
client: making progress 2015-08-23 23:49:48 +00:00			`}`
client: working on state restore 2015-08-30 02:14:47 +00:00			`r.handle = handle`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.setStatus(structs.AllocClientStatusRunning, "task started")`
client: working on state restore 2015-08-30 02:14:47 +00:00			`return nil`
			`}`

			`// Run is a long running routine used to manage the task`
			`func (r *TaskRunner) Run() {`
			`defer close(r.waitCh)`
			`r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",`
			`r.task.Name, r.allocID)`

			`// Start the task if not yet started`
			`if r.handle == nil {`
			`if err := r.startTask(); err != nil {`
			`return`
			`}`
			`}`
client: making progress 2015-08-23 23:49:48 +00:00
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`OUTER:`
client: making progress 2015-08-23 23:49:48 +00:00			`// Wait for updates`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`for {`
			`select {`
client: working on state restore 2015-08-30 02:14:47 +00:00			`case err := <-r.handle.WaitCh():`
client: making progress 2015-08-23 23:49:48 +00:00			`if err != nil {`
			`r.logger.Printf("[ERR] client: failed to complete task '%s' for alloc '%s': %v",`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.task.Name, r.allocID, err)`
			`r.setStatus(structs.AllocClientStatusDead,`
			`fmt.Sprintf("task failed with: %v", err))`
client: making progress 2015-08-23 23:49:48 +00:00			`} else {`
			`r.logger.Printf("[INFO] client: completed task '%s' for alloc '%s'",`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.task.Name, r.allocID)`
			`r.setStatus(structs.AllocClientStatusDead,`
			`"task completed")`
client: making progress 2015-08-23 23:49:48 +00:00			`}`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00			`break OUTER`
client: working on runners 2015-08-29 22:46:10 +00:00
client: standardize naming 2015-08-23 22:36:06 +00:00			`case update := <-r.updateCh:`
client: working on runners 2015-08-29 22:46:10 +00:00			`// Update`
client: standardize naming 2015-08-23 22:36:06 +00:00			`r.task = update`
client: working on state restore 2015-08-30 02:14:47 +00:00			`if err := r.handle.Update(update); err != nil {`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.logger.Printf("[ERR] client: failed to update task '%s' for alloc '%s': %v",`
			`r.task.Name, r.allocID, err)`
			`}`

client: standardize naming 2015-08-23 22:36:06 +00:00			`case <-r.destroyCh:`
client: working on runners 2015-08-29 22:46:10 +00:00			`// Send the kill signal, and use the WaitCh to block until complete`
client: working on state restore 2015-08-30 02:14:47 +00:00			`if err := r.handle.Kill(); err != nil {`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc '%s': %v",`
			`r.task.Name, r.allocID, err)`
			`}`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`}`
			`}`
client: first pass at save/restore of state 2015-08-30 01:16:49 +00:00
client: working on state restore 2015-08-30 02:14:47 +00:00			`// Cleanup after ourselves`
			`r.DestroyState()`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`}`

			`// Update is used to update the task of the context`
client: standardize naming 2015-08-23 22:36:06 +00:00			`func (r TaskRunner) Update(update structs.Task) {`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`select {`
client: standardize naming 2015-08-23 22:36:06 +00:00			`case r.updateCh <- update:`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`default:`
client: working on runners 2015-08-29 22:46:10 +00:00			`r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",`
			`update.Name, r.allocID)`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`}`
			`}`

			`// Destroy is used to indicate that the task context should be destroyed`
client: standardize naming 2015-08-23 22:36:06 +00:00			`func (r *TaskRunner) Destroy() {`
			`r.destroyLock.Lock()`
			`defer r.destroyLock.Unlock()`
client: alloc/task context 2015-08-23 22:30:16 +00:00
client: standardize naming 2015-08-23 22:36:06 +00:00			`if r.destroy {`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`return`
			`}`
client: standardize naming 2015-08-23 22:36:06 +00:00			`r.destroy = true`
			`close(r.destroyCh)`
client: alloc/task context 2015-08-23 22:30:16 +00:00			`}`