open-nomad/client/task_runner.go

273 lines
7.2 KiB
Go

package client
import (
"crypto/md5"
"encoding/hex"
"fmt"
"log"
"os"
"path/filepath"
"sync"
"time"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/driver"
"github.com/hashicorp/nomad/nomad/structs"
)
// TaskRunner is used to wrap a task within an allocation and provide the execution context.
type TaskRunner struct {
config *config.Config
updater TaskStateUpdater
logger *log.Logger
ctx *driver.ExecContext
allocID string
restartTracker restartTracker
task *structs.Task
updateCh chan *structs.Task
handle driver.DriverHandle
destroy bool
destroyCh chan struct{}
destroyLock sync.Mutex
waitCh chan struct{}
}
// taskRunnerState is used to snapshot the state of the task runner
type taskRunnerState struct {
Task *structs.Task
HandleID string
}
// TaskStateUpdater is used to update the status of a task
type TaskStateUpdater func(taskName, status, desc string)
// NewTaskRunner is used to create a new task context
func NewTaskRunner(logger *log.Logger, config *config.Config,
updater TaskStateUpdater, ctx *driver.ExecContext,
allocID string, task *structs.Task, restartTracker restartTracker) *TaskRunner {
tc := &TaskRunner{
config: config,
updater: updater,
logger: logger,
restartTracker: restartTracker,
ctx: ctx,
allocID: allocID,
task: task,
updateCh: make(chan *structs.Task, 8),
destroyCh: make(chan struct{}),
waitCh: make(chan struct{}),
}
return tc
}
// WaitCh returns a channel to wait for termination
func (r *TaskRunner) WaitCh() <-chan struct{} {
return r.waitCh
}
// stateFilePath returns the path to our state file
func (r *TaskRunner) stateFilePath() string {
// Get the MD5 of the task name
hashVal := md5.Sum([]byte(r.task.Name))
hashHex := hex.EncodeToString(hashVal[:])
dirName := fmt.Sprintf("task-%s", hashHex)
// Generate the path
path := filepath.Join(r.config.StateDir, "alloc", r.allocID,
dirName, "state.json")
return path
}
// RestoreState is used to restore our state
func (r *TaskRunner) RestoreState() error {
// Load the snapshot
var snap taskRunnerState
if err := restoreState(r.stateFilePath(), &snap); err != nil {
return err
}
// Restore fields
r.task = snap.Task
// Restore the driver
if snap.HandleID != "" {
driver, err := r.createDriver()
if err != nil {
return err
}
handle, err := driver.Open(r.ctx, snap.HandleID)
// In the case it fails, we relaunch the task in the Run() method.
if err != nil {
r.logger.Printf("[ERR] client: failed to open handle to task '%s' for alloc '%s': %v",
r.task.Name, r.allocID, err)
return nil
}
r.handle = handle
}
return nil
}
// SaveState is used to snapshot our state
func (r *TaskRunner) SaveState() error {
snap := taskRunnerState{
Task: r.task,
}
if r.handle != nil {
snap.HandleID = r.handle.ID()
}
return persistState(r.stateFilePath(), &snap)
}
// DestroyState is used to cleanup after ourselves
func (r *TaskRunner) DestroyState() error {
return os.RemoveAll(r.stateFilePath())
}
// setStatus is used to update the status of the task runner
func (r *TaskRunner) setStatus(status, desc string) {
r.updater(r.task.Name, status, desc)
}
// createDriver makes a driver for the task
func (r *TaskRunner) createDriver() (driver.Driver, error) {
driverCtx := driver.NewDriverContext(r.task.Name, r.config, r.config.Node, r.logger)
driver, err := driver.NewDriver(r.task.Driver, driverCtx)
if err != nil {
err = fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
r.task.Driver, r.allocID, err)
r.logger.Printf("[ERR] client: %s", err)
}
return driver, err
}
// startTask is used to start the task if there is no handle
func (r *TaskRunner) startTask() error {
// Create a driver
driver, err := r.createDriver()
if err != nil {
r.setStatus(structs.AllocClientStatusFailed, err.Error())
return err
}
// Start the job
handle, err := driver.Start(r.ctx, r.task)
if err != nil {
r.logger.Printf("[ERR] client: failed to start task '%s' for alloc '%s': %v",
r.task.Name, r.allocID, err)
r.setStatus(structs.AllocClientStatusFailed,
fmt.Sprintf("failed to start: %v", err))
return err
}
r.handle = handle
r.setStatus(structs.AllocClientStatusRunning, "task started")
return nil
}
// Run is a long running routine used to manage the task
func (r *TaskRunner) Run() {
var err error
defer close(r.waitCh)
r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
r.task.Name, r.allocID)
// Start the task if not yet started
if r.handle == nil {
if err := r.startTask(); err != nil {
return
}
}
// Monitoring the Driver
defer r.DestroyState()
err = r.monitorDriver(r.handle.WaitCh(), r.updateCh, r.destroyCh)
for err != nil {
r.logger.Printf("[ERR] client: failed to complete task '%s' for alloc '%s': %v",
r.task.Name, r.allocID, err)
shouldRestart, when := r.restartTracker.nextRestart()
if !shouldRestart {
r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.allocID)
r.setStatus(structs.AllocClientStatusDead, fmt.Sprintf("task failed with: %v", err))
return
}
r.logger.Printf("[INFO] client: Restarting Task: %v", r.task.Name)
r.setStatus(structs.AllocClientStatusPending, "Task Restarting")
r.logger.Printf("[DEBUG] client: Sleeping for %v before restarting Task %v", when, r.task.Name)
select {
case <-time.After(when):
case <-r.destroyCh:
}
r.destroyLock.Lock()
if r.destroy {
r.logger.Printf("[DEBUG] client: Not restarting task: %v because it's destroyed by user", r.task.Name)
break
}
if err = r.startTask(); err != nil {
r.destroyLock.Unlock()
continue
}
r.destroyLock.Unlock()
err = r.monitorDriver(r.handle.WaitCh(), r.updateCh, r.destroyCh)
}
// Cleanup after ourselves
r.logger.Printf("[INFO] client: completed task '%s' for alloc '%s'", r.task.Name, r.allocID)
r.setStatus(structs.AllocClientStatusDead, "task completed")
}
// This functions listens to messages from the driver and blocks until the
// driver exits
func (r *TaskRunner) monitorDriver(waitCh chan error, updateCh chan *structs.Task, destroyCh chan struct{}) error {
var err error
OUTER:
// Wait for updates
for {
select {
case err = <-waitCh:
break OUTER
case update := <-updateCh:
// Update
r.task = update
if err := r.handle.Update(update); err != nil {
r.logger.Printf("[ERR] client: failed to update task '%s' for alloc '%s': %v",
r.task.Name, r.allocID, err)
}
case <-destroyCh:
// Send the kill signal, and use the WaitCh to block until complete
if err := r.handle.Kill(); err != nil {
r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc '%s': %v",
r.task.Name, r.allocID, err)
}
}
}
return err
}
// Update is used to update the task of the context
func (r *TaskRunner) Update(update *structs.Task) {
select {
case r.updateCh <- update:
default:
r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
update.Name, r.allocID)
}
}
// Destroy is used to indicate that the task context should be destroyed
func (r *TaskRunner) Destroy() {
r.destroyLock.Lock()
defer r.destroyLock.Unlock()
if r.destroy {
return
}
r.destroy = true
close(r.destroyCh)
}