open-nomad/client/alloc_runner.go

829 lines
24 KiB
Go
Raw Normal View History

2015-08-23 22:06:47 +00:00
package client
2015-08-23 22:15:48 +00:00
import (
2015-08-29 22:46:10 +00:00
"fmt"
2016-09-14 20:30:01 +00:00
"io/ioutil"
2015-08-23 22:15:48 +00:00
"log"
"os"
"path/filepath"
2015-08-23 22:15:48 +00:00
"sync"
2015-08-29 22:46:10 +00:00
"time"
2015-08-23 22:15:48 +00:00
2016-02-20 00:31:04 +00:00
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/config"
2015-08-23 23:49:48 +00:00
"github.com/hashicorp/nomad/client/driver"
2016-09-14 20:30:01 +00:00
"github.com/hashicorp/nomad/client/vaultclient"
2015-08-23 22:15:48 +00:00
"github.com/hashicorp/nomad/nomad/structs"
cstructs "github.com/hashicorp/nomad/client/structs"
2015-08-23 22:15:48 +00:00
)
2015-08-23 22:06:47 +00:00
2015-08-29 22:46:10 +00:00
const (
2016-02-20 00:02:48 +00:00
// taskReceivedSyncLimit is how long the client will wait before sending
// that a task was received to the server. The client does not immediately
2016-05-15 16:41:34 +00:00
// send that the task was received to the server because another transition
2016-02-20 00:02:48 +00:00
// to running or failed is likely to occur immediately after and a single
2016-05-15 16:41:34 +00:00
// update will transfer all past state information. If not other transition
// has occurred up to this limit, we will send to the server.
2016-02-19 23:49:32 +00:00
taskReceivedSyncLimit = 30 * time.Second
// watchdogInterval is the interval at which resource constraints for the
// allocation are being checked and enforced.
watchdogInterval = 5 * time.Second
2016-09-14 20:30:01 +00:00
// vaultTokenFile is the name of the file holding the Vault token inside the
// task's secret directory
vaultTokenFile = "vault_token"
2015-08-29 22:46:10 +00:00
)
// AllocStateUpdater is used to update the status of an allocation
type AllocStateUpdater func(alloc *structs.Allocation)
type AllocStatsReporter interface {
LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error)
}
// AllocRunner is used to wrap an allocation and provide the execution context.
type AllocRunner struct {
config *config.Config
updater AllocStateUpdater
logger *log.Logger
2015-08-23 22:15:48 +00:00
2016-02-10 21:44:53 +00:00
alloc *structs.Allocation
allocClientStatus string // Explicit status of allocation. Set when there are failures
allocClientDescription string
2016-02-10 21:44:53 +00:00
allocLock sync.Mutex
2015-08-29 22:46:10 +00:00
dirtyCh chan struct{}
ctx *driver.ExecContext
ctxLock sync.Mutex
tasks map[string]*TaskRunner
taskStates map[string]*structs.TaskState
restored map[string]struct{}
taskLock sync.RWMutex
2015-08-23 23:49:48 +00:00
2016-03-22 02:59:58 +00:00
taskStatusLock sync.RWMutex
2015-08-29 22:46:10 +00:00
2015-08-23 22:15:48 +00:00
updateCh chan *structs.Allocation
2016-09-14 20:30:01 +00:00
vaultClient vaultclient.VaultClient
2016-09-14 22:04:25 +00:00
vaultTokens map[string]vaultToken
2016-09-14 20:30:01 +00:00
2015-08-23 22:15:48 +00:00
destroy bool
destroyCh chan struct{}
destroyLock sync.Mutex
2015-10-04 20:36:03 +00:00
waitCh chan struct{}
// serialize saveAllocRunnerState calls
persistLock sync.Mutex
2015-08-23 22:06:47 +00:00
}
// allocRunnerState is used to snapshot the state of the alloc runner
type allocRunnerState struct {
Version string
Alloc *structs.Allocation
AllocClientStatus string
AllocClientDescription string
Context *driver.ExecContext
}
// NewAllocRunner is used to create a new allocation context
func NewAllocRunner(logger *log.Logger, config *config.Config, updater AllocStateUpdater,
2016-09-14 20:30:01 +00:00
alloc *structs.Allocation, vaultClient vaultclient.VaultClient) *AllocRunner {
2015-08-30 02:14:47 +00:00
ar := &AllocRunner{
2016-09-14 20:30:01 +00:00
config: config,
updater: updater,
logger: logger,
alloc: alloc,
dirtyCh: make(chan struct{}, 1),
tasks: make(map[string]*TaskRunner),
taskStates: copyTaskStates(alloc.TaskStates),
restored: make(map[string]struct{}),
updateCh: make(chan *structs.Allocation, 64),
destroyCh: make(chan struct{}),
waitCh: make(chan struct{}),
vaultClient: vaultClient,
2015-08-23 22:06:47 +00:00
}
2015-08-30 02:14:47 +00:00
return ar
2015-08-23 22:06:47 +00:00
}
// stateFilePath returns the path to our state file
func (r *AllocRunner) stateFilePath() string {
r.allocLock.Lock()
defer r.allocLock.Unlock()
path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, "state.json")
return path
}
// RestoreState is used to restore the state of the alloc runner
func (r *AllocRunner) RestoreState() error {
// Load the snapshot
var snap allocRunnerState
if err := restoreState(r.stateFilePath(), &snap); err != nil {
return err
}
// Restore fields
r.alloc = snap.Alloc
2015-08-30 02:14:47 +00:00
r.ctx = snap.Context
r.allocClientStatus = snap.AllocClientStatus
r.allocClientDescription = snap.AllocClientDescription
r.taskStates = snap.Alloc.TaskStates
2016-06-17 21:58:53 +00:00
var snapshotErrors multierror.Error
if r.alloc == nil {
snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil allocation"))
}
if r.ctx == nil {
snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil context"))
}
2016-06-17 21:58:53 +00:00
if e := snapshotErrors.ErrorOrNil(); e != nil {
return e
}
2016-09-14 20:30:01 +00:00
// Recover the Vault tokens
2016-09-14 22:04:25 +00:00
vaultErr := r.recoverVaultTokens()
2016-09-14 20:30:01 +00:00
// Restore the task runners
var mErr multierror.Error
for name, state := range r.taskStates {
// Mark the task as restored.
r.restored[name] = struct{}{}
task := &structs.Task{Name: name}
2016-02-10 21:44:53 +00:00
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(),
task)
r.tasks[name] = tr
2015-11-09 23:55:31 +00:00
2016-09-14 22:04:25 +00:00
if vt, ok := r.vaultTokens[name]; ok {
2016-09-14 20:30:01 +00:00
tr.SetVaultToken(vt.token, vt.renewalCh)
}
2015-11-09 23:55:31 +00:00
// Skip tasks in terminal states.
if state.State == structs.TaskStateDead {
2015-11-09 23:55:31 +00:00
continue
}
if err := tr.RestoreState(); err != nil {
r.logger.Printf("[ERR] client: failed to restore state for alloc %s task '%s': %v", r.alloc.ID, name, err)
mErr.Errors = append(mErr.Errors, err)
2016-02-19 23:49:32 +00:00
} else if !r.alloc.TerminalStatus() {
// Only start if the alloc isn't in a terminal status.
go tr.Run()
}
}
2016-09-14 20:30:01 +00:00
// Since this is somewhat of an expected case we do not return an error but
// handle it gracefully.
if vaultErr != nil {
msg := fmt.Sprintf("failed to recover Vault tokens for allocation %q: %v", r.alloc.ID, vaultErr)
r.logger.Printf("[ERR] client: %s", msg)
r.setStatus(structs.AllocClientStatusFailed, msg)
2016-09-15 18:37:20 +00:00
// Destroy the task runners and set the error
r.destroyTaskRunners(structs.NewTaskEvent(structs.TaskVaultRenewalFailed).SetVaultRenewalError(vaultErr))
// Handle cleanup
go r.handleDestroy()
2016-09-14 20:30:01 +00:00
}
return mErr.ErrorOrNil()
}
2015-11-10 00:15:11 +00:00
// SaveState is used to snapshot the state of the alloc runner
// if the fullSync is marked as false only the state of the Alloc Runner
// is snapshotted. If fullSync is marked as true, we snapshot
// all the Task Runners associated with the Alloc
func (r *AllocRunner) SaveState() error {
if err := r.saveAllocRunnerState(); err != nil {
return err
}
2015-11-10 00:15:11 +00:00
// Save state for each task
2016-07-21 20:41:01 +00:00
runners := r.getTaskRunners()
var mErr multierror.Error
2016-07-21 20:41:01 +00:00
for _, tr := range runners {
2015-11-10 00:15:11 +00:00
if err := r.saveTaskRunnerState(tr); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
}
return mErr.ErrorOrNil()
}
func (r *AllocRunner) saveAllocRunnerState() error {
r.persistLock.Lock()
defer r.persistLock.Unlock()
2016-02-10 21:44:53 +00:00
// Create the snapshot.
alloc := r.Alloc()
r.allocLock.Lock()
2016-02-10 21:44:53 +00:00
allocClientStatus := r.allocClientStatus
allocClientDescription := r.allocClientDescription
r.allocLock.Unlock()
r.ctxLock.Lock()
ctx := r.ctx
r.ctxLock.Unlock()
snap := allocRunnerState{
Version: r.config.Version,
2016-02-10 21:44:53 +00:00
Alloc: alloc,
Context: ctx,
AllocClientStatus: allocClientStatus,
AllocClientDescription: allocClientDescription,
}
return persistState(r.stateFilePath(), &snap)
}
2015-11-10 00:15:11 +00:00
func (r *AllocRunner) saveTaskRunnerState(tr *TaskRunner) error {
2016-08-08 23:57:21 +00:00
if err := tr.SaveState(); err != nil {
2016-08-10 22:17:32 +00:00
return fmt.Errorf("failed to save state for alloc %s task '%s': %v",
r.alloc.ID, tr.task.Name, err)
}
2016-08-08 23:57:21 +00:00
return nil
}
// DestroyState is used to cleanup after ourselves
func (r *AllocRunner) DestroyState() error {
return os.RemoveAll(filepath.Dir(r.stateFilePath()))
}
2015-08-31 00:35:58 +00:00
// DestroyContext is used to destroy the context
func (r *AllocRunner) DestroyContext() error {
return r.ctx.AllocDir.Destroy()
2015-08-31 00:35:58 +00:00
}
2016-02-10 21:44:53 +00:00
// copyTaskStates returns a copy of the passed task states.
func copyTaskStates(states map[string]*structs.TaskState) map[string]*structs.TaskState {
copy := make(map[string]*structs.TaskState, len(states))
for task, state := range states {
copy[task] = state.Copy()
}
return copy
}
2015-08-23 22:06:47 +00:00
// Alloc returns the associated allocation
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Alloc() *structs.Allocation {
r.allocLock.Lock()
alloc := r.alloc.Copy()
2016-02-10 21:44:53 +00:00
2016-05-15 16:41:34 +00:00
// The status has explicitly been set.
2016-02-10 21:44:53 +00:00
if r.allocClientStatus != "" || r.allocClientDescription != "" {
alloc.ClientStatus = r.allocClientStatus
alloc.ClientDescription = r.allocClientDescription
// Copy over the task states so we don't lose them
r.taskStatusLock.RLock()
alloc.TaskStates = copyTaskStates(r.taskStates)
r.taskStatusLock.RUnlock()
2016-02-10 21:44:53 +00:00
r.allocLock.Unlock()
return alloc
}
r.allocLock.Unlock()
// Scan the task states to determine the status of the alloc
var pending, running, dead, failed bool
r.taskStatusLock.RLock()
2016-02-10 21:44:53 +00:00
alloc.TaskStates = copyTaskStates(r.taskStates)
for _, state := range r.taskStates {
switch state.State {
case structs.TaskStateRunning:
running = true
case structs.TaskStatePending:
pending = true
case structs.TaskStateDead:
if state.Failed() {
failed = true
} else {
dead = true
}
}
}
r.taskStatusLock.RUnlock()
// Determine the alloc status
if failed {
alloc.ClientStatus = structs.AllocClientStatusFailed
} else if running {
alloc.ClientStatus = structs.AllocClientStatusRunning
} else if pending {
alloc.ClientStatus = structs.AllocClientStatusPending
} else if dead {
alloc.ClientStatus = structs.AllocClientStatusComplete
}
return alloc
2015-08-23 22:06:47 +00:00
}
2015-08-31 00:10:17 +00:00
// dirtySyncState is used to watch for state being marked dirty to sync
func (r *AllocRunner) dirtySyncState() {
2015-08-29 22:46:10 +00:00
for {
select {
case <-r.dirtyCh:
r.syncStatus()
2015-08-29 22:46:10 +00:00
case <-r.destroyCh:
return
}
2015-08-31 00:10:17 +00:00
}
}
2015-08-29 22:46:10 +00:00
2015-08-31 00:10:17 +00:00
// syncStatus is used to run and sync the status when it changes
func (r *AllocRunner) syncStatus() error {
// Get a copy of our alloc, update status server side and sync to disk
alloc := r.Alloc()
r.updater(alloc)
return r.saveAllocRunnerState()
2015-08-29 22:46:10 +00:00
}
// setStatus is used to update the allocation status
func (r *AllocRunner) setStatus(status, desc string) {
2016-02-10 21:44:53 +00:00
r.allocLock.Lock()
r.allocClientStatus = status
r.allocClientDescription = desc
r.allocLock.Unlock()
2015-08-29 22:46:10 +00:00
select {
case r.dirtyCh <- struct{}{}:
default:
}
}
// setTaskState is used to set the status of a task
func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEvent) {
r.taskStatusLock.Lock()
defer r.taskStatusLock.Unlock()
taskState, ok := r.taskStates[taskName]
if !ok {
2016-03-01 22:09:25 +00:00
taskState = &structs.TaskState{}
r.taskStates[taskName] = taskState
}
// Set the tasks state.
taskState.State = state
r.appendTaskEvent(taskState, event)
2016-09-14 22:04:25 +00:00
if state == structs.TaskStateDead {
// If the task has a Vault token, stop renewing it
if vt, ok := r.vaultTokens[taskName]; ok {
if err := r.vaultClient.StopRenewToken(vt.token); err != nil {
r.logger.Printf("[ERR] client: stopping token renewal for task %q failed: %v", taskName, err)
2016-03-22 02:59:58 +00:00
}
2016-02-19 22:49:43 +00:00
}
2016-09-14 22:04:25 +00:00
// If the task failed, we should kill all the other tasks in the task group.
if taskState.Failed() {
var destroyingTasks []string
for task, tr := range r.tasks {
if task != taskName {
destroyingTasks = append(destroyingTasks, task)
tr.Destroy(structs.NewTaskEvent(structs.TaskSiblingFailed).SetFailedSibling(taskName))
}
}
if len(destroyingTasks) > 0 {
r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, destroyingTasks)
}
}
2016-02-19 22:49:43 +00:00
}
2015-08-29 22:46:10 +00:00
select {
case r.dirtyCh <- struct{}{}:
default:
}
}
// appendTaskEvent updates the task status by appending the new event.
func (r *AllocRunner) appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) {
capacity := 10
if state.Events == nil {
state.Events = make([]*structs.TaskEvent, 0, capacity)
}
// If we hit capacity, then shift it.
if len(state.Events) == capacity {
old := state.Events
state.Events = make([]*structs.TaskEvent, 0, capacity)
state.Events = append(state.Events, old[1:]...)
}
state.Events = append(state.Events, event)
}
2015-08-23 22:06:47 +00:00
// Run is a long running goroutine used to manage an allocation
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Run() {
2015-10-04 20:36:03 +00:00
defer close(r.waitCh)
2015-08-31 00:10:17 +00:00
go r.dirtySyncState()
2015-08-23 22:15:48 +00:00
2015-08-29 22:46:10 +00:00
// Find the task group to run in the allocation
alloc := r.alloc
2015-08-30 02:14:47 +00:00
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
2015-08-23 23:49:48 +00:00
if tg == nil {
r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
2015-08-29 22:46:10 +00:00
r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup))
2015-08-23 23:49:48 +00:00
return
}
// Create the execution context
2016-02-10 21:44:53 +00:00
r.ctxLock.Lock()
2015-08-30 02:14:47 +00:00
if r.ctx == nil {
2016-09-02 00:23:15 +00:00
allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID), r.Alloc().Resources.DiskMB)
if err := allocDir.Build(tg.Tasks); err != nil {
2016-09-02 00:23:15 +00:00
r.logger.Printf("[WARN] client: failed to build task directories: %v", err)
r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup))
2016-02-11 16:38:16 +00:00
r.ctxLock.Unlock()
return
}
r.ctx = driver.NewExecContext(allocDir, r.alloc.ID)
2015-08-30 02:14:47 +00:00
}
2016-02-10 21:44:53 +00:00
r.ctxLock.Unlock()
2015-08-23 23:49:48 +00:00
// Check if the allocation is in a terminal status. In this case, we don't
// start any of the task runners and directly wait for the destroy signal to
// clean up the allocation.
if alloc.TerminalStatus() {
r.logger.Printf("[DEBUG] client: alloc %q in terminal status, waiting for destroy", r.alloc.ID)
r.handleDestroy()
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
2016-02-04 22:19:27 +00:00
return
}
2016-09-14 20:30:01 +00:00
// Request Vault tokens for the tasks that require them
2016-09-14 22:04:25 +00:00
err := r.deriveVaultTokens()
2016-09-14 20:30:01 +00:00
if err != nil {
msg := fmt.Sprintf("failed to derive Vault token for allocation %q: %v", r.alloc.ID, err)
r.logger.Printf("[ERR] client: %s", msg)
r.setStatus(structs.AllocClientStatusFailed, msg)
return
}
2015-08-23 23:49:48 +00:00
// Start the task runners
2016-02-04 22:19:27 +00:00
r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID)
r.taskLock.Lock()
2015-08-23 23:49:48 +00:00
for _, task := range tg.Tasks {
if _, ok := r.restored[task.Name]; ok {
2015-08-30 02:14:47 +00:00
continue
}
2016-09-14 20:30:01 +00:00
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(), task.Copy())
2015-08-23 23:49:48 +00:00
r.tasks[task.Name] = tr
tr.MarkReceived()
2016-09-14 20:30:01 +00:00
// If the task has a vault token set it before running
2016-09-14 22:04:25 +00:00
if vt, ok := r.vaultTokens[task.Name]; ok {
2016-09-14 20:30:01 +00:00
tr.SetVaultToken(vt.token, vt.renewalCh)
}
2015-08-23 23:49:48 +00:00
go tr.Run()
}
r.taskLock.Unlock()
2015-08-23 23:49:48 +00:00
// Start watching the shared allocation directory for disk usage
go r.ctx.AllocDir.StartDiskWatcher()
watchdog := time.NewTicker(watchdogInterval)
defer watchdog.Stop()
// taskDestroyEvent contains an event that caused the destroyment of a task
// in the allocation.
var taskDestroyEvent *structs.TaskEvent
2015-08-29 22:46:10 +00:00
OUTER:
2015-08-23 23:49:48 +00:00
// Wait for updates
2015-08-23 22:06:47 +00:00
for {
2015-08-23 22:15:48 +00:00
select {
2015-08-23 22:36:06 +00:00
case update := <-r.updateCh:
// Store the updated allocation.
r.allocLock.Lock()
r.alloc = update
r.allocLock.Unlock()
2015-08-29 22:46:10 +00:00
// Check if we're in a terminal status
if update.TerminalStatus() {
taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled)
2015-08-29 22:46:10 +00:00
break OUTER
}
// Update the task groups
2016-07-21 20:41:01 +00:00
runners := r.getTaskRunners()
for _, tr := range runners {
tr.Update(update)
2015-08-29 22:46:10 +00:00
}
case <-watchdog.C:
if event, desc := r.checkResources(); event != nil {
r.setStatus(structs.AllocClientStatusFailed, desc)
taskDestroyEvent = event
break OUTER
}
2015-08-23 22:36:06 +00:00
case <-r.destroyCh:
taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled)
2015-08-29 22:46:10 +00:00
break OUTER
2015-08-23 22:15:48 +00:00
}
2015-08-23 22:06:47 +00:00
}
2015-08-29 22:46:10 +00:00
2016-09-15 18:37:20 +00:00
// Kill the task runners
r.destroyTaskRunners(taskDestroyEvent)
// Stop watching the shared allocation directory
r.ctx.AllocDir.StopDiskWatcher()
// Block until we should destroy the state of the alloc
r.handleDestroy()
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
}
// destroyTaskRunners destroys the task runners, waits for them to terminate and
// then saves state.
func (r *AllocRunner) destroyTaskRunners(destroyEvent *structs.TaskEvent) {
2015-08-29 22:46:10 +00:00
// Destroy each sub-task
2016-07-21 20:41:01 +00:00
runners := r.getTaskRunners()
for _, tr := range runners {
2016-09-15 18:37:20 +00:00
tr.Destroy(destroyEvent)
2015-08-29 22:46:10 +00:00
}
// Wait for termination of the task runners
2016-07-21 20:41:01 +00:00
for _, tr := range runners {
2015-08-29 22:46:10 +00:00
<-tr.WaitCh()
}
2015-08-31 00:10:17 +00:00
// Final state sync
r.syncStatus()
}
2015-08-31 00:10:17 +00:00
2016-09-14 20:30:01 +00:00
// vaultToken acts as a tuple of the token and renewal channel
type vaultToken struct {
token string
renewalCh <-chan error
}
2016-09-14 20:30:01 +00:00
// deriveVaultTokens derives the required vault tokens and returns a map of the
// tasks to their respective vault token and renewal channel. This must be
// called after the allocation directory is created as the vault tokens are
// written to disk.
2016-09-14 22:04:25 +00:00
func (r *AllocRunner) deriveVaultTokens() error {
2016-09-14 20:30:01 +00:00
required, err := r.tasksRequiringVaultTokens()
if err != nil {
2016-09-14 22:04:25 +00:00
return err
2016-09-14 20:30:01 +00:00
}
if len(required) == 0 {
2016-09-14 22:04:25 +00:00
return nil
2016-09-14 20:30:01 +00:00
}
2016-09-15 18:20:51 +00:00
if r.vaultTokens == nil {
r.vaultTokens = make(map[string]vaultToken, len(required))
}
2016-09-14 20:30:01 +00:00
// Get the tokens
tokens, err := r.vaultClient.DeriveToken(r.Alloc(), required)
if err != nil {
2016-09-14 22:04:25 +00:00
return fmt.Errorf("failed to derive Vault tokens: %v", err)
2016-09-14 20:30:01 +00:00
}
// Persist the tokens to the appropriate secret directories
adir := r.ctx.AllocDir
for task, token := range tokens {
2016-09-15 18:20:51 +00:00
// Has been recovered
if _, ok := r.vaultTokens[task]; ok {
continue
}
2016-09-14 20:30:01 +00:00
secretDir, err := adir.GetSecretDir(task)
if err != nil {
2016-09-14 22:04:25 +00:00
return fmt.Errorf("failed to determine task %s secret dir in alloc %q: %v", task, r.alloc.ID, err)
2016-09-14 20:30:01 +00:00
}
// Write the token to the file system
tokenPath := filepath.Join(secretDir, vaultTokenFile)
if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
2016-09-14 22:04:25 +00:00
return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", task, r.alloc.ID, err)
2016-09-14 20:30:01 +00:00
}
// Start renewing the token
2016-09-14 22:04:25 +00:00
renewCh, err := r.vaultClient.RenewToken(token, 10)
2016-09-14 20:30:01 +00:00
if err != nil {
var mErr multierror.Error
errMsg := fmt.Errorf("failed to renew Vault token for task %q in alloc %q: %v", task, r.alloc.ID, err)
multierror.Append(&mErr, errMsg)
// Clean up any token that we have started renewing
2016-09-15 18:20:51 +00:00
for _, token := range r.vaultTokens {
2016-09-14 20:30:01 +00:00
if err := r.vaultClient.StopRenewToken(token.token); err != nil {
multierror.Append(&mErr, err)
}
}
2016-09-14 22:04:25 +00:00
return mErr.ErrorOrNil()
2016-09-14 20:30:01 +00:00
}
2016-09-15 18:20:51 +00:00
r.vaultTokens[task] = vaultToken{token: token, renewalCh: renewCh}
2016-09-14 20:30:01 +00:00
}
2016-09-14 22:04:25 +00:00
return nil
2016-09-14 20:30:01 +00:00
}
2016-09-15 18:20:51 +00:00
// tasksRequiringVaultTokens returns the set of tasks that require a Vault token
2016-09-14 20:30:01 +00:00
func (r *AllocRunner) tasksRequiringVaultTokens() ([]string, error) {
// Get the tasks
tg := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup)
if tg == nil {
return nil, fmt.Errorf("Failed to lookup task group in alloc")
}
// Retrieve any required Vault tokens
var required []string
for _, task := range tg.Tasks {
if task.Vault != nil && len(task.Vault.Policies) != 0 {
required = append(required, task.Name)
}
}
return required, nil
}
// recoverVaultTokens reads the Vault tokens for the tasks that have Vault
// tokens off disk. If there is an error, it is returned, otherwise token
// renewal is started.
2016-09-14 22:04:25 +00:00
func (r *AllocRunner) recoverVaultTokens() error {
2016-09-14 20:30:01 +00:00
required, err := r.tasksRequiringVaultTokens()
if err != nil {
2016-09-14 22:04:25 +00:00
return err
2016-09-14 20:30:01 +00:00
}
if len(required) == 0 {
2016-09-14 22:04:25 +00:00
return nil
2016-09-14 20:30:01 +00:00
}
// Read the tokens and start renewing them
adir := r.ctx.AllocDir
renewingTokens := make(map[string]vaultToken, len(required))
for _, task := range required {
secretDir, err := adir.GetSecretDir(task)
if err != nil {
2016-09-14 22:04:25 +00:00
return fmt.Errorf("failed to determine task %s secret dir in alloc %q: %v", task, r.alloc.ID, err)
2016-09-14 20:30:01 +00:00
}
2016-09-17 18:31:17 +00:00
// Read the token from the secret directory
2016-09-14 20:30:01 +00:00
tokenPath := filepath.Join(secretDir, vaultTokenFile)
data, err := ioutil.ReadFile(tokenPath)
if err != nil {
2016-09-14 22:04:25 +00:00
return fmt.Errorf("failed to read token for task %q in alloc %q: %v", task, r.alloc.ID, err)
2016-09-14 20:30:01 +00:00
}
token := string(data)
2016-09-14 22:04:25 +00:00
renewCh, err := r.vaultClient.RenewToken(token, 10)
2016-09-14 20:30:01 +00:00
if err != nil {
var mErr multierror.Error
errMsg := fmt.Errorf("failed to renew Vault token for task %q in alloc %q: %v", task, r.alloc.ID, err)
multierror.Append(&mErr, errMsg)
// Clean up any token that we have started renewing
for _, token := range renewingTokens {
if err := r.vaultClient.StopRenewToken(token.token); err != nil {
multierror.Append(&mErr, err)
}
}
2016-09-14 22:04:25 +00:00
return mErr.ErrorOrNil()
2016-09-14 20:30:01 +00:00
}
renewingTokens[task] = vaultToken{token: token, renewalCh: renewCh}
}
2016-09-14 22:04:25 +00:00
r.vaultTokens = renewingTokens
return nil
}
// checkResources monitors and enforces alloc resource usage. It returns an
// appropriate task event describing why the allocation had to be killed.
func (r *AllocRunner) checkResources() (*structs.TaskEvent, string) {
diskSize := r.ctx.AllocDir.GetSize()
diskLimit := r.Alloc().Resources.DiskInBytes()
if diskSize > diskLimit {
return structs.NewTaskEvent(structs.TaskDiskExceeded).SetDiskLimit(diskLimit).SetDiskSize(diskSize),
"shared allocation directory exceeded the allowed disk space"
}
return nil, ""
}
// handleDestroy blocks till the AllocRunner should be destroyed and does the
// necessary cleanup.
func (r *AllocRunner) handleDestroy() {
select {
case <-r.destroyCh:
2015-08-31 00:35:58 +00:00
if err := r.DestroyContext(); err != nil {
r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v",
r.alloc.ID, err)
}
if err := r.DestroyState(); err != nil {
r.logger.Printf("[ERR] client: failed to destroy state for alloc '%s': %v",
r.alloc.ID, err)
}
}
2015-08-23 22:06:47 +00:00
}
// Update is used to update the allocation of the context
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Update(update *structs.Allocation) {
2015-08-23 22:15:48 +00:00
select {
2015-08-23 22:36:06 +00:00
case r.updateCh <- update:
2015-08-23 22:15:48 +00:00
default:
2015-08-23 22:36:06 +00:00
r.logger.Printf("[ERR] client: dropping update to alloc '%s'", update.ID)
2015-08-23 22:15:48 +00:00
}
2015-08-23 22:06:47 +00:00
}
// StatsReporter returns an interface to query resource usage statistics of an
// allocation
func (r *AllocRunner) StatsReporter() AllocStatsReporter {
return r
}
2016-07-21 20:41:01 +00:00
// getTaskRunners is a helper that returns a copy of the task runners list using
// the taskLock.
func (r *AllocRunner) getTaskRunners() []*TaskRunner {
// Get the task runners
r.taskLock.RLock()
defer r.taskLock.RUnlock()
runners := make([]*TaskRunner, 0, len(r.tasks))
for _, tr := range r.tasks {
runners = append(runners, tr)
}
return runners
}
// LatestAllocStats returns the latest allocation stats. If the optional taskFilter is set
// the allocation stats will only include the given task.
func (r *AllocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
astat := &cstructs.AllocResourceUsage{
Tasks: make(map[string]*cstructs.TaskResourceUsage),
}
var flat []*cstructs.TaskResourceUsage
if taskFilter != "" {
2016-06-20 17:19:06 +00:00
r.taskLock.RLock()
tr, ok := r.tasks[taskFilter]
2016-06-20 17:19:06 +00:00
r.taskLock.RUnlock()
if !ok {
return nil, fmt.Errorf("allocation %q has no task %q", r.alloc.ID, taskFilter)
}
l := tr.LatestResourceUsage()
if l != nil {
astat.Tasks[taskFilter] = l
flat = []*cstructs.TaskResourceUsage{l}
astat.Timestamp = l.Timestamp
}
} else {
2016-06-20 17:19:06 +00:00
// Get the task runners
2016-07-21 20:41:01 +00:00
runners := r.getTaskRunners()
2016-06-20 17:19:06 +00:00
for _, tr := range runners {
l := tr.LatestResourceUsage()
if l != nil {
2016-06-20 17:19:06 +00:00
astat.Tasks[tr.task.Name] = l
flat = append(flat, l)
if l.Timestamp > astat.Timestamp {
astat.Timestamp = l.Timestamp
}
}
}
}
astat.ResourceUsage = sumTaskResourceUsage(flat)
return astat, nil
}
// sumTaskResourceUsage takes a set of task resources and sums their resources
func sumTaskResourceUsage(usages []*cstructs.TaskResourceUsage) *cstructs.ResourceUsage {
summed := &cstructs.ResourceUsage{
MemoryStats: &cstructs.MemoryStats{},
CpuStats: &cstructs.CpuStats{},
}
for _, usage := range usages {
summed.Add(usage.ResourceUsage)
}
return summed
}
// shouldUpdate takes the AllocModifyIndex of an allocation sent from the server and
// checks if the current running allocation is behind and should be updated.
func (r *AllocRunner) shouldUpdate(serverIndex uint64) bool {
r.allocLock.Lock()
defer r.allocLock.Unlock()
return r.alloc.AllocModifyIndex < serverIndex
}
2015-08-23 22:06:47 +00:00
// Destroy is used to indicate that the allocation context should be destroyed
2015-08-23 22:36:06 +00:00
func (r *AllocRunner) Destroy() {
r.destroyLock.Lock()
defer r.destroyLock.Unlock()
2015-08-23 22:15:48 +00:00
2015-08-23 22:36:06 +00:00
if r.destroy {
2015-08-23 22:15:48 +00:00
return
}
2015-08-23 22:36:06 +00:00
r.destroy = true
close(r.destroyCh)
2015-08-23 22:06:47 +00:00
}
2015-10-04 20:36:03 +00:00
// WaitCh returns a channel to wait for termination
func (r *AllocRunner) WaitCh() <-chan struct{} {
return r.waitCh
}