1936 lines
60 KiB
Go
1936 lines
60 KiB
Go
package client
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/md5"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
metrics "github.com/armon/go-metrics"
|
|
"github.com/boltdb/bolt"
|
|
"github.com/golang/snappy"
|
|
"github.com/hashicorp/consul-template/signals"
|
|
"github.com/hashicorp/go-multierror"
|
|
version "github.com/hashicorp/go-version"
|
|
"github.com/hashicorp/nomad/client/allocdir"
|
|
"github.com/hashicorp/nomad/client/config"
|
|
"github.com/hashicorp/nomad/client/driver"
|
|
"github.com/hashicorp/nomad/client/getter"
|
|
"github.com/hashicorp/nomad/client/vaultclient"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/ugorji/go/codec"
|
|
|
|
"github.com/hashicorp/nomad/client/driver/env"
|
|
dstructs "github.com/hashicorp/nomad/client/driver/structs"
|
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
|
)
|
|
|
|
const (
|
|
// killBackoffBaseline is the baseline time for exponential backoff while
|
|
// killing a task.
|
|
killBackoffBaseline = 5 * time.Second
|
|
|
|
// killBackoffLimit is the limit of the exponential backoff for killing
|
|
// the task.
|
|
killBackoffLimit = 2 * time.Minute
|
|
|
|
// killFailureLimit is how many times we will attempt to kill a task before
|
|
// giving up and potentially leaking resources.
|
|
killFailureLimit = 5
|
|
|
|
// vaultBackoffBaseline is the baseline time for exponential backoff when
|
|
// attempting to retrieve a Vault token
|
|
vaultBackoffBaseline = 5 * time.Second
|
|
|
|
// vaultBackoffLimit is the limit of the exponential backoff when attempting
|
|
// to retrieve a Vault token
|
|
vaultBackoffLimit = 3 * time.Minute
|
|
|
|
// vaultTokenFile is the name of the file holding the Vault token inside the
|
|
// task's secret directory
|
|
vaultTokenFile = "vault_token"
|
|
)
|
|
|
|
var (
|
|
// taskRunnerStateAllKey holds all the task runners state. At the moment
|
|
// there is no need to split it
|
|
taskRunnerStateAllKey = []byte("simple-all")
|
|
)
|
|
|
|
// taskRestartEvent wraps a TaskEvent with additional metadata to control
|
|
// restart behavior.
|
|
type taskRestartEvent struct {
|
|
// taskEvent to report
|
|
taskEvent *structs.TaskEvent
|
|
|
|
// if false, don't count against restart count
|
|
failure bool
|
|
}
|
|
|
|
func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent {
|
|
return &taskRestartEvent{
|
|
taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason),
|
|
failure: failure,
|
|
}
|
|
}
|
|
|
|
// TaskRunner is used to wrap a task within an allocation and provide the execution context.
|
|
type TaskRunner struct {
|
|
stateDB *bolt.DB
|
|
config *config.Config
|
|
updater TaskStateUpdater
|
|
logger *log.Logger
|
|
restartTracker *RestartTracker
|
|
consul ConsulServiceAPI
|
|
|
|
// running marks whether the task is running
|
|
running bool
|
|
runningLock sync.Mutex
|
|
|
|
resourceUsage *cstructs.TaskResourceUsage
|
|
resourceUsageLock sync.RWMutex
|
|
|
|
alloc *structs.Allocation
|
|
task *structs.Task
|
|
taskDir *allocdir.TaskDir
|
|
|
|
// envBuilder is used to build the task's environment
|
|
envBuilder *env.Builder
|
|
|
|
// driverNet is the network information returned by the driver
|
|
driverNet *cstructs.DriverNetwork
|
|
driverNetLock sync.Mutex
|
|
|
|
// updateCh is used to receive updated versions of the allocation
|
|
updateCh chan *structs.Allocation
|
|
|
|
handle driver.DriverHandle
|
|
handleLock sync.Mutex
|
|
|
|
// artifactsDownloaded tracks whether the tasks artifacts have been
|
|
// downloaded
|
|
//
|
|
// Must acquire persistLock when accessing
|
|
artifactsDownloaded bool
|
|
|
|
// taskDirBuilt tracks whether the task has built its directory.
|
|
//
|
|
// Must acquire persistLock when accessing
|
|
taskDirBuilt bool
|
|
|
|
// createdResources are all the resources created by the task driver
|
|
// across all attempts to start the task.
|
|
// Simple gets and sets should use {get,set}CreatedResources
|
|
createdResources *driver.CreatedResources
|
|
createdResourcesLock sync.Mutex
|
|
|
|
// payloadRendered tracks whether the payload has been rendered to disk
|
|
payloadRendered bool
|
|
|
|
// vaultFuture is the means to wait for and get a Vault token
|
|
vaultFuture *tokenFuture
|
|
|
|
// recoveredVaultToken is the token that was recovered through a restore
|
|
recoveredVaultToken string
|
|
|
|
// vaultClient is used to retrieve and renew any needed Vault token
|
|
vaultClient vaultclient.VaultClient
|
|
|
|
// templateManager is used to manage any consul-templates this task may have
|
|
templateManager *TaskTemplateManager
|
|
|
|
// startCh is used to trigger the start of the task
|
|
startCh chan struct{}
|
|
|
|
// unblockCh is used to unblock the starting of the task
|
|
unblockCh chan struct{}
|
|
unblocked bool
|
|
unblockLock sync.Mutex
|
|
|
|
// restartCh is used to restart a task
|
|
restartCh chan *taskRestartEvent
|
|
|
|
// signalCh is used to send a signal to a task
|
|
signalCh chan SignalEvent
|
|
|
|
destroy bool
|
|
destroyCh chan struct{}
|
|
destroyLock sync.Mutex
|
|
destroyEvent *structs.TaskEvent
|
|
|
|
// waitCh closing marks the run loop as having exited
|
|
waitCh chan struct{}
|
|
|
|
// persistLock must be acquired when accessing fields stored by
|
|
// SaveState. SaveState is called asynchronously to TaskRunner.Run by
|
|
// AllocRunner, so all state fields must be synchronized using this
|
|
// lock.
|
|
persistLock sync.Mutex
|
|
|
|
// persistedHash is the hash of the last persisted snapshot. It is used to
|
|
// detect if a new snapshot has to be written to disk.
|
|
persistedHash []byte
|
|
|
|
// baseLabels are used when emitting tagged metrics. All task runner metrics
|
|
// will have these tags, and optionally more.
|
|
baseLabels []metrics.Label
|
|
}
|
|
|
|
// taskRunnerState is used to snapshot the state of the task runner
|
|
type taskRunnerState struct {
|
|
Version string
|
|
HandleID string
|
|
ArtifactDownloaded bool
|
|
TaskDirBuilt bool
|
|
PayloadRendered bool
|
|
CreatedResources *driver.CreatedResources
|
|
DriverNetwork *cstructs.DriverNetwork
|
|
}
|
|
|
|
func (s *taskRunnerState) Hash() []byte {
|
|
h := md5.New()
|
|
|
|
io.WriteString(h, s.Version)
|
|
io.WriteString(h, s.HandleID)
|
|
io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded))
|
|
io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt))
|
|
io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered))
|
|
h.Write(s.CreatedResources.Hash())
|
|
h.Write(s.DriverNetwork.Hash())
|
|
|
|
return h.Sum(nil)
|
|
}
|
|
|
|
// TaskStateUpdater is used to signal that tasks state has changed. If lazySync
|
|
// is set the event won't be immediately pushed to the server.
|
|
type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool)
|
|
|
|
// SignalEvent is a tuple of the signal and the event generating it
|
|
type SignalEvent struct {
|
|
// s is the signal to be sent
|
|
s os.Signal
|
|
|
|
// e is the task event generating the signal
|
|
e *structs.TaskEvent
|
|
|
|
// result should be used to send back the result of the signal
|
|
result chan<- error
|
|
}
|
|
|
|
// NewTaskRunner is used to create a new task context
|
|
func NewTaskRunner(logger *log.Logger, config *config.Config,
|
|
stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir,
|
|
alloc *structs.Allocation, task *structs.Task,
|
|
vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner {
|
|
|
|
// Merge in the task resources
|
|
task.Resources = alloc.TaskResources[task.Name]
|
|
|
|
// Build the restart tracker.
|
|
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
|
if tg == nil {
|
|
logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup)
|
|
return nil
|
|
}
|
|
restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
|
|
|
|
// Initialize the environment builder
|
|
envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region)
|
|
|
|
tc := &TaskRunner{
|
|
config: config,
|
|
stateDB: stateDB,
|
|
updater: updater,
|
|
logger: logger,
|
|
restartTracker: restartTracker,
|
|
alloc: alloc,
|
|
task: task,
|
|
taskDir: taskDir,
|
|
envBuilder: envBuilder,
|
|
createdResources: driver.NewCreatedResources(),
|
|
consul: consulClient,
|
|
vaultClient: vaultClient,
|
|
vaultFuture: NewTokenFuture().Set(""),
|
|
updateCh: make(chan *structs.Allocation, 64),
|
|
destroyCh: make(chan struct{}),
|
|
waitCh: make(chan struct{}),
|
|
startCh: make(chan struct{}, 1),
|
|
unblockCh: make(chan struct{}),
|
|
restartCh: make(chan *taskRestartEvent),
|
|
signalCh: make(chan SignalEvent),
|
|
}
|
|
|
|
tc.baseLabels = []metrics.Label{
|
|
{
|
|
Name: "job",
|
|
Value: tc.alloc.Job.Name,
|
|
},
|
|
{
|
|
Name: "task_group",
|
|
Value: tc.alloc.TaskGroup,
|
|
},
|
|
{
|
|
Name: "alloc_id",
|
|
Value: tc.alloc.ID,
|
|
},
|
|
{
|
|
Name: "task",
|
|
Value: tc.task.Name,
|
|
},
|
|
}
|
|
|
|
return tc
|
|
}
|
|
|
|
// MarkReceived marks the task as received.
|
|
func (r *TaskRunner) MarkReceived() {
|
|
// We lazy sync this since there will be a follow up message almost
|
|
// immediately.
|
|
r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true)
|
|
}
|
|
|
|
// WaitCh returns a channel to wait for termination
|
|
func (r *TaskRunner) WaitCh() <-chan struct{} {
|
|
return r.waitCh
|
|
}
|
|
|
|
// getHandle returns the task's handle or nil
|
|
func (r *TaskRunner) getHandle() driver.DriverHandle {
|
|
r.handleLock.Lock()
|
|
h := r.handle
|
|
r.handleLock.Unlock()
|
|
return h
|
|
}
|
|
|
|
// pre060StateFilePath returns the path to our state file that would have been
|
|
// written pre v0.6.0
|
|
// COMPAT: Remove in 0.7.0
|
|
func (r *TaskRunner) pre060StateFilePath() string {
|
|
// Get the MD5 of the task name
|
|
hashVal := md5.Sum([]byte(r.task.Name))
|
|
hashHex := hex.EncodeToString(hashVal[:])
|
|
dirName := fmt.Sprintf("task-%s", hashHex)
|
|
|
|
// Generate the path
|
|
return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json")
|
|
}
|
|
|
|
// RestoreState is used to restore our state. If a non-empty string is returned
|
|
// the task is restarted with the string as the reason. This is useful for
|
|
// backwards incompatible upgrades that need to restart tasks with a new
|
|
// executor.
|
|
func (r *TaskRunner) RestoreState() (string, error) {
|
|
// COMPAT: Remove in 0.7.0
|
|
// 0.6.0 transitioned from individual state files to a single bolt-db.
|
|
// The upgrade path is to:
|
|
// Check if old state exists
|
|
// If so, restore from that and delete old state
|
|
// Restore using state database
|
|
|
|
var snap taskRunnerState
|
|
|
|
// Check if the old snapshot is there
|
|
oldPath := r.pre060StateFilePath()
|
|
if err := pre060RestoreState(oldPath, &snap); err == nil {
|
|
// Delete the old state
|
|
os.RemoveAll(oldPath)
|
|
} else if !os.IsNotExist(err) {
|
|
// Something corrupt in the old state file
|
|
return "", err
|
|
} else {
|
|
// We are doing a normal restore
|
|
err := r.stateDB.View(func(tx *bolt.Tx) error {
|
|
bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get task bucket: %v", err)
|
|
}
|
|
|
|
if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil {
|
|
return fmt.Errorf("failed to read task runner state: %v", err)
|
|
}
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
}
|
|
|
|
// Restore fields from the snapshot
|
|
r.artifactsDownloaded = snap.ArtifactDownloaded
|
|
r.taskDirBuilt = snap.TaskDirBuilt
|
|
r.payloadRendered = snap.PayloadRendered
|
|
r.setCreatedResources(snap.CreatedResources)
|
|
r.driverNet = snap.DriverNetwork
|
|
|
|
if r.task.Vault != nil {
|
|
// Read the token from the secret directory
|
|
tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
|
|
data, err := ioutil.ReadFile(tokenPath)
|
|
if err != nil {
|
|
if !os.IsNotExist(err) {
|
|
return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
}
|
|
|
|
// Token file doesn't exist
|
|
} else {
|
|
// Store the recovered token
|
|
r.recoveredVaultToken = string(data)
|
|
}
|
|
}
|
|
|
|
// Restore the driver
|
|
restartReason := ""
|
|
if snap.HandleID != "" {
|
|
d, err := r.createDriver()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Add the restored network driver to the environment
|
|
r.envBuilder.SetDriverNetwork(r.driverNet)
|
|
|
|
// Open a connection to the driver handle
|
|
ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
|
|
handle, err := d.Open(ctx, snap.HandleID)
|
|
|
|
// In the case it fails, we relaunch the task in the Run() method.
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
return "", nil
|
|
}
|
|
|
|
if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) {
|
|
restartReason = pre06ScriptCheckReason
|
|
}
|
|
|
|
if err := r.registerServices(d, handle, r.driverNet); err != nil {
|
|
// Don't hard fail here as there's a chance this task
|
|
// registered with Consul properly when it initial
|
|
// started.
|
|
r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
}
|
|
|
|
r.handleLock.Lock()
|
|
r.handle = handle
|
|
r.handleLock.Unlock()
|
|
|
|
r.runningLock.Lock()
|
|
r.running = true
|
|
r.runningLock.Unlock()
|
|
}
|
|
return restartReason, nil
|
|
}
|
|
|
|
// ver06 is used for checking for pre-0.6 script checks
|
|
var ver06 = version.Must(version.NewVersion("0.6.0dev"))
|
|
|
|
// pre06ScriptCheckReason is the restart reason given when a pre-0.6 script
|
|
// check is found on an exec/java task.
|
|
const pre06ScriptCheckReason = "upgrading pre-0.6 script checks"
|
|
|
|
// pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script
|
|
// check, and uses exec or java drivers.
|
|
func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool {
|
|
if driver != "exec" && driver != "java" && driver != "mock_driver" {
|
|
// Only exec and java are affected
|
|
return false
|
|
}
|
|
v, err := version.NewVersion(ver)
|
|
if err != nil {
|
|
// Treat it as old
|
|
return true
|
|
}
|
|
if !v.LessThan(ver06) {
|
|
// >= 0.6.0dev
|
|
return false
|
|
}
|
|
for _, service := range services {
|
|
for _, check := range service.Checks {
|
|
if check.Type == "script" {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// SaveState is used to snapshot our state
|
|
func (r *TaskRunner) SaveState() error {
|
|
r.destroyLock.Lock()
|
|
defer r.destroyLock.Unlock()
|
|
if r.destroy {
|
|
// Don't save state if already destroyed
|
|
return nil
|
|
}
|
|
|
|
r.persistLock.Lock()
|
|
defer r.persistLock.Unlock()
|
|
snap := taskRunnerState{
|
|
Version: r.config.Version.VersionNumber(),
|
|
ArtifactDownloaded: r.artifactsDownloaded,
|
|
TaskDirBuilt: r.taskDirBuilt,
|
|
PayloadRendered: r.payloadRendered,
|
|
CreatedResources: r.getCreatedResources(),
|
|
}
|
|
|
|
r.handleLock.Lock()
|
|
if r.handle != nil {
|
|
snap.HandleID = r.handle.ID()
|
|
}
|
|
r.handleLock.Unlock()
|
|
|
|
r.driverNetLock.Lock()
|
|
snap.DriverNetwork = r.driverNet.Copy()
|
|
r.driverNetLock.Unlock()
|
|
|
|
// If nothing has changed avoid the write
|
|
h := snap.Hash()
|
|
if bytes.Equal(h, r.persistedHash) {
|
|
return nil
|
|
}
|
|
|
|
// Serialize the object
|
|
var buf bytes.Buffer
|
|
if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil {
|
|
return fmt.Errorf("failed to serialize snapshot: %v", err)
|
|
}
|
|
|
|
// Start the transaction.
|
|
return r.stateDB.Batch(func(tx *bolt.Tx) error {
|
|
// Grab the task bucket
|
|
taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
|
|
}
|
|
|
|
if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil {
|
|
return fmt.Errorf("failed to write task_runner state: %v", err)
|
|
}
|
|
|
|
// Store the hash that was persisted
|
|
tx.OnCommit(func() {
|
|
r.persistedHash = h
|
|
})
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// DestroyState is used to cleanup after ourselves
|
|
func (r *TaskRunner) DestroyState() error {
|
|
r.persistLock.Lock()
|
|
defer r.persistLock.Unlock()
|
|
|
|
return r.stateDB.Update(func(tx *bolt.Tx) error {
|
|
if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil {
|
|
return fmt.Errorf("failed to delete task bucket: %v", err)
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// setState is used to update the state of the task runner
|
|
func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) {
|
|
event.PopulateEventDisplayMessage()
|
|
|
|
// Persist our state to disk.
|
|
if err := r.SaveState(); err != nil {
|
|
r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
|
|
}
|
|
|
|
// Indicate the task has been updated.
|
|
r.updater(r.task.Name, state, event, lazySync)
|
|
}
|
|
|
|
// createDriver makes a driver for the task
|
|
func (r *TaskRunner) createDriver() (driver.Driver, error) {
|
|
// Create a task-specific event emitter callback to expose minimal
|
|
// state to drivers
|
|
eventEmitter := func(m string, args ...interface{}) {
|
|
msg := fmt.Sprintf(m, args...)
|
|
r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
|
|
r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false)
|
|
}
|
|
|
|
driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter)
|
|
d, err := driver.NewDriver(r.task.Driver, driverCtx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
|
|
r.task.Driver, r.alloc.ID, err)
|
|
}
|
|
|
|
return d, err
|
|
}
|
|
|
|
// Run is a long running routine used to manage the task
|
|
func (r *TaskRunner) Run() {
|
|
defer close(r.waitCh)
|
|
r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
|
|
r.task.Name, r.alloc.ID)
|
|
|
|
if err := r.validateTask(); err != nil {
|
|
r.setState(
|
|
structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(),
|
|
false)
|
|
return
|
|
}
|
|
|
|
// Create a temporary driver so that we can determine the FSIsolation
|
|
// required. run->startTask will create a new driver after environment
|
|
// has been setup (env vars, templates, artifacts, secrets, etc).
|
|
tmpDrv, err := r.createDriver()
|
|
if err != nil {
|
|
e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
r.setState(
|
|
structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
|
|
false)
|
|
return
|
|
}
|
|
|
|
// Build base task directory structure regardless of FS isolation abilities.
|
|
// This needs to happen before we start the Vault manager and call prestart
|
|
// as both those can write to the task directories
|
|
if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil {
|
|
e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
|
|
r.setState(
|
|
structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
|
|
false)
|
|
return
|
|
}
|
|
|
|
// If there is no Vault policy leave the static future created in
|
|
// NewTaskRunner
|
|
if r.task.Vault != nil {
|
|
// Start the go-routine to get a Vault token
|
|
r.vaultFuture.Clear()
|
|
go r.vaultManager(r.recoveredVaultToken)
|
|
}
|
|
|
|
// Start the run loop
|
|
r.run()
|
|
|
|
// Do any cleanup necessary
|
|
r.postrun()
|
|
|
|
return
|
|
}
|
|
|
|
// validateTask validates the fields of the task and returns an error if the
|
|
// task is invalid.
|
|
func (r *TaskRunner) validateTask() error {
|
|
var mErr multierror.Error
|
|
|
|
// Validate the user.
|
|
unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
|
|
checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
|
|
if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
|
|
if _, unallowed := unallowedUsers[r.task.User]; unallowed {
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
|
|
}
|
|
}
|
|
|
|
// Validate the artifacts
|
|
for i, artifact := range r.task.Artifacts {
|
|
// Verify the artifact doesn't escape the task directory.
|
|
if err := artifact.Validate(); err != nil {
|
|
// If this error occurs there is potentially a server bug or
|
|
// malicious, server spoofing.
|
|
r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
|
|
r.alloc.ID, r.task.Name, artifact, i, err)
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
|
|
}
|
|
}
|
|
|
|
// Validate the Service names
|
|
taskEnv := r.envBuilder.Build()
|
|
for i, service := range r.task.Services {
|
|
name := taskEnv.ReplaceEnv(service.Name)
|
|
if err := service.ValidateName(name); err != nil {
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
|
|
}
|
|
}
|
|
|
|
if len(mErr.Errors) == 1 {
|
|
return mErr.Errors[0]
|
|
}
|
|
return mErr.ErrorOrNil()
|
|
}
|
|
|
|
// tokenFuture stores the Vault token and allows consumers to block till a valid
|
|
// token exists
|
|
type tokenFuture struct {
|
|
waiting []chan struct{}
|
|
token string
|
|
set bool
|
|
m sync.Mutex
|
|
}
|
|
|
|
// NewTokenFuture returns a new token future without any token set
|
|
func NewTokenFuture() *tokenFuture {
|
|
return &tokenFuture{}
|
|
}
|
|
|
|
// Wait returns a channel that can be waited on. When this channel unblocks, a
|
|
// valid token will be available via the Get method
|
|
func (f *tokenFuture) Wait() <-chan struct{} {
|
|
f.m.Lock()
|
|
defer f.m.Unlock()
|
|
|
|
c := make(chan struct{})
|
|
if f.set {
|
|
close(c)
|
|
return c
|
|
}
|
|
|
|
f.waiting = append(f.waiting, c)
|
|
return c
|
|
}
|
|
|
|
// Set sets the token value and unblocks any caller of Wait
|
|
func (f *tokenFuture) Set(token string) *tokenFuture {
|
|
f.m.Lock()
|
|
defer f.m.Unlock()
|
|
|
|
f.set = true
|
|
f.token = token
|
|
for _, w := range f.waiting {
|
|
close(w)
|
|
}
|
|
f.waiting = nil
|
|
return f
|
|
}
|
|
|
|
// Clear clears the set vault token.
|
|
func (f *tokenFuture) Clear() *tokenFuture {
|
|
f.m.Lock()
|
|
defer f.m.Unlock()
|
|
|
|
f.token = ""
|
|
f.set = false
|
|
return f
|
|
}
|
|
|
|
// Get returns the set Vault token
|
|
func (f *tokenFuture) Get() string {
|
|
f.m.Lock()
|
|
defer f.m.Unlock()
|
|
return f.token
|
|
}
|
|
|
|
// vaultManager should be called in a go-routine and manages the derivation,
|
|
// renewal and handling of errors with the Vault token. The optional parameter
|
|
// allows setting the initial Vault token. This is useful when the Vault token
|
|
// is recovered off disk.
|
|
func (r *TaskRunner) vaultManager(token string) {
|
|
// Helper for stopping token renewal
|
|
stopRenewal := func() {
|
|
if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
|
|
r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
}
|
|
}
|
|
|
|
// updatedToken lets us store state between loops. If true, a new token
|
|
// has been retrieved and we need to apply the Vault change mode
|
|
var updatedToken bool
|
|
|
|
OUTER:
|
|
for {
|
|
// Check if we should exit
|
|
select {
|
|
case <-r.waitCh:
|
|
stopRenewal()
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Clear the token
|
|
r.vaultFuture.Clear()
|
|
|
|
// Check if there already is a token which can be the case for
|
|
// restoring the TaskRunner
|
|
if token == "" {
|
|
// Get a token
|
|
var exit bool
|
|
token, exit = r.deriveVaultToken()
|
|
if exit {
|
|
// Exit the manager
|
|
return
|
|
}
|
|
|
|
// Write the token to disk
|
|
if err := r.writeToken(token); err != nil {
|
|
e := fmt.Errorf("failed to write Vault token to disk")
|
|
r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
|
|
r.Kill("vault", e.Error(), true)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Start the renewal process
|
|
renewCh, err := r.vaultClient.RenewToken(token, 30)
|
|
|
|
// An error returned means the token is not being renewed
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
token = ""
|
|
goto OUTER
|
|
}
|
|
|
|
// The Vault token is valid now, so set it
|
|
r.vaultFuture.Set(token)
|
|
|
|
if updatedToken {
|
|
switch r.task.Vault.ChangeMode {
|
|
case structs.VaultChangeModeSignal:
|
|
s, err := signals.Parse(r.task.Vault.ChangeSignal)
|
|
if err != nil {
|
|
e := fmt.Errorf("failed to parse signal: %v", err)
|
|
r.logger.Printf("[ERR] client: %v", err)
|
|
r.Kill("vault", e.Error(), true)
|
|
return
|
|
}
|
|
|
|
if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
|
|
r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
|
|
return
|
|
}
|
|
case structs.VaultChangeModeRestart:
|
|
const noFailure = false
|
|
r.Restart("vault", "new Vault token acquired", noFailure)
|
|
case structs.VaultChangeModeNoop:
|
|
fallthrough
|
|
default:
|
|
r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
|
|
}
|
|
|
|
// We have handled it
|
|
updatedToken = false
|
|
|
|
// Call the handler
|
|
r.updatedTokenHandler()
|
|
}
|
|
|
|
// Start watching for renewal errors
|
|
select {
|
|
case err := <-renewCh:
|
|
// Clear the token
|
|
token = ""
|
|
r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
stopRenewal()
|
|
|
|
// Check if we have to do anything
|
|
if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
|
|
updatedToken = true
|
|
}
|
|
case <-r.waitCh:
|
|
stopRenewal()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// deriveVaultToken derives the Vault token using exponential backoffs. It
|
|
// returns the Vault token and whether the manager should exit.
|
|
func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
|
|
attempts := 0
|
|
for {
|
|
tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
|
|
if err == nil {
|
|
return tokens[r.task.Name], false
|
|
}
|
|
|
|
// Check if this is a server side error
|
|
if structs.IsServerSide(err) {
|
|
r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
r.Kill("vault", fmt.Sprintf("server error deriving vault token: %v", err), true)
|
|
return "", true
|
|
}
|
|
// Check if we can't recover from the error
|
|
if !structs.IsRecoverable(err) {
|
|
r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
|
|
return "", true
|
|
}
|
|
|
|
// Handle the retry case
|
|
backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
|
|
if backoff > vaultBackoffLimit {
|
|
backoff = vaultBackoffLimit
|
|
}
|
|
r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
|
|
r.task.Name, r.alloc.ID, err, backoff)
|
|
|
|
attempts++
|
|
|
|
// Wait till retrying
|
|
select {
|
|
case <-r.waitCh:
|
|
return "", true
|
|
case <-time.After(backoff):
|
|
}
|
|
}
|
|
}
|
|
|
|
// writeToken writes the given token to disk
|
|
func (r *TaskRunner) writeToken(token string) error {
|
|
tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
|
|
if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
|
|
return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// updatedTokenHandler is called when a new Vault token is retrieved. Things
|
|
// that rely on the token should be updated here.
|
|
func (r *TaskRunner) updatedTokenHandler() {
|
|
|
|
// Update the tasks environment
|
|
r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env)
|
|
|
|
if r.templateManager != nil {
|
|
r.templateManager.Stop()
|
|
|
|
// Create a new templateManager
|
|
var err error
|
|
r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
|
|
Hooks: r,
|
|
Templates: r.task.Templates,
|
|
ClientConfig: r.config,
|
|
VaultToken: r.vaultFuture.Get(),
|
|
TaskDir: r.taskDir.Dir,
|
|
EnvBuilder: r.envBuilder,
|
|
MaxTemplateEventRate: DefaultMaxTemplateEventRate,
|
|
})
|
|
|
|
if err != nil {
|
|
err := fmt.Errorf("failed to build task's template manager: %v", err)
|
|
r.setState(structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
|
|
false)
|
|
r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
|
|
r.Kill("vault", err.Error(), true)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// prestart handles life-cycle tasks that occur before the task has started.
|
|
// Since it's run asynchronously with the main Run() loop the alloc & task are
|
|
// passed in to avoid racing with updates.
|
|
func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
|
|
if task.Vault != nil {
|
|
// Wait for the token
|
|
r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
|
|
tokenCh := r.vaultFuture.Wait()
|
|
select {
|
|
case <-tokenCh:
|
|
case <-r.waitCh:
|
|
resultCh <- false
|
|
return
|
|
}
|
|
r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID)
|
|
r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env)
|
|
}
|
|
|
|
// If the job is a dispatch job and there is a payload write it to disk
|
|
requirePayload := len(alloc.Job.Payload) != 0 &&
|
|
(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
|
|
if !r.payloadRendered && requirePayload {
|
|
renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File)
|
|
decoded, err := snappy.Decode(nil, alloc.Job.Payload)
|
|
if err != nil {
|
|
r.setState(
|
|
structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
|
|
false)
|
|
resultCh <- false
|
|
return
|
|
}
|
|
|
|
if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
|
|
r.setState(
|
|
structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
|
|
false)
|
|
resultCh <- false
|
|
return
|
|
}
|
|
|
|
if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
|
|
r.setState(
|
|
structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
|
|
false)
|
|
resultCh <- false
|
|
return
|
|
}
|
|
|
|
r.payloadRendered = true
|
|
}
|
|
|
|
for {
|
|
r.persistLock.Lock()
|
|
downloaded := r.artifactsDownloaded
|
|
r.persistLock.Unlock()
|
|
|
|
// Download the task's artifacts
|
|
if !downloaded && len(task.Artifacts) > 0 {
|
|
r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false)
|
|
taskEnv := r.envBuilder.Build()
|
|
for _, artifact := range task.Artifacts {
|
|
if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil {
|
|
wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
|
|
r.logger.Printf("[DEBUG] client: %v", wrapped)
|
|
r.setState(structs.TaskStatePending,
|
|
structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false)
|
|
r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
|
|
goto RESTART
|
|
}
|
|
}
|
|
|
|
r.persistLock.Lock()
|
|
r.artifactsDownloaded = true
|
|
r.persistLock.Unlock()
|
|
}
|
|
|
|
// We don't have to wait for any template
|
|
if len(task.Templates) == 0 {
|
|
// Send the start signal
|
|
select {
|
|
case r.startCh <- struct{}{}:
|
|
default:
|
|
}
|
|
|
|
resultCh <- true
|
|
return
|
|
}
|
|
|
|
// Build the template manager
|
|
if r.templateManager == nil {
|
|
var err error
|
|
r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
|
|
Hooks: r,
|
|
Templates: r.task.Templates,
|
|
ClientConfig: r.config,
|
|
VaultToken: r.vaultFuture.Get(),
|
|
TaskDir: r.taskDir.Dir,
|
|
EnvBuilder: r.envBuilder,
|
|
MaxTemplateEventRate: DefaultMaxTemplateEventRate,
|
|
})
|
|
if err != nil {
|
|
err := fmt.Errorf("failed to build task's template manager: %v", err)
|
|
r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false)
|
|
r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err)
|
|
resultCh <- false
|
|
return
|
|
}
|
|
}
|
|
|
|
// Block for consul-template
|
|
// TODO Hooks should register themselves as blocking and then we can
|
|
// periodically enumerate what we are still blocked on
|
|
select {
|
|
case <-r.unblockCh:
|
|
// Send the start signal
|
|
select {
|
|
case r.startCh <- struct{}{}:
|
|
default:
|
|
}
|
|
|
|
resultCh <- true
|
|
return
|
|
case <-r.waitCh:
|
|
// The run loop has exited so exit too
|
|
resultCh <- false
|
|
return
|
|
}
|
|
|
|
RESTART:
|
|
restart := r.shouldRestart()
|
|
if !restart {
|
|
resultCh <- false
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// postrun is used to do any cleanup that is necessary after exiting the runloop
|
|
func (r *TaskRunner) postrun() {
|
|
// Stop the template manager
|
|
if r.templateManager != nil {
|
|
r.templateManager.Stop()
|
|
}
|
|
}
|
|
|
|
// run is the main run loop that handles starting the application, destroying
|
|
// it, restarts and signals.
|
|
func (r *TaskRunner) run() {
|
|
// Predeclare things so we can jump to the RESTART
|
|
var stopCollection chan struct{}
|
|
var handleWaitCh chan *dstructs.WaitResult
|
|
|
|
// If we already have a handle, populate the stopCollection and handleWaitCh
|
|
// to fix the invariant that it exists.
|
|
handleEmpty := r.getHandle() == nil
|
|
|
|
if !handleEmpty {
|
|
stopCollection = make(chan struct{})
|
|
go r.collectResourceUsageStats(stopCollection)
|
|
handleWaitCh = r.handle.WaitCh()
|
|
}
|
|
|
|
for {
|
|
// Do the prestart activities
|
|
prestartResultCh := make(chan bool, 1)
|
|
go r.prestart(r.alloc, r.task, prestartResultCh)
|
|
|
|
WAIT:
|
|
for {
|
|
select {
|
|
case success := <-prestartResultCh:
|
|
if !success {
|
|
r.cleanup()
|
|
r.setState(structs.TaskStateDead, nil, false)
|
|
return
|
|
}
|
|
case <-r.startCh:
|
|
// Start the task if not yet started or it is being forced. This logic
|
|
// is necessary because in the case of a restore the handle already
|
|
// exists.
|
|
handleEmpty := r.getHandle() == nil
|
|
if handleEmpty {
|
|
startErr := r.startTask()
|
|
r.restartTracker.SetStartError(startErr)
|
|
if startErr != nil {
|
|
r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true)
|
|
goto RESTART
|
|
}
|
|
|
|
// Mark the task as started
|
|
r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false)
|
|
r.runningLock.Lock()
|
|
r.running = true
|
|
r.runningLock.Unlock()
|
|
|
|
if stopCollection == nil {
|
|
stopCollection = make(chan struct{})
|
|
go r.collectResourceUsageStats(stopCollection)
|
|
}
|
|
|
|
handleWaitCh = r.handle.WaitCh()
|
|
}
|
|
|
|
case waitRes := <-handleWaitCh:
|
|
if waitRes == nil {
|
|
panic("nil wait")
|
|
}
|
|
|
|
r.runningLock.Lock()
|
|
r.running = false
|
|
r.runningLock.Unlock()
|
|
|
|
// Stop collection of the task's resource usage
|
|
close(stopCollection)
|
|
|
|
// Log whether the task was successful or not.
|
|
r.restartTracker.SetWaitResult(waitRes)
|
|
r.setState("", r.waitErrorToEvent(waitRes), true)
|
|
if !waitRes.Successful() {
|
|
r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
|
|
} else {
|
|
r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
|
|
}
|
|
|
|
break WAIT
|
|
case update := <-r.updateCh:
|
|
if err := r.handleUpdate(update); err != nil {
|
|
r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
|
|
}
|
|
|
|
case se := <-r.signalCh:
|
|
r.runningLock.Lock()
|
|
running := r.running
|
|
r.runningLock.Unlock()
|
|
common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
|
|
if !running {
|
|
// Send no error
|
|
r.logger.Printf("[DEBUG] client: skipping %s", common)
|
|
se.result <- nil
|
|
continue
|
|
}
|
|
|
|
r.logger.Printf("[DEBUG] client: sending %s", common)
|
|
r.setState(structs.TaskStateRunning, se.e, false)
|
|
|
|
res := r.handle.Signal(se.s)
|
|
se.result <- res
|
|
|
|
case restartEvent := <-r.restartCh:
|
|
r.runningLock.Lock()
|
|
running := r.running
|
|
r.runningLock.Unlock()
|
|
common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
|
|
if !running {
|
|
r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
|
|
continue
|
|
}
|
|
|
|
r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason)
|
|
r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false)
|
|
r.killTask(nil)
|
|
|
|
close(stopCollection)
|
|
|
|
if handleWaitCh != nil {
|
|
<-handleWaitCh
|
|
}
|
|
|
|
r.restartTracker.SetRestartTriggered(restartEvent.failure)
|
|
break WAIT
|
|
|
|
case <-r.destroyCh:
|
|
r.runningLock.Lock()
|
|
running := r.running
|
|
r.runningLock.Unlock()
|
|
if !running {
|
|
r.cleanup()
|
|
r.setState(structs.TaskStateDead, r.destroyEvent, false)
|
|
return
|
|
}
|
|
|
|
// Remove from consul before killing the task so that traffic
|
|
// can be rerouted
|
|
interpTask := interpolateServices(r.envBuilder.Build(), r.task)
|
|
r.consul.RemoveTask(r.alloc.ID, interpTask)
|
|
|
|
// Delay actually killing the task if configured. See #244
|
|
if r.task.ShutdownDelay > 0 {
|
|
r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
|
|
r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
|
|
<-time.After(r.task.ShutdownDelay)
|
|
}
|
|
|
|
// Store the task event that provides context on the task
|
|
// destroy. The Killed event is set from the alloc_runner and
|
|
// doesn't add detail
|
|
var killEvent *structs.TaskEvent
|
|
if r.destroyEvent.Type != structs.TaskKilled {
|
|
if r.destroyEvent.Type == structs.TaskKilling {
|
|
killEvent = r.destroyEvent
|
|
} else {
|
|
r.setState(structs.TaskStateRunning, r.destroyEvent, false)
|
|
}
|
|
}
|
|
|
|
r.killTask(killEvent)
|
|
close(stopCollection)
|
|
|
|
// Wait for handler to exit before calling cleanup
|
|
<-handleWaitCh
|
|
r.cleanup()
|
|
|
|
r.setState(structs.TaskStateDead, nil, false)
|
|
return
|
|
}
|
|
}
|
|
|
|
RESTART:
|
|
// shouldRestart will block if the task should restart after a delay.
|
|
restart := r.shouldRestart()
|
|
if !restart {
|
|
r.cleanup()
|
|
r.setState(structs.TaskStateDead, nil, false)
|
|
return
|
|
}
|
|
|
|
// Clear the handle so a new driver will be created.
|
|
r.handleLock.Lock()
|
|
r.handle = nil
|
|
handleWaitCh = nil
|
|
stopCollection = nil
|
|
r.handleLock.Unlock()
|
|
}
|
|
}
|
|
|
|
// cleanup removes Consul entries and calls Driver.Cleanup when a task is
|
|
// stopping. Errors are logged.
|
|
func (r *TaskRunner) cleanup() {
|
|
// Remove from Consul
|
|
interpTask := interpolateServices(r.envBuilder.Build(), r.task)
|
|
r.consul.RemoveTask(r.alloc.ID, interpTask)
|
|
|
|
drv, err := r.createDriver()
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
|
|
return
|
|
}
|
|
|
|
res := r.getCreatedResources()
|
|
|
|
ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
|
|
attempts := 1
|
|
var cleanupErr error
|
|
for retry := true; retry; attempts++ {
|
|
cleanupErr = drv.Cleanup(ctx, res)
|
|
retry = structs.IsRecoverable(cleanupErr)
|
|
|
|
// Copy current createdResources state in case SaveState is
|
|
// called between retries
|
|
r.setCreatedResources(res)
|
|
|
|
// Retry 3 times with sleeps between
|
|
if !retry || attempts > 3 {
|
|
break
|
|
}
|
|
time.Sleep(time.Duration(attempts) * time.Second)
|
|
}
|
|
|
|
if cleanupErr != nil {
|
|
r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
|
|
}
|
|
return
|
|
}
|
|
|
|
// shouldRestart returns if the task should restart. If the return value is
|
|
// true, the task's restart policy has already been considered and any wait time
|
|
// between restarts has been applied.
|
|
func (r *TaskRunner) shouldRestart() bool {
|
|
state, when := r.restartTracker.GetState()
|
|
reason := r.restartTracker.GetReason()
|
|
switch state {
|
|
case structs.TaskNotRestarting, structs.TaskTerminated:
|
|
r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
|
|
if state == structs.TaskNotRestarting {
|
|
r.setState(structs.TaskStateDead,
|
|
structs.NewTaskEvent(structs.TaskNotRestarting).
|
|
SetRestartReason(reason).SetFailsTask(),
|
|
false)
|
|
}
|
|
return false
|
|
case structs.TaskRestarting:
|
|
r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
|
|
r.setState(structs.TaskStatePending,
|
|
structs.NewTaskEvent(structs.TaskRestarting).
|
|
SetRestartDelay(when).
|
|
SetRestartReason(reason),
|
|
false)
|
|
default:
|
|
r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
|
|
return false
|
|
}
|
|
|
|
// Unregister from Consul while waiting to restart.
|
|
interpTask := interpolateServices(r.envBuilder.Build(), r.task)
|
|
r.consul.RemoveTask(r.alloc.ID, interpTask)
|
|
|
|
// Sleep but watch for destroy events.
|
|
select {
|
|
case <-time.After(when):
|
|
case <-r.destroyCh:
|
|
}
|
|
|
|
// Destroyed while we were waiting to restart, so abort.
|
|
r.destroyLock.Lock()
|
|
destroyed := r.destroy
|
|
r.destroyLock.Unlock()
|
|
if destroyed {
|
|
r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
|
|
r.setState(structs.TaskStateDead, r.destroyEvent, false)
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// killTask kills the running task. A killing event can optionally be passed and
|
|
// this event is used to mark the task as being killed. It provides a means to
|
|
// store extra information.
|
|
func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
|
|
r.runningLock.Lock()
|
|
running := r.running
|
|
r.runningLock.Unlock()
|
|
if !running {
|
|
return
|
|
}
|
|
|
|
// Get the kill timeout
|
|
timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
|
|
|
|
// Build the event
|
|
var event *structs.TaskEvent
|
|
if killingEvent != nil {
|
|
event = killingEvent
|
|
event.Type = structs.TaskKilling
|
|
} else {
|
|
event = structs.NewTaskEvent(structs.TaskKilling)
|
|
}
|
|
event.SetKillTimeout(timeout)
|
|
|
|
// Mark that we received the kill event
|
|
r.setState(structs.TaskStateRunning, event, false)
|
|
|
|
handle := r.getHandle()
|
|
|
|
// Kill the task using an exponential backoff in-case of failures.
|
|
destroySuccess, err := r.handleDestroy(handle)
|
|
if !destroySuccess {
|
|
// We couldn't successfully destroy the resource created.
|
|
r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
|
|
}
|
|
|
|
r.runningLock.Lock()
|
|
r.running = false
|
|
r.runningLock.Unlock()
|
|
|
|
// Store that the task has been destroyed and any associated error.
|
|
r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true)
|
|
}
|
|
|
|
// startTask creates the driver, task dir, and starts the task.
|
|
func (r *TaskRunner) startTask() error {
|
|
// Create a driver
|
|
drv, err := r.createDriver()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
}
|
|
|
|
// Run prestart
|
|
ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
|
|
presp, err := drv.Prestart(ctx, r.task)
|
|
|
|
// Merge newly created resources into previously created resources
|
|
if presp != nil {
|
|
r.createdResourcesLock.Lock()
|
|
r.createdResources.Merge(presp.CreatedResources)
|
|
r.createdResourcesLock.Unlock()
|
|
|
|
// Set any network configuration returned by the driver
|
|
r.envBuilder.SetDriverNetwork(presp.Network)
|
|
}
|
|
|
|
if err != nil {
|
|
wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
|
|
return structs.WrapRecoverable(wrapped, err)
|
|
}
|
|
|
|
// Create a new context for Start since the environment may have been updated.
|
|
ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build())
|
|
|
|
// Start the job
|
|
sresp, err := drv.Start(ctx, r.task)
|
|
if err != nil {
|
|
wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
r.logger.Printf("[WARN] client: %s", wrapped)
|
|
return structs.WrapRecoverable(wrapped, err)
|
|
|
|
}
|
|
|
|
// Log driver network information
|
|
if sresp.Network != nil && sresp.Network.IP != "" {
|
|
if sresp.Network.AutoAdvertise {
|
|
r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s",
|
|
r.alloc.ID, r.task.Name, sresp.Network.IP)
|
|
} else {
|
|
r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising",
|
|
r.alloc.ID, r.task.Name, sresp.Network.IP)
|
|
}
|
|
}
|
|
|
|
if sresp.Network == nil || sresp.Network.IP == "" {
|
|
r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name)
|
|
}
|
|
|
|
// Update environment with the network defined by the driver's Start method.
|
|
r.envBuilder.SetDriverNetwork(sresp.Network)
|
|
|
|
if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil {
|
|
// All IO is done asynchronously, so errors from registering
|
|
// services are hard failures.
|
|
r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err)
|
|
|
|
// Kill the started task
|
|
if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed {
|
|
r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v",
|
|
r.task.Name, r.alloc.ID, err)
|
|
}
|
|
return structs.NewRecoverableError(err, false)
|
|
}
|
|
|
|
r.handleLock.Lock()
|
|
r.handle = sresp.Handle
|
|
r.handleLock.Unlock()
|
|
|
|
// Need to persist the driver network between restarts
|
|
r.driverNetLock.Lock()
|
|
r.driverNet = sresp.Network
|
|
r.driverNetLock.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// registerServices and checks with Consul.
|
|
func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error {
|
|
var exec driver.ScriptExecutor
|
|
if d.Abilities().Exec {
|
|
// Allow set the script executor if the driver supports it
|
|
exec = h
|
|
}
|
|
interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
|
|
return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n)
|
|
}
|
|
|
|
// interpolateServices interpolates tags in a service and checks with values from the
|
|
// task's environment.
|
|
func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task {
|
|
taskCopy := task.Copy()
|
|
for _, service := range taskCopy.Services {
|
|
for _, check := range service.Checks {
|
|
check.Name = taskEnv.ReplaceEnv(check.Name)
|
|
check.Type = taskEnv.ReplaceEnv(check.Type)
|
|
check.Command = taskEnv.ReplaceEnv(check.Command)
|
|
check.Args = taskEnv.ParseAndReplace(check.Args)
|
|
check.Path = taskEnv.ReplaceEnv(check.Path)
|
|
check.Protocol = taskEnv.ReplaceEnv(check.Protocol)
|
|
check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel)
|
|
check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus)
|
|
check.Method = taskEnv.ReplaceEnv(check.Method)
|
|
if len(check.Header) > 0 {
|
|
header := make(map[string][]string, len(check.Header))
|
|
for k, vs := range check.Header {
|
|
newVals := make([]string, len(vs))
|
|
for i, v := range vs {
|
|
newVals[i] = taskEnv.ReplaceEnv(v)
|
|
}
|
|
header[taskEnv.ReplaceEnv(k)] = newVals
|
|
}
|
|
check.Header = header
|
|
}
|
|
}
|
|
service.Name = taskEnv.ReplaceEnv(service.Name)
|
|
service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel)
|
|
service.Tags = taskEnv.ParseAndReplace(service.Tags)
|
|
}
|
|
return taskCopy
|
|
}
|
|
|
|
// buildTaskDir creates the task directory before driver.Prestart. It is safe
|
|
// to call multiple times as its state is persisted.
|
|
func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
|
|
r.persistLock.Lock()
|
|
built := r.taskDirBuilt
|
|
r.persistLock.Unlock()
|
|
|
|
// We do not set the state again since this only occurs during restoration
|
|
// and the task dir is already built. The reason we call Build again is to
|
|
// ensure that the task dir invariants are still held.
|
|
if !built {
|
|
r.setState(structs.TaskStatePending,
|
|
structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir),
|
|
false)
|
|
}
|
|
|
|
chroot := config.DefaultChrootEnv
|
|
if len(r.config.ChrootEnv) > 0 {
|
|
chroot = r.config.ChrootEnv
|
|
}
|
|
if err := r.taskDir.Build(built, chroot, fsi); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Mark task dir as successfully built
|
|
r.persistLock.Lock()
|
|
r.taskDirBuilt = true
|
|
r.persistLock.Unlock()
|
|
|
|
// Set path and host related env vars
|
|
driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config)
|
|
return nil
|
|
}
|
|
|
|
// collectResourceUsageStats starts collecting resource usage stats of a Task.
|
|
// Collection ends when the passed channel is closed
|
|
func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
|
|
// start collecting the stats right away and then start collecting every
|
|
// collection interval
|
|
next := time.NewTimer(0)
|
|
defer next.Stop()
|
|
for {
|
|
select {
|
|
case <-next.C:
|
|
next.Reset(r.config.StatsCollectionInterval)
|
|
handle := r.getHandle()
|
|
if handle == nil {
|
|
continue
|
|
}
|
|
ru, err := handle.Stats()
|
|
|
|
if err != nil {
|
|
// Check if the driver doesn't implement stats
|
|
if err.Error() == driver.DriverStatsNotImplemented.Error() {
|
|
r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
|
|
return
|
|
}
|
|
|
|
// We do not log when the plugin is shutdown as this is simply a
|
|
// race between the stopCollection channel being closed and calling
|
|
// Stats on the handle.
|
|
if !strings.Contains(err.Error(), "connection is shut down") {
|
|
r.logger.Printf("[DEBUG] client: error fetching stats of task %v: %v", r.task.Name, err)
|
|
}
|
|
continue
|
|
}
|
|
|
|
r.resourceUsageLock.Lock()
|
|
r.resourceUsage = ru
|
|
r.resourceUsageLock.Unlock()
|
|
if ru != nil {
|
|
r.emitStats(ru)
|
|
}
|
|
case <-stopCollection:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// LatestResourceUsage returns the last resource utilization datapoint collected
|
|
func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
|
|
r.resourceUsageLock.RLock()
|
|
defer r.resourceUsageLock.RUnlock()
|
|
r.runningLock.Lock()
|
|
defer r.runningLock.Unlock()
|
|
|
|
// If the task is not running there can be no latest resource
|
|
if !r.running {
|
|
return nil
|
|
}
|
|
|
|
return r.resourceUsage
|
|
}
|
|
|
|
// handleUpdate takes an updated allocation and updates internal state to
|
|
// reflect the new config for the task.
|
|
func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
|
|
// Extract the task group from the alloc.
|
|
tg := update.Job.LookupTaskGroup(update.TaskGroup)
|
|
if tg == nil {
|
|
return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
|
|
}
|
|
|
|
// Extract the task.
|
|
var updatedTask *structs.Task
|
|
for _, t := range tg.Tasks {
|
|
if t.Name == r.task.Name {
|
|
updatedTask = t.Copy()
|
|
break
|
|
}
|
|
}
|
|
if updatedTask == nil {
|
|
return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
|
|
}
|
|
|
|
// Merge in the task resources
|
|
updatedTask.Resources = update.TaskResources[updatedTask.Name]
|
|
|
|
// Interpolate the old task with the old env before updating the env as
|
|
// updating services in Consul need both the old and new interpolations
|
|
// to find differences.
|
|
oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
|
|
|
|
// Now it's safe to update the environment
|
|
r.envBuilder.UpdateTask(update, updatedTask)
|
|
|
|
var mErr multierror.Error
|
|
r.handleLock.Lock()
|
|
if r.handle != nil {
|
|
drv, err := r.createDriver()
|
|
if err != nil {
|
|
// Something has really gone wrong; don't continue
|
|
r.handleLock.Unlock()
|
|
return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err)
|
|
}
|
|
|
|
// Update will update resources and store the new kill timeout.
|
|
if err := r.handle.Update(updatedTask); err != nil {
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
|
|
}
|
|
|
|
// Update services in Consul
|
|
newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask)
|
|
if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil {
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err))
|
|
}
|
|
}
|
|
r.handleLock.Unlock()
|
|
|
|
// Update the restart policy.
|
|
if r.restartTracker != nil {
|
|
r.restartTracker.SetPolicy(tg.RestartPolicy)
|
|
}
|
|
|
|
// Store the updated alloc.
|
|
r.alloc = update
|
|
r.task = updatedTask
|
|
return mErr.ErrorOrNil()
|
|
}
|
|
|
|
// updateServices and checks with Consul. Tasks must be interpolated!
|
|
func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error {
|
|
var exec driver.ScriptExecutor
|
|
if d.Abilities().Exec {
|
|
// Allow set the script executor if the driver supports it
|
|
exec = h
|
|
}
|
|
r.driverNetLock.Lock()
|
|
net := r.driverNet.Copy()
|
|
r.driverNetLock.Unlock()
|
|
return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net)
|
|
}
|
|
|
|
// handleDestroy kills the task handle. In the case that killing fails,
|
|
// handleDestroy will retry with an exponential backoff and will give up at a
|
|
// given limit. It returns whether the task was destroyed and the error
|
|
// associated with the last kill attempt.
|
|
func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) {
|
|
// Cap the number of times we attempt to kill the task.
|
|
for i := 0; i < killFailureLimit; i++ {
|
|
if err = handle.Kill(); err != nil {
|
|
// Calculate the new backoff
|
|
backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
|
|
if backoff > killBackoffLimit {
|
|
backoff = killBackoffLimit
|
|
}
|
|
|
|
r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
|
|
r.task.Name, r.alloc.ID, backoff, err)
|
|
time.Sleep(backoff)
|
|
} else {
|
|
// Kill was successful
|
|
return true, nil
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Restart will restart the task.
|
|
func (r *TaskRunner) Restart(source, reason string, failure bool) {
|
|
reasonStr := fmt.Sprintf("%s: %s", source, reason)
|
|
event := newTaskRestartEvent(reasonStr, failure)
|
|
|
|
select {
|
|
case r.restartCh <- event:
|
|
case <-r.waitCh:
|
|
}
|
|
}
|
|
|
|
// Signal will send a signal to the task
|
|
func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
|
|
|
|
reasonStr := fmt.Sprintf("%s: %s", source, reason)
|
|
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
|
|
|
|
resCh := make(chan error)
|
|
se := SignalEvent{
|
|
s: s,
|
|
e: event,
|
|
result: resCh,
|
|
}
|
|
|
|
select {
|
|
case r.signalCh <- se:
|
|
case <-r.waitCh:
|
|
}
|
|
|
|
return <-resCh
|
|
}
|
|
|
|
// Kill will kill a task and store the error, no longer restarting the task. If
|
|
// fail is set, the task is marked as having failed.
|
|
func (r *TaskRunner) Kill(source, reason string, fail bool) {
|
|
reasonStr := fmt.Sprintf("%s: %s", source, reason)
|
|
event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
|
|
if fail {
|
|
event.SetFailsTask()
|
|
}
|
|
|
|
r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
|
|
r.Destroy(event)
|
|
}
|
|
|
|
func (r *TaskRunner) EmitEvent(source, message string) {
|
|
event := structs.NewTaskEvent(source).
|
|
SetMessage(message)
|
|
r.setState("", event, false)
|
|
r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v",
|
|
source, r.task.Name, r.alloc.ID, message)
|
|
}
|
|
|
|
// UnblockStart unblocks the starting of the task. It currently assumes only
|
|
// consul-template will unblock
|
|
func (r *TaskRunner) UnblockStart(source string) {
|
|
r.unblockLock.Lock()
|
|
defer r.unblockLock.Unlock()
|
|
if r.unblocked {
|
|
return
|
|
}
|
|
|
|
r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
|
|
r.unblocked = true
|
|
close(r.unblockCh)
|
|
}
|
|
|
|
// Helper function for converting a WaitResult into a TaskTerminated event.
|
|
func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
|
|
return structs.NewTaskEvent(structs.TaskTerminated).
|
|
SetExitCode(res.ExitCode).
|
|
SetSignal(res.Signal).
|
|
SetExitMessage(res.Err)
|
|
}
|
|
|
|
// Update is used to update the task of the context
|
|
func (r *TaskRunner) Update(update *structs.Allocation) {
|
|
select {
|
|
case r.updateCh <- update:
|
|
default:
|
|
r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
|
|
r.task.Name, r.alloc.ID)
|
|
}
|
|
}
|
|
|
|
// Destroy is used to indicate that the task context should be destroyed. The
|
|
// event parameter provides a context for the destroy.
|
|
func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
|
|
r.destroyLock.Lock()
|
|
defer r.destroyLock.Unlock()
|
|
|
|
if r.destroy {
|
|
return
|
|
}
|
|
r.destroy = true
|
|
r.destroyEvent = event
|
|
close(r.destroyCh)
|
|
}
|
|
|
|
// getCreatedResources returns the resources created by drivers. It will never
|
|
// return nil.
|
|
func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
|
|
r.createdResourcesLock.Lock()
|
|
if r.createdResources == nil {
|
|
r.createdResources = driver.NewCreatedResources()
|
|
}
|
|
cr := r.createdResources.Copy()
|
|
r.createdResourcesLock.Unlock()
|
|
|
|
return cr
|
|
}
|
|
|
|
// setCreatedResources updates the resources created by drivers. If passed nil
|
|
// it will set createdResources to an initialized struct.
|
|
func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
|
|
if cr == nil {
|
|
cr = driver.NewCreatedResources()
|
|
}
|
|
r.createdResourcesLock.Lock()
|
|
r.createdResources = cr.Copy()
|
|
r.createdResourcesLock.Unlock()
|
|
}
|
|
|
|
func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
|
|
if !r.config.DisableTaggedMetrics {
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
|
|
float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
|
|
float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
|
|
float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
|
|
float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
|
|
float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
|
|
float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
|
|
float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels)
|
|
}
|
|
|
|
if r.config.BackwardsCompatibleMetrics {
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
|
|
}
|
|
}
|
|
|
|
func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
|
|
if !r.config.DisableTaggedMetrics {
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
|
|
float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
|
|
float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
|
|
float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
|
|
float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
|
|
float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels)
|
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
|
|
float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels)
|
|
}
|
|
|
|
if r.config.BackwardsCompatibleMetrics {
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
|
|
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
|
|
}
|
|
}
|
|
|
|
// emitStats emits resource usage stats of tasks to remote metrics collector
|
|
// sinks
|
|
func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
|
|
if !r.config.PublishAllocationMetrics {
|
|
return
|
|
}
|
|
|
|
// If the task is not running don't emit anything
|
|
r.runningLock.Lock()
|
|
running := r.running
|
|
r.runningLock.Unlock()
|
|
if !running {
|
|
return
|
|
}
|
|
|
|
if ru.ResourceUsage.MemoryStats != nil {
|
|
r.setGaugeForMemory(ru)
|
|
}
|
|
|
|
if ru.ResourceUsage.CpuStats != nil {
|
|
r.setGaugeForCPU(ru)
|
|
}
|
|
}
|