0e6c564406
Fixes #2289 Unfortunately took more RecoverableError hijinx than I would have liked. There might be a better way.
223 lines
6.2 KiB
Go
223 lines
6.2 KiB
Go
package client
|
|
|
|
import (
|
|
"fmt"
|
|
"math/rand"
|
|
"sync"
|
|
"time"
|
|
|
|
dstructs "github.com/hashicorp/nomad/client/driver/structs"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
const (
|
|
// jitter is the percent of jitter added to restart delays.
|
|
jitter = 0.25
|
|
|
|
ReasonNoRestartsAllowed = "Policy allows no restarts"
|
|
ReasonUnrecoverableErrror = "Error was unrecoverable"
|
|
ReasonWithinPolicy = "Restart within policy"
|
|
ReasonDelay = "Exceeded allowed attempts, applying a delay"
|
|
)
|
|
|
|
func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
|
|
onSuccess := true
|
|
if jobType == structs.JobTypeBatch {
|
|
onSuccess = false
|
|
}
|
|
return &RestartTracker{
|
|
startTime: time.Now(),
|
|
onSuccess: onSuccess,
|
|
policy: policy,
|
|
rand: rand.New(rand.NewSource(time.Now().Unix())),
|
|
}
|
|
}
|
|
|
|
type RestartTracker struct {
|
|
waitRes *dstructs.WaitResult
|
|
startErr error
|
|
restartTriggered bool // Whether the task has been signalled to be restarted
|
|
count int // Current number of attempts.
|
|
onSuccess bool // Whether to restart on successful exit code.
|
|
startTime time.Time // When the interval began
|
|
reason string // The reason for the last state
|
|
policy *structs.RestartPolicy
|
|
rand *rand.Rand
|
|
lock sync.Mutex
|
|
}
|
|
|
|
// SetPolicy updates the policy used to determine restarts.
|
|
func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
|
|
r.lock.Lock()
|
|
defer r.lock.Unlock()
|
|
r.policy = policy
|
|
}
|
|
|
|
// SetStartError is used to mark the most recent start error. If starting was
|
|
// successful the error should be nil.
|
|
func (r *RestartTracker) SetStartError(err error) *RestartTracker {
|
|
r.lock.Lock()
|
|
defer r.lock.Unlock()
|
|
r.startErr = err
|
|
return r
|
|
}
|
|
|
|
// SetWaitResult is used to mark the most recent wait result.
|
|
func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker {
|
|
r.lock.Lock()
|
|
defer r.lock.Unlock()
|
|
r.waitRes = res
|
|
return r
|
|
}
|
|
|
|
// SetRestartTriggered is used to mark that the task has been signalled to be
|
|
// restarted
|
|
func (r *RestartTracker) SetRestartTriggered() *RestartTracker {
|
|
r.lock.Lock()
|
|
defer r.lock.Unlock()
|
|
r.restartTriggered = true
|
|
return r
|
|
}
|
|
|
|
// GetReason returns a human-readable description for the last state returned by
|
|
// GetState.
|
|
func (r *RestartTracker) GetReason() string {
|
|
r.lock.Lock()
|
|
defer r.lock.Unlock()
|
|
return r.reason
|
|
}
|
|
|
|
// GetState returns the tasks next state given the set exit code and start
|
|
// error. One of the following states are returned:
|
|
// * TaskRestarting - Task should be restarted
|
|
// * TaskNotRestarting - Task should not be restarted and has exceeded its
|
|
// restart policy.
|
|
// * TaskTerminated - Task has terminated successfully and does not need a
|
|
// restart.
|
|
//
|
|
// If TaskRestarting is returned, the duration is how long to wait until
|
|
// starting the task again.
|
|
func (r *RestartTracker) GetState() (string, time.Duration) {
|
|
r.lock.Lock()
|
|
defer r.lock.Unlock()
|
|
|
|
// Clear out the existing state
|
|
defer func() {
|
|
r.startErr = nil
|
|
r.waitRes = nil
|
|
r.restartTriggered = false
|
|
}()
|
|
|
|
// Hot path if a restart was triggered
|
|
if r.restartTriggered {
|
|
r.reason = ""
|
|
return structs.TaskRestarting, 0
|
|
}
|
|
|
|
// Hot path if no attempts are expected
|
|
if r.policy.Attempts == 0 {
|
|
r.reason = ReasonNoRestartsAllowed
|
|
if r.waitRes != nil && r.waitRes.Successful() {
|
|
return structs.TaskTerminated, 0
|
|
}
|
|
|
|
return structs.TaskNotRestarting, 0
|
|
}
|
|
|
|
r.count++
|
|
|
|
// Check if we have entered a new interval.
|
|
end := r.startTime.Add(r.policy.Interval)
|
|
now := time.Now()
|
|
if now.After(end) {
|
|
r.count = 0
|
|
r.startTime = now
|
|
}
|
|
|
|
if r.startErr != nil {
|
|
return r.handleStartError()
|
|
} else if r.waitRes != nil {
|
|
return r.handleWaitResult()
|
|
}
|
|
|
|
return "", 0
|
|
}
|
|
|
|
// handleStartError returns the new state and potential wait duration for
|
|
// restarting the task after it was not successfully started. On start errors,
|
|
// the restart policy is always treated as fail mode to ensure we don't
|
|
// infinitely try to start a task.
|
|
func (r *RestartTracker) handleStartError() (string, time.Duration) {
|
|
// If the error is not recoverable, do not restart.
|
|
if !structs.IsRecoverable(r.startErr) {
|
|
r.reason = ReasonUnrecoverableErrror
|
|
return structs.TaskNotRestarting, 0
|
|
}
|
|
|
|
if r.count > r.policy.Attempts {
|
|
if r.policy.Mode == structs.RestartPolicyModeFail {
|
|
r.reason = fmt.Sprintf(
|
|
`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
|
|
r.policy.Attempts, r.policy.Interval)
|
|
return structs.TaskNotRestarting, 0
|
|
} else {
|
|
r.reason = ReasonDelay
|
|
return structs.TaskRestarting, r.getDelay()
|
|
}
|
|
}
|
|
|
|
r.reason = ReasonWithinPolicy
|
|
return structs.TaskRestarting, r.jitter()
|
|
}
|
|
|
|
// handleWaitResult returns the new state and potential wait duration for
|
|
// restarting the task after it has exited.
|
|
func (r *RestartTracker) handleWaitResult() (string, time.Duration) {
|
|
// If the task started successfully and restart on success isn't specified,
|
|
// don't restart but don't mark as failed.
|
|
if r.waitRes.Successful() && !r.onSuccess {
|
|
r.reason = "Restart unnecessary as task terminated successfully"
|
|
return structs.TaskTerminated, 0
|
|
}
|
|
|
|
if r.count > r.policy.Attempts {
|
|
if r.policy.Mode == structs.RestartPolicyModeFail {
|
|
r.reason = fmt.Sprintf(
|
|
`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
|
|
r.policy.Attempts, r.policy.Interval)
|
|
return structs.TaskNotRestarting, 0
|
|
} else {
|
|
r.reason = ReasonDelay
|
|
return structs.TaskRestarting, r.getDelay()
|
|
}
|
|
}
|
|
|
|
r.reason = ReasonWithinPolicy
|
|
return structs.TaskRestarting, r.jitter()
|
|
}
|
|
|
|
// getDelay returns the delay time to enter the next interval.
|
|
func (r *RestartTracker) getDelay() time.Duration {
|
|
end := r.startTime.Add(r.policy.Interval)
|
|
now := time.Now()
|
|
return end.Sub(now)
|
|
}
|
|
|
|
// jitter returns the delay time plus a jitter.
|
|
func (r *RestartTracker) jitter() time.Duration {
|
|
// Get the delay and ensure it is valid.
|
|
d := r.policy.Delay.Nanoseconds()
|
|
if d == 0 {
|
|
d = 1
|
|
}
|
|
|
|
j := float64(r.rand.Int63n(d)) * jitter
|
|
return time.Duration(d + int64(j))
|
|
}
|
|
|
|
// Returns a tracker that never restarts.
|
|
func noRestartsTracker() *RestartTracker {
|
|
policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
|
|
return newRestartTracker(policy, structs.JobTypeBatch)
|
|
}
|