open-nomad/client/restarts.go

package client

import (
	"fmt"
	"math/rand"
	"sync"
	"time"

	cstructs "github.com/hashicorp/nomad/client/driver/structs"
	"github.com/hashicorp/nomad/nomad/structs"
)

const (
	// jitter is the percent of jitter added to restart delays.
	jitter = 0.25

	ReasonNoRestartsAllowed   = "Policy allows no restarts"
	ReasonUnrecoverableErrror = "Error was unrecoverable"
	ReasonWithinPolicy        = "Restart within policy"
	ReasonDelay               = "Exceeded allowed attempts, applying a delay"
)

func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
	onSuccess := true
	if jobType == structs.JobTypeBatch {
		onSuccess = false
	}
	return &RestartTracker{
		startTime: time.Now(),
		onSuccess: onSuccess,
		policy:    policy,
		rand:      rand.New(rand.NewSource(time.Now().Unix())),
	}
}

type RestartTracker struct {
	waitRes   *cstructs.WaitResult
	startErr  error
	count     int       // Current number of attempts.
	onSuccess bool      // Whether to restart on successful exit code.
	startTime time.Time // When the interval began
	reason    string    // The reason for the last state
	policy    *structs.RestartPolicy
	rand      *rand.Rand
	lock      sync.Mutex
}

// SetPolicy updates the policy used to determine restarts.
func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
	r.lock.Lock()
	defer r.lock.Unlock()
	r.policy = policy
}

// SetStartError is used to mark the most recent start error. If starting was
// successful the error should be nil.
func (r *RestartTracker) SetStartError(err error) *RestartTracker {
	r.lock.Lock()
	defer r.lock.Unlock()
	r.startErr = err
	return r
}

// SetWaitResult is used to mark the most recent wait result.
func (r *RestartTracker) SetWaitResult(res *cstructs.WaitResult) *RestartTracker {
	r.lock.Lock()
	defer r.lock.Unlock()
	r.waitRes = res
	return r
}

// GetReason returns a human-readable description for the last state returned by
// GetState.
func (r *RestartTracker) GetReason() string {
	r.lock.Lock()
	defer r.lock.Unlock()
	return r.reason
}

// GetState returns the tasks next state given the set exit code and start
// error. One of the following states are returned:
// * TaskRestarting - Task should be restarted
// * TaskNotRestarting - Task should not be restarted and has exceeded its
//   restart policy.
// * TaskTerminated - Task has terminated successfully and does not need a
//   restart.
//
// If TaskRestarting is returned, the duration is how long to wait until
// starting the task again.
func (r *RestartTracker) GetState() (string, time.Duration) {
	r.lock.Lock()
	defer r.lock.Unlock()

	// Hot path if no attempts are expected
	if r.policy.Attempts == 0 {
		r.reason = ReasonNoRestartsAllowed
		if r.waitRes != nil && r.waitRes.Successful() {
			return structs.TaskTerminated, 0
		}

		return structs.TaskNotRestarting, 0
	}

	r.count++

	// Check if we have entered a new interval.
	end := r.startTime.Add(r.policy.Interval)
	now := time.Now()
	if now.After(end) {
		r.count = 0
		r.startTime = now
	}

	if r.startErr != nil {
		return r.handleStartError()
	} else if r.waitRes != nil {
		return r.handleWaitResult()
	} else {
		return "", 0
	}
}

// handleStartError returns the new state and potential wait duration for
// restarting the task after it was not successfully started. On start errors,
// the restart policy is always treated as fail mode to ensure we don't
// infinitely try to start a task.
func (r *RestartTracker) handleStartError() (string, time.Duration) {
	// If the error is not recoverable, do not restart.
	if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) {
		r.reason = ReasonUnrecoverableErrror
		return structs.TaskNotRestarting, 0
	}

	if r.count > r.policy.Attempts {
		if r.policy.Mode == structs.RestartPolicyModeFail {
			r.reason = fmt.Sprintf(
				`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
				r.policy.Attempts, r.policy.Interval)
			return structs.TaskNotRestarting, 0
		} else {
			r.reason = ReasonDelay
			return structs.TaskRestarting, r.getDelay()
		}
	}

	r.reason = ReasonWithinPolicy
	return structs.TaskRestarting, r.jitter()
}

// handleWaitResult returns the new state and potential wait duration for
// restarting the task after it has exited.
func (r *RestartTracker) handleWaitResult() (string, time.Duration) {
	// If the task started successfully and restart on success isn't specified,
	// don't restart but don't mark as failed.
	if r.waitRes.Successful() && !r.onSuccess {
		r.reason = "Restart unnecessary as task terminated successfully"
		return structs.TaskTerminated, 0
	}

	if r.count > r.policy.Attempts {
		if r.policy.Mode == structs.RestartPolicyModeFail {
			r.reason = fmt.Sprintf(
				`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
				r.policy.Attempts, r.policy.Interval)
			return structs.TaskNotRestarting, 0
		} else {
			r.reason = ReasonDelay
			return structs.TaskRestarting, r.getDelay()
		}
	}

	r.reason = ReasonWithinPolicy
	return structs.TaskRestarting, r.jitter()
}

// getDelay returns the delay time to enter the next interval.
func (r *RestartTracker) getDelay() time.Duration {
	end := r.startTime.Add(r.policy.Interval)
	now := time.Now()
	return end.Sub(now)
}

// jitter returns the delay time plus a jitter.
func (r *RestartTracker) jitter() time.Duration {
	// Get the delay and ensure it is valid.
	d := r.policy.Delay.Nanoseconds()
	if d == 0 {
		d = 1
	}

	j := float64(r.rand.Int63n(d)) * jitter
	return time.Duration(d + int64(j))
}

// Returns a tracker that never restarts.
func noRestartsTracker() *RestartTracker {
	policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
	return newRestartTracker(policy, structs.JobTypeBatch)
}
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`package client`

			`import (`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`"fmt"`
Add jitter 2015-12-17 18:37:53 +00:00			`"math/rand"`
Client handles updates to KillTimeout and Restart Policy 2016-02-04 03:43:44 +00:00			`"sync"`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`"time"`
Track Task State in the client and capture Wait results 2015-11-14 06:07:13 +00:00
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`cstructs "github.com/hashicorp/nomad/client/driver/structs"`
Track Task State in the client and capture Wait results 2015-11-14 06:07:13 +00:00			`"github.com/hashicorp/nomad/nomad/structs"`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`)`

Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`const (`
			`// jitter is the percent of jitter added to restart delays.`
			`jitter = 0.25`

			`ReasonNoRestartsAllowed = "Policy allows no restarts"`
			`ReasonUnrecoverableErrror = "Error was unrecoverable"`
			`ReasonWithinPolicy = "Restart within policy"`
			`ReasonDelay = "Exceeded allowed attempts, applying a delay"`
			`)`
Fix test because of jitter 2015-12-18 20:11:12 +00:00
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`func newRestartTracker(policy structs.RestartPolicy, jobType string) RestartTracker {`
			`onSuccess := true`
			`if jobType == structs.JobTypeBatch {`
			`onSuccess = false`
			`}`
merge 2015-12-18 20:17:13 +00:00			`return &RestartTracker{`
			`startTime: time.Now(),`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`onSuccess: onSuccess,`
merge 2015-12-18 20:17:13 +00:00			`policy: policy,`
Add jitter 2015-12-17 18:37:53 +00:00			`rand: rand.New(rand.NewSource(time.Now().Unix())),`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`
			`}`

merge 2015-12-18 20:17:13 +00:00			`type RestartTracker struct {`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`waitRes *cstructs.WaitResult`
			`startErr error`
merge 2015-12-18 20:17:13 +00:00			`count int // Current number of attempts.`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`onSuccess bool // Whether to restart on successful exit code.`
merge 2015-12-18 20:17:13 +00:00			`startTime time.Time // When the interval began`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`reason string // The reason for the last state`
merge 2015-12-18 20:17:13 +00:00			`policy *structs.RestartPolicy`
Add jitter 2015-12-17 18:37:53 +00:00			`rand *rand.Rand`
Client handles updates to KillTimeout and Restart Policy 2016-02-04 03:43:44 +00:00			`lock sync.Mutex`
Track Task State in the client and capture Wait results 2015-11-14 06:07:13 +00:00			`}`

Client handles updates to KillTimeout and Restart Policy 2016-02-04 03:43:44 +00:00			`// SetPolicy updates the policy used to determine restarts.`
			`func (r RestartTracker) SetPolicy(policy structs.RestartPolicy) {`
			`r.lock.Lock()`
			`defer r.lock.Unlock()`
			`r.policy = policy`
			`}`

Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`// SetStartError is used to mark the most recent start error. If starting was`
			`// successful the error should be nil.`
			`func (r RestartTracker) SetStartError(err error) RestartTracker {`
			`r.lock.Lock()`
			`defer r.lock.Unlock()`
			`r.startErr = err`
			`return r`
			`}`

			`// SetWaitResult is used to mark the most recent wait result.`
			`func (r RestartTracker) SetWaitResult(res cstructs.WaitResult) *RestartTracker {`
			`r.lock.Lock()`
			`defer r.lock.Unlock()`
			`r.waitRes = res`
			`return r`
			`}`

Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`// GetReason returns a human-readable description for the last state returned by`
			`// GetState.`
			`func (r *RestartTracker) GetReason() string {`
			`r.lock.Lock()`
			`defer r.lock.Unlock()`
			`return r.reason`
			`}`

Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`// GetState returns the tasks next state given the set exit code and start`
			`// error. One of the following states are returned:`
			`// * TaskRestarting - Task should be restarted`
			`// * TaskNotRestarting - Task should not be restarted and has exceeded its`
			`// restart policy.`
			`// * TaskTerminated - Task has terminated successfully and does not need a`
			`// restart.`
			`//`
			`// If TaskRestarting is returned, the duration is how long to wait until`
			`// starting the task again.`
			`func (r *RestartTracker) GetState() (string, time.Duration) {`
Client handles updates to KillTimeout and Restart Policy 2016-02-04 03:43:44 +00:00			`r.lock.Lock()`
			`defer r.lock.Unlock()`

client: obey restart policy when attempts == 0 2016-02-02 22:17:39 +00:00			`// Hot path if no attempts are expected`
			`if r.policy.Attempts == 0 {`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = ReasonNoRestartsAllowed`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`if r.waitRes != nil && r.waitRes.Successful() {`
			`return structs.TaskTerminated, 0`
			`}`

			`return structs.TaskNotRestarting, 0`
client: obey restart policy when attempts == 0 2016-02-02 22:17:39 +00:00			`}`

Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`r.count++`

merge 2015-12-18 20:17:13 +00:00			`// Check if we have entered a new interval.`
			`end := r.startTime.Add(r.policy.Interval)`
			`now := time.Now()`
			`if now.After(end) {`
			`r.count = 0`
			`r.startTime = now`
			`}`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`if r.startErr != nil {`
			`return r.handleStartError()`
			`} else if r.waitRes != nil {`
			`return r.handleWaitResult()`
			`} else {`
			`return "", 0`
			`}`
			`}`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`// handleStartError returns the new state and potential wait duration for`
			`// restarting the task after it was not successfully started. On start errors,`
			`// the restart policy is always treated as fail mode to ensure we don't`
			`// infinitely try to start a task.`
			`func (r *RestartTracker) handleStartError() (string, time.Duration) {`
			`// If the error is not recoverable, do not restart.`
			`if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) {`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = ReasonUnrecoverableErrror`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`return structs.TaskNotRestarting, 0`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`

Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`if r.count > r.policy.Attempts {`
if policy mode is delay, do not fail for multiple startup failures, delay instead 2016-07-10 21:34:07 +00:00			`if r.policy.Mode == structs.RestartPolicyModeFail {`
			`r.reason = fmt.Sprintf(`
			`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
			`r.policy.Attempts, r.policy.Interval)`
			`return structs.TaskNotRestarting, 0`
			`} else {`
			`r.reason = ReasonDelay`
			`return structs.TaskRestarting, r.getDelay()`
			`}`
merge 2015-12-18 20:17:13 +00:00			`}`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = ReasonWithinPolicy`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`return structs.TaskRestarting, r.jitter()`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`

Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`// handleWaitResult returns the new state and potential wait duration for`
			`// restarting the task after it has exited.`
			`func (r *RestartTracker) handleWaitResult() (string, time.Duration) {`
			`// If the task started successfully and restart on success isn't specified,`
			`// don't restart but don't mark as failed.`
			`if r.waitRes.Successful() && !r.onSuccess {`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = "Restart unnecessary as task terminated successfully"`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`return structs.TaskTerminated, 0`
			`}`

			`if r.count > r.policy.Attempts {`
			`if r.policy.Mode == structs.RestartPolicyModeFail {`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = fmt.Sprintf(`
			`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
			`r.policy.Attempts, r.policy.Interval)`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`return structs.TaskNotRestarting, 0`
			`} else {`
Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = ReasonDelay`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`return structs.TaskRestarting, r.getDelay()`
			`}`
			`}`

Explain restart decision and display in alloc-status 2016-03-24 22:43:55 +00:00			`r.reason = ReasonWithinPolicy`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`return structs.TaskRestarting, r.jitter()`
			`}`

			`// getDelay returns the delay time to enter the next interval.`
			`func (r *RestartTracker) getDelay() time.Duration {`
			`end := r.startTime.Add(r.policy.Interval)`
			`now := time.Now()`
			`return end.Sub(now)`
Add jitter 2015-12-17 18:37:53 +00:00			`}`

			`// jitter returns the delay time plus a jitter.`
			`func (r *RestartTracker) jitter() time.Duration {`
Fix a bunch of tests Up timeouts trusty travis beta Increase timeouts 2016-01-20 20:00:20 +00:00			`// Get the delay and ensure it is valid.`
Add jitter 2015-12-17 18:37:53 +00:00			`d := r.policy.Delay.Nanoseconds()`
Fix a bunch of tests Up timeouts trusty travis beta Increase timeouts 2016-01-20 20:00:20 +00:00			`if d == 0 {`
			`d = 1`
			`}`

Fix test because of jitter 2015-12-18 20:11:12 +00:00			`j := float64(r.rand.Int63n(d)) * jitter`
			`return time.Duration(d + int64(j))`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00			`}`

merge 2015-12-18 20:17:13 +00:00			`// Returns a tracker that never restarts.`
			`func noRestartsTracker() *RestartTracker {`
			`policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`return newRestartTracker(policy, structs.JobTypeBatch)`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`