Merge pull request #744 from hashicorp/f-on-restart
Restart on-success shouldn't be user specifiable
This commit is contained in:
commit
7a32ebcae1
|
@ -7,11 +7,10 @@ import (
|
|||
// RestartPolicy defines how the Nomad client restarts
|
||||
// tasks in a taskgroup when they fail
|
||||
type RestartPolicy struct {
|
||||
Interval time.Duration
|
||||
Attempts int
|
||||
Delay time.Duration
|
||||
RestartOnSuccess bool
|
||||
Mode string
|
||||
Interval time.Duration
|
||||
Attempts int
|
||||
Delay time.Duration
|
||||
Mode string
|
||||
}
|
||||
|
||||
// The ServiceCheck data model represents the consul health check that
|
||||
|
|
|
@ -115,7 +115,7 @@ func (r *AllocRunner) RestoreState() error {
|
|||
r.restored[name] = struct{}{}
|
||||
|
||||
task := &structs.Task{Name: name}
|
||||
restartTracker := newRestartTracker(r.RestartPolicy)
|
||||
restartTracker := newRestartTracker(r.RestartPolicy, r.alloc.Job.Type)
|
||||
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx,
|
||||
r.alloc, task, restartTracker, r.consulService)
|
||||
r.tasks[name] = tr
|
||||
|
@ -370,7 +370,7 @@ func (r *AllocRunner) Run() {
|
|||
|
||||
// Merge in the task resources
|
||||
task.Resources = alloc.TaskResources[task.Name]
|
||||
restartTracker := newRestartTracker(r.RestartPolicy)
|
||||
restartTracker := newRestartTracker(r.RestartPolicy, r.alloc.Job.Type)
|
||||
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx,
|
||||
r.alloc, task, restartTracker, r.consulService)
|
||||
r.tasks[task.Name] = tr
|
||||
|
|
|
@ -33,7 +33,8 @@ func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
|
|||
alloc := mock.Alloc()
|
||||
consulClient, _ := NewConsulService(&consulServiceConfig{logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{}})
|
||||
if !restarts {
|
||||
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0, RestartOnSuccess: false}
|
||||
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
|
||||
alloc.Job.Type = structs.JobTypeBatch
|
||||
}
|
||||
|
||||
ar := NewAllocRunner(logger, conf, upd.Update, alloc, consulClient)
|
||||
|
|
|
@ -10,9 +10,14 @@ import (
|
|||
// jitter is the percent of jitter added to restart delays.
|
||||
const jitter = 0.25
|
||||
|
||||
func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker {
|
||||
func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
|
||||
onSuccess := true
|
||||
if jobType == structs.JobTypeBatch {
|
||||
onSuccess = false
|
||||
}
|
||||
return &RestartTracker{
|
||||
startTime: time.Now(),
|
||||
onSuccess: onSuccess,
|
||||
policy: policy,
|
||||
rand: rand.New(rand.NewSource(time.Now().Unix())),
|
||||
}
|
||||
|
@ -20,6 +25,7 @@ func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker {
|
|||
|
||||
type RestartTracker struct {
|
||||
count int // Current number of attempts.
|
||||
onSuccess bool // Whether to restart on successful exit code.
|
||||
startTime time.Time // When the interval began
|
||||
policy *structs.RestartPolicy
|
||||
rand *rand.Rand
|
||||
|
@ -57,9 +63,9 @@ func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) {
|
|||
}
|
||||
|
||||
// shouldRestart returns whether a restart should occur based on the exit code
|
||||
// and the RestartOnSuccess configuration.
|
||||
// and job type.
|
||||
func (r *RestartTracker) shouldRestart(exitCode int) bool {
|
||||
return exitCode != 0 || r.policy.RestartOnSuccess
|
||||
return exitCode != 0 || r.onSuccess
|
||||
}
|
||||
|
||||
// jitter returns the delay time plus a jitter.
|
||||
|
@ -77,5 +83,5 @@ func (r *RestartTracker) jitter() time.Duration {
|
|||
// Returns a tracker that never restarts.
|
||||
func noRestartsTracker() *RestartTracker {
|
||||
policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
|
||||
return newRestartTracker(policy)
|
||||
return newRestartTracker(policy, structs.JobTypeBatch)
|
||||
}
|
||||
|
|
|
@ -9,11 +9,10 @@ import (
|
|||
|
||||
func testPolicy(success bool, mode string) *structs.RestartPolicy {
|
||||
return &structs.RestartPolicy{
|
||||
Interval: 2 * time.Minute,
|
||||
Delay: 1 * time.Second,
|
||||
Attempts: 3,
|
||||
Mode: mode,
|
||||
RestartOnSuccess: success,
|
||||
Interval: 2 * time.Minute,
|
||||
Delay: 1 * time.Second,
|
||||
Attempts: 3,
|
||||
Mode: mode,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,7 +26,7 @@ func withinJitter(expected, actual time.Duration) bool {
|
|||
func TestClient_RestartTracker_ModeDelay(t *testing.T) {
|
||||
t.Parallel()
|
||||
p := testPolicy(true, structs.RestartPolicyModeDelay)
|
||||
rt := newRestartTracker(p)
|
||||
rt := newRestartTracker(p, structs.JobTypeService)
|
||||
for i := 0; i < p.Attempts; i++ {
|
||||
actual, when := rt.NextRestart(127)
|
||||
if !actual {
|
||||
|
@ -53,7 +52,7 @@ func TestClient_RestartTracker_ModeDelay(t *testing.T) {
|
|||
func TestClient_RestartTracker_ModeFail(t *testing.T) {
|
||||
t.Parallel()
|
||||
p := testPolicy(true, structs.RestartPolicyModeFail)
|
||||
rt := newRestartTracker(p)
|
||||
rt := newRestartTracker(p, structs.JobTypeSystem)
|
||||
for i := 0; i < p.Attempts; i++ {
|
||||
actual, when := rt.NextRestart(127)
|
||||
if !actual {
|
||||
|
@ -73,7 +72,7 @@ func TestClient_RestartTracker_ModeFail(t *testing.T) {
|
|||
func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) {
|
||||
t.Parallel()
|
||||
p := testPolicy(false, structs.RestartPolicyModeDelay)
|
||||
rt := newRestartTracker(p)
|
||||
rt := newRestartTracker(p, structs.JobTypeBatch)
|
||||
if shouldRestart, _ := rt.NextRestart(0); shouldRestart {
|
||||
t.Fatalf("NextRestart() returned %v, expected: %v", shouldRestart, false)
|
||||
}
|
||||
|
@ -83,7 +82,7 @@ func TestClient_RestartTracker_ZeroAttempts(t *testing.T) {
|
|||
t.Parallel()
|
||||
p := testPolicy(true, structs.RestartPolicyModeFail)
|
||||
p.Attempts = 0
|
||||
rt := newRestartTracker(p)
|
||||
rt := newRestartTracker(p, structs.JobTypeService)
|
||||
if actual, when := rt.NextRestart(1); actual {
|
||||
t.Fatalf("expect no restart, got restart/delay: %v", when)
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) {
|
|||
|
||||
ctx := driver.NewExecContext(allocDir, alloc.ID)
|
||||
rp := structs.NewRestartPolicy(structs.JobTypeService)
|
||||
restartTracker := newRestartTracker(rp)
|
||||
restartTracker := newRestartTracker(rp, alloc.Job.Type)
|
||||
if !restarts {
|
||||
restartTracker = noRestartsTracker()
|
||||
}
|
||||
|
|
|
@ -114,9 +114,6 @@ job "example" {
|
|||
# A delay between a task failing and a restart occuring.
|
||||
delay = "25s"
|
||||
|
||||
# Whether the tasks should be restarted if the exit successfully.
|
||||
on_success = true
|
||||
|
||||
# Mode controls what happens when a task has restarted "attempts"
|
||||
# times within the interval. "delay" mode delays the next restart
|
||||
# till the next interval. "fail" mode does not restart the task if
|
||||
|
|
|
@ -78,11 +78,10 @@ func TestParse(t *testing.T) {
|
|||
"elb_checks": "3",
|
||||
},
|
||||
RestartPolicy: &structs.RestartPolicy{
|
||||
Interval: 10 * time.Minute,
|
||||
Attempts: 5,
|
||||
Delay: 15 * time.Second,
|
||||
RestartOnSuccess: true,
|
||||
Mode: "delay",
|
||||
Interval: 10 * time.Minute,
|
||||
Attempts: 5,
|
||||
Delay: 15 * time.Second,
|
||||
Mode: "delay",
|
||||
},
|
||||
Tasks: []*structs.Task{
|
||||
&structs.Task{
|
||||
|
@ -114,7 +113,7 @@ func TestParse(t *testing.T) {
|
|||
CPU: 500,
|
||||
MemoryMB: 128,
|
||||
DiskMB: 10,
|
||||
IOPS: 1,
|
||||
IOPS: 0,
|
||||
Networks: []*structs.NetworkResource{
|
||||
&structs.NetworkResource{
|
||||
MBits: 100,
|
||||
|
|
|
@ -35,7 +35,6 @@ job "binstore-storagelocker" {
|
|||
attempts = 5
|
||||
interval = "10m"
|
||||
delay = "15s"
|
||||
on_success = true
|
||||
mode = "delay"
|
||||
}
|
||||
task "binstore" {
|
||||
|
|
|
@ -77,11 +77,10 @@ func Job() *structs.Job {
|
|||
Name: "web",
|
||||
Count: 10,
|
||||
RestartPolicy: &structs.RestartPolicy{
|
||||
Attempts: 3,
|
||||
Interval: 10 * time.Minute,
|
||||
Delay: 1 * time.Minute,
|
||||
RestartOnSuccess: true,
|
||||
Mode: structs.RestartPolicyModeDelay,
|
||||
Attempts: 3,
|
||||
Interval: 10 * time.Minute,
|
||||
Delay: 1 * time.Minute,
|
||||
Mode: structs.RestartPolicyModeDelay,
|
||||
},
|
||||
Tasks: []*structs.Task{
|
||||
&structs.Task{
|
||||
|
@ -156,11 +155,10 @@ func SystemJob() *structs.Job {
|
|||
Name: "web",
|
||||
Count: 1,
|
||||
RestartPolicy: &structs.RestartPolicy{
|
||||
Attempts: 3,
|
||||
Interval: 10 * time.Minute,
|
||||
Delay: 1 * time.Minute,
|
||||
RestartOnSuccess: true,
|
||||
Mode: structs.RestartPolicyModeDelay,
|
||||
Attempts: 3,
|
||||
Interval: 10 * time.Minute,
|
||||
Delay: 1 * time.Minute,
|
||||
Mode: structs.RestartPolicyModeDelay,
|
||||
},
|
||||
Tasks: []*structs.Task{
|
||||
&structs.Task{
|
||||
|
|
|
@ -1146,18 +1146,16 @@ type PeriodicLaunch struct {
|
|||
|
||||
var (
|
||||
defaultServiceJobRestartPolicy = RestartPolicy{
|
||||
Delay: 15 * time.Second,
|
||||
Attempts: 2,
|
||||
Interval: 1 * time.Minute,
|
||||
RestartOnSuccess: true,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
Delay: 15 * time.Second,
|
||||
Attempts: 2,
|
||||
Interval: 1 * time.Minute,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
}
|
||||
defaultBatchJobRestartPolicy = RestartPolicy{
|
||||
Delay: 15 * time.Second,
|
||||
Attempts: 15,
|
||||
Interval: 7 * 24 * time.Hour,
|
||||
RestartOnSuccess: false,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
Delay: 15 * time.Second,
|
||||
Attempts: 15,
|
||||
Interval: 7 * 24 * time.Hour,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -1183,10 +1181,6 @@ type RestartPolicy struct {
|
|||
// Delay is the time between a failure and a restart.
|
||||
Delay time.Duration
|
||||
|
||||
// RestartOnSuccess determines whether a task should be restarted if it
|
||||
// exited successfully.
|
||||
RestartOnSuccess bool `mapstructure:"on_success"`
|
||||
|
||||
// Mode controls what happens when the task restarts more than attempt times
|
||||
// in an interval.
|
||||
Mode string
|
||||
|
|
|
@ -189,11 +189,10 @@ func TestJob_IsPeriodic(t *testing.T) {
|
|||
func TestTaskGroup_Validate(t *testing.T) {
|
||||
tg := &TaskGroup{
|
||||
RestartPolicy: &RestartPolicy{
|
||||
Interval: 5 * time.Minute,
|
||||
Delay: 10 * time.Second,
|
||||
Attempts: 10,
|
||||
RestartOnSuccess: true,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
Interval: 5 * time.Minute,
|
||||
Delay: 10 * time.Second,
|
||||
Attempts: 10,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
},
|
||||
}
|
||||
err := tg.Validate()
|
||||
|
@ -217,11 +216,10 @@ func TestTaskGroup_Validate(t *testing.T) {
|
|||
&Task{},
|
||||
},
|
||||
RestartPolicy: &RestartPolicy{
|
||||
Interval: 5 * time.Minute,
|
||||
Delay: 10 * time.Second,
|
||||
Attempts: 10,
|
||||
RestartOnSuccess: true,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
Interval: 5 * time.Minute,
|
||||
Delay: 10 * time.Second,
|
||||
Attempts: 10,
|
||||
Mode: RestartPolicyModeDelay,
|
||||
},
|
||||
}
|
||||
err = tg.Validate()
|
||||
|
|
|
@ -309,9 +309,6 @@ The `restart` object supports the following keys:
|
|||
time duration using the `s`, `m`, and `h` suffixes, such as `30s`. A random
|
||||
jitter of up to 25% is added to the the delay.
|
||||
|
||||
* `on_success` - `on_success` controls whether a task is restarted when the
|
||||
task exits successfully.
|
||||
|
||||
* `mode` - Controls the behavior when the task fails more than `attempts`
|
||||
times in an interval. Possible values are listed below:
|
||||
|
||||
|
@ -327,7 +324,6 @@ restart {
|
|||
attempts = 15
|
||||
delay = "15s"
|
||||
interval = "168h" # 7 days
|
||||
on_success = false
|
||||
mode = "delay"
|
||||
}
|
||||
```
|
||||
|
@ -339,7 +335,6 @@ restart {
|
|||
interval = "1m"
|
||||
attempts = 2
|
||||
delay = "15s"
|
||||
on_success = true
|
||||
mode = "delay"
|
||||
}
|
||||
```
|
||||
|
|
Loading…
Reference in a new issue