Merge pull request #744 from hashicorp/f-on-restart

Restart on-success shouldn't be user specifiable
This commit is contained in:
Alex Dadgar 2016-02-02 17:40:05 -08:00
commit 7a32ebcae1
13 changed files with 56 additions and 71 deletions

View file

@ -7,11 +7,10 @@ import (
// RestartPolicy defines how the Nomad client restarts
// tasks in a taskgroup when they fail
type RestartPolicy struct {
Interval time.Duration
Attempts int
Delay time.Duration
RestartOnSuccess bool
Mode string
Interval time.Duration
Attempts int
Delay time.Duration
Mode string
}
// The ServiceCheck data model represents the consul health check that

View file

@ -115,7 +115,7 @@ func (r *AllocRunner) RestoreState() error {
r.restored[name] = struct{}{}
task := &structs.Task{Name: name}
restartTracker := newRestartTracker(r.RestartPolicy)
restartTracker := newRestartTracker(r.RestartPolicy, r.alloc.Job.Type)
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx,
r.alloc, task, restartTracker, r.consulService)
r.tasks[name] = tr
@ -370,7 +370,7 @@ func (r *AllocRunner) Run() {
// Merge in the task resources
task.Resources = alloc.TaskResources[task.Name]
restartTracker := newRestartTracker(r.RestartPolicy)
restartTracker := newRestartTracker(r.RestartPolicy, r.alloc.Job.Type)
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx,
r.alloc, task, restartTracker, r.consulService)
r.tasks[task.Name] = tr

View file

@ -33,7 +33,8 @@ func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
alloc := mock.Alloc()
consulClient, _ := NewConsulService(&consulServiceConfig{logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{}})
if !restarts {
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0, RestartOnSuccess: false}
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
alloc.Job.Type = structs.JobTypeBatch
}
ar := NewAllocRunner(logger, conf, upd.Update, alloc, consulClient)

View file

@ -10,9 +10,14 @@ import (
// jitter is the percent of jitter added to restart delays.
const jitter = 0.25
func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker {
func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
onSuccess := true
if jobType == structs.JobTypeBatch {
onSuccess = false
}
return &RestartTracker{
startTime: time.Now(),
onSuccess: onSuccess,
policy: policy,
rand: rand.New(rand.NewSource(time.Now().Unix())),
}
@ -20,6 +25,7 @@ func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker {
type RestartTracker struct {
count int // Current number of attempts.
onSuccess bool // Whether to restart on successful exit code.
startTime time.Time // When the interval began
policy *structs.RestartPolicy
rand *rand.Rand
@ -57,9 +63,9 @@ func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) {
}
// shouldRestart returns whether a restart should occur based on the exit code
// and the RestartOnSuccess configuration.
// and job type.
func (r *RestartTracker) shouldRestart(exitCode int) bool {
return exitCode != 0 || r.policy.RestartOnSuccess
return exitCode != 0 || r.onSuccess
}
// jitter returns the delay time plus a jitter.
@ -77,5 +83,5 @@ func (r *RestartTracker) jitter() time.Duration {
// Returns a tracker that never restarts.
func noRestartsTracker() *RestartTracker {
policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
return newRestartTracker(policy)
return newRestartTracker(policy, structs.JobTypeBatch)
}

View file

@ -9,11 +9,10 @@ import (
func testPolicy(success bool, mode string) *structs.RestartPolicy {
return &structs.RestartPolicy{
Interval: 2 * time.Minute,
Delay: 1 * time.Second,
Attempts: 3,
Mode: mode,
RestartOnSuccess: success,
Interval: 2 * time.Minute,
Delay: 1 * time.Second,
Attempts: 3,
Mode: mode,
}
}
@ -27,7 +26,7 @@ func withinJitter(expected, actual time.Duration) bool {
func TestClient_RestartTracker_ModeDelay(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeService)
for i := 0; i < p.Attempts; i++ {
actual, when := rt.NextRestart(127)
if !actual {
@ -53,7 +52,7 @@ func TestClient_RestartTracker_ModeDelay(t *testing.T) {
func TestClient_RestartTracker_ModeFail(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeSystem)
for i := 0; i < p.Attempts; i++ {
actual, when := rt.NextRestart(127)
if !actual {
@ -73,7 +72,7 @@ func TestClient_RestartTracker_ModeFail(t *testing.T) {
func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) {
t.Parallel()
p := testPolicy(false, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeBatch)
if shouldRestart, _ := rt.NextRestart(0); shouldRestart {
t.Fatalf("NextRestart() returned %v, expected: %v", shouldRestart, false)
}
@ -83,7 +82,7 @@ func TestClient_RestartTracker_ZeroAttempts(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
p.Attempts = 0
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeService)
if actual, when := rt.NextRestart(1); actual {
t.Fatalf("expect no restart, got restart/delay: %v", when)
}

View file

@ -49,7 +49,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) {
ctx := driver.NewExecContext(allocDir, alloc.ID)
rp := structs.NewRestartPolicy(structs.JobTypeService)
restartTracker := newRestartTracker(rp)
restartTracker := newRestartTracker(rp, alloc.Job.Type)
if !restarts {
restartTracker = noRestartsTracker()
}

View file

@ -114,9 +114,6 @@ job "example" {
# A delay between a task failing and a restart occuring.
delay = "25s"
# Whether the tasks should be restarted if the exit successfully.
on_success = true
# Mode controls what happens when a task has restarted "attempts"
# times within the interval. "delay" mode delays the next restart
# till the next interval. "fail" mode does not restart the task if

View file

@ -78,11 +78,10 @@ func TestParse(t *testing.T) {
"elb_checks": "3",
},
RestartPolicy: &structs.RestartPolicy{
Interval: 10 * time.Minute,
Attempts: 5,
Delay: 15 * time.Second,
RestartOnSuccess: true,
Mode: "delay",
Interval: 10 * time.Minute,
Attempts: 5,
Delay: 15 * time.Second,
Mode: "delay",
},
Tasks: []*structs.Task{
&structs.Task{
@ -114,7 +113,7 @@ func TestParse(t *testing.T) {
CPU: 500,
MemoryMB: 128,
DiskMB: 10,
IOPS: 1,
IOPS: 0,
Networks: []*structs.NetworkResource{
&structs.NetworkResource{
MBits: 100,

View file

@ -35,7 +35,6 @@ job "binstore-storagelocker" {
attempts = 5
interval = "10m"
delay = "15s"
on_success = true
mode = "delay"
}
task "binstore" {

View file

@ -77,11 +77,10 @@ func Job() *structs.Job {
Name: "web",
Count: 10,
RestartPolicy: &structs.RestartPolicy{
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
RestartOnSuccess: true,
Mode: structs.RestartPolicyModeDelay,
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeDelay,
},
Tasks: []*structs.Task{
&structs.Task{
@ -156,11 +155,10 @@ func SystemJob() *structs.Job {
Name: "web",
Count: 1,
RestartPolicy: &structs.RestartPolicy{
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
RestartOnSuccess: true,
Mode: structs.RestartPolicyModeDelay,
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeDelay,
},
Tasks: []*structs.Task{
&structs.Task{

View file

@ -1146,18 +1146,16 @@ type PeriodicLaunch struct {
var (
defaultServiceJobRestartPolicy = RestartPolicy{
Delay: 15 * time.Second,
Attempts: 2,
Interval: 1 * time.Minute,
RestartOnSuccess: true,
Mode: RestartPolicyModeDelay,
Delay: 15 * time.Second,
Attempts: 2,
Interval: 1 * time.Minute,
Mode: RestartPolicyModeDelay,
}
defaultBatchJobRestartPolicy = RestartPolicy{
Delay: 15 * time.Second,
Attempts: 15,
Interval: 7 * 24 * time.Hour,
RestartOnSuccess: false,
Mode: RestartPolicyModeDelay,
Delay: 15 * time.Second,
Attempts: 15,
Interval: 7 * 24 * time.Hour,
Mode: RestartPolicyModeDelay,
}
)
@ -1183,10 +1181,6 @@ type RestartPolicy struct {
// Delay is the time between a failure and a restart.
Delay time.Duration
// RestartOnSuccess determines whether a task should be restarted if it
// exited successfully.
RestartOnSuccess bool `mapstructure:"on_success"`
// Mode controls what happens when the task restarts more than attempt times
// in an interval.
Mode string

View file

@ -189,11 +189,10 @@ func TestJob_IsPeriodic(t *testing.T) {
func TestTaskGroup_Validate(t *testing.T) {
tg := &TaskGroup{
RestartPolicy: &RestartPolicy{
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
RestartOnSuccess: true,
Mode: RestartPolicyModeDelay,
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
Mode: RestartPolicyModeDelay,
},
}
err := tg.Validate()
@ -217,11 +216,10 @@ func TestTaskGroup_Validate(t *testing.T) {
&Task{},
},
RestartPolicy: &RestartPolicy{
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
RestartOnSuccess: true,
Mode: RestartPolicyModeDelay,
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
Mode: RestartPolicyModeDelay,
},
}
err = tg.Validate()

View file

@ -309,9 +309,6 @@ The `restart` object supports the following keys:
time duration using the `s`, `m`, and `h` suffixes, such as `30s`. A random
jitter of up to 25% is added to the the delay.
* `on_success` - `on_success` controls whether a task is restarted when the
task exits successfully.
* `mode` - Controls the behavior when the task fails more than `attempts`
times in an interval. Possible values are listed below:
@ -327,7 +324,6 @@ restart {
attempts = 15
delay = "15s"
interval = "168h" # 7 days
on_success = false
mode = "delay"
}
```
@ -339,7 +335,6 @@ restart {
interval = "1m"
attempts = 2
delay = "15s"
on_success = true
mode = "delay"
}
```