From 31c3e1295789886c9743e1a794f2b3b737ec68b2 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Fri, 18 Dec 2015 12:17:13 -0800 Subject: [PATCH 01/10] merge --- api/tasks.go | 22 ++--- client/alloc_runner.go | 4 +- client/restarts.go | 109 ++++++++-------------- client/restarts_test.go | 104 ++++++++++----------- client/task_runner.go | 6 +- client/task_runner_test.go | 2 +- command/init.go | 21 +++-- jobspec/parse.go | 19 ++-- jobspec/parse_test.go | 18 +--- jobspec/test-fixtures/basic.hcl | 2 + nomad/structs/structs.go | 102 +++++++++++++------- nomad/structs/structs_test.go | 2 +- website/source/docs/jobspec/index.html.md | 31 ++++-- 13 files changed, 223 insertions(+), 219 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index c378c222d..3691cc78b 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -7,17 +7,11 @@ import ( // RestartPolicy defines how the Nomad client restarts // tasks in a taskgroup when they fail type RestartPolicy struct { - Interval time.Duration - Attempts int - Delay time.Duration -} - -func NewRestartPolicy() *RestartPolicy { - return &RestartPolicy{ - Attempts: 10, - Interval: 3 * time.Minute, - Delay: 5 * time.Second, - } + Interval time.Duration + Attempts int + Delay time.Duration + RestartOnSuccess bool + Mode string } // The ServiceCheck data model represents the consul health check that @@ -54,11 +48,9 @@ type TaskGroup struct { // NewTaskGroup creates a new TaskGroup. func NewTaskGroup(name string, count int) *TaskGroup { - restartPolicy := NewRestartPolicy() return &TaskGroup{ - Name: name, - Count: count, - RestartPolicy: restartPolicy, + Name: name, + Count: count, } } diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 14b93dd0d..1b4a5ea0d 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -110,7 +110,7 @@ func (r *AllocRunner) RestoreState() error { r.restored[name] = struct{}{} task := &structs.Task{Name: name} - restartTracker := newRestartTracker(r.alloc.Job.Type, r.RestartPolicy) + restartTracker := NewRestartTracker(r.RestartPolicy) tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.alloc, task, r.alloc.TaskStates[task.Name], restartTracker, r.consulService) @@ -322,7 +322,7 @@ func (r *AllocRunner) Run() { // Merge in the task resources task.Resources = alloc.TaskResources[task.Name] - restartTracker := newRestartTracker(r.alloc.Job.Type, r.RestartPolicy) + restartTracker := NewRestartTracker(r.RestartPolicy) tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.alloc, task, r.alloc.TaskStates[task.Name], restartTracker, r.consulService) diff --git a/client/restarts.go b/client/restarts.go index 7f42ad432..82dd10da9 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -6,84 +6,53 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -// The errorCounter keeps track of the number of times a process has exited -// It returns the duration after which a task is restarted -// For Batch jobs, the interval is set to zero value since the takss -// will be restarted only upto maxAttempts times -type restartTracker interface { - nextRestart(exitCode int) (bool, time.Duration) -} - -func newRestartTracker(jobType string, restartPolicy *structs.RestartPolicy) restartTracker { - switch jobType { - case structs.JobTypeService: - return &serviceRestartTracker{ - maxAttempts: restartPolicy.Attempts, - startTime: time.Now(), - interval: restartPolicy.Interval, - delay: restartPolicy.Delay, - } - default: - return &batchRestartTracker{ - maxAttempts: restartPolicy.Attempts, - delay: restartPolicy.Delay, - } +func NewRestartTracker(policy *structs.RestartPolicy) *RestartTracker { + return &RestartTracker{ + startTime: time.Now(), + policy: policy, } } -// noRestartsTracker returns a RestartTracker that never restarts. -func noRestartsTracker() restartTracker { - return &batchRestartTracker{maxAttempts: 0} +type RestartTracker struct { + count int // Current number of attempts. + startTime time.Time // When the interval began + policy *structs.RestartPolicy } -type batchRestartTracker struct { - maxAttempts int - delay time.Duration - - count int -} - -func (b *batchRestartTracker) increment() { - b.count += 1 -} - -func (b *batchRestartTracker) nextRestart(exitCode int) (bool, time.Duration) { - if b.count < b.maxAttempts && exitCode > 0 { - b.increment() - return true, b.delay - } - return false, 0 -} - -type serviceRestartTracker struct { - maxAttempts int - delay time.Duration - interval time.Duration - - count int - startTime time.Time -} - -func (s *serviceRestartTracker) increment() { - s.count += 1 -} - -func (s *serviceRestartTracker) nextRestart(exitCode int) (bool, time.Duration) { - defer s.increment() - windowEndTime := s.startTime.Add(s.interval) +func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) { + // Check if we have entered a new interval. + end := r.startTime.Add(r.policy.Interval) now := time.Now() - // If the window of restart is over we wait until the delay duration - if now.After(windowEndTime) { - s.count = 0 - s.startTime = time.Now() - return true, s.delay + if now.After(end) { + r.count = 0 + r.startTime = now + return true, r.policy.Delay } - // If we are within the delay duration and didn't exhaust all retries - if s.count < s.maxAttempts { - return true, s.delay + r.count++ + + // If we are under the attempts, restart with delay. + if r.count <= r.policy.Attempts { + return r.shouldRestart(exitCode), r.policy.Delay } - // If we exhausted all the retries and are withing the time window - return true, windowEndTime.Sub(now) + // Don't restart since mode is "fail" + if r.policy.Mode == structs.RestartPolicyModeFail { + return false, 0 + } + + // Apply an artifical wait to enter the next interval + return r.shouldRestart(exitCode), end.Sub(now) +} + +// shouldRestart returns whether a restart should occur based on the exit code +// and the RestartOnSuccess configuration. +func (r *RestartTracker) shouldRestart(exitCode int) bool { + return exitCode != 0 || r.policy.RestartOnSuccess && exitCode == 0 +} + +// Returns a tracker that never restarts. +func noRestartsTracker() *RestartTracker { + policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail} + return NewRestartTracker(policy) } diff --git a/client/restarts_test.go b/client/restarts_test.go index 2faca00ff..b30e418f3 100644 --- a/client/restarts_test.go +++ b/client/restarts_test.go @@ -1,78 +1,74 @@ package client import ( - "github.com/hashicorp/nomad/nomad/structs" "testing" "time" + + "github.com/hashicorp/nomad/nomad/structs" ) -func TestTaskRunner_ServiceRestartCounter(t *testing.T) { - interval := 2 * time.Minute - delay := 1 * time.Second - attempts := 3 - rt := newRestartTracker(structs.JobTypeService, &structs.RestartPolicy{Attempts: attempts, Interval: interval, Delay: delay}) +func testPolicy(success bool, mode string) *structs.RestartPolicy { + return &structs.RestartPolicy{ + Interval: 2 * time.Minute, + Delay: 1 * time.Second, + Attempts: 3, + Mode: mode, + RestartOnSuccess: success, + } +} - for i := 0; i < attempts; i++ { - actual, when := rt.nextRestart(127) +func TestClient_RestartTracker_ModeDelay(t *testing.T) { + t.Parallel() + p := testPolicy(true, structs.RestartPolicyModeDelay) + rt := NewRestartTracker(p) + for i := 0; i < p.Attempts; i++ { + actual, when := rt.NextRestart(127) if !actual { - t.Fatalf("should restart returned %v, actual %v", actual, true) + t.Fatalf("NextRestart() returned %v, want %v", actual, true) } - if when != delay { - t.Fatalf("nextRestart() returned %v; want %v", when, delay) + if when != p.Delay { + t.Fatalf("NextRestart() returned %v; want %v", when, p.Delay) } } - time.Sleep(1 * time.Second) + // Follow up restarts should cause delay. for i := 0; i < 3; i++ { - actual, when := rt.nextRestart(127) + actual, when := rt.NextRestart(127) if !actual { t.Fail() } - if !(when > delay && when < interval) { - t.Fatalf("nextRestart() returned %v; want less than %v and more than %v", when, interval, delay) + if !(when > p.Delay && when < p.Interval) { + t.Fatalf("NextRestart() returned %v; want less than %v and more than %v", when, p.Interval, p.Delay) } } - -} - -func TestTaskRunner_BatchRestartCounter(t *testing.T) { - attempts := 2 - interval := 1 * time.Second - delay := 1 * time.Second - rt := newRestartTracker(structs.JobTypeBatch, - &structs.RestartPolicy{Attempts: attempts, - Interval: interval, - Delay: delay, - }, - ) - for i := 0; i < attempts; i++ { - shouldRestart, when := rt.nextRestart(127) - if !shouldRestart { - t.Fatalf("should restart returned %v, actual %v", shouldRestart, true) - } - if when != delay { - t.Fatalf("Delay should be %v, actual: %v", delay, when) - } - } - actual, _ := rt.nextRestart(1) - if actual { - t.Fatalf("Expect %v, Actual: %v", false, actual) - } } -func TestTaskRunner_BatchRestartOnSuccess(t *testing.T) { - attempts := 2 - interval := 1 * time.Second - delay := 1 * time.Second - rt := newRestartTracker(structs.JobTypeBatch, - &structs.RestartPolicy{Attempts: attempts, - Interval: interval, - Delay: delay, - }, - ) - shouldRestart, _ := rt.nextRestart(0) - if shouldRestart { - t.Fatalf("should restart returned %v, expected: %v", shouldRestart, false) +func TestClient_RestartTracker_ModeFail(t *testing.T) { + t.Parallel() + p := testPolicy(true, structs.RestartPolicyModeFail) + rt := NewRestartTracker(p) + for i := 0; i < p.Attempts; i++ { + actual, when := rt.NextRestart(127) + if !actual { + t.Fatalf("NextRestart() returned %v, want %v", actual, true) + } + if when != p.Delay { + t.Fatalf("NextRestart() returned %v; want %v", when, p.Delay) + } + } + + // Next restart should cause fail + if actual, _ := rt.NextRestart(127); actual { + t.Fail() + } +} + +func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) { + t.Parallel() + p := testPolicy(false, structs.RestartPolicyModeDelay) + rt := NewRestartTracker(p) + if shouldRestart, _ := rt.NextRestart(0); shouldRestart { + t.Fatalf("NextRestart() returned %v, expected: %v", shouldRestart, false) } } diff --git a/client/task_runner.go b/client/task_runner.go index 4f10900a3..55c920bd8 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -24,7 +24,7 @@ type TaskRunner struct { logger *log.Logger ctx *driver.ExecContext alloc *structs.Allocation - restartTracker restartTracker + restartTracker *RestartTracker consulService *ConsulService task *structs.Task @@ -53,7 +53,7 @@ type TaskStateUpdater func(taskName string) func NewTaskRunner(logger *log.Logger, config *config.Config, updater TaskStateUpdater, ctx *driver.ExecContext, alloc *structs.Allocation, task *structs.Task, state *structs.TaskState, - restartTracker restartTracker, consulService *ConsulService) *TaskRunner { + restartTracker *RestartTracker, consulService *ConsulService) *TaskRunner { tc := &TaskRunner{ config: config, @@ -280,7 +280,7 @@ func (r *TaskRunner) run() { } // Check if we should restart. If not mark task as dead and exit. - shouldRestart, when := r.restartTracker.nextRestart(waitRes.ExitCode) + shouldRestart, when := r.restartTracker.NextRestart(waitRes.ExitCode) waitEvent := r.waitErrorToEvent(waitRes) if !shouldRestart { r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) diff --git a/client/task_runner_test.go b/client/task_runner_test.go index f6b581eb7..4f1437052 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -42,7 +42,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) { ctx := driver.NewExecContext(allocDir, alloc.ID) rp := structs.NewRestartPolicy(structs.JobTypeService) - restartTracker := newRestartTracker(structs.JobTypeService, rp) + restartTracker := NewRestartTracker(rp) if !restarts { restartTracker = noRestartsTracker() } diff --git a/command/init.go b/command/init.go index 39cfbc4cd..cf2d0aa3b 100644 --- a/command/init.go +++ b/command/init.go @@ -104,15 +104,24 @@ job "example" { # Defaults to 1 # count = 1 - # Restart Policy - This block defines the restart policy for TaskGroups, - # the attempts value defines the number of restarts Nomad will do if Tasks - # in this TaskGroup fails in a rolling window of interval duration - # The delay value makes Nomad wait for that duration to restart after a Task - # fails or crashes. + # Configure the restart policy for the task group. If not provided, a + # default is used based on the job type. restart { - interval = "5m" + # The number of attempts to run the job within the specified interval. attempts = 10 + interval = "5m" + + # A delay between a task failing and a restart occuring. delay = "25s" + + # Whether the tasks should be restarted if the exit successfully. + on_success = true + + # Mode controls what happens when a task has restarted "attempts" + # times within the interval. "delay" mode delays the next restart + # till the next interval. "fail" mode does not restart the task if + # "attempts" has been hit within the interval. + mode = "delay" } # Define a task to run diff --git a/jobspec/parse.go b/jobspec/parse.go index 3559744fd..52df6e35b 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -159,10 +159,9 @@ func parseJob(result *structs.Job, list *ast.ObjectList) error { result.TaskGroups = make([]*structs.TaskGroup, len(tasks), len(tasks)*2) for i, t := range tasks { result.TaskGroups[i] = &structs.TaskGroup{ - Name: t.Name, - Count: 1, - Tasks: []*structs.Task{t}, - RestartPolicy: structs.NewRestartPolicy(result.Type), + Name: t.Name, + Count: 1, + Tasks: []*structs.Task{t}, } } } @@ -230,11 +229,10 @@ func parseGroups(result *structs.Job, list *ast.ObjectList) error { return err } } - g.RestartPolicy = structs.NewRestartPolicy(result.Type) // Parse restart policy if o := listVal.Filter("restart"); len(o.Items) > 0 { - if err := parseRestartPolicy(g.RestartPolicy, o); err != nil { + if err := parseRestartPolicy(&g.RestartPolicy, o); err != nil { return err } } @@ -267,12 +265,9 @@ func parseGroups(result *structs.Job, list *ast.ObjectList) error { return nil } -func parseRestartPolicy(final *structs.RestartPolicy, list *ast.ObjectList) error { +func parseRestartPolicy(final **structs.RestartPolicy, list *ast.ObjectList) error { list = list.Elem() - if len(list.Items) == 0 { - return nil - } - if len(list.Items) != 1 { + if len(list.Items) > 1 { return fmt.Errorf("only one 'restart' block allowed") } @@ -297,7 +292,7 @@ func parseRestartPolicy(final *structs.RestartPolicy, list *ast.ObjectList) erro return err } - *final = result + *final = &result return nil } diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 4497348eb..df04e284c 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -48,11 +48,6 @@ func TestParse(t *testing.T) { &structs.TaskGroup{ Name: "outside", Count: 1, - RestartPolicy: &structs.RestartPolicy{ - Attempts: 2, - Interval: 1 * time.Minute, - Delay: 15 * time.Second, - }, Tasks: []*structs.Task{ &structs.Task{ Name: "outside", @@ -83,9 +78,11 @@ func TestParse(t *testing.T) { "elb_checks": "3", }, RestartPolicy: &structs.RestartPolicy{ - Interval: 10 * time.Minute, - Attempts: 5, - Delay: 15 * time.Second, + Interval: 10 * time.Minute, + Attempts: 5, + Delay: 15 * time.Second, + RestartOnSuccess: true, + Mode: "delay", }, Tasks: []*structs.Task{ &structs.Task{ @@ -271,11 +268,6 @@ func TestParse(t *testing.T) { &structs.TaskGroup{ Name: "bar", Count: 1, - RestartPolicy: &structs.RestartPolicy{ - Attempts: 2, - Interval: 1 * time.Minute, - Delay: 15 * time.Second, - }, Tasks: []*structs.Task{ &structs.Task{ Name: "bar", diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index 9696fdef8..549e380ef 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -35,6 +35,8 @@ job "binstore-storagelocker" { attempts = 5 interval = "10m" delay = "15s" + on_success = true + mode = "delay" } task "binstore" { driver = "docker" diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index ec4f08986..bed272309 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -19,17 +19,8 @@ import ( ) var ( - ErrNoLeader = fmt.Errorf("No cluster leader") - ErrNoRegionPath = fmt.Errorf("No path to region") - defaultServiceJobRestartPolicy = RestartPolicy{ - Delay: 15 * time.Second, - Attempts: 2, - Interval: 1 * time.Minute, - } - defaultBatchJobRestartPolicy = RestartPolicy{ - Delay: 15 * time.Second, - Attempts: 15, - } + ErrNoLeader = fmt.Errorf("No cluster leader") + ErrNoRegionPath = fmt.Errorf("No path to region") ) type MessageType uint8 @@ -786,8 +777,9 @@ type Job struct { // InitFields is used to initialize fields in the Job. This should be called // when registering a Job. func (j *Job) InitFields() { - // Initialize the service block. - j.InitAllServiceFields() + for _, tg := range j.TaskGroups { + tg.InitFields(j) + } // If the job is batch then make it GC. if j.Type == JobTypeBatch { @@ -795,16 +787,6 @@ func (j *Job) InitFields() { } } -// InitAllServiceFields traverses all Task Groups and makes them -// interpolate Job, Task group and Task names in all Service names. -// It also generates the check names if they are not set. This method also -// generates Check and Service IDs -func (j *Job) InitAllServiceFields() { - for _, tg := range j.TaskGroups { - tg.InitAllServiceFields(j.Name) - } -} - // Validate is used to sanity check a job input func (j *Job) Validate() error { var mErr multierror.Error @@ -984,15 +966,61 @@ func (p *PeriodicConfig) Next(fromTime time.Time) time.Time { return time.Time{} } -// RestartPolicy influences how Nomad restarts Tasks when they -// crash or fail. +var ( + defaultServiceJobRestartPolicy = RestartPolicy{ + Delay: 15 * time.Second, + Attempts: 2, + Interval: 1 * time.Minute, + RestartOnSuccess: true, + Mode: RestartPolicyModeDelay, + } + defaultBatchJobRestartPolicy = RestartPolicy{ + Delay: 15 * time.Second, + Attempts: 15, + Interval: 7 * 24 * time.Hour, + RestartOnSuccess: false, + Mode: RestartPolicyModeDelay, + } +) + +const ( + // RestartPolicyModeDelay causes an artificial delay till the next interval is + // reached when the specified attempts have been reached in the interval. + RestartPolicyModeDelay = "delay" + + // RestartPolicyModeFail causes a job to fail if the specified number of + // attempts are reached within an interval. + RestartPolicyModeFail = "fail" +) + +// RestartPolicy configures how Tasks are restarted when they crash or fail. type RestartPolicy struct { + // Attempts is the number of restart that will occur in an interval. Attempts int + + // Interval is a duration in which we can limit the number of restarts + // within. Interval time.Duration - Delay time.Duration + + // Delay is the time between a failure and a restart. + Delay time.Duration + + // RestartOnSuccess determines whether a task should be restarted if it + // exited successfully. + RestartOnSuccess bool `mapstructure:"on_success"` + + // Mode controls what happens when the task restarts more than attempt times + // in an interval. + Mode string } func (r *RestartPolicy) Validate() error { + switch r.Mode { + case RestartPolicyModeDelay, RestartPolicyModeFail: + default: + return fmt.Errorf("Unsupported restart mode: %q", r.Mode) + } + if r.Interval == 0 { return nil } @@ -1040,12 +1068,15 @@ type TaskGroup struct { Meta map[string]string } -// InitAllServiceFields traverses over all Tasks and makes them to interpolate -// values of Job, Task Group and Task names in all Service Names. -// It also generates service ids, check ids and check names -func (tg *TaskGroup) InitAllServiceFields(job string) { +// InitFields is used to initialize fields in the TaskGroup. +func (tg *TaskGroup) InitFields(job *Job) { + // Set the default restart policy. + if tg.RestartPolicy == nil { + tg.RestartPolicy = NewRestartPolicy(job.Type) + } + for _, task := range tg.Tasks { - task.InitAllServiceFields(job, tg.Name) + task.InitFields(job, tg) } } @@ -1240,10 +1271,15 @@ type Task struct { Meta map[string]string } -// InitAllServiceFields interpolates values of Job, Task Group +// InitFields initializes fields in the task. +func (t *Task) InitFields(job *Job, tg *TaskGroup) { + t.InitServiceFields(job.Name, tg.Name) +} + +// InitServiceFields interpolates values of Job, Task Group // and Tasks in all the service Names of a Task. This also generates the service // id, check id and check names. -func (t *Task) InitAllServiceFields(job string, taskGroup string) { +func (t *Task) InitServiceFields(job string, taskGroup string) { for _, service := range t.Services { service.InitFields(job, taskGroup, t.Name) } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 5111ea4a5..981697d8b 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -505,7 +505,7 @@ func TestJob_ExpandServiceNames(t *testing.T) { }, } - j.InitAllServiceFields() + j.InitFields() service1Name := j.TaskGroups[0].Tasks[0].Services[0].Name if service1Name != "my-job-web-frontend-default" { diff --git a/website/source/docs/jobspec/index.html.md b/website/source/docs/jobspec/index.html.md index 99aefe2b4..19a97c180 100644 --- a/website/source/docs/jobspec/index.html.md +++ b/website/source/docs/jobspec/index.html.md @@ -236,26 +236,37 @@ The `network` object supports the following keys: The `restart` object supports the following keys: -* `attempts` - For `batch` jobs, `attempts` is the maximum number of restarts - allowed before the task is failed. For non-batch jobs, the `attempts` is the - number of restarts allowed in an `interval` before a restart delay is added. +* `attempts` - `attempts` is the number of restarts allowed in an `interval`. -* `interval` - `interval` is only valid on non-batch jobs and is a time duration - that can be specified using the `s`, `m`, and `h` suffixes, such as `30s`. - The `interval` begins when the first task starts and ensures that only - `attempts` number of restarts happens within it. If more than `attempts` - number of failures happen, the restart is delayed till after the `interval`, - which is then reset. +* `interval` - `interval` is a time duration that can be specified using the + `s`, `m`, and `h` suffixes, such as `30s`. The `interval` begins when the + first task starts and ensures that only `attempts` number of restarts happens + within it. If more than `attempts` number of failures happen, behavior is + controlled by `mode`. * `delay` - A duration to wait before restarting a task. It is specified as a time duration using the `s`, `m`, and `h` suffixes, such as `30s`. +* `on_success` - `on_success` controls whether a task is restarted when the + task exits successfully. + +* `mode` - Controls the behavior when the task fails more than `attempts` + times in an interval. Possible values are listed below: + + * `delay` - `delay` will delay the next restart until the next `interval` is + reached. + + * `fail` - `fail` will not restart the task again. + The default `batch` restart policy is: ``` restart { attempts = 15 delay = "15s" + interval = "168h" # 7 days + on_success = false + mode = "delay" } ``` @@ -266,6 +277,8 @@ restart { interval = "1m" attempts = 2 delay = "15s" + on_success = true + mode = "delay" } ``` From f1c41ed4cf8c111463f7c8b49f824cdee14ea811 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 15 Dec 2015 17:13:04 -0800 Subject: [PATCH 02/10] Fix mock --- nomad/mock/mock.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 2e0057c8e..abdfc859f 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -126,7 +126,7 @@ func Job() *structs.Job { CreateIndex: 42, ModifyIndex: 99, } - job.InitAllServiceFields() + job.InitFields() return job } From ea119b72975ef30499569a3b3a833d60155784b3 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 15 Dec 2015 17:35:55 -0800 Subject: [PATCH 03/10] Update restart policy in mock --- nomad/mock/mock.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index abdfc859f..b3a468d64 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -76,9 +76,11 @@ func Job() *structs.Job { Name: "web", Count: 10, RestartPolicy: &structs.RestartPolicy{ - Attempts: 3, - Interval: 10 * time.Minute, - Delay: 1 * time.Minute, + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + RestartOnSuccess: true, + Mode: structs.RestartPolicyModeDelay, }, Tasks: []*structs.Task{ &structs.Task{ @@ -151,9 +153,11 @@ func SystemJob() *structs.Job { Name: "web", Count: 1, RestartPolicy: &structs.RestartPolicy{ - Attempts: 3, - Interval: 10 * time.Minute, - Delay: 1 * time.Minute, + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + RestartOnSuccess: true, + Mode: structs.RestartPolicyModeDelay, }, Tasks: []*structs.Task{ &structs.Task{ From 55cc8c61b3fb361c6770d69fa2549139d8e7f4eb Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Fri, 18 Dec 2015 12:17:50 -0800 Subject: [PATCH 04/10] fix --- command/run.go | 3 +++ command/run_test.go | 1 + command/validate.go | 3 +++ command/validate_test.go | 1 + nomad/structs/structs_test.go | 16 ++++++++++------ 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/command/run.go b/command/run.go index 5b674b81b..d25692135 100644 --- a/command/run.go +++ b/command/run.go @@ -80,6 +80,9 @@ func (c *RunCommand) Run(args []string) int { return 1 } + // Initialize any fields that need to be. + job.InitFields() + // Check that the job is valid if err := job.Validate(); err != nil { c.Ui.Error(fmt.Sprintf("Error validating job: %s", err)) diff --git a/command/run_test.go b/command/run_test.go index c797ef406..a61f598f2 100644 --- a/command/run_test.go +++ b/command/run_test.go @@ -77,6 +77,7 @@ func TestRunCommand_Fails(t *testing.T) { defer os.Remove(fh3.Name()) _, err = fh3.WriteString(` job "job1" { + type = "service" datacenters = [ "dc1" ] group "group1" { count = 1 diff --git a/command/validate.go b/command/validate.go index 10d03eaca..56bb166d4 100644 --- a/command/validate.go +++ b/command/validate.go @@ -48,6 +48,9 @@ func (c *ValidateCommand) Run(args []string) int { return 1 } + // Initialize any fields that need to be. + job.InitFields() + // Check that the job is valid if err := job.Validate(); err != nil { c.Ui.Error(fmt.Sprintf("Error validating job: %s", err)) diff --git a/command/validate_test.go b/command/validate_test.go index 7e61c19b6..606f75a4a 100644 --- a/command/validate_test.go +++ b/command/validate_test.go @@ -24,6 +24,7 @@ func TestValidateCommand(t *testing.T) { defer os.Remove(fh.Name()) _, err = fh.WriteString(` job "job1" { + type = "service" datacenters = [ "dc1" ] group "group1" { count = 1 diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 981697d8b..2b74c3f1a 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -115,9 +115,11 @@ func TestJob_IsPeriodic(t *testing.T) { func TestTaskGroup_Validate(t *testing.T) { tg := &TaskGroup{ RestartPolicy: &RestartPolicy{ - Interval: 5 * time.Minute, - Delay: 10 * time.Second, - Attempts: 10, + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + RestartOnSuccess: true, + Mode: RestartPolicyModeDelay, }, } err := tg.Validate() @@ -141,9 +143,11 @@ func TestTaskGroup_Validate(t *testing.T) { &Task{}, }, RestartPolicy: &RestartPolicy{ - Interval: 5 * time.Minute, - Delay: 10 * time.Second, - Attempts: 10, + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + RestartOnSuccess: true, + Mode: RestartPolicyModeDelay, }, } err = tg.Validate() From 156d5845d18bf73ad6f9fe19d514b09d702829b5 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 15 Dec 2015 18:08:53 -0800 Subject: [PATCH 05/10] Fix api test --- api/compose_test.go | 1 - api/tasks_test.go | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/api/compose_test.go b/api/compose_test.go index bc76895a9..f4b21fcbd 100644 --- a/api/compose_test.go +++ b/api/compose_test.go @@ -69,7 +69,6 @@ func TestCompose(t *testing.T) { Operand: "=", }, }, - RestartPolicy: NewRestartPolicy(), Tasks: []*Task{ &Task{ Name: "task1", diff --git a/api/tasks_test.go b/api/tasks_test.go index 75f29996d..bbdf12550 100644 --- a/api/tasks_test.go +++ b/api/tasks_test.go @@ -8,9 +8,8 @@ import ( func TestTaskGroup_NewTaskGroup(t *testing.T) { grp := NewTaskGroup("grp1", 2) expect := &TaskGroup{ - Name: "grp1", - Count: 2, - RestartPolicy: NewRestartPolicy(), + Name: "grp1", + Count: 2, } if !reflect.DeepEqual(grp, expect) { t.Fatalf("expect: %#v, got: %#v", expect, grp) From a5e9e2068c9dcd60e7563ea3e374d37ea49c7ee8 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 16 Dec 2015 14:39:37 -0800 Subject: [PATCH 06/10] Make NewRestartTracker private --- client/alloc_runner.go | 4 ++-- client/restarts.go | 4 ++-- client/restarts_test.go | 6 +++--- client/task_runner_test.go | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 1b4a5ea0d..71de749c7 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -110,7 +110,7 @@ func (r *AllocRunner) RestoreState() error { r.restored[name] = struct{}{} task := &structs.Task{Name: name} - restartTracker := NewRestartTracker(r.RestartPolicy) + restartTracker := newRestartTracker(r.RestartPolicy) tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.alloc, task, r.alloc.TaskStates[task.Name], restartTracker, r.consulService) @@ -322,7 +322,7 @@ func (r *AllocRunner) Run() { // Merge in the task resources task.Resources = alloc.TaskResources[task.Name] - restartTracker := NewRestartTracker(r.RestartPolicy) + restartTracker := newRestartTracker(r.RestartPolicy) tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.alloc, task, r.alloc.TaskStates[task.Name], restartTracker, r.consulService) diff --git a/client/restarts.go b/client/restarts.go index 82dd10da9..937a8c851 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -6,7 +6,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -func NewRestartTracker(policy *structs.RestartPolicy) *RestartTracker { +func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker { return &RestartTracker{ startTime: time.Now(), policy: policy, @@ -54,5 +54,5 @@ func (r *RestartTracker) shouldRestart(exitCode int) bool { // Returns a tracker that never restarts. func noRestartsTracker() *RestartTracker { policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail} - return NewRestartTracker(policy) + return newRestartTracker(policy) } diff --git a/client/restarts_test.go b/client/restarts_test.go index b30e418f3..04f2d362d 100644 --- a/client/restarts_test.go +++ b/client/restarts_test.go @@ -20,7 +20,7 @@ func testPolicy(success bool, mode string) *structs.RestartPolicy { func TestClient_RestartTracker_ModeDelay(t *testing.T) { t.Parallel() p := testPolicy(true, structs.RestartPolicyModeDelay) - rt := NewRestartTracker(p) + rt := newRestartTracker(p) for i := 0; i < p.Attempts; i++ { actual, when := rt.NextRestart(127) if !actual { @@ -46,7 +46,7 @@ func TestClient_RestartTracker_ModeDelay(t *testing.T) { func TestClient_RestartTracker_ModeFail(t *testing.T) { t.Parallel() p := testPolicy(true, structs.RestartPolicyModeFail) - rt := NewRestartTracker(p) + rt := newRestartTracker(p) for i := 0; i < p.Attempts; i++ { actual, when := rt.NextRestart(127) if !actual { @@ -66,7 +66,7 @@ func TestClient_RestartTracker_ModeFail(t *testing.T) { func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) { t.Parallel() p := testPolicy(false, structs.RestartPolicyModeDelay) - rt := NewRestartTracker(p) + rt := newRestartTracker(p) if shouldRestart, _ := rt.NextRestart(0); shouldRestart { t.Fatalf("NextRestart() returned %v, expected: %v", shouldRestart, false) } diff --git a/client/task_runner_test.go b/client/task_runner_test.go index 4f1437052..eae7eb8e6 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -42,7 +42,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) { ctx := driver.NewExecContext(allocDir, alloc.ID) rp := structs.NewRestartPolicy(structs.JobTypeService) - restartTracker := NewRestartTracker(rp) + restartTracker := newRestartTracker(rp) if !restarts { restartTracker = noRestartsTracker() } From 307fbef7194fd7a5cf78137841c635194bcb1531 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 17 Dec 2015 10:37:53 -0800 Subject: [PATCH 07/10] Add jitter --- client/restarts.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/client/restarts.go b/client/restarts.go index 937a8c851..bf3921dba 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -1,6 +1,7 @@ package client import ( + "math/rand" "time" "github.com/hashicorp/nomad/nomad/structs" @@ -10,6 +11,7 @@ func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker { return &RestartTracker{ startTime: time.Now(), policy: policy, + rand: rand.New(rand.NewSource(time.Now().Unix())), } } @@ -17,6 +19,7 @@ type RestartTracker struct { count int // Current number of attempts. startTime time.Time // When the interval began policy *structs.RestartPolicy + rand *rand.Rand } func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) { @@ -26,14 +29,14 @@ func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) { if now.After(end) { r.count = 0 r.startTime = now - return true, r.policy.Delay + return true, r.jitter() } r.count++ // If we are under the attempts, restart with delay. if r.count <= r.policy.Attempts { - return r.shouldRestart(exitCode), r.policy.Delay + return r.shouldRestart(exitCode), r.jitter() } // Don't restart since mode is "fail" @@ -48,7 +51,14 @@ func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) { // shouldRestart returns whether a restart should occur based on the exit code // and the RestartOnSuccess configuration. func (r *RestartTracker) shouldRestart(exitCode int) bool { - return exitCode != 0 || r.policy.RestartOnSuccess && exitCode == 0 + return exitCode != 0 || r.policy.RestartOnSuccess +} + +// jitter returns the delay time plus a jitter. +func (r *RestartTracker) jitter() time.Duration { + d := r.policy.Delay.Nanoseconds() + j := r.rand.Int63n(d) / 4 // Up to 25% jitter + return time.Duration(d + j) } // Returns a tracker that never restarts. From 0a17dc6eca0ea458e89d24c2dc59fcb9f839c121 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 17 Dec 2015 16:39:47 -0800 Subject: [PATCH 08/10] Document jitter --- website/source/docs/jobspec/index.html.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/source/docs/jobspec/index.html.md b/website/source/docs/jobspec/index.html.md index 19a97c180..f55251c2b 100644 --- a/website/source/docs/jobspec/index.html.md +++ b/website/source/docs/jobspec/index.html.md @@ -245,7 +245,8 @@ The `restart` object supports the following keys: controlled by `mode`. * `delay` - A duration to wait before restarting a task. It is specified as a - time duration using the `s`, `m`, and `h` suffixes, such as `30s`. + time duration using the `s`, `m`, and `h` suffixes, such as `30s`. A random + jitter of up to 25% is added to the the delay. * `on_success` - `on_success` controls whether a task is restarted when the task exits successfully. From b0f321c2c76ce33e2f6d379f4d2fa350b8d42d5b Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Fri, 18 Dec 2015 12:11:12 -0800 Subject: [PATCH 09/10] Fix test because of jitter --- client/restarts.go | 7 +++++-- client/restarts_test.go | 15 +++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/client/restarts.go b/client/restarts.go index bf3921dba..6da456e83 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -7,6 +7,9 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) +// jitter is the percent of jitter added to restart delays. +const jitter = 0.25 + func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker { return &RestartTracker{ startTime: time.Now(), @@ -57,8 +60,8 @@ func (r *RestartTracker) shouldRestart(exitCode int) bool { // jitter returns the delay time plus a jitter. func (r *RestartTracker) jitter() time.Duration { d := r.policy.Delay.Nanoseconds() - j := r.rand.Int63n(d) / 4 // Up to 25% jitter - return time.Duration(d + j) + j := float64(r.rand.Int63n(d)) * jitter + return time.Duration(d + int64(j)) } // Returns a tracker that never restarts. diff --git a/client/restarts_test.go b/client/restarts_test.go index 04f2d362d..79e4d2a56 100644 --- a/client/restarts_test.go +++ b/client/restarts_test.go @@ -17,6 +17,13 @@ func testPolicy(success bool, mode string) *structs.RestartPolicy { } } +// withinJitter is a helper that returns whether the returned delay is within +// the jitter. +func withinJitter(expected, actual time.Duration) bool { + return float64((actual.Nanoseconds()-expected.Nanoseconds())/ + expected.Nanoseconds()) <= jitter +} + func TestClient_RestartTracker_ModeDelay(t *testing.T) { t.Parallel() p := testPolicy(true, structs.RestartPolicyModeDelay) @@ -26,8 +33,8 @@ func TestClient_RestartTracker_ModeDelay(t *testing.T) { if !actual { t.Fatalf("NextRestart() returned %v, want %v", actual, true) } - if when != p.Delay { - t.Fatalf("NextRestart() returned %v; want %v", when, p.Delay) + if !withinJitter(p.Delay, when) { + t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay) } } @@ -52,8 +59,8 @@ func TestClient_RestartTracker_ModeFail(t *testing.T) { if !actual { t.Fatalf("NextRestart() returned %v, want %v", actual, true) } - if when != p.Delay { - t.Fatalf("NextRestart() returned %v; want %v", when, p.Delay) + if !withinJitter(p.Delay, when) { + t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay) } } From 423d16908ee0174825518c2216f299588c353add Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Fri, 18 Dec 2015 12:24:53 -0800 Subject: [PATCH 10/10] Go vet --- nomad/core_sched_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 54c876051..4d71de5bc 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -192,7 +192,7 @@ func TestCoreScheduler_JobGC(t *testing.T) { // Should still exist out, err := state.JobByID(job.ID) if err != nil { - t.Fatalf("test(%s) err: %v", err) + t.Fatalf("test(%s) err: %v", test.test, err) } if (test.shouldExist && out == nil) || (!test.shouldExist && out != nil) { t.Fatalf("test(%s) bad: %v", test.test, out) @@ -200,7 +200,7 @@ func TestCoreScheduler_JobGC(t *testing.T) { outE, err := state.EvalByID(eval.ID) if err != nil { - t.Fatalf("test(%s) err: %v", err) + t.Fatalf("test(%s) err: %v", test.test, err) } if (test.shouldExist && outE == nil) || (!test.shouldExist && outE != nil) { t.Fatalf("test(%s) bad: %v", test.test, out) @@ -208,7 +208,7 @@ func TestCoreScheduler_JobGC(t *testing.T) { outA, err := state.AllocByID(alloc.ID) if err != nil { - t.Fatalf("test(%s) err: %v", err) + t.Fatalf("test(%s) err: %v", test.test, err) } if (test.shouldExist && outA == nil) || (!test.shouldExist && outA != nil) { t.Fatalf("test(%s) bad: %v", test.test, outA)