open-nomad/client/restarts_test.go

package client

import (
	"fmt"
	"testing"
	"time"

	cstructs "github.com/hashicorp/nomad/client/driver/structs"
	"github.com/hashicorp/nomad/nomad/structs"
)

func testPolicy(success bool, mode string) *structs.RestartPolicy {
	return &structs.RestartPolicy{
		Interval: 2 * time.Minute,
		Delay:    1 * time.Second,
		Attempts: 3,
		Mode:     mode,
	}
}

// withinJitter is a helper that returns whether the returned delay is within
// the jitter.
func withinJitter(expected, actual time.Duration) bool {
	return float64((actual.Nanoseconds()-expected.Nanoseconds())/
		expected.Nanoseconds()) <= jitter
}

func testWaitResult(exit int) *cstructs.WaitResult {
	return cstructs.NewWaitResult(exit, 0, nil)
}

func TestClient_RestartTracker_ModeDelay(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeDelay)
	rt := newRestartTracker(p, structs.JobTypeService)
	for i := 0; i < p.Attempts; i++ {
		state, when := rt.SetWaitResult(testWaitResult(127)).GetState()
		if state != structs.TaskRestarting {
			t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
		}
		if !withinJitter(p.Delay, when) {
			t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
		}
	}

	// Follow up restarts should cause delay.
	for i := 0; i < 3; i++ {
		state, when := rt.SetWaitResult(testWaitResult(127)).GetState()
		if state != structs.TaskRestarting {
			t.Fail()
		}
		if !(when > p.Delay && when <= p.Interval) {
			t.Fatalf("NextRestart() returned %v; want > %v and <= %v", when, p.Delay, p.Interval)
		}
	}
}

func TestClient_RestartTracker_ModeFail(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeFail)
	rt := newRestartTracker(p, structs.JobTypeSystem)
	for i := 0; i < p.Attempts; i++ {
		state, when := rt.SetWaitResult(testWaitResult(127)).GetState()
		if state != structs.TaskRestarting {
			t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
		}
		if !withinJitter(p.Delay, when) {
			t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
		}
	}

	// Next restart should cause fail
	if state, _ := rt.SetWaitResult(testWaitResult(127)).GetState(); state != structs.TaskNotRestarting {
		t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)
	}
}

func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) {
	t.Parallel()
	p := testPolicy(false, structs.RestartPolicyModeDelay)
	rt := newRestartTracker(p, structs.JobTypeBatch)
	if state, _ := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskTerminated {
		t.Fatalf("NextRestart() returned %v, expected: %v", state, structs.TaskTerminated)
	}
}

func TestClient_RestartTracker_ZeroAttempts(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeFail)
	p.Attempts = 0

	// Test with a non-zero exit code
	rt := newRestartTracker(p, structs.JobTypeService)
	if state, when := rt.SetWaitResult(testWaitResult(1)).GetState(); state != structs.TaskNotRestarting {
		t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)
	}

	// Even with a zero (successful) exit code non-batch jobs should exit
	// with TaskNotRestarting
	rt = newRestartTracker(p, structs.JobTypeService)
	if state, when := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskNotRestarting {
		t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)
	}

	// Batch jobs with a zero exit code and 0 attempts *do* exit cleanly
	// with Terminated
	rt = newRestartTracker(p, structs.JobTypeBatch)
	if state, when := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskTerminated {
		t.Fatalf("expect terminated, got restart/delay: %v/%v", state, when)
	}

	// Batch jobs with a non-zero exit code and 0 attempts exit with
	// TaskNotRestarting
	rt = newRestartTracker(p, structs.JobTypeBatch)
	if state, when := rt.SetWaitResult(testWaitResult(1)).GetState(); state != structs.TaskNotRestarting {
		t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)
	}
}

func TestClient_RestartTracker_RestartTriggered(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeFail)
	p.Attempts = 0
	rt := newRestartTracker(p, structs.JobTypeService)
	if state, when := rt.SetRestartTriggered(false).GetState(); state != structs.TaskRestarting && when != 0 {
		t.Fatalf("expect restart immediately, got %v %v", state, when)
	}
}

func TestClient_RestartTracker_RestartTriggered_Failure(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeFail)
	p.Attempts = 1
	rt := newRestartTracker(p, structs.JobTypeService)
	if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskRestarting || when == 0 {
		t.Fatalf("expect restart got %v %v", state, when)
	}
	if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskNotRestarting || when != 0 {
		t.Fatalf("expect failed got %v %v", state, when)
	}
}

func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeFail)
	rt := newRestartTracker(p, structs.JobTypeSystem)
	recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)
	for i := 0; i < p.Attempts; i++ {
		state, when := rt.SetStartError(recErr).GetState()
		if state != structs.TaskRestarting {
			t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
		}
		if !withinJitter(p.Delay, when) {
			t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
		}
	}

	// Next restart should cause fail
	if state, _ := rt.SetStartError(recErr).GetState(); state != structs.TaskNotRestarting {
		t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)
	}
}

func TestClient_RestartTracker_StartError_Recoverable_Delay(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeDelay)
	rt := newRestartTracker(p, structs.JobTypeSystem)
	recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)
	for i := 0; i < p.Attempts; i++ {
		state, when := rt.SetStartError(recErr).GetState()
		if state != structs.TaskRestarting {
			t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
		}
		if !withinJitter(p.Delay, when) {
			t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
		}
	}

	// Next restart should cause delay
	state, when := rt.SetStartError(recErr).GetState()
	if state != structs.TaskRestarting {
		t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskRestarting)
	}
	if !(when > p.Delay && when <= p.Interval) {
		t.Fatalf("NextRestart() returned %v; want > %v and <= %v", when, p.Delay, p.Interval)
	}
}
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`package client`

			`import (`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`"fmt"`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`"testing"`
			`"time"`
merge 2015-12-18 20:17:13 +00:00
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`cstructs "github.com/hashicorp/nomad/client/driver/structs"`
merge 2015-12-18 20:17:13 +00:00			`"github.com/hashicorp/nomad/nomad/structs"`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`)`

merge 2015-12-18 20:17:13 +00:00			`func testPolicy(success bool, mode string) *structs.RestartPolicy {`
			`return &structs.RestartPolicy{`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`Interval: 2 * time.Minute,`
			`Delay: 1 * time.Second,`
			`Attempts: 3,`
			`Mode: mode,`
merge 2015-12-18 20:17:13 +00:00			`}`
			`}`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00
Fix test because of jitter 2015-12-18 20:11:12 +00:00			`// withinJitter is a helper that returns whether the returned delay is within`
			`// the jitter.`
			`func withinJitter(expected, actual time.Duration) bool {`
			`return float64((actual.Nanoseconds()-expected.Nanoseconds())/`
			`expected.Nanoseconds()) <= jitter`
			`}`

Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`func testWaitResult(exit int) *cstructs.WaitResult {`
			`return cstructs.NewWaitResult(exit, 0, nil)`
			`}`

merge 2015-12-18 20:17:13 +00:00			`func TestClient_RestartTracker_ModeDelay(t *testing.T) {`
			`t.Parallel()`
			`p := testPolicy(true, structs.RestartPolicyModeDelay)`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`rt := newRestartTracker(p, structs.JobTypeService)`
merge 2015-12-18 20:17:13 +00:00			`for i := 0; i < p.Attempts; i++ {`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`state, when := rt.SetWaitResult(testWaitResult(127)).GetState()`
			`if state != structs.TaskRestarting {`
			`t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00			`}`
Fix test because of jitter 2015-12-18 20:11:12 +00:00			`if !withinJitter(p.Delay, when) {`
			`t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00			`}`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00
merge 2015-12-18 20:17:13 +00:00			`// Follow up restarts should cause delay.`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00			`for i := 0; i < 3; i++ {`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`state, when := rt.SetWaitResult(testWaitResult(127)).GetState()`
			`if state != structs.TaskRestarting {`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00			`t.Fail()`
			`}`
Allow for low precision time.Now on Windows. 2015-12-21 15:43:45 +00:00			`if !(when > p.Delay && when <= p.Interval) {`
			`t.Fatalf("NextRestart() returned %v; want > %v and <= %v", when, p.Delay, p.Interval)`
Incremeneting the counter once we calculate next restart 2015-11-06 01:13:25 +00:00			`}`
			`}`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`

merge 2015-12-18 20:17:13 +00:00			`func TestClient_RestartTracker_ModeFail(t *testing.T) {`
			`t.Parallel()`
			`p := testPolicy(true, structs.RestartPolicyModeFail)`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`rt := newRestartTracker(p, structs.JobTypeSystem)`
merge 2015-12-18 20:17:13 +00:00			`for i := 0; i < p.Attempts; i++ {`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`state, when := rt.SetWaitResult(testWaitResult(127)).GetState()`
			`if state != structs.TaskRestarting {`
			`t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)`
Fixed some tests and refactored logic 2015-11-06 01:30:41 +00:00			`}`
Fix test because of jitter 2015-12-18 20:11:12 +00:00			`if !withinJitter(p.Delay, when) {`
			`t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)`
Fixed some tests and refactored logic 2015-11-06 01:30:41 +00:00			`}`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`
merge 2015-12-18 20:17:13 +00:00
			`// Next restart should cause fail`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`if state, _ := rt.SetWaitResult(testWaitResult(127)).GetState(); state != structs.TaskNotRestarting {`
			`t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)`
Cleaned up the logic to calculate restart duration 2015-11-05 19:12:31 +00:00			`}`
			`}`
Making the restart tracker aware of the exit codes 2015-11-23 18:56:38 +00:00
merge 2015-12-18 20:17:13 +00:00			`func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) {`
			`t.Parallel()`
			`p := testPolicy(false, structs.RestartPolicyModeDelay)`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`rt := newRestartTracker(p, structs.JobTypeBatch)`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`if state, _ := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskTerminated {`
			`t.Fatalf("NextRestart() returned %v, expected: %v", state, structs.TaskTerminated)`
Making the restart tracker aware of the exit codes 2015-11-23 18:56:38 +00:00			`}`
client: obey restart policy when attempts == 0 2016-02-02 22:17:39 +00:00			`}`
Making the restart tracker aware of the exit codes 2015-11-23 18:56:38 +00:00
client: obey restart policy when attempts == 0 2016-02-02 22:17:39 +00:00			`func TestClient_RestartTracker_ZeroAttempts(t *testing.T) {`
			`t.Parallel()`
			`p := testPolicy(true, structs.RestartPolicyModeFail)`
			`p.Attempts = 0`
client: always mark exited sys/svc allocs as failed When restarts.attempts=0 was set in a jobspec a system or service alloc that exited with 0 status would be marked as `completed` instead of `failed`. Since system and service jobs are intended to run until stopped or updated, they should always be marked as failed when they exit even in cases where the exit code is 0. 2018-03-23 18:16:58 +00:00
			`// Test with a non-zero exit code`
Restart on-success shouldn't be user specifiable 2016-02-02 23:08:07 +00:00			`rt := newRestartTracker(p, structs.JobTypeService)`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`if state, when := rt.SetWaitResult(testWaitResult(1)).GetState(); state != structs.TaskNotRestarting {`
client: always mark exited sys/svc allocs as failed When restarts.attempts=0 was set in a jobspec a system or service alloc that exited with 0 status would be marked as `completed` instead of `failed`. Since system and service jobs are intended to run until stopped or updated, they should always be marked as failed when they exit even in cases where the exit code is 0. 2018-03-23 18:16:58 +00:00			`t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)`
			`}`

			`// Even with a zero (successful) exit code non-batch jobs should exit`
			`// with TaskNotRestarting`
			`rt = newRestartTracker(p, structs.JobTypeService)`
			`if state, when := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskNotRestarting {`
			`t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)`
			`}`

			`// Batch jobs with a zero exit code and 0 attempts do exit cleanly`
			`// with Terminated`
			`rt = newRestartTracker(p, structs.JobTypeBatch)`
			`if state, when := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskTerminated {`
			`t.Fatalf("expect terminated, got restart/delay: %v/%v", state, when)`
			`}`

			`// Batch jobs with a non-zero exit code and 0 attempts exit with`
			`// TaskNotRestarting`
			`rt = newRestartTracker(p, structs.JobTypeBatch)`
			`if state, when := rt.SetWaitResult(testWaitResult(1)).GetState(); state != structs.TaskNotRestarting {`
			`t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)`
client: obey restart policy when attempts == 0 2016-02-02 22:17:39 +00:00			`}`
Making the restart tracker aware of the exit codes 2015-11-23 18:56:38 +00:00			`}`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00
Fix handling of restart in TaskEvents 2016-10-05 22:11:09 +00:00			`func TestClient_RestartTracker_RestartTriggered(t *testing.T) {`
			`t.Parallel()`
			`p := testPolicy(true, structs.RestartPolicyModeFail)`
			`p.Attempts = 0`
			`rt := newRestartTracker(p, structs.JobTypeService)`
Fold SetFailure into SetRestartTriggered 2017-09-14 22:27:39 +00:00			`if state, when := rt.SetRestartTriggered(false).GetState(); state != structs.TaskRestarting && when != 0 {`
Fix handling of restart in TaskEvents 2016-10-05 22:11:09 +00:00			`t.Fatalf("expect restart immediately, got %v %v", state, when)`
			`}`
			`}`

Cleanup and test restart failure code 2017-09-15 21:54:37 +00:00			`func TestClient_RestartTracker_RestartTriggered_Failure(t *testing.T) {`
			`t.Parallel()`
			`p := testPolicy(true, structs.RestartPolicyModeFail)`
			`p.Attempts = 1`
			`rt := newRestartTracker(p, structs.JobTypeService)`
			`if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskRestarting \|\| when == 0 {`
			`t.Fatalf("expect restart got %v %v", state, when)`
			`}`
			`if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskNotRestarting \|\| when != 0 {`
			`t.Fatalf("expect failed got %v %v", state, when)`
			`}`
			`}`

test policy delay for startup error 2016-07-12 01:54:36 +00:00			`func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) {`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`t.Parallel()`
test policy delay for startup error 2016-07-12 01:54:36 +00:00			`p := testPolicy(true, structs.RestartPolicyModeFail)`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`rt := newRestartTracker(p, structs.JobTypeSystem)`
Thread through whether DeriveToken error is recoverable or not 2016-10-23 01:08:30 +00:00			`recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)`
Refactor task runner to include driver starting into restart policy and add recoverable errors 2016-02-29 00:56:05 +00:00			`for i := 0; i < p.Attempts; i++ {`
			`state, when := rt.SetStartError(recErr).GetState()`
			`if state != structs.TaskRestarting {`
			`t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)`
			`}`
			`if !withinJitter(p.Delay, when) {`
			`t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)`
			`}`
			`}`

			`// Next restart should cause fail`
			`if state, _ := rt.SetStartError(recErr).GetState(); state != structs.TaskNotRestarting {`
			`t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)`
			`}`
			`}`
test policy delay for startup error 2016-07-12 01:54:36 +00:00
			`func TestClient_RestartTracker_StartError_Recoverable_Delay(t *testing.T) {`
Fix gofmt in restarts_test.go 2016-07-30 12:11:06 +00:00			`t.Parallel()`
			`p := testPolicy(true, structs.RestartPolicyModeDelay)`
			`rt := newRestartTracker(p, structs.JobTypeSystem)`
Thread through whether DeriveToken error is recoverable or not 2016-10-23 01:08:30 +00:00			`recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)`
Fix gofmt in restarts_test.go 2016-07-30 12:11:06 +00:00			`for i := 0; i < p.Attempts; i++ {`
			`state, when := rt.SetStartError(recErr).GetState()`
			`if state != structs.TaskRestarting {`
			`t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)`
			`}`
			`if !withinJitter(p.Delay, when) {`
			`t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)`
			`}`
			`}`
test policy delay for startup error 2016-07-12 01:54:36 +00:00
Fix gofmt in restarts_test.go 2016-07-30 12:11:06 +00:00			`// Next restart should cause delay`
			`state, when := rt.SetStartError(recErr).GetState()`
			`if state != structs.TaskRestarting {`
			`t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskRestarting)`
			`}`
			`if !(when > p.Delay && when <= p.Interval) {`
			`t.Fatalf("NextRestart() returned %v; want > %v and <= %v", when, p.Delay, p.Interval)`
			`}`
test policy delay for startup error 2016-07-12 01:54:36 +00:00			`}`