open-nomad/client/restarts_test.go
Michael Schurter fcaee471a0 client: always mark exited sys/svc allocs as failed
When restarts.attempts=0 was set in a jobspec a system or service alloc
that exited with 0 status would be marked as `completed` instead of
`failed`. Since system and service jobs are intended to run until
stopped or updated, they should always be marked as failed when they
exit even in cases where the exit code is 0.
2018-03-27 14:30:19 -07:00

188 lines
6.5 KiB
Go

package client
import (
"fmt"
"testing"
"time"
cstructs "github.com/hashicorp/nomad/client/driver/structs"
"github.com/hashicorp/nomad/nomad/structs"
)
func testPolicy(success bool, mode string) *structs.RestartPolicy {
return &structs.RestartPolicy{
Interval: 2 * time.Minute,
Delay: 1 * time.Second,
Attempts: 3,
Mode: mode,
}
}
// withinJitter is a helper that returns whether the returned delay is within
// the jitter.
func withinJitter(expected, actual time.Duration) bool {
return float64((actual.Nanoseconds()-expected.Nanoseconds())/
expected.Nanoseconds()) <= jitter
}
func testWaitResult(exit int) *cstructs.WaitResult {
return cstructs.NewWaitResult(exit, 0, nil)
}
func TestClient_RestartTracker_ModeDelay(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p, structs.JobTypeService)
for i := 0; i < p.Attempts; i++ {
state, when := rt.SetWaitResult(testWaitResult(127)).GetState()
if state != structs.TaskRestarting {
t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
}
if !withinJitter(p.Delay, when) {
t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
}
}
// Follow up restarts should cause delay.
for i := 0; i < 3; i++ {
state, when := rt.SetWaitResult(testWaitResult(127)).GetState()
if state != structs.TaskRestarting {
t.Fail()
}
if !(when > p.Delay && when <= p.Interval) {
t.Fatalf("NextRestart() returned %v; want > %v and <= %v", when, p.Delay, p.Interval)
}
}
}
func TestClient_RestartTracker_ModeFail(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
rt := newRestartTracker(p, structs.JobTypeSystem)
for i := 0; i < p.Attempts; i++ {
state, when := rt.SetWaitResult(testWaitResult(127)).GetState()
if state != structs.TaskRestarting {
t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
}
if !withinJitter(p.Delay, when) {
t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
}
}
// Next restart should cause fail
if state, _ := rt.SetWaitResult(testWaitResult(127)).GetState(); state != structs.TaskNotRestarting {
t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)
}
}
func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) {
t.Parallel()
p := testPolicy(false, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p, structs.JobTypeBatch)
if state, _ := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskTerminated {
t.Fatalf("NextRestart() returned %v, expected: %v", state, structs.TaskTerminated)
}
}
func TestClient_RestartTracker_ZeroAttempts(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
p.Attempts = 0
// Test with a non-zero exit code
rt := newRestartTracker(p, structs.JobTypeService)
if state, when := rt.SetWaitResult(testWaitResult(1)).GetState(); state != structs.TaskNotRestarting {
t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)
}
// Even with a zero (successful) exit code non-batch jobs should exit
// with TaskNotRestarting
rt = newRestartTracker(p, structs.JobTypeService)
if state, when := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskNotRestarting {
t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)
}
// Batch jobs with a zero exit code and 0 attempts *do* exit cleanly
// with Terminated
rt = newRestartTracker(p, structs.JobTypeBatch)
if state, when := rt.SetWaitResult(testWaitResult(0)).GetState(); state != structs.TaskTerminated {
t.Fatalf("expect terminated, got restart/delay: %v/%v", state, when)
}
// Batch jobs with a non-zero exit code and 0 attempts exit with
// TaskNotRestarting
rt = newRestartTracker(p, structs.JobTypeBatch)
if state, when := rt.SetWaitResult(testWaitResult(1)).GetState(); state != structs.TaskNotRestarting {
t.Fatalf("expect no restart, got restart/delay: %v/%v", state, when)
}
}
func TestClient_RestartTracker_RestartTriggered(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
p.Attempts = 0
rt := newRestartTracker(p, structs.JobTypeService)
if state, when := rt.SetRestartTriggered(false).GetState(); state != structs.TaskRestarting && when != 0 {
t.Fatalf("expect restart immediately, got %v %v", state, when)
}
}
func TestClient_RestartTracker_RestartTriggered_Failure(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
p.Attempts = 1
rt := newRestartTracker(p, structs.JobTypeService)
if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskRestarting || when == 0 {
t.Fatalf("expect restart got %v %v", state, when)
}
if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskNotRestarting || when != 0 {
t.Fatalf("expect failed got %v %v", state, when)
}
}
func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
rt := newRestartTracker(p, structs.JobTypeSystem)
recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)
for i := 0; i < p.Attempts; i++ {
state, when := rt.SetStartError(recErr).GetState()
if state != structs.TaskRestarting {
t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
}
if !withinJitter(p.Delay, when) {
t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
}
}
// Next restart should cause fail
if state, _ := rt.SetStartError(recErr).GetState(); state != structs.TaskNotRestarting {
t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)
}
}
func TestClient_RestartTracker_StartError_Recoverable_Delay(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p, structs.JobTypeSystem)
recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)
for i := 0; i < p.Attempts; i++ {
state, when := rt.SetStartError(recErr).GetState()
if state != structs.TaskRestarting {
t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
}
if !withinJitter(p.Delay, when) {
t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
}
}
// Next restart should cause delay
state, when := rt.SetStartError(recErr).GetState()
if state != structs.TaskRestarting {
t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskRestarting)
}
if !(when > p.Delay && when <= p.Interval) {
t.Fatalf("NextRestart() returned %v; want > %v and <= %v", when, p.Delay, p.Interval)
}
}