open-nomad/e2e/rescheduling/rescheduling.go

408 lines
12 KiB
Go
Raw Normal View History

package rescheduling
import (
"fmt"
"os"
"reflect"
"sort"
"time"
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/framework"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/jobspec"
)
const ns = ""
type RescheduleE2ETest struct {
framework.TC
jobIds []string
}
func init() {
framework.AddSuites(&framework.TestSuite{
Component: "Rescheduling",
CanRunLocal: true,
Consul: true,
Cases: []framework.TestCase{
new(RescheduleE2ETest),
},
})
}
func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
e2e.WaitForLeader(f.T(), tc.Nomad())
e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
}
func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}
for _, id := range tc.jobIds {
_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
f.NoError(err)
}
tc.jobIds = []string{}
_, err := e2e.Command("nomad", "system", "gc")
f.NoError(err)
}
// TestNoReschedule runs a job that should fail and never reschedule
func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 failed allocs",
)
}
// TestNoRescheduleSystem runs a system job that should fail and never reschedule
func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status != "failed" {
return false
}
}
return true
}, nil,
),
"should have only failed allocs",
)
}
// TestDefaultReschedule runs a job that should reschedule after delay
func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 failed allocs",
)
// TODO(tgross): return early if "slow" isn't set
// wait until first exponential delay kicks in and rescheduling is attempted
time.Sleep(time.Second * 35)
expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 6 failed allocs after 35s",
)
}
// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 failed allocs",
)
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status == "running" {
return true
}
}
return false
}, nil,
),
"should have at least 1 running alloc",
)
}
// TestRescheduleSuccess runs a job that should be running after rescheduling
func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status == "running" {
return true
}
}
return false
}, nil,
),
"should have at least 1 running alloc",
)
}
// TestRescheduleWithUpdate updates a running job to fail, and verifies that
// it gets rescheduled
func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have rescheduled allocs until progress deadline",
)
}
// TestRescheduleWithCanary updates a running job to fail, and verify that the
// canary gets rescheduled
func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have rescheduled allocs until progress deadline",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
"deployment should be running")
}
// TestRescheduleWithCanary updates a running job to fail, and verifies that
// the job gets reverted
func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have new allocs after update",
)
// then we'll fail and revert
expected = []string{"failed", "failed", "failed", "running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running reverted allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
}
// TestRescheduleMaxParallel updates a job with a max_parallel config
func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
expected = []string{"complete", "failed", "failed", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
sort.Strings(got)
return reflect.DeepEqual(got, expected)
}, nil,
),
"should have failed allocs including rescheduled failed allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
"deployment should be running")
}
// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
// config that will autorevert on failure
func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not e2e.Register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have new allocs after update",
)
// wait for the revert
expected = []string{"complete", "failed", "running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
sort.Strings(got)
return reflect.DeepEqual(got, expected)
}, nil,
),
"should have one successful, one failed, and 3 reverted allocs",
)
out, err := e2e.Command("nomad", "deployment", "status")
f.NoError(err, "could not get deployment status")
results, err := e2e.ParseColumns(out)
f.NoError(err, "could not parse deployment status")
statuses := []string{}
for _, row := range results {
if row["Job ID"] == jobID {
statuses = append(statuses, row["Status"])
}
}
f.True(reflect.DeepEqual([]string{"running", "failed", "successful"}, statuses),
fmt.Sprintf("deployment status was: %#v", statuses),
)
}
// TestRescheduleProgressDeadline verifies a deployment succeeds by the
// progress deadline
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
// TODO(tgross): return early if "slow" isn't set
// wait until first exponential delay kicks in and rescheduling is attempted
time.Sleep(time.Second * 30)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
}