open-nomad/e2e/rescheduling/rescheduling.go
Tim Gross d7a013b6f5
e2e: refactor CLI utils out of rescheduling test (#8905)
The CLI helpers in the rescheduling test were intended for shared use, but
until some other tests were written we didn't want to waste time making them
generic. This changeset refactors them and adds some new helpers associated
with the node drain tests (under separate PR).
2020-09-16 16:10:06 -04:00

406 lines
12 KiB
Go

package rescheduling
import (
"fmt"
"os"
"reflect"
"sort"
"time"
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/framework"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/jobspec"
)
type RescheduleE2ETest struct {
framework.TC
jobIds []string
}
func init() {
framework.AddSuites(&framework.TestSuite{
Component: "Rescheduling",
CanRunLocal: true,
Consul: true,
Cases: []framework.TestCase{
new(RescheduleE2ETest),
},
})
}
func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
e2e.WaitForLeader(f.T(), tc.Nomad())
e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
}
func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}
for _, id := range tc.jobIds {
_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
f.NoError(err)
}
tc.jobIds = []string{}
_, err := e2e.Command("nomad", "system", "gc")
f.NoError(err)
}
// TestNoReschedule runs a job that should fail and never reschedule
func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 failed allocs",
)
}
// TestNoRescheduleSystem runs a system job that should fail and never reschedule
func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID) },
func(got []string) bool {
for _, status := range got {
if status != "failed" {
return false
}
}
return true
}, nil,
),
"should have only failed allocs",
)
}
// TestDefaultReschedule runs a job that should reschedule after delay
func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 failed allocs",
)
// TODO(tgross): return early if "slow" isn't set
// wait until first exponential delay kicks in and rescheduling is attempted
time.Sleep(time.Second * 35)
expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 6 failed allocs after 35s",
)
}
// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 failed allocs",
)
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID) },
func(got []string) bool {
for _, status := range got {
if status == "running" {
return true
}
}
return false
}, nil,
),
"should have at least 1 running alloc",
)
}
// TestRescheduleSuccess runs a job that should be running after rescheduling
func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID) },
func(got []string) bool {
for _, status := range got {
if status == "running" {
return true
}
}
return false
}, nil,
),
"should have at least 1 running alloc",
)
}
// TestRescheduleWithUpdate updates a running job to fail, and verifies that
// it gets rescheduled
func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 running allocs",
)
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have rescheduled allocs until progress deadline",
)
}
// TestRescheduleWithCanary updates a running job to fail, and verify that the
// canary gets rescheduled
func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have rescheduled allocs until progress deadline",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "running", nil),
"deployment should be running")
}
// TestRescheduleWithCanary updates a running job to fail, and verifies that
// the job gets reverted
func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) },
func(got []string) bool { return len(got) == 0 }, nil,
),
"should have new allocs after update",
)
// then we'll fail and revert
expected = []string{"failed", "failed", "failed", "running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 running reverted allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "successful", nil),
"deployment should be successful")
}
// TestRescheduleMaxParallel updates a job with a max_parallel config
func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
expected = []string{"complete", "failed", "failed", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID) },
func(got []string) bool {
sort.Strings(got)
return reflect.DeepEqual(got, expected)
}, nil,
),
"should have failed allocs including rescheduled failed allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "running", nil),
"deployment should be running")
}
// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
// config that will autorevert on failure
func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not e2e.Register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) },
func(got []string) bool { return len(got) == 0 }, nil,
),
"should have new allocs after update",
)
// wait for the revert
expected = []string{"complete", "failed", "running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID) },
func(got []string) bool {
sort.Strings(got)
return reflect.DeepEqual(got, expected)
}, nil,
),
"should have one successful, one failed, and 3 reverted allocs",
)
out, err := e2e.Command("nomad", "deployment", "status")
f.NoError(err, "could not get deployment status")
results, err := e2e.ParseColumns(out)
f.NoError(err, "could not parse deployment status")
statuses := []string{}
for _, row := range results {
if row["Job ID"] == jobID {
statuses = append(statuses, row["Status"])
}
}
f.True(reflect.DeepEqual([]string{"running", "failed", "successful"}, statuses),
fmt.Sprintf("deployment status was: %#v", statuses),
)
}
// TestRescheduleProgressDeadline verifies a deployment succeeds by the
// progress deadline
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
// TODO(tgross): return early if "slow" isn't set
// wait until first exponential delay kicks in and rescheduling is attempted
time.Sleep(time.Second * 30)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, "successful", nil),
"deployment should be successful")
}