open-nomad/e2e/rescheduling/rescheduling.go
Tim Gross fe88003f29
e2e: eliminate race condition causing rescheduling test flake (#9085)
The autorevert test checks for reverted allocations to be placed and running
before checking the deployment status, but the deployment can be completed and
marked "successful" before we check it for "running" status. Instead, just
wait for it to be marked "successful" and assert we have the expected count of
deployment statuses.
2020-10-14 11:35:30 -04:00

490 lines
15 KiB
Go

package rescheduling
import (
"fmt"
"os"
"reflect"
"sort"
"time"
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/framework"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/jobspec"
)
const ns = ""
type RescheduleE2ETest struct {
framework.TC
jobIds []string
}
func init() {
framework.AddSuites(&framework.TestSuite{
Component: "Rescheduling",
CanRunLocal: true,
Consul: true,
Cases: []framework.TestCase{
new(RescheduleE2ETest),
},
})
}
func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
e2e.WaitForLeader(f.T(), tc.Nomad())
e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
}
func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}
for _, id := range tc.jobIds {
_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
f.Assert().NoError(err)
}
tc.jobIds = []string{}
_, err := e2e.Command("nomad", "system", "gc")
f.Assert().NoError(err)
}
// TestNoReschedule runs a job that should fail and never reschedule
func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 failed allocs",
)
}
// TestNoRescheduleSystem runs a system job that should fail and never reschedule
func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status != "failed" {
return false
}
}
return true
}, nil,
),
"should have only failed allocs",
)
}
// TestDefaultReschedule runs a job that should reschedule after delay
func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 failed allocs",
)
// TODO(tgross): return early if "slow" isn't set
// wait until first exponential delay kicks in and rescheduling is attempted
time.Sleep(time.Second * 35)
expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 6 failed allocs after 35s",
)
}
// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"failed", "failed", "failed"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 failed allocs",
)
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status == "running" {
return true
}
}
return false
}, nil,
),
"should have at least 1 running alloc",
)
}
// TestRescheduleSuccess runs a job that should be running after rescheduling
func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status == "running" {
return true
}
}
return false
}, nil,
),
"should have at least 1 running alloc",
)
}
// TestRescheduleWithUpdate updates a running job to fail, and verifies that
// it gets rescheduled
func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have rescheduled allocs until progress deadline",
)
}
// TestRescheduleWithCanary updates a running job to fail, and verify that the
// canary gets rescheduled
func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have rescheduled allocs until progress deadline",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
"deployment should be running")
}
// TestRescheduleWithCanary updates a running job to fail, and verifies that
// the job gets reverted
func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have new allocs after update",
)
// then we'll fail and revert
expected = []string{"failed", "failed", "failed", "running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running reverted allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
}
// TestRescheduleMaxParallel updates a job with a max_parallel config
func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not register updated job")
expected = []string{"complete", "failed", "failed", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
sort.Strings(got)
return reflect.DeepEqual(got, expected)
}, nil,
),
"should have failed allocs including rescheduled failed allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
"deployment should be running")
}
// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
// config that will autorevert on failure
func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have exactly 3 running allocs",
)
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
// reschedule to make fail
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
f.NoError(err)
job.ID = &jobID
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
_, _, err = tc.Nomad().Jobs().Register(job, nil)
f.NoError(err, "could not e2e.Register updated job")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
func(got []string) bool { return len(got) > 0 }, nil,
),
"should have new allocs after update",
)
// wait for the revert
expected = []string{"complete", "failed", "running", "running", "running"}
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
sort.Strings(got)
return reflect.DeepEqual(got, expected)
}, nil,
),
"should have one successful, one failed, and 3 reverted allocs",
)
// at this point the allocs have been checked but we need to wait for the
// deployment to be marked complete before we can assert that it's successful
// and verify the count of deployments
f.NoError(
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"most recent deployment should be successful")
out, err := e2e.Command("nomad", "deployment", "status")
f.NoError(err, "could not get deployment status")
results, err := e2e.ParseColumns(out)
f.NoError(err, "could not parse deployment status")
statuses := map[string]int{}
for _, row := range results {
if row["Job ID"] == jobID {
statuses[row["Status"]]++
}
}
f.Equal(1, statuses["failed"],
fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out))
f.Equal(2, statuses["successful"],
fmt.Sprintf("expected 2 successful deployments, got:\n%s", out))
}
// TestRescheduleProgressDeadline verifies the progress deadline is reset with
// each healthy allocation, and that a rescheduled allocation does not.
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
expected := []string{"running"}
f.NoError(
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
"should have a running allocation",
)
deploymentID, err := e2e.LastDeploymentID(jobID, ns)
f.NoError(err, "couldn't look up deployment")
oldDeadline, err := getProgressDeadline(deploymentID)
f.NoError(err, "could not get progress deadline")
time.Sleep(time.Second * 20)
newDeadline, err := getProgressDeadline(deploymentID)
f.NoError(err, "could not get new progress deadline")
f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated")
f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
"deployment should be successful")
}
// TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with
// each healthy allocation, and that a rescheduled allocation does not.
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) {
jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad"))
tc.jobIds = append(tc.jobIds, jobID)
deploymentID, err := e2e.LastDeploymentID(jobID, ns)
f.NoError(err, "couldn't look up deployment")
oldDeadline, err := getProgressDeadline(deploymentID)
f.NoError(err, "could not get progress deadline")
time.Sleep(time.Second * 20)
f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil),
"deployment should be failed")
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
func(got []string) bool {
for _, status := range got {
if status != "failed" {
return false
}
}
return true
}, nil,
),
"should have only failed allocs",
)
newDeadline, err := getProgressDeadline(deploymentID)
f.NoError(err, "could not get new progress deadline")
f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated")
}
func getProgressDeadline(deploymentID string) (time.Time, error) {
out, err := e2e.Command("nomad", "deployment", "status", deploymentID)
if err != nil {
return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out)
}
section, err := e2e.GetSection(out, "Deployed")
if err != nil {
return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err)
}
rows, err := e2e.ParseColumns(section)
if err != nil {
return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err)
}
layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
raw := rows[0]["Progress Deadline"]
return time.Parse(layout, raw)
}