fe88003f29
The autorevert test checks for reverted allocations to be placed and running before checking the deployment status, but the deployment can be completed and marked "successful" before we check it for "running" status. Instead, just wait for it to be marked "successful" and assert we have the expected count of deployment statuses.
490 lines
15 KiB
Go
490 lines
15 KiB
Go
package rescheduling
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"reflect"
|
|
"sort"
|
|
"time"
|
|
|
|
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
|
|
"github.com/hashicorp/nomad/e2e/framework"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/jobspec"
|
|
)
|
|
|
|
const ns = ""
|
|
|
|
type RescheduleE2ETest struct {
|
|
framework.TC
|
|
jobIds []string
|
|
}
|
|
|
|
func init() {
|
|
framework.AddSuites(&framework.TestSuite{
|
|
Component: "Rescheduling",
|
|
CanRunLocal: true,
|
|
Consul: true,
|
|
Cases: []framework.TestCase{
|
|
new(RescheduleE2ETest),
|
|
},
|
|
})
|
|
|
|
}
|
|
|
|
func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
|
|
e2e.WaitForLeader(f.T(), tc.Nomad())
|
|
e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
|
|
}
|
|
|
|
func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
|
|
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
|
|
return
|
|
}
|
|
|
|
for _, id := range tc.jobIds {
|
|
_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
|
|
f.Assert().NoError(err)
|
|
}
|
|
tc.jobIds = []string{}
|
|
_, err := e2e.Command("nomad", "system", "gc")
|
|
f.Assert().NoError(err)
|
|
}
|
|
|
|
// TestNoReschedule runs a job that should fail and never reschedule
|
|
func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
|
|
jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"failed", "failed", "failed"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 failed allocs",
|
|
)
|
|
}
|
|
|
|
// TestNoRescheduleSystem runs a system job that should fail and never reschedule
|
|
func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
|
|
jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status != "failed" {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}, nil,
|
|
),
|
|
"should have only failed allocs",
|
|
)
|
|
}
|
|
|
|
// TestDefaultReschedule runs a job that should reschedule after delay
|
|
func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
|
|
|
|
jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"failed", "failed", "failed"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 failed allocs",
|
|
)
|
|
|
|
// TODO(tgross): return early if "slow" isn't set
|
|
// wait until first exponential delay kicks in and rescheduling is attempted
|
|
time.Sleep(time.Second * 35)
|
|
expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 6 failed allocs after 35s",
|
|
)
|
|
}
|
|
|
|
// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
|
|
func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"failed", "failed", "failed"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 failed allocs",
|
|
)
|
|
|
|
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
|
|
f.NoError(err)
|
|
job.ID = &jobID
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
|
|
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
|
f.NoError(err, "could not register updated job")
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status == "running" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}, nil,
|
|
),
|
|
"should have at least 1 running alloc",
|
|
)
|
|
}
|
|
|
|
// TestRescheduleSuccess runs a job that should be running after rescheduling
|
|
func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status == "running" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}, nil,
|
|
),
|
|
"should have at least 1 running alloc",
|
|
)
|
|
}
|
|
|
|
// TestRescheduleWithUpdate updates a running job to fail, and verifies that
|
|
// it gets rescheduled
|
|
func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 running allocs",
|
|
)
|
|
|
|
// reschedule to make fail
|
|
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
|
|
f.NoError(err)
|
|
job.ID = &jobID
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
|
f.NoError(err, "could not register updated job")
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
|
|
func(got []string) bool { return len(got) > 0 }, nil,
|
|
),
|
|
"should have rescheduled allocs until progress deadline",
|
|
)
|
|
}
|
|
|
|
// TestRescheduleWithCanary updates a running job to fail, and verify that the
|
|
// canary gets rescheduled
|
|
func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 running allocs",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"deployment should be successful")
|
|
|
|
// reschedule to make fail
|
|
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
|
|
f.NoError(err)
|
|
job.ID = &jobID
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
|
f.NoError(err, "could not register updated job")
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
|
|
func(got []string) bool { return len(got) > 0 }, nil,
|
|
),
|
|
"should have rescheduled allocs until progress deadline",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
|
|
"deployment should be running")
|
|
}
|
|
|
|
// TestRescheduleWithCanary updates a running job to fail, and verifies that
|
|
// the job gets reverted
|
|
func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 running allocs",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"deployment should be successful")
|
|
|
|
// reschedule to make fail
|
|
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
|
|
f.NoError(err)
|
|
job.ID = &jobID
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
|
f.NoError(err, "could not register updated job")
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
|
|
func(got []string) bool { return len(got) > 0 }, nil,
|
|
),
|
|
"should have new allocs after update",
|
|
)
|
|
|
|
// then we'll fail and revert
|
|
expected = []string{"failed", "failed", "failed", "running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 running reverted allocs",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"deployment should be successful")
|
|
}
|
|
|
|
// TestRescheduleMaxParallel updates a job with a max_parallel config
|
|
func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 running allocs",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"deployment should be successful")
|
|
|
|
// reschedule to make fail
|
|
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
|
|
f.NoError(err)
|
|
job.ID = &jobID
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
|
f.NoError(err, "could not register updated job")
|
|
|
|
expected = []string{"complete", "failed", "failed", "running", "running"}
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
|
|
func(got []string) bool {
|
|
sort.Strings(got)
|
|
return reflect.DeepEqual(got, expected)
|
|
}, nil,
|
|
),
|
|
"should have failed allocs including rescheduled failed allocs",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
|
|
"deployment should be running")
|
|
}
|
|
|
|
// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
|
|
// config that will autorevert on failure
|
|
func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have exactly 3 running allocs",
|
|
)
|
|
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"deployment should be successful")
|
|
|
|
// reschedule to make fail
|
|
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
|
|
f.NoError(err)
|
|
job.ID = &jobID
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
|
f.NoError(err, "could not e2e.Register updated job")
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
|
|
func(got []string) bool { return len(got) > 0 }, nil,
|
|
),
|
|
"should have new allocs after update",
|
|
)
|
|
|
|
// wait for the revert
|
|
expected = []string{"complete", "failed", "running", "running", "running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
|
|
func(got []string) bool {
|
|
sort.Strings(got)
|
|
return reflect.DeepEqual(got, expected)
|
|
}, nil,
|
|
),
|
|
"should have one successful, one failed, and 3 reverted allocs",
|
|
)
|
|
|
|
// at this point the allocs have been checked but we need to wait for the
|
|
// deployment to be marked complete before we can assert that it's successful
|
|
// and verify the count of deployments
|
|
f.NoError(
|
|
e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"most recent deployment should be successful")
|
|
|
|
out, err := e2e.Command("nomad", "deployment", "status")
|
|
f.NoError(err, "could not get deployment status")
|
|
|
|
results, err := e2e.ParseColumns(out)
|
|
f.NoError(err, "could not parse deployment status")
|
|
statuses := map[string]int{}
|
|
for _, row := range results {
|
|
if row["Job ID"] == jobID {
|
|
statuses[row["Status"]]++
|
|
}
|
|
}
|
|
|
|
f.Equal(1, statuses["failed"],
|
|
fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out))
|
|
f.Equal(2, statuses["successful"],
|
|
fmt.Sprintf("expected 2 successful deployments, got:\n%s", out))
|
|
}
|
|
|
|
// TestRescheduleProgressDeadline verifies the progress deadline is reset with
|
|
// each healthy allocation, and that a rescheduled allocation does not.
|
|
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
expected := []string{"running"}
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusExpected(jobID, ns, expected),
|
|
"should have a running allocation",
|
|
)
|
|
|
|
deploymentID, err := e2e.LastDeploymentID(jobID, ns)
|
|
f.NoError(err, "couldn't look up deployment")
|
|
|
|
oldDeadline, err := getProgressDeadline(deploymentID)
|
|
f.NoError(err, "could not get progress deadline")
|
|
time.Sleep(time.Second * 20)
|
|
|
|
newDeadline, err := getProgressDeadline(deploymentID)
|
|
f.NoError(err, "could not get new progress deadline")
|
|
f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated")
|
|
|
|
f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
|
|
"deployment should be successful")
|
|
}
|
|
|
|
// TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with
|
|
// each healthy allocation, and that a rescheduled allocation does not.
|
|
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) {
|
|
|
|
jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
|
|
f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad"))
|
|
tc.jobIds = append(tc.jobIds, jobID)
|
|
|
|
deploymentID, err := e2e.LastDeploymentID(jobID, ns)
|
|
f.NoError(err, "couldn't look up deployment")
|
|
|
|
oldDeadline, err := getProgressDeadline(deploymentID)
|
|
f.NoError(err, "could not get progress deadline")
|
|
time.Sleep(time.Second * 20)
|
|
|
|
f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil),
|
|
"deployment should be failed")
|
|
|
|
f.NoError(
|
|
e2e.WaitForAllocStatusComparison(
|
|
func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
|
|
func(got []string) bool {
|
|
for _, status := range got {
|
|
if status != "failed" {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}, nil,
|
|
),
|
|
"should have only failed allocs",
|
|
)
|
|
|
|
newDeadline, err := getProgressDeadline(deploymentID)
|
|
f.NoError(err, "could not get new progress deadline")
|
|
f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated")
|
|
}
|
|
|
|
func getProgressDeadline(deploymentID string) (time.Time, error) {
|
|
|
|
out, err := e2e.Command("nomad", "deployment", "status", deploymentID)
|
|
if err != nil {
|
|
return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out)
|
|
}
|
|
|
|
section, err := e2e.GetSection(out, "Deployed")
|
|
if err != nil {
|
|
return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err)
|
|
}
|
|
|
|
rows, err := e2e.ParseColumns(section)
|
|
if err != nil {
|
|
return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err)
|
|
}
|
|
|
|
layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
|
|
raw := rows[0]["Progress Deadline"]
|
|
return time.Parse(layout, raw)
|
|
}
|