From 7057c0c8868d67aa0ebd8d2cbb73456575878027 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Thu, 16 Nov 2023 14:52:05 -0600 Subject: [PATCH] e2e: fix and modernize rescheduling test (#19105) (#19107) The E2E test suite for rescheduling had a few bugs: * Using the command line to stop a job with a failing deployment returns a non-zero exit code, which would cause an otherwise passing test to fail. * Two of the input jobs were actually invalid but were only correctly detected as such because of #17342 This changeset also updates the whole test suite to move it off the v1 "framework". A few test assertions are also de-flaked. Fixes: #19076 Co-authored-by: Tim Gross --- e2e/e2e_test.go | 2 +- e2e/rescheduling/doc.go | 7 + ...ing.nomad => norescheduling_service.nomad} | 0 ...stem.nomad => norescheduling_system.nomad} | 0 .../rescheduling_canary_autorevert.nomad | 2 +- .../input/rescheduling_maxp_autorevert.nomad | 2 +- .../rescheduling_progressdeadline_fail.nomad | 4 +- .../input/rescheduling_success.nomad | 7 +- e2e/rescheduling/rescheduling.go | 500 ----------------- e2e/rescheduling/rescheduling_test.go | 509 ++++++++++++++++++ 10 files changed, 525 insertions(+), 508 deletions(-) create mode 100644 e2e/rescheduling/doc.go rename e2e/rescheduling/input/{norescheduling.nomad => norescheduling_service.nomad} (100%) rename e2e/rescheduling/input/{rescheduling_system.nomad => norescheduling_system.nomad} (100%) delete mode 100644 e2e/rescheduling/rescheduling.go create mode 100644 e2e/rescheduling/rescheduling_test.go diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 674920ced..2c215b7a7 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -31,7 +31,6 @@ import ( _ "github.com/hashicorp/nomad/e2e/podman" _ "github.com/hashicorp/nomad/e2e/quotas" _ "github.com/hashicorp/nomad/e2e/remotetasks" - _ "github.com/hashicorp/nomad/e2e/rescheduling" _ "github.com/hashicorp/nomad/e2e/scaling" _ "github.com/hashicorp/nomad/e2e/scalingpolicies" _ "github.com/hashicorp/nomad/e2e/scheduler_sysbatch" @@ -45,6 +44,7 @@ import ( _ "github.com/hashicorp/nomad/e2e/disconnectedclients" _ "github.com/hashicorp/nomad/e2e/namespaces" _ "github.com/hashicorp/nomad/e2e/nodedrain" + _ "github.com/hashicorp/nomad/e2e/rescheduling" _ "github.com/hashicorp/nomad/e2e/volumes" ) diff --git a/e2e/rescheduling/doc.go b/e2e/rescheduling/doc.go new file mode 100644 index 000000000..b12ca33e5 --- /dev/null +++ b/e2e/rescheduling/doc.go @@ -0,0 +1,7 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package rescheduling + +// This package contains only tests, so this is a placeholder file to +// make sure builds don't fail with "no non-test Go files in" errors diff --git a/e2e/rescheduling/input/norescheduling.nomad b/e2e/rescheduling/input/norescheduling_service.nomad similarity index 100% rename from e2e/rescheduling/input/norescheduling.nomad rename to e2e/rescheduling/input/norescheduling_service.nomad diff --git a/e2e/rescheduling/input/rescheduling_system.nomad b/e2e/rescheduling/input/norescheduling_system.nomad similarity index 100% rename from e2e/rescheduling/input/rescheduling_system.nomad rename to e2e/rescheduling/input/norescheduling_system.nomad diff --git a/e2e/rescheduling/input/rescheduling_canary_autorevert.nomad b/e2e/rescheduling/input/rescheduling_canary_autorevert.nomad index 6b9b42e74..bcbe122bf 100644 --- a/e2e/rescheduling/input/rescheduling_canary_autorevert.nomad +++ b/e2e/rescheduling/input/rescheduling_canary_autorevert.nomad @@ -30,7 +30,7 @@ job "test" { min_healthy_time = "1s" auto_revert = true healthy_deadline = "2s" - progress_deadline = "3s" + progress_deadline = "5s" } restart { diff --git a/e2e/rescheduling/input/rescheduling_maxp_autorevert.nomad b/e2e/rescheduling/input/rescheduling_maxp_autorevert.nomad index 7e3c3d5f1..b42dd6301 100644 --- a/e2e/rescheduling/input/rescheduling_maxp_autorevert.nomad +++ b/e2e/rescheduling/input/rescheduling_maxp_autorevert.nomad @@ -29,7 +29,7 @@ job "demo3" { min_healthy_time = "1s" auto_revert = true healthy_deadline = "2s" - progress_deadline = "3s" + progress_deadline = "5s" } restart { diff --git a/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad b/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad index 65ff943c9..edf13a52d 100644 --- a/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad +++ b/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad @@ -27,7 +27,7 @@ job "demo2" { update { # we want the first allocation to take a while before we give up on it, # so that we can check the deployment's progress deadline before and - # after it becomes healthy + # after we determine it will never become healthy min_healthy_time = "10s" healthy_deadline = "15s" progress_deadline = "20s" @@ -42,7 +42,7 @@ job "demo2" { } reschedule { - unlimited = "true" + unlimited = true delay_function = "constant" delay = "5s" } diff --git a/e2e/rescheduling/input/rescheduling_success.nomad b/e2e/rescheduling/input/rescheduling_success.nomad index dc8cde2ac..81c45ab3e 100644 --- a/e2e/rescheduling/input/rescheduling_success.nomad +++ b/e2e/rescheduling/input/rescheduling_success.nomad @@ -31,9 +31,10 @@ job "test3" { } reschedule { - attempts = 2 - interval = "5m" - unlimited = false + delay = "5s" + delay_function = "constant" + unlimited = true } + } } diff --git a/e2e/rescheduling/rescheduling.go b/e2e/rescheduling/rescheduling.go deleted file mode 100644 index b315505c9..000000000 --- a/e2e/rescheduling/rescheduling.go +++ /dev/null @@ -1,500 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -package rescheduling - -import ( - "fmt" - "os" - "reflect" - "sort" - "time" - - e2e "github.com/hashicorp/nomad/e2e/e2eutil" - "github.com/hashicorp/nomad/e2e/framework" - "github.com/hashicorp/nomad/helper/uuid" - "github.com/hashicorp/nomad/jobspec" - "github.com/hashicorp/nomad/testutil" -) - -const ns = "" - -type RescheduleE2ETest struct { - framework.TC - jobIds []string -} - -func init() { - framework.AddSuites(&framework.TestSuite{ - Component: "Rescheduling", - CanRunLocal: true, - Consul: true, - Cases: []framework.TestCase{ - new(RescheduleE2ETest), - }, - }) - -} - -func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) { - e2e.WaitForLeader(f.T(), tc.Nomad()) - e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) -} - -func (tc *RescheduleE2ETest) AfterEach(f *framework.F) { - if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { - return - } - - for _, id := range tc.jobIds { - err := e2e.StopJob(id, "-purge") - f.Assert().NoError(err) - } - tc.jobIds = []string{} - _, err := e2e.Command("nomad", "system", "gc") - f.Assert().NoError(err) -} - -// TestNoReschedule runs a job that should fail and never reschedule -func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) { - jobID := "test-no-reschedule-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"failed", "failed", "failed"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 failed allocs", - ) -} - -// TestNoRescheduleSystem runs a system job that should fail and never reschedule -func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) { - jobID := "test-reschedule-system-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, - func(got []string) bool { - for _, status := range got { - if status != "failed" { - return false - } - } - return true - }, nil, - ), - "should have only failed allocs", - ) -} - -// TestDefaultReschedule runs a job that should reschedule after delay -func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) { - - jobID := "test-default-reschedule-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"failed", "failed", "failed"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 failed allocs", - ) - - // TODO(tgross): return early if "slow" isn't set - // wait until first exponential delay kicks in and rescheduling is attempted - time.Sleep(time.Second * 35) - expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 6 failed allocs after 35s", - ) -} - -// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts -func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) { - - jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"failed", "failed", "failed"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 failed allocs", - ) - - job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad") - f.NoError(err) - job.ID = &jobID - job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} - _, _, err = tc.Nomad().Jobs().Register(job, nil) - f.NoError(err, "could not register updated job") - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, - func(got []string) bool { - for _, status := range got { - if status == "running" { - return true - } - } - return false - }, nil, - ), - "should have at least 1 running alloc", - ) -} - -// TestRescheduleSuccess runs a job that should be running after rescheduling -func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) { - - jobID := "test-reschedule-success-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, - func(got []string) bool { - for _, status := range got { - if status == "running" { - return true - } - } - return false - }, nil, - ), - "should have at least 1 running alloc", - ) -} - -// TestRescheduleWithUpdate updates a running job to fail, and verifies that -// it gets rescheduled -func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) { - - jobID := "test-reschedule-update-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 running allocs", - ) - - // reschedule to make fail - job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad") - f.NoError(err) - job.ID = &jobID - job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} - _, _, err = tc.Nomad().Jobs().Register(job, nil) - f.NoError(err, "could not register updated job") - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, - func(got []string) bool { return len(got) > 0 }, nil, - ), - "should have rescheduled allocs until progress deadline", - ) -} - -// TestRescheduleWithCanary updates a running job to fail, and verify that the -// canary gets rescheduled -func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) { - - jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 running allocs", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "deployment should be successful") - - // reschedule to make fail - job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad") - f.NoError(err) - job.ID = &jobID - job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} - _, _, err = tc.Nomad().Jobs().Register(job, nil) - f.NoError(err, "could not register updated job") - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, - func(got []string) bool { return len(got) > 0 }, nil, - ), - "should have rescheduled allocs until progress deadline", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), - "deployment should be running") -} - -// TestRescheduleWithCanaryAutoRevert updates a running job to fail, and -// verifies that the job gets reverted. -func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) { - - jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 running allocs", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "deployment should be successful") - - // reschedule to make fail - job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad") - f.NoError(err) - job.ID = &jobID - job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} - _, _, err = tc.Nomad().Jobs().Register(job, nil) - f.NoError(err, "could not register updated job") - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, - func(got []string) bool { return len(got) > 0 }, nil, - ), - "should have new allocs after update", - ) - - // then we'll fail and revert - expected = []string{"failed", "failed", "failed", "running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 running reverted allocs", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "deployment should be successful") -} - -// TestRescheduleMaxParallel updates a job with a max_parallel config -func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) { - - jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 running allocs", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "deployment should be successful") - - // reschedule to make fail - job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad") - f.NoError(err) - job.ID = &jobID - job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} - _, _, err = tc.Nomad().Jobs().Register(job, nil) - f.NoError(err, "could not register updated job") - - expected = []string{"complete", "failed", "failed", "running", "running"} - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, - func(got []string) bool { - sort.Strings(got) - return reflect.DeepEqual(got, expected) - }, nil, - ), - "should have failed allocs including rescheduled failed allocs", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), - "deployment should be running") -} - -// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel -// config that will autorevert on failure -func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) { - - jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have exactly 3 running allocs", - ) - - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "deployment should be successful") - - // reschedule to make fail - job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad") - f.NoError(err) - job.ID = &jobID - job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} - _, _, err = tc.Nomad().Jobs().Register(job, nil) - f.NoError(err, "could not e2e.Register updated job") - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, - func(got []string) bool { return len(got) > 0 }, nil, - ), - "should have new allocs after update", - ) - - // wait for the revert - expected = []string{"complete", "failed", "running", "running", "running"} - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, - func(got []string) bool { - sort.Strings(got) - return reflect.DeepEqual(got, expected) - }, nil, - ), - "should have one successful, one failed, and 3 reverted allocs", - ) - - // at this point the allocs have been checked but we need to wait for the - // deployment to be marked complete before we can assert that it's successful - // and verify the count of deployments - f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "most recent deployment should be successful") - - out, err := e2e.Command("nomad", "deployment", "status") - f.NoError(err, "could not get deployment status") - - results, err := e2e.ParseColumns(out) - f.NoError(err, "could not parse deployment status") - statuses := map[string]int{} - for _, row := range results { - if row["Job ID"] == jobID { - statuses[row["Status"]]++ - } - } - - f.Equal(1, statuses["failed"], - fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out)) - f.Equal(2, statuses["successful"], - fmt.Sprintf("expected 2 successful deployments, got:\n%s", out)) -} - -// TestRescheduleProgressDeadline verifies the progress deadline is reset with -// each healthy allocation, and that a rescheduled allocation does not. -func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { - - jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - expected := []string{"running"} - f.NoError( - e2e.WaitForAllocStatusExpected(jobID, ns, expected), - "should have a running allocation", - ) - - deploymentID, err := e2e.LastDeploymentID(jobID, ns) - f.NoError(err, "couldn't look up deployment") - - oldDeadline, err := getProgressDeadline(deploymentID) - f.NoError(err, "could not get progress deadline") - time.Sleep(time.Second * 20) - - newDeadline, err := getProgressDeadline(deploymentID) - f.NoError(err, "could not get new progress deadline") - f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated") - - f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), - "deployment should be successful") -} - -// TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with -// each healthy allocation, and that a rescheduled allocation does not. -func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) { - - jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad")) - tc.jobIds = append(tc.jobIds, jobID) - - testutil.WaitForResult(func() (bool, error) { - _, err := e2e.LastDeploymentID(jobID, ns) - return err == nil, err - }, func(err error) { - f.NoError(err, "deployment wasn't created yet") - }) - - deploymentID, err := e2e.LastDeploymentID(jobID, ns) - f.NoError(err, "couldn't look up deployment") - - oldDeadline, err := getProgressDeadline(deploymentID) - f.NoError(err, "could not get progress deadline") - time.Sleep(time.Second * 20) - - f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil), - "deployment should be failed") - - f.NoError( - e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, - func(got []string) bool { - for _, status := range got { - if status != "failed" { - return false - } - } - return true - }, nil, - ), - "should have only failed allocs", - ) - - newDeadline, err := getProgressDeadline(deploymentID) - f.NoError(err, "could not get new progress deadline") - f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated") -} - -func getProgressDeadline(deploymentID string) (time.Time, error) { - - out, err := e2e.Command("nomad", "deployment", "status", deploymentID) - if err != nil { - return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out) - } - - section, err := e2e.GetSection(out, "Deployed") - if err != nil { - return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err) - } - - rows, err := e2e.ParseColumns(section) - if err != nil { - return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err) - } - - layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go - raw := rows[0]["Progress Deadline"] - return time.Parse(layout, raw) -} diff --git a/e2e/rescheduling/rescheduling_test.go b/e2e/rescheduling/rescheduling_test.go new file mode 100644 index 000000000..d1a13b483 --- /dev/null +++ b/e2e/rescheduling/rescheduling_test.go @@ -0,0 +1,509 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package rescheduling + +import ( + "os" + "reflect" + "sort" + "testing" + "time" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/jobspec" + "github.com/shoenig/test" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +const ns = "default" + +func cleanupJob(t *testing.T, jobID string) { + if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { + return + } + + t.Helper() + t.Cleanup(func() { + e2eutil.StopJob(jobID, "-purge", "-detach") + _, err := e2eutil.Command("nomad", "system", "gc") + test.NoError(t, err) + }) +} + +// Note: most of the StopJob calls in this test suite will return an +// error because the job has previously failed and we're not waiting for +// the deployment to end + +// TestRescheduling_Service_NoReschedule runs a service job that should fail and never +// reschedule +func TestRescheduling_Service_NoReschedule(t *testing.T) { + jobID := "test-no-reschedule-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/norescheduling_service.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"failed", "failed", "failed"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 failed allocs"), + ) +} + +// TestRescheduling_System_NoReschedule runs a system job that should fail and never +// reschedule +func TestRescheduling_System_NoReschedule(t *testing.T) { + jobID := "test-no-reschedule-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/norescheduling_system.nomad")) + + cleanupJob(t, jobID) + + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) }, + func(got []string) bool { + for _, status := range got { + if status != "failed" { + return false + } + } + return true + }, nil, + ), + must.Sprint("should have only failed allocs"), + ) +} + +// TestRescheduling_Default runs a job that should reschedule after delay +func TestRescheduling_Default(t *testing.T) { + jobID := "test-default-reschedule-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_default.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"failed", "failed", "failed"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 failed allocs"), + ) + + // wait until first exponential delay kicks in and rescheduling is attempted + time.Sleep(time.Second * 35) + expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 6 failed allocs after 35s"), + ) +} + +// TestRescheduling_MaxAttempts runs a job with a maximum reschedule attempts +func TestRescheduling_MaxAttempts(t *testing.T) { + + jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_fail.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"failed", "failed", "failed"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 failed allocs"), + ) + + job, err := jobspec.ParseFile("./input/rescheduling_fail.nomad") + must.NoError(t, err) + job.ID = &jobID + job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} + + nc := e2eutil.NomadClient(t) + _, _, err = nc.Jobs().Register(job, nil) + must.NoError(t, err, must.Sprint("could not register updated job")) + + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + got, err := e2eutil.AllocStatuses(jobID, ns) + must.NoError(t, err) + for _, status := range got { + if status == "running" { + return true + } + } + return false + }), + wait.Timeout(10*time.Second), + wait.Gap(500*time.Millisecond), + ), must.Sprint("should have at least 1 running alloc")) +} + +// TestRescheduling_Success runs a job that should be running after rescheduling +func TestRescheduling_Success(t *testing.T) { + + jobID := "test-reschedule-success-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_success.nomad")) + + cleanupJob(t, jobID) + + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + got, err := e2eutil.AllocStatuses(jobID, ns) + must.NoError(t, err) + running := 0 + for _, status := range got { + if status == "running" { + running++ + } + } + return running == 3 + }), + wait.Timeout(60*time.Second), // this can take a while! + wait.Gap(500*time.Millisecond), + ), must.Sprint("all 3 allocs should eventually be running")) +} + +// TestRescheduling_WithUpdate updates a running job to fail, and verifies that +// it gets rescheduled +func TestRescheduling_WithUpdate(t *testing.T) { + + jobID := "test-reschedule-update-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_update.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 running allocs"), + ) + + // reschedule to make fail + job, err := jobspec.ParseFile("./input/rescheduling_update.nomad") + must.NoError(t, err) + job.ID = &jobID + job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} + + nc := e2eutil.NomadClient(t) + _, _, err = nc.Jobs().Register(job, nil) + must.NoError(t, err, must.Sprint("could not register updated job")) + + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) }, + func(got []string) bool { return len(got) > 0 }, nil, + ), + must.Sprint("should have rescheduled allocs until progress deadline"), + ) +} + +// TestRescheduling_WithCanary updates a running job to fail, and verify that the +// canary gets rescheduled +func TestRescheduling_WithCanary(t *testing.T) { + + jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_canary.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 running allocs"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + must.Sprint("deployment should be successful")) + + // reschedule to make fail + job, err := jobspec.ParseFile("./input/rescheduling_canary.nomad") + must.NoError(t, err) + job.ID = &jobID + job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} + + nc := e2eutil.NomadClient(t) + _, _, err = nc.Jobs().Register(job, nil) + must.NoError(t, err, must.Sprint("could not register updated job")) + + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) }, + func(got []string) bool { return len(got) > 0 }, nil, + ), + must.Sprint("should have rescheduled allocs until progress deadline"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "running", nil), + must.Sprint("deployment should be running")) +} + +// TestRescheduling_WithCanaryAutoRevert updates a running job to fail, and +// verifies that the job gets reverted. +func TestRescheduling_WithCanaryAutoRevert(t *testing.T) { + + jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_canary_autorevert.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 running allocs"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + must.Sprint("deployment should be successful")) + + // reschedule to make fail + job, err := jobspec.ParseFile("./input/rescheduling_canary_autorevert.nomad") + must.NoError(t, err) + job.ID = &jobID + job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} + + nc := e2eutil.NomadClient(t) + _, _, err = nc.Jobs().Register(job, nil) + must.NoError(t, err, must.Sprint("could not register updated job")) + + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) }, + func(got []string) bool { return len(got) > 0 }, nil, + ), + must.Sprint("should have new allocs after update"), + ) + + // then we'll fail and revert + expected = []string{"failed", "failed", "failed", "running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 running reverted allocs"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + must.Sprint("deployment should be successful")) +} + +// TestRescheduling_MaxParallel updates a job with a max_parallel config +func TestRescheduling_MaxParallel(t *testing.T) { + + jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_maxp.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 running allocs"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + must.Sprint("deployment should be successful")) + + // reschedule to make fail + job, err := jobspec.ParseFile("./input/rescheduling_maxp.nomad") + must.NoError(t, err) + job.ID = &jobID + job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} + + nc := e2eutil.NomadClient(t) + _, _, err = nc.Jobs().Register(job, nil) + must.NoError(t, err, must.Sprint("could not register updated job")) + + expected = []string{"complete", "failed", "failed", "running", "running"} + + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) }, + func(got []string) bool { + sort.Strings(got) + return reflect.DeepEqual(got, expected) + }, nil, + ), + must.Sprint("should have failed allocs including rescheduled failed allocs"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "running", nil), + must.Sprint("deployment should be running")) +} + +// TestRescheduling_MaxParallelAutoRevert updates a job with a max_parallel +// config that will autorevert on failure +func TestRescheduling_MaxParallelAutoRevert(t *testing.T) { + + jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_maxp_autorevert.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have exactly 3 running allocs"), + ) + + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + must.Sprint("deployment should be successful")) + + // reschedule to make fail + job, err := jobspec.ParseFile("./input/rescheduling_maxp_autorevert.nomad") + must.NoError(t, err) + job.ID = &jobID + job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} + + nc := e2eutil.NomadClient(t) + _, _, err = nc.Jobs().Register(job, nil) + must.NoError(t, err, must.Sprint("could not e2eutil.Register updated job")) + + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) }, + func(got []string) bool { return len(got) > 0 }, nil, + ), + must.Sprint("should have new allocs after update"), + ) + + // wait for the revert + expected = []string{"complete", "failed", "running", "running", "running"} + must.NoError(t, + e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) }, + func(got []string) bool { + sort.Strings(got) + return reflect.DeepEqual(got, expected) + }, nil, + ), + must.Sprint("should have one successful, one failed, and 3 reverted allocs"), + ) + + // at this point the allocs have been checked but we need to wait for the + // deployment to be marked complete before we can assert that it's successful + // and verify the count of deployments + must.NoError(t, + e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + must.Sprint("most recent deployment should be successful")) + + out, err := e2eutil.Command("nomad", "deployment", "status") + must.NoError(t, err, must.Sprint("could not get deployment status")) + + results, err := e2eutil.ParseColumns(out) + must.NoError(t, err, must.Sprint("could not parse deployment status")) + statuses := map[string]int{} + for _, row := range results { + if row["Job ID"] == jobID { + statuses[row["Status"]]++ + } + } + + must.Eq(t, 1, statuses["failed"], + must.Sprintf("expected only 1 failed deployment, got:\n%s", out)) + must.Eq(t, 2, statuses["successful"], + must.Sprintf("expected 2 successful deployments, got:\n%s", out)) +} + +// TestRescheduling_ProgressDeadline verifies the progress deadline is only +// reset with each healthy allocation, not failed one (which we'll then +// reschedule) +func TestRescheduling_ProgressDeadline(t *testing.T) { + + jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_progressdeadline.nomad")) + + cleanupJob(t, jobID) + + expected := []string{"running"} + must.NoError(t, + e2eutil.WaitForAllocStatusExpected(jobID, ns, expected), + must.Sprint("should have a running allocation"), + ) + + var deploymentID string + + deploymentID, err := e2eutil.LastDeploymentID(jobID, ns) + must.NoError(t, err, must.Sprint("couldn't look up deployment")) + + _, oldDeadline := getDeploymentState(t, deploymentID) + + var newStatus string + var newDeadline time.Time + + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + newStatus, newDeadline = getDeploymentState(t, deploymentID) + return newStatus == "successful" + }), + wait.Timeout(30*time.Second), + wait.Gap(500*time.Millisecond), + ), must.Sprint("deployment should be successful")) + + must.NotEq(t, oldDeadline, newDeadline, + must.Sprint("progress deadline should have been updated")) +} + +// TestRescheduling_ProgressDeadlineFail verifies the progress deadline is only +// reset with each healthy allocation, and this fails the deployment if not +func TestRescheduling_ProgressDeadlineFail(t *testing.T) { + + jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_progressdeadline_fail.nomad")) + + cleanupJob(t, jobID) + + var deploymentID string + + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + deploymentID, _ = e2eutil.LastDeploymentID(jobID, ns) + return deploymentID != "" + }), + wait.Timeout(5*time.Second), + wait.Gap(500*time.Millisecond), + ), must.Sprint("deployment not created")) + + _, oldDeadline := getDeploymentState(t, deploymentID) + + var newStatus string + var newDeadline time.Time + + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + newStatus, newDeadline = getDeploymentState(t, deploymentID) + return newStatus == "failed" + }), + wait.Timeout(30*time.Second), + wait.Gap(500*time.Millisecond), + ), must.Sprint("deployment should be failed")) + + must.Eq(t, oldDeadline, newDeadline, + must.Sprint("progress deadline should not have been updated")) +} + +// getDeploymentState returns the status and progress deadline for the given +// deployment +func getDeploymentState(t *testing.T, deploymentID string) (string, time.Time) { + + out, err := e2eutil.Command("nomad", "deployment", "status", deploymentID) + must.NoError(t, err, must.Sprintf("could not get deployment status from output: %v", out)) + + status, err := e2eutil.GetField(out, "Status") + must.NoError(t, err, must.Sprintf("could not find Status field in output: %v", out)) + + section, err := e2eutil.GetSection(out, "Deployed") + must.NoError(t, err, must.Sprintf("could not find Deployed section in output: %v", out)) + + rows, err := e2eutil.ParseColumns(section) + must.NoError(t, err, must.Sprintf("could not parse Deployed section from output: %v", out)) + + layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go + raw := rows[0]["Progress Deadline"] + deadline, err := time.Parse(layout, raw) + must.NoError(t, err, must.Sprint("could not parse Progress Deadline timestamp")) + return status, deadline +}