e2e: fix and modernize rescheduling test (#19105) (#19107)

The E2E test suite for rescheduling had a few bugs: * Using the command line to stop a job with a failing deployment returns a non-zero exit code, which would cause an otherwise passing test to fail. * Two of the input jobs were actually invalid but were only correctly detected as such because of #17342 This changeset also updates the whole test suite to move it off the v1 "framework". A few test assertions are also de-flaked. Fixes: #19076 Co-authored-by: Tim Gross <tgross@hashicorp.com>
2023-11-16 14:52:05 -06:00 · 2023-11-16 14:52:05 -06:00 · 7057c0c886
parent 5f5ed4161e
commit 7057c0c886
10 changed files with 525 additions and 508 deletions
--- a/e2e/e2e_test.go
+++ b/e2e/e2e_test.go
@ -31,7 +31,6 @@ import (
 	_ "github.com/hashicorp/nomad/e2e/podman"
 	_ "github.com/hashicorp/nomad/e2e/quotas"
 	_ "github.com/hashicorp/nomad/e2e/remotetasks"
-	_ "github.com/hashicorp/nomad/e2e/rescheduling"
 	_ "github.com/hashicorp/nomad/e2e/scaling"
 	_ "github.com/hashicorp/nomad/e2e/scalingpolicies"
 	_ "github.com/hashicorp/nomad/e2e/scheduler_sysbatch"
@ -45,6 +44,7 @@ import (
 	_ "github.com/hashicorp/nomad/e2e/disconnectedclients"
 	_ "github.com/hashicorp/nomad/e2e/namespaces"
 	_ "github.com/hashicorp/nomad/e2e/nodedrain"
+	_ "github.com/hashicorp/nomad/e2e/rescheduling"
 	_ "github.com/hashicorp/nomad/e2e/volumes"
 )

--- a/e2e/rescheduling/doc.go
+++ b/e2e/rescheduling/doc.go
@ -0,0 +1,7 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package rescheduling
+
+// This package contains only tests, so this is a placeholder file to
+// make sure builds don't fail with "no non-test Go files in" errors
--- a/e2e/rescheduling/input/norescheduling_service.nomad
+++ b/e2e/rescheduling/input/norescheduling_service.nomad
--- a/e2e/rescheduling/input/norescheduling_system.nomad
+++ b/e2e/rescheduling/input/norescheduling_system.nomad
--- a/e2e/rescheduling/input/rescheduling_canary_autorevert.nomad
+++ b/e2e/rescheduling/input/rescheduling_canary_autorevert.nomad
@ -30,7 +30,7 @@ job "test" {
      min_healthy_time  = "1s"
      auto_revert       = true
      healthy_deadline  = "2s"
-      progress_deadline = "3s"
+      progress_deadline = "5s"
    }

    restart {
--- a/e2e/rescheduling/input/rescheduling_maxp_autorevert.nomad
+++ b/e2e/rescheduling/input/rescheduling_maxp_autorevert.nomad
@ -29,7 +29,7 @@ job "demo3" {
      min_healthy_time  = "1s"
      auto_revert       = true
      healthy_deadline  = "2s"
-      progress_deadline = "3s"
+      progress_deadline = "5s"
    }

    restart {
--- a/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad
+++ b/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad
@ -27,7 +27,7 @@ job "demo2" {
    update {
      # we want the first allocation to take a while before we give up on it,
      # so that we can check the deployment's progress deadline before and
-      # after it becomes healthy
+      # after we determine it will never become healthy
      min_healthy_time  = "10s"
      healthy_deadline  = "15s"
      progress_deadline = "20s"
@ -42,7 +42,7 @@ job "demo2" {
    }

    reschedule {
-      unlimited      = "true"
+      unlimited      = true
      delay_function = "constant"
      delay          = "5s"
    }
--- a/e2e/rescheduling/input/rescheduling_success.nomad
+++ b/e2e/rescheduling/input/rescheduling_success.nomad
@ -31,9 +31,10 @@ job "test3" {
    }

    reschedule {
-      attempts  = 2
-      interval  = "5m"
-      unlimited = false
+      delay          = "5s"
+      delay_function = "constant"
+      unlimited      = true
    }
+
  }
 }
--- a/e2e/rescheduling/rescheduling.go
+++ b/e2e/rescheduling/rescheduling.go
@ -1,500 +0,0 @@
-// Copyright (c) HashiCorp, Inc.
-// SPDX-License-Identifier: MPL-2.0
-
-package rescheduling
-
-import (
-	"fmt"
-	"os"
-	"reflect"
-	"sort"
-	"time"
-
-	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
-	"github.com/hashicorp/nomad/e2e/framework"
-	"github.com/hashicorp/nomad/helper/uuid"
-	"github.com/hashicorp/nomad/jobspec"
-	"github.com/hashicorp/nomad/testutil"
-)
-
-const ns = ""
-
-type RescheduleE2ETest struct {
-	framework.TC
-	jobIds []string
-}
-
-func init() {
-	framework.AddSuites(&framework.TestSuite{
-		Component:   "Rescheduling",
-		CanRunLocal: true,
-		Consul:      true,
-		Cases: []framework.TestCase{
-			new(RescheduleE2ETest),
-		},
-	})
-
-}
-
-func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
-	e2e.WaitForLeader(f.T(), tc.Nomad())
-	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
-}
-
-func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
-	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
-		return
-	}
-
-	for _, id := range tc.jobIds {
-		err := e2e.StopJob(id, "-purge")
-		f.Assert().NoError(err)
-	}
-	tc.jobIds = []string{}
-	_, err := e2e.Command("nomad", "system", "gc")
-	f.Assert().NoError(err)
-}
-
-// TestNoReschedule runs a job that should fail and never reschedule
-func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
-	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"failed", "failed", "failed"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 failed allocs",
-	)
-}
-
-// TestNoRescheduleSystem runs a system job that should fail and never reschedule
-func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
-	jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
-			func(got []string) bool {
-				for _, status := range got {
-					if status != "failed" {
-						return false
-					}
-				}
-				return true
-			}, nil,
-		),
-		"should have only failed allocs",
-	)
-}
-
-// TestDefaultReschedule runs a job that should reschedule after delay
-func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
-
-	jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"failed", "failed", "failed"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 failed allocs",
-	)
-
-	// TODO(tgross): return early if "slow" isn't set
-	// wait until first exponential delay kicks in and rescheduling is attempted
-	time.Sleep(time.Second * 35)
-	expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 6 failed allocs after 35s",
-	)
-}
-
-// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
-func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
-
-	jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"failed", "failed", "failed"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 failed allocs",
-	)
-
-	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
-	f.NoError(err)
-	job.ID = &jobID
-	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
-	_, _, err = tc.Nomad().Jobs().Register(job, nil)
-	f.NoError(err, "could not register updated job")
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
-			func(got []string) bool {
-				for _, status := range got {
-					if status == "running" {
-						return true
-					}
-				}
-				return false
-			}, nil,
-		),
-		"should have at least 1 running alloc",
-	)
-}
-
-// TestRescheduleSuccess runs a job that should be running after rescheduling
-func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
-
-	jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
-			func(got []string) bool {
-				for _, status := range got {
-					if status == "running" {
-						return true
-					}
-				}
-				return false
-			}, nil,
-		),
-		"should have at least 1 running alloc",
-	)
-}
-
-// TestRescheduleWithUpdate updates a running job to fail, and verifies that
-// it gets rescheduled
-func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
-
-	jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 running allocs",
-	)
-
-	// reschedule to make fail
-	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
-	f.NoError(err)
-	job.ID = &jobID
-	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
-	_, _, err = tc.Nomad().Jobs().Register(job, nil)
-	f.NoError(err, "could not register updated job")
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
-			func(got []string) bool { return len(got) > 0 }, nil,
-		),
-		"should have rescheduled allocs until progress deadline",
-	)
-}
-
-// TestRescheduleWithCanary updates a running job to fail, and verify that the
-// canary gets rescheduled
-func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
-
-	jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 running allocs",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"deployment should be successful")
-
-	// reschedule to make fail
-	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
-	f.NoError(err)
-	job.ID = &jobID
-	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
-	_, _, err = tc.Nomad().Jobs().Register(job, nil)
-	f.NoError(err, "could not register updated job")
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
-			func(got []string) bool { return len(got) > 0 }, nil,
-		),
-		"should have rescheduled allocs until progress deadline",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
-		"deployment should be running")
-}
-
-// TestRescheduleWithCanaryAutoRevert updates a running job to fail, and
-// verifies that the job gets reverted.
-func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
-
-	jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 running allocs",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"deployment should be successful")
-
-	// reschedule to make fail
-	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
-	f.NoError(err)
-	job.ID = &jobID
-	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
-	_, _, err = tc.Nomad().Jobs().Register(job, nil)
-	f.NoError(err, "could not register updated job")
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
-			func(got []string) bool { return len(got) > 0 }, nil,
-		),
-		"should have new allocs after update",
-	)
-
-	// then we'll fail and revert
-	expected = []string{"failed", "failed", "failed", "running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 running reverted allocs",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"deployment should be successful")
-}
-
-// TestRescheduleMaxParallel updates a job with a max_parallel config
-func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
-
-	jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 running allocs",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"deployment should be successful")
-
-	// reschedule to make fail
-	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
-	f.NoError(err)
-	job.ID = &jobID
-	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
-	_, _, err = tc.Nomad().Jobs().Register(job, nil)
-	f.NoError(err, "could not register updated job")
-
-	expected = []string{"complete", "failed", "failed", "running", "running"}
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
-			func(got []string) bool {
-				sort.Strings(got)
-				return reflect.DeepEqual(got, expected)
-			}, nil,
-		),
-		"should have failed allocs including rescheduled failed allocs",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
-		"deployment should be running")
-}
-
-// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
-// config that will autorevert on failure
-func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
-
-	jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have exactly 3 running allocs",
-	)
-
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"deployment should be successful")
-
-	// reschedule to make fail
-	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
-	f.NoError(err)
-	job.ID = &jobID
-	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
-	_, _, err = tc.Nomad().Jobs().Register(job, nil)
-	f.NoError(err, "could not e2e.Register updated job")
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
-			func(got []string) bool { return len(got) > 0 }, nil,
-		),
-		"should have new allocs after update",
-	)
-
-	// wait for the revert
-	expected = []string{"complete", "failed", "running", "running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
-			func(got []string) bool {
-				sort.Strings(got)
-				return reflect.DeepEqual(got, expected)
-			}, nil,
-		),
-		"should have one successful, one failed, and 3 reverted allocs",
-	)
-
-	// at this point the allocs have been checked but we need to wait for the
-	// deployment to be marked complete before we can assert that it's successful
-	// and verify the count of deployments
-	f.NoError(
-		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"most recent deployment should be successful")
-
-	out, err := e2e.Command("nomad", "deployment", "status")
-	f.NoError(err, "could not get deployment status")
-
-	results, err := e2e.ParseColumns(out)
-	f.NoError(err, "could not parse deployment status")
-	statuses := map[string]int{}
-	for _, row := range results {
-		if row["Job ID"] == jobID {
-			statuses[row["Status"]]++
-		}
-	}
-
-	f.Equal(1, statuses["failed"],
-		fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out))
-	f.Equal(2, statuses["successful"],
-		fmt.Sprintf("expected 2 successful deployments, got:\n%s", out))
-}
-
-// TestRescheduleProgressDeadline verifies the progress deadline is reset with
-// each healthy allocation, and that a rescheduled allocation does not.
-func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
-
-	jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	expected := []string{"running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
-		"should have a running allocation",
-	)
-
-	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
-	f.NoError(err, "couldn't look up deployment")
-
-	oldDeadline, err := getProgressDeadline(deploymentID)
-	f.NoError(err, "could not get progress deadline")
-	time.Sleep(time.Second * 20)
-
-	newDeadline, err := getProgressDeadline(deploymentID)
-	f.NoError(err, "could not get new progress deadline")
-	f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated")
-
-	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
-		"deployment should be successful")
-}
-
-// TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with
-// each healthy allocation, and that a rescheduled allocation does not.
-func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) {
-
-	jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad"))
-	tc.jobIds = append(tc.jobIds, jobID)
-
-	testutil.WaitForResult(func() (bool, error) {
-		_, err := e2e.LastDeploymentID(jobID, ns)
-		return err == nil, err
-	}, func(err error) {
-		f.NoError(err, "deployment wasn't created yet")
-	})
-
-	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
-	f.NoError(err, "couldn't look up deployment")
-
-	oldDeadline, err := getProgressDeadline(deploymentID)
-	f.NoError(err, "could not get progress deadline")
-	time.Sleep(time.Second * 20)
-
-	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil),
-		"deployment should be failed")
-
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
-			func(got []string) bool {
-				for _, status := range got {
-					if status != "failed" {
-						return false
-					}
-				}
-				return true
-			}, nil,
-		),
-		"should have only failed allocs",
-	)
-
-	newDeadline, err := getProgressDeadline(deploymentID)
-	f.NoError(err, "could not get new progress deadline")
-	f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated")
-}
-
-func getProgressDeadline(deploymentID string) (time.Time, error) {
-
-	out, err := e2e.Command("nomad", "deployment", "status", deploymentID)
-	if err != nil {
-		return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out)
-	}
-
-	section, err := e2e.GetSection(out, "Deployed")
-	if err != nil {
-		return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err)
-	}
-
-	rows, err := e2e.ParseColumns(section)
-	if err != nil {
-		return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err)
-	}
-
-	layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
-	raw := rows[0]["Progress Deadline"]
-	return time.Parse(layout, raw)
-}
--- a/e2e/rescheduling/rescheduling_test.go
+++ b/e2e/rescheduling/rescheduling_test.go
@ -0,0 +1,509 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package rescheduling
+
+import (
+	"os"
+	"reflect"
+	"sort"
+	"testing"
+	"time"
+
+	"github.com/hashicorp/nomad/e2e/e2eutil"
+	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/hashicorp/nomad/jobspec"
+	"github.com/shoenig/test"
+	"github.com/shoenig/test/must"
+	"github.com/shoenig/test/wait"
+)
+
+const ns = "default"
+
+func cleanupJob(t *testing.T, jobID string) {
+	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
+		return
+	}
+
+	t.Helper()
+	t.Cleanup(func() {
+		e2eutil.StopJob(jobID, "-purge", "-detach")
+		_, err := e2eutil.Command("nomad", "system", "gc")
+		test.NoError(t, err)
+	})
+}
+
+// Note: most of the StopJob calls in this test suite will return an
+// error because the job has previously failed and we're not waiting for
+// the deployment to end
+
+// TestRescheduling_Service_NoReschedule runs a service job that should fail and never
+// reschedule
+func TestRescheduling_Service_NoReschedule(t *testing.T) {
+	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/norescheduling_service.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"failed", "failed", "failed"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 failed allocs"),
+	)
+}
+
+// TestRescheduling_System_NoReschedule runs a system job that should fail and never
+// reschedule
+func TestRescheduling_System_NoReschedule(t *testing.T) {
+	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/norescheduling_system.nomad"))
+
+	cleanupJob(t, jobID)
+
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) },
+			func(got []string) bool {
+				for _, status := range got {
+					if status != "failed" {
+						return false
+					}
+				}
+				return true
+			}, nil,
+		),
+		must.Sprint("should have only failed allocs"),
+	)
+}
+
+// TestRescheduling_Default runs a job that should reschedule after delay
+func TestRescheduling_Default(t *testing.T) {
+	jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_default.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"failed", "failed", "failed"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 failed allocs"),
+	)
+
+	// wait until first exponential delay kicks in and rescheduling is attempted
+	time.Sleep(time.Second * 35)
+	expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 6 failed allocs after 35s"),
+	)
+}
+
+// TestRescheduling_MaxAttempts runs a job with a maximum reschedule attempts
+func TestRescheduling_MaxAttempts(t *testing.T) {
+
+	jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_fail.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"failed", "failed", "failed"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 failed allocs"),
+	)
+
+	job, err := jobspec.ParseFile("./input/rescheduling_fail.nomad")
+	must.NoError(t, err)
+	job.ID = &jobID
+	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
+
+	nc := e2eutil.NomadClient(t)
+	_, _, err = nc.Jobs().Register(job, nil)
+	must.NoError(t, err, must.Sprint("could not register updated job"))
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			got, err := e2eutil.AllocStatuses(jobID, ns)
+			must.NoError(t, err)
+			for _, status := range got {
+				if status == "running" {
+					return true
+				}
+			}
+			return false
+		}),
+		wait.Timeout(10*time.Second),
+		wait.Gap(500*time.Millisecond),
+	), must.Sprint("should have at least 1 running alloc"))
+}
+
+// TestRescheduling_Success runs a job that should be running after rescheduling
+func TestRescheduling_Success(t *testing.T) {
+
+	jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_success.nomad"))
+
+	cleanupJob(t, jobID)
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			got, err := e2eutil.AllocStatuses(jobID, ns)
+			must.NoError(t, err)
+			running := 0
+			for _, status := range got {
+				if status == "running" {
+					running++
+				}
+			}
+			return running == 3
+		}),
+		wait.Timeout(60*time.Second), // this can take a while!
+		wait.Gap(500*time.Millisecond),
+	), must.Sprint("all 3 allocs should eventually be running"))
+}
+
+// TestRescheduling_WithUpdate updates a running job to fail, and verifies that
+// it gets rescheduled
+func TestRescheduling_WithUpdate(t *testing.T) {
+
+	jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_update.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 running allocs"),
+	)
+
+	// reschedule to make fail
+	job, err := jobspec.ParseFile("./input/rescheduling_update.nomad")
+	must.NoError(t, err)
+	job.ID = &jobID
+	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
+
+	nc := e2eutil.NomadClient(t)
+	_, _, err = nc.Jobs().Register(job, nil)
+	must.NoError(t, err, must.Sprint("could not register updated job"))
+
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
+			func(got []string) bool { return len(got) > 0 }, nil,
+		),
+		must.Sprint("should have rescheduled allocs until progress deadline"),
+	)
+}
+
+// TestRescheduling_WithCanary updates a running job to fail, and verify that the
+// canary gets rescheduled
+func TestRescheduling_WithCanary(t *testing.T) {
+
+	jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_canary.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 running allocs"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
+		must.Sprint("deployment should be successful"))
+
+	// reschedule to make fail
+	job, err := jobspec.ParseFile("./input/rescheduling_canary.nomad")
+	must.NoError(t, err)
+	job.ID = &jobID
+	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
+
+	nc := e2eutil.NomadClient(t)
+	_, _, err = nc.Jobs().Register(job, nil)
+	must.NoError(t, err, must.Sprint("could not register updated job"))
+
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
+			func(got []string) bool { return len(got) > 0 }, nil,
+		),
+		must.Sprint("should have rescheduled allocs until progress deadline"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
+		must.Sprint("deployment should be running"))
+}
+
+// TestRescheduling_WithCanaryAutoRevert updates a running job to fail, and
+// verifies that the job gets reverted.
+func TestRescheduling_WithCanaryAutoRevert(t *testing.T) {
+
+	jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_canary_autorevert.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 running allocs"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
+		must.Sprint("deployment should be successful"))
+
+	// reschedule to make fail
+	job, err := jobspec.ParseFile("./input/rescheduling_canary_autorevert.nomad")
+	must.NoError(t, err)
+	job.ID = &jobID
+	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
+
+	nc := e2eutil.NomadClient(t)
+	_, _, err = nc.Jobs().Register(job, nil)
+	must.NoError(t, err, must.Sprint("could not register updated job"))
+
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
+			func(got []string) bool { return len(got) > 0 }, nil,
+		),
+		must.Sprint("should have new allocs after update"),
+	)
+
+	// then we'll fail and revert
+	expected = []string{"failed", "failed", "failed", "running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 running reverted allocs"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
+		must.Sprint("deployment should be successful"))
+}
+
+// TestRescheduling_MaxParallel updates a job with a max_parallel config
+func TestRescheduling_MaxParallel(t *testing.T) {
+
+	jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_maxp.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 running allocs"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
+		must.Sprint("deployment should be successful"))
+
+	// reschedule to make fail
+	job, err := jobspec.ParseFile("./input/rescheduling_maxp.nomad")
+	must.NoError(t, err)
+	job.ID = &jobID
+	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
+
+	nc := e2eutil.NomadClient(t)
+	_, _, err = nc.Jobs().Register(job, nil)
+	must.NoError(t, err, must.Sprint("could not register updated job"))
+
+	expected = []string{"complete", "failed", "failed", "running", "running"}
+
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) },
+			func(got []string) bool {
+				sort.Strings(got)
+				return reflect.DeepEqual(got, expected)
+			}, nil,
+		),
+		must.Sprint("should have failed allocs including rescheduled failed allocs"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
+		must.Sprint("deployment should be running"))
+}
+
+// TestRescheduling_MaxParallelAutoRevert updates a job with a max_parallel
+// config that will autorevert on failure
+func TestRescheduling_MaxParallelAutoRevert(t *testing.T) {
+
+	jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_maxp_autorevert.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have exactly 3 running allocs"),
+	)
+
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
+		must.Sprint("deployment should be successful"))
+
+	// reschedule to make fail
+	job, err := jobspec.ParseFile("./input/rescheduling_maxp_autorevert.nomad")
+	must.NoError(t, err)
+	job.ID = &jobID
+	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
+
+	nc := e2eutil.NomadClient(t)
+	_, _, err = nc.Jobs().Register(job, nil)
+	must.NoError(t, err, must.Sprint("could not e2eutil.Register updated job"))
+
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
+			func(got []string) bool { return len(got) > 0 }, nil,
+		),
+		must.Sprint("should have new allocs after update"),
+	)
+
+	// wait for the revert
+	expected = []string{"complete", "failed", "running", "running", "running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusComparison(
+			func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) },
+			func(got []string) bool {
+				sort.Strings(got)
+				return reflect.DeepEqual(got, expected)
+			}, nil,
+		),
+		must.Sprint("should have one successful, one failed, and 3 reverted allocs"),
+	)
+
+	// at this point the allocs have been checked but we need to wait for the
+	// deployment to be marked complete before we can assert that it's successful
+	// and verify the count of deployments
+	must.NoError(t,
+		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
+		must.Sprint("most recent deployment should be successful"))
+
+	out, err := e2eutil.Command("nomad", "deployment", "status")
+	must.NoError(t, err, must.Sprint("could not get deployment status"))
+
+	results, err := e2eutil.ParseColumns(out)
+	must.NoError(t, err, must.Sprint("could not parse deployment status"))
+	statuses := map[string]int{}
+	for _, row := range results {
+		if row["Job ID"] == jobID {
+			statuses[row["Status"]]++
+		}
+	}
+
+	must.Eq(t, 1, statuses["failed"],
+		must.Sprintf("expected only 1 failed deployment, got:\n%s", out))
+	must.Eq(t, 2, statuses["successful"],
+		must.Sprintf("expected 2 successful deployments, got:\n%s", out))
+}
+
+// TestRescheduling_ProgressDeadline verifies the progress deadline is only
+// reset with each healthy allocation, not failed one (which we'll then
+// reschedule)
+func TestRescheduling_ProgressDeadline(t *testing.T) {
+
+	jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_progressdeadline.nomad"))
+
+	cleanupJob(t, jobID)
+
+	expected := []string{"running"}
+	must.NoError(t,
+		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
+		must.Sprint("should have a running allocation"),
+	)
+
+	var deploymentID string
+
+	deploymentID, err := e2eutil.LastDeploymentID(jobID, ns)
+	must.NoError(t, err, must.Sprint("couldn't look up deployment"))
+
+	_, oldDeadline := getDeploymentState(t, deploymentID)
+
+	var newStatus string
+	var newDeadline time.Time
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			newStatus, newDeadline = getDeploymentState(t, deploymentID)
+			return newStatus == "successful"
+		}),
+		wait.Timeout(30*time.Second),
+		wait.Gap(500*time.Millisecond),
+	), must.Sprint("deployment should be successful"))
+
+	must.NotEq(t, oldDeadline, newDeadline,
+		must.Sprint("progress deadline should have been updated"))
+}
+
+// TestRescheduling_ProgressDeadlineFail verifies the progress deadline is only
+// reset with each healthy allocation, and this fails the deployment if not
+func TestRescheduling_ProgressDeadlineFail(t *testing.T) {
+
+	jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
+	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_progressdeadline_fail.nomad"))
+
+	cleanupJob(t, jobID)
+
+	var deploymentID string
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			deploymentID, _ = e2eutil.LastDeploymentID(jobID, ns)
+			return deploymentID != ""
+		}),
+		wait.Timeout(5*time.Second),
+		wait.Gap(500*time.Millisecond),
+	), must.Sprint("deployment not created"))
+
+	_, oldDeadline := getDeploymentState(t, deploymentID)
+
+	var newStatus string
+	var newDeadline time.Time
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			newStatus, newDeadline = getDeploymentState(t, deploymentID)
+			return newStatus == "failed"
+		}),
+		wait.Timeout(30*time.Second),
+		wait.Gap(500*time.Millisecond),
+	), must.Sprint("deployment should be failed"))
+
+	must.Eq(t, oldDeadline, newDeadline,
+		must.Sprint("progress deadline should not have been updated"))
+}
+
+// getDeploymentState returns the status and progress deadline for the given
+// deployment
+func getDeploymentState(t *testing.T, deploymentID string) (string, time.Time) {
+
+	out, err := e2eutil.Command("nomad", "deployment", "status", deploymentID)
+	must.NoError(t, err, must.Sprintf("could not get deployment status from output: %v", out))
+
+	status, err := e2eutil.GetField(out, "Status")
+	must.NoError(t, err, must.Sprintf("could not find Status field in output: %v", out))
+
+	section, err := e2eutil.GetSection(out, "Deployed")
+	must.NoError(t, err, must.Sprintf("could not find Deployed section in output: %v", out))
+
+	rows, err := e2eutil.ParseColumns(section)
+	must.NoError(t, err, must.Sprintf("could not parse Deployed section from output: %v", out))
+
+	layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
+	raw := rows[0]["Progress Deadline"]
+	deadline, err := time.Parse(layout, raw)
+	must.NoError(t, err, must.Sprint("could not parse Progress Deadline timestamp"))
+	return status, deadline
+}