package rescheduling import ( "fmt" "os" "reflect" "sort" "time" e2e "github.com/hashicorp/nomad/e2e/e2eutil" "github.com/hashicorp/nomad/e2e/framework" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/jobspec" ) const ns = "" type RescheduleE2ETest struct { framework.TC jobIds []string } func init() { framework.AddSuites(&framework.TestSuite{ Component: "Rescheduling", CanRunLocal: true, Consul: true, Cases: []framework.TestCase{ new(RescheduleE2ETest), }, }) } func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) { e2e.WaitForLeader(f.T(), tc.Nomad()) e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) } func (tc *RescheduleE2ETest) AfterEach(f *framework.F) { if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { return } for _, id := range tc.jobIds { _, err := e2e.Command("nomad", "job", "stop", "-purge", id) f.Assert().NoError(err) } tc.jobIds = []string{} _, err := e2e.Command("nomad", "system", "gc") f.Assert().NoError(err) } // TestNoReschedule runs a job that should fail and never reschedule func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) { jobID := "test-no-reschedule-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 failed allocs", ) } // TestNoRescheduleSystem runs a system job that should fail and never reschedule func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) { jobID := "test-reschedule-system-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad")) tc.jobIds = append(tc.jobIds, jobID) f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, func(got []string) bool { for _, status := range got { if status != "failed" { return false } } return true }, nil, ), "should have only failed allocs", ) } // TestDefaultReschedule runs a job that should reschedule after delay func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) { jobID := "test-default-reschedule-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 failed allocs", ) // TODO(tgross): return early if "slow" isn't set // wait until first exponential delay kicks in and rescheduling is attempted time.Sleep(time.Second * 35) expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 6 failed allocs after 35s", ) } // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) { jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 failed allocs", ) job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, func(got []string) bool { for _, status := range got { if status == "running" { return true } } return false }, nil, ), "should have at least 1 running alloc", ) } // TestRescheduleSuccess runs a job that should be running after rescheduling func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) { jobID := "test-reschedule-success-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad")) tc.jobIds = append(tc.jobIds, jobID) f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, func(got []string) bool { for _, status := range got { if status == "running" { return true } } return false }, nil, ), "should have at least 1 running alloc", ) } // TestRescheduleWithUpdate updates a running job to fail, and verifies that // it gets rescheduled func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) { jobID := "test-reschedule-update-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 running allocs", ) // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have rescheduled allocs until progress deadline", ) } // TestRescheduleWithCanary updates a running job to fail, and verify that the // canary gets rescheduled func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) { jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have rescheduled allocs until progress deadline", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), "deployment should be running") } // TestRescheduleWithCanary updates a running job to fail, and verifies that // the job gets reverted func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) { jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have new allocs after update", ) // then we'll fail and revert expected = []string{"failed", "failed", "failed", "running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 running reverted allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") } // TestRescheduleMaxParallel updates a job with a max_parallel config func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) { jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") expected = []string{"complete", "failed", "failed", "running", "running"} f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, func(got []string) bool { sort.Strings(got) return reflect.DeepEqual(got, expected) }, nil, ), "should have failed allocs including rescheduled failed allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), "deployment should be running") } // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel // config that will autorevert on failure func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) { jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not e2e.Register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have new allocs after update", ) // wait for the revert expected = []string{"complete", "failed", "running", "running", "running"} f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, func(got []string) bool { sort.Strings(got) return reflect.DeepEqual(got, expected) }, nil, ), "should have one successful, one failed, and 3 reverted allocs", ) // at this point the allocs have been checked but we need to wait for the // deployment to be marked complete before we can assert that it's successful // and verify the count of deployments f.NoError( e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "most recent deployment should be successful") out, err := e2e.Command("nomad", "deployment", "status") f.NoError(err, "could not get deployment status") results, err := e2e.ParseColumns(out) f.NoError(err, "could not parse deployment status") statuses := map[string]int{} for _, row := range results { if row["Job ID"] == jobID { statuses[row["Status"]]++ } } f.Equal(1, statuses["failed"], fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out)) f.Equal(2, statuses["successful"], fmt.Sprintf("expected 2 successful deployments, got:\n%s", out)) } // TestRescheduleProgressDeadline verifies the progress deadline is reset with // each healthy allocation, and that a rescheduled allocation does not. func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, ns, expected), "should have a running allocation", ) deploymentID, err := e2e.LastDeploymentID(jobID, ns) f.NoError(err, "couldn't look up deployment") oldDeadline, err := getProgressDeadline(deploymentID) f.NoError(err, "could not get progress deadline") time.Sleep(time.Second * 20) newDeadline, err := getProgressDeadline(deploymentID) f.NoError(err, "could not get new progress deadline") f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated") f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") } // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with // each healthy allocation, and that a rescheduled allocation does not. func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) { jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad")) tc.jobIds = append(tc.jobIds, jobID) deploymentID, err := e2e.LastDeploymentID(jobID, ns) f.NoError(err, "couldn't look up deployment") oldDeadline, err := getProgressDeadline(deploymentID) f.NoError(err, "could not get progress deadline") time.Sleep(time.Second * 20) f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil), "deployment should be failed") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, func(got []string) bool { for _, status := range got { if status != "failed" { return false } } return true }, nil, ), "should have only failed allocs", ) newDeadline, err := getProgressDeadline(deploymentID) f.NoError(err, "could not get new progress deadline") f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated") } func getProgressDeadline(deploymentID string) (time.Time, error) { out, err := e2e.Command("nomad", "deployment", "status", deploymentID) if err != nil { return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out) } section, err := e2e.GetSection(out, "Deployed") if err != nil { return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err) } rows, err := e2e.ParseColumns(section) if err != nil { return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err) } layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go raw := rows[0]["Progress Deadline"] return time.Parse(layout, raw) }