e2e: rescheduling tests
Ports the rescheduling tests (which aren't running in CI) into the current test framework so that they're run on nightly, and exercises the new CLI helpers.
This commit is contained in:
parent
28e9bbbbf4
commit
294c7149a2
|
@ -22,6 +22,7 @@ import (
|
|||
_ "github.com/hashicorp/nomad/e2e/nomad09upgrade"
|
||||
_ "github.com/hashicorp/nomad/e2e/nomadexec"
|
||||
_ "github.com/hashicorp/nomad/e2e/podman"
|
||||
_ "github.com/hashicorp/nomad/e2e/rescheduling"
|
||||
_ "github.com/hashicorp/nomad/e2e/spread"
|
||||
_ "github.com/hashicorp/nomad/e2e/systemsched"
|
||||
_ "github.com/hashicorp/nomad/e2e/taskevents"
|
||||
|
|
134
e2e/rescheduling/helpers.go
Normal file
134
e2e/rescheduling/helpers.go
Normal file
|
@ -0,0 +1,134 @@
|
|||
package rescheduling
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/e2e/e2eutil"
|
||||
"github.com/hashicorp/nomad/e2e/framework"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
)
|
||||
|
||||
// allocStatuses returns a slice of client statuses
|
||||
func allocStatuses(f *framework.F, jobID string) []string {
|
||||
|
||||
out, err := e2eutil.Command("nomad", "job", "status", "-verbose", "-all-allocs", jobID)
|
||||
f.NoError(err, "nomad job status failed", err)
|
||||
section, err := e2eutil.GetSection(out, "Allocations")
|
||||
f.NoError(err, "could not find Allocations section", err)
|
||||
|
||||
allocs, err := e2eutil.ParseColumns(section)
|
||||
f.NoError(err, "could not parse Allocations section", err)
|
||||
|
||||
statuses := []string{}
|
||||
for _, alloc := range allocs {
|
||||
statuses = append(statuses, alloc["Status"])
|
||||
}
|
||||
|
||||
return statuses
|
||||
}
|
||||
|
||||
// allocStatusesRescheduled is a helper function that pulls
|
||||
// out client statuses only from rescheduled allocs.
|
||||
func allocStatusesRescheduled(f *framework.F, jobID string) []string {
|
||||
|
||||
out, err := e2eutil.Command("nomad", "job", "status", "-verbose", jobID)
|
||||
f.NoError(err, "nomad job status failed", err)
|
||||
section, err := e2eutil.GetSection(out, "Allocations")
|
||||
f.NoError(err, "could not find Allocations section", err)
|
||||
|
||||
allocs, err := e2eutil.ParseColumns(section)
|
||||
f.NoError(err, "could not parse Allocations section", err)
|
||||
|
||||
statuses := []string{}
|
||||
for _, alloc := range allocs {
|
||||
|
||||
allocID := alloc["ID"]
|
||||
|
||||
// reschedule tracker isn't exposed in the normal CLI output
|
||||
out, err := e2eutil.Command("nomad", "alloc", "status", "-json", allocID)
|
||||
f.NoError(err, "nomad alloc status failed", err)
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(out))
|
||||
alloc := &api.Allocation{}
|
||||
err = dec.Decode(alloc)
|
||||
f.NoError(err, "could not decode alloc status JSON: %w", err)
|
||||
|
||||
if (alloc.RescheduleTracker != nil &&
|
||||
len(alloc.RescheduleTracker.Events) > 0) || alloc.FollowupEvalID != "" {
|
||||
statuses = append(statuses, alloc.ClientStatus)
|
||||
}
|
||||
}
|
||||
return statuses
|
||||
}
|
||||
|
||||
// register is a helper that registers a jobspec with a unique ID
|
||||
// and records that ID in the testcase for later cleanup
|
||||
func register(f *framework.F, jobFile, jobID string) {
|
||||
|
||||
cmd := exec.Command("nomad", "job", "run", "-")
|
||||
stdin, err := cmd.StdinPipe()
|
||||
f.NoError(err, fmt.Sprintf("could not open stdin?: %v", err))
|
||||
|
||||
content, err := ioutil.ReadFile(jobFile)
|
||||
f.NoError(err, fmt.Sprintf("could not open job file: %v", err))
|
||||
|
||||
// hack off the first line to replace with our unique ID
|
||||
var re = regexp.MustCompile(`^job "\w+" \{`)
|
||||
jobspec := re.ReplaceAllString(string(content),
|
||||
fmt.Sprintf("job \"%s\" {", jobID))
|
||||
|
||||
go func() {
|
||||
defer stdin.Close()
|
||||
io.WriteString(stdin, jobspec)
|
||||
}()
|
||||
|
||||
out, err := cmd.CombinedOutput()
|
||||
f.NoError(err, "could not register job: %v\n%v", err, string(out))
|
||||
}
|
||||
|
||||
func waitForAllocStatusComparison(query func() ([]string, error), comparison func([]string) bool) error {
|
||||
var got []string
|
||||
var err error
|
||||
testutil.WaitForResultRetries(30, func() (bool, error) {
|
||||
time.Sleep(time.Millisecond * 100)
|
||||
got, err = query()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return comparison(got), nil
|
||||
}, func(e error) {
|
||||
err = fmt.Errorf("alloc status check failed: got %#v", got)
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func waitForLastDeploymentStatus(f *framework.F, jobID, status string) error {
|
||||
var got string
|
||||
var err error
|
||||
testutil.WaitForResultRetries(30, func() (bool, error) {
|
||||
time.Sleep(time.Millisecond * 100)
|
||||
|
||||
out, err := e2eutil.Command("nomad", "job", "status", jobID)
|
||||
f.NoError(err, "could not get job status: %v\n%v", err, out)
|
||||
|
||||
section, err := e2eutil.GetSection(out, "Latest Deployment")
|
||||
f.NoError(err, "could not find Latest Deployment section", err)
|
||||
|
||||
fields, err := e2eutil.ParseFields(section)
|
||||
f.NoError(err, "could not parse Latest Deployment section", err)
|
||||
|
||||
got = fields["Status"]
|
||||
return got == status, nil
|
||||
}, func(e error) {
|
||||
err = fmt.Errorf("deployment status check failed: got %#v", got)
|
||||
})
|
||||
return err
|
||||
}
|
388
e2e/rescheduling/rescheduling.go
Normal file
388
e2e/rescheduling/rescheduling.go
Normal file
|
@ -0,0 +1,388 @@
|
|||
package rescheduling
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"reflect"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/e2e/e2eutil"
|
||||
"github.com/hashicorp/nomad/e2e/framework"
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/jobspec"
|
||||
)
|
||||
|
||||
type RescheduleE2ETest struct {
|
||||
framework.TC
|
||||
jobIds []string
|
||||
}
|
||||
|
||||
func init() {
|
||||
framework.AddSuites(&framework.TestSuite{
|
||||
Component: "Rescheduling",
|
||||
CanRunLocal: true,
|
||||
Consul: true,
|
||||
Cases: []framework.TestCase{
|
||||
new(RescheduleE2ETest),
|
||||
},
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
|
||||
e2eutil.WaitForLeader(f.T(), tc.Nomad())
|
||||
e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1)
|
||||
}
|
||||
|
||||
func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
|
||||
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
|
||||
return
|
||||
}
|
||||
|
||||
for _, id := range tc.jobIds {
|
||||
_, err := e2eutil.Command("nomad", "job", "stop", "-purge", id)
|
||||
f.NoError(err)
|
||||
}
|
||||
tc.jobIds = []string{}
|
||||
_, err := e2eutil.Command("nomad", "system", "gc")
|
||||
f.NoError(err)
|
||||
}
|
||||
|
||||
// TestNoReschedule runs a job that should fail and never reschedule
|
||||
func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
|
||||
jobID := "test-no-reschedule" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/norescheduling.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"failed", "failed", "failed"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 failed allocs")
|
||||
}
|
||||
|
||||
// TestNoRescheduleSystem runs a system job that should fail and never reschedule
|
||||
func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
|
||||
jobID := "test-reschedule-system" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_system.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool {
|
||||
for _, status := range got {
|
||||
if status != "failed" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
},
|
||||
)
|
||||
f.NoError(err, "should have only failed allocs")
|
||||
}
|
||||
|
||||
// TestDefaultReschedule runs a job that should reschedule after delay
|
||||
func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
|
||||
|
||||
jobID := "test-default-reschedule" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_default.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"failed", "failed", "failed"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 failed allocs")
|
||||
|
||||
// TODO(tgross): return early if "slow" isn't set
|
||||
// wait until first exponential delay kicks in and rescheduling is attempted
|
||||
time.Sleep(time.Second * 35)
|
||||
expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 6 failed allocs after 35s")
|
||||
}
|
||||
|
||||
// TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
|
||||
func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-fail" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_fail.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"failed", "failed", "failed"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 failed allocs")
|
||||
|
||||
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
|
||||
f.NoError(err)
|
||||
job.ID = &jobID
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
|
||||
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
||||
f.NoError(err, "could not register updated job")
|
||||
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool {
|
||||
for _, status := range got {
|
||||
if status == "running" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
},
|
||||
)
|
||||
f.NoError(err, "should have at least 1 running alloc")
|
||||
}
|
||||
|
||||
// TestRescheduleSuccess runs a job that should be running after rescheduling
|
||||
func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-success" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_success.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool {
|
||||
for _, status := range got {
|
||||
if status == "running" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
},
|
||||
)
|
||||
f.NoError(err, "should have at least 1 running alloc")
|
||||
}
|
||||
|
||||
// TestRescheduleWithUpdate updates a running job to fail, and verifies that
|
||||
// it gets rescheduled
|
||||
func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-update" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_update.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"running", "running", "running"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 running allocs")
|
||||
|
||||
// reschedule to make fail
|
||||
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
|
||||
f.NoError(err)
|
||||
job.ID = &jobID
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
||||
f.NoError(err, "could not register updated job")
|
||||
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatusesRescheduled(f, jobID), nil },
|
||||
func(got []string) bool { return len(got) > 0 },
|
||||
)
|
||||
f.NoError(err, "should have rescheduled allocs until progress deadline")
|
||||
}
|
||||
|
||||
// TestRescheduleWithCanary updates a running job to fail, and verify that the
|
||||
// canary gets rescheduled
|
||||
func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-canary" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_canary.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"running", "running", "running"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 running allocs")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "successful")
|
||||
f.NoError(err, "deployment should be successful")
|
||||
|
||||
// reschedule to make fail
|
||||
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
|
||||
f.NoError(err)
|
||||
job.ID = &jobID
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
||||
f.NoError(err, "could not register updated job")
|
||||
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatusesRescheduled(f, jobID), nil },
|
||||
func(got []string) bool { return len(got) > 0 },
|
||||
)
|
||||
f.NoError(err, "should have rescheduled allocs until progress deadline")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "running")
|
||||
f.NoError(err, "deployment should be running")
|
||||
}
|
||||
|
||||
// TestRescheduleWithCanary updates a running job to fail, and verifies that
|
||||
// the job gets reverted
|
||||
func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-canary-revert" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_canary_autorevert.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"running", "running", "running"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 running allocs")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "successful")
|
||||
f.NoError(err, "deployment should be successful")
|
||||
|
||||
// reschedule to make fail
|
||||
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
|
||||
f.NoError(err)
|
||||
job.ID = &jobID
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
||||
f.NoError(err, "could not register updated job")
|
||||
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatusesRescheduled(f, jobID), nil },
|
||||
func(got []string) bool { return len(got) == 0 },
|
||||
)
|
||||
f.NoError(err, "should have new allocs after update")
|
||||
|
||||
// then we'll fail and revert
|
||||
expected = []string{"failed", "failed", "failed", "running", "running", "running"}
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 running reverted allocs")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "successful")
|
||||
f.NoError(err, "deployment should be successful")
|
||||
}
|
||||
|
||||
// TestRescheduleMaxParallel updates a job with a max_parallel config
|
||||
func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-maxp" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_maxp.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"running", "running", "running"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 running allocs")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "successful")
|
||||
f.NoError(err, "deployment should be successful")
|
||||
|
||||
// reschedule to make fail
|
||||
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
|
||||
f.NoError(err)
|
||||
job.ID = &jobID
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
||||
f.NoError(err, "could not register updated job")
|
||||
|
||||
expected = []string{"complete", "failed", "failed", "running", "running"}
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool {
|
||||
sort.Strings(got)
|
||||
return reflect.DeepEqual(got, expected)
|
||||
},
|
||||
)
|
||||
f.NoError(err, "should have failed allocs including rescheduled failed allocs")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "running")
|
||||
f.NoError(err, "deployment should be running")
|
||||
}
|
||||
|
||||
// TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
|
||||
// config that will autorevert on failure
|
||||
func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-maxp-revert" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_maxp_autorevert.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
expected := []string{"running", "running", "running"}
|
||||
err := waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool { return reflect.DeepEqual(got, expected) },
|
||||
)
|
||||
f.NoError(err, "should have exactly 3 running allocs")
|
||||
|
||||
err = waitForLastDeploymentStatus(f, jobID, "successful")
|
||||
f.NoError(err, "deployment should be successful")
|
||||
|
||||
// reschedule to make fail
|
||||
job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
|
||||
f.NoError(err)
|
||||
job.ID = &jobID
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err = tc.Nomad().Jobs().Register(job, nil)
|
||||
f.NoError(err, "could not register updated job")
|
||||
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatusesRescheduled(f, jobID), nil },
|
||||
func(got []string) bool { return len(got) == 0 },
|
||||
)
|
||||
f.NoError(err, "should have new allocs after update")
|
||||
|
||||
// wait for the revert
|
||||
expected = []string{"complete", "failed", "running", "running", "running"}
|
||||
err = waitForAllocStatusComparison(
|
||||
func() ([]string, error) { return allocStatuses(f, jobID), nil },
|
||||
func(got []string) bool {
|
||||
sort.Strings(got)
|
||||
return reflect.DeepEqual(got, expected)
|
||||
},
|
||||
)
|
||||
f.NoError(err, "should have one successful, one failed, and 3 reverted allocs")
|
||||
|
||||
out, err := e2eutil.Command("nomad", "deployment", "status")
|
||||
f.NoError(err, "could not get deployment status")
|
||||
|
||||
results, err := e2eutil.ParseColumns(out)
|
||||
f.NoError(err, "could not parse deployment status")
|
||||
statuses := []string{}
|
||||
for _, row := range results {
|
||||
if row["Job ID"] == jobID {
|
||||
statuses = append(statuses, row["Status"])
|
||||
}
|
||||
}
|
||||
f.True(reflect.DeepEqual([]string{"running", "failed", "successful"}, statuses),
|
||||
fmt.Sprintf("deployment status was: %#v", statuses),
|
||||
)
|
||||
}
|
||||
|
||||
// TestRescheduleProgressDeadline verifies a deployment succeeds by the
|
||||
// progress deadline
|
||||
func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
|
||||
|
||||
jobID := "test-reschedule-deadline" + uuid.Generate()[0:8]
|
||||
register(f, "rescheduling/input/rescheduling_progressdeadline.nomad", jobID)
|
||||
tc.jobIds = append(tc.jobIds, jobID)
|
||||
|
||||
// TODO(tgross): return early if "slow" isn't set
|
||||
// wait until first exponential delay kicks in and rescheduling is attempted
|
||||
time.Sleep(time.Second * 30)
|
||||
err := waitForLastDeploymentStatus(f, jobID, "successful")
|
||||
f.NoError(err, "deployment should be successful")
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
package rescheduling
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var integration = flag.Bool("integration", false, "run integration tests")
|
||||
var slow = flag.Bool("slow", false, "runs slower integration tests")
|
||||
|
||||
func TestServerSideRestarts(t *testing.T) {
|
||||
if !*integration {
|
||||
t.Skip("skipping test in non-integration mode.")
|
||||
}
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "Server Side Restart Tests")
|
||||
}
|
|
@ -1,323 +0,0 @@
|
|||
package rescheduling
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/jobspec"
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
var _ = Describe("Server Side Restart Tests", func() {
|
||||
|
||||
var (
|
||||
jobs *api.Jobs
|
||||
system *api.System
|
||||
job *api.Job
|
||||
err error
|
||||
specFile string
|
||||
|
||||
// allocStatuses is a helper function that pulls
|
||||
// out client statuses from a slice of allocs
|
||||
allocStatuses = func() []string {
|
||||
allocs, _, err := jobs.Allocations(*job.ID, false, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
var ret []string
|
||||
for _, a := range allocs {
|
||||
ret = append(ret, a.ClientStatus)
|
||||
}
|
||||
sort.Strings(ret)
|
||||
return ret
|
||||
}
|
||||
|
||||
// allocStatusesRescheduled is a helper function that pulls
|
||||
// out client statuses only from rescheduled allocs
|
||||
allocStatusesRescheduled = func() []string {
|
||||
allocs, _, err := jobs.Allocations(*job.ID, false, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
var ret []string
|
||||
for _, a := range allocs {
|
||||
if (a.RescheduleTracker != nil && len(a.RescheduleTracker.Events) > 0) || a.FollowupEvalID != "" {
|
||||
ret = append(ret, a.ClientStatus)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// deploymentStatus is a helper function that returns deployment status of all deployments
|
||||
// sorted by time
|
||||
deploymentStatus = func() []string {
|
||||
deploys, _, err := jobs.Deployments(*job.ID, false, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
var ret []string
|
||||
sort.Slice(deploys, func(i, j int) bool {
|
||||
return deploys[i].CreateIndex < deploys[j].CreateIndex
|
||||
})
|
||||
for _, d := range deploys {
|
||||
ret = append(ret, d.Status)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
)
|
||||
|
||||
BeforeSuite(func() {
|
||||
conf := api.DefaultConfig()
|
||||
|
||||
// Create client
|
||||
client, err := api.NewClient(conf)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
jobs = client.Jobs()
|
||||
system = client.System()
|
||||
})
|
||||
|
||||
JustBeforeEach(func() {
|
||||
job, err = jobspec.ParseFile(specFile)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
job.ID = helper.StringToPtr(uuid.Generate())
|
||||
resp, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Expect(resp.EvalID).ShouldNot(BeEmpty())
|
||||
|
||||
})
|
||||
|
||||
AfterEach(func() {
|
||||
//Deregister job
|
||||
jobs.Deregister(*job.ID, true, nil)
|
||||
system.GarbageCollect()
|
||||
})
|
||||
|
||||
Describe("Reschedule Stanza Tests", func() {
|
||||
|
||||
Context("No reschedule attempts", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/norescheduling.hcl"
|
||||
})
|
||||
|
||||
It("Should have exactly three allocs and all failed", func() {
|
||||
Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"}))
|
||||
})
|
||||
})
|
||||
|
||||
Context("System jobs should never be rescheduled", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_system.hcl"
|
||||
})
|
||||
|
||||
It("Should have exactly one failed alloc", func() {
|
||||
Eventually(allocStatuses, 10*time.Second, time.Second).Should(ConsistOf([]string{"failed"}))
|
||||
})
|
||||
})
|
||||
|
||||
Context("Default Rescheduling", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_default.hcl"
|
||||
})
|
||||
It("Should have exactly three allocs and all failed after 5 secs", func() {
|
||||
Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"}))
|
||||
})
|
||||
// wait until first exponential delay kicks in and rescheduling is attempted
|
||||
It("Should have exactly six allocs and all failed after 35 secs", func() {
|
||||
if !*slow {
|
||||
Skip("Skipping slow test")
|
||||
}
|
||||
Eventually(allocStatuses, 35*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed", "failed", "failed", "failed"}))
|
||||
})
|
||||
})
|
||||
|
||||
Context("Reschedule attempts maxed out", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_fail.hcl"
|
||||
})
|
||||
It("Should have all failed", func() {
|
||||
Eventually(allocStatuses, 6*time.Second, time.Second).ShouldNot(
|
||||
SatisfyAll(ContainElement("pending"),
|
||||
ContainElement("running")))
|
||||
})
|
||||
Context("Updating job to change its version", func() {
|
||||
It("Should have running allocs now", func() {
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatuses, 5*time.Second, time.Second).Should(ContainElement("running"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Context("Reschedule attempts succeeded", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/reschedule_success.hcl"
|
||||
})
|
||||
It("Should have some running allocs", func() {
|
||||
Eventually(allocStatuses, 6*time.Second, time.Second).Should(
|
||||
ContainElement("running"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("Reschedule with update stanza", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_update.hcl"
|
||||
})
|
||||
It("Should have all running allocs", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
})
|
||||
Context("Updating job to make allocs fail", func() {
|
||||
It("Should have rescheduled allocs until progress deadline", func() {
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty())
|
||||
})
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("Reschedule with canary", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_canary.hcl"
|
||||
})
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
|
||||
time.Sleep(2 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
})
|
||||
|
||||
Context("Updating job to make allocs fail", func() {
|
||||
It("Should have rescheduled allocs until progress deadline", func() {
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty())
|
||||
|
||||
// Verify new deployment and its status
|
||||
// Deployment status should be running (because of progress deadline)
|
||||
time.Sleep(3 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusRunning))
|
||||
})
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("Reschedule with canary, auto revert with short progress deadline ", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_canary_autorevert.hcl"
|
||||
})
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
|
||||
// Make an update that causes the job to fail
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
|
||||
|
||||
// Wait for the revert
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"failed", "failed", "failed", "running", "running", "running"}))
|
||||
// Verify new deployment and its status
|
||||
// There should be one successful, one failed, and one more successful (after revert)
|
||||
time.Sleep(5 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
||||
Eventually(deploymentStatus(), 5*time.Second, time.Second).Should(
|
||||
ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("Reschedule with max parallel/auto_revert false", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_maxp.hcl"
|
||||
})
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
})
|
||||
|
||||
Context("Updating job to make allocs fail", func() {
|
||||
It("Should have rescheduled allocs till progress deadline", func() {
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 6*time.Second, time.Second).ShouldNot(BeEmpty())
|
||||
|
||||
// Should have failed allocs including rescheduled failed allocs
|
||||
Eventually(allocStatuses, 6*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"complete", "failed", "failed", "running", "running"}))
|
||||
|
||||
// Verify new deployment and its status
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusRunning))
|
||||
})
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("Reschedule with max parallel, auto revert true and short progress deadline", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_maxp_autorevert.hcl"
|
||||
})
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
|
||||
time.Sleep(4 * time.Second)
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
|
||||
// Make an update that causes the job to fail
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
|
||||
|
||||
// Wait for the revert
|
||||
Eventually(allocStatuses, 5*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"complete", "failed", "running", "running", "running"}))
|
||||
|
||||
// Verify new deployment and its status
|
||||
// There should be one successful, one failed, and one more successful (after revert)
|
||||
time.Sleep(5 * time.Second)
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("Reschedule with progress deadline", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_progressdeadline.hcl"
|
||||
})
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
if !*slow {
|
||||
Skip("Skipping slow test")
|
||||
}
|
||||
// Deployment should succeed eventually
|
||||
time.Sleep(20 * time.Second)
|
||||
Eventually(deploymentStatus(), 5*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
})
|
Loading…
Reference in a new issue