324 lines
11 KiB
Go
324 lines
11 KiB
Go
package rescheduling
|
|
|
|
import (
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
|
"github.com/hashicorp/nomad/jobspec"
|
|
. "github.com/onsi/ginkgo"
|
|
. "github.com/onsi/gomega"
|
|
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
var _ = Describe("Server Side Restart Tests", func() {
|
|
|
|
var (
|
|
jobs *api.Jobs
|
|
system *api.System
|
|
job *api.Job
|
|
err error
|
|
specFile string
|
|
|
|
// allocStatuses is a helper function that pulls
|
|
// out client statuses from a slice of allocs
|
|
allocStatuses = func() []string {
|
|
allocs, _, err := jobs.Allocations(*job.ID, false, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
var ret []string
|
|
for _, a := range allocs {
|
|
ret = append(ret, a.ClientStatus)
|
|
}
|
|
sort.Strings(ret)
|
|
return ret
|
|
}
|
|
|
|
// allocStatusesRescheduled is a helper function that pulls
|
|
// out client statuses only from rescheduled allocs
|
|
allocStatusesRescheduled = func() []string {
|
|
allocs, _, err := jobs.Allocations(*job.ID, false, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
var ret []string
|
|
for _, a := range allocs {
|
|
if (a.RescheduleTracker != nil && len(a.RescheduleTracker.Events) > 0) || a.FollowupEvalID != "" {
|
|
ret = append(ret, a.ClientStatus)
|
|
}
|
|
}
|
|
return ret
|
|
}
|
|
|
|
// deploymentStatus is a helper function that returns deployment status of all deployments
|
|
// sorted by time
|
|
deploymentStatus = func() []string {
|
|
deploys, _, err := jobs.Deployments(*job.ID, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
var ret []string
|
|
sort.Slice(deploys, func(i, j int) bool {
|
|
return deploys[i].CreateIndex < deploys[j].CreateIndex
|
|
})
|
|
for _, d := range deploys {
|
|
ret = append(ret, d.Status)
|
|
}
|
|
return ret
|
|
}
|
|
)
|
|
|
|
BeforeSuite(func() {
|
|
conf := api.DefaultConfig()
|
|
|
|
// Create client
|
|
client, err := api.NewClient(conf)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
jobs = client.Jobs()
|
|
system = client.System()
|
|
})
|
|
|
|
JustBeforeEach(func() {
|
|
job, err = jobspec.ParseFile(specFile)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
job.ID = helper.StringToPtr(uuid.Generate())
|
|
resp, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Expect(resp.EvalID).ShouldNot(BeEmpty())
|
|
|
|
})
|
|
|
|
AfterEach(func() {
|
|
//Deregister job
|
|
jobs.Deregister(*job.ID, true, nil)
|
|
system.GarbageCollect()
|
|
})
|
|
|
|
Describe("Reschedule Stanza Tests", func() {
|
|
|
|
Context("No reschedule attempts", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/norescheduling.hcl"
|
|
})
|
|
|
|
It("Should have exactly three allocs and all failed", func() {
|
|
Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"}))
|
|
})
|
|
})
|
|
|
|
Context("System jobs should never be rescheduled", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_system.hcl"
|
|
})
|
|
|
|
It("Should have exactly one failed alloc", func() {
|
|
Eventually(allocStatuses, 10*time.Second, time.Second).Should(ConsistOf([]string{"failed"}))
|
|
})
|
|
})
|
|
|
|
Context("Default Rescheduling", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_default.hcl"
|
|
})
|
|
It("Should have exactly three allocs and all failed after 5 secs", func() {
|
|
Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"}))
|
|
})
|
|
// wait until first exponential delay kicks in and rescheduling is attempted
|
|
It("Should have exactly six allocs and all failed after 35 secs", func() {
|
|
if !*slow {
|
|
Skip("Skipping slow test")
|
|
}
|
|
Eventually(allocStatuses, 35*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed", "failed", "failed", "failed"}))
|
|
})
|
|
})
|
|
|
|
Context("Reschedule attempts maxed out", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_fail.hcl"
|
|
})
|
|
It("Should have all failed", func() {
|
|
Eventually(allocStatuses, 6*time.Second, time.Second).ShouldNot(
|
|
SatisfyAll(ContainElement("pending"),
|
|
ContainElement("running")))
|
|
})
|
|
Context("Updating job to change its version", func() {
|
|
It("Should have running allocs now", func() {
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
|
|
_, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Eventually(allocStatuses, 5*time.Second, time.Second).Should(ContainElement("running"))
|
|
})
|
|
})
|
|
})
|
|
|
|
Context("Reschedule attempts succeeded", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/reschedule_success.hcl"
|
|
})
|
|
It("Should have some running allocs", func() {
|
|
Eventually(allocStatuses, 6*time.Second, time.Second).Should(
|
|
ContainElement("running"))
|
|
})
|
|
})
|
|
|
|
Context("Reschedule with update stanza", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_update.hcl"
|
|
})
|
|
It("Should have all running allocs", func() {
|
|
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"running", "running", "running"}))
|
|
})
|
|
Context("Updating job to make allocs fail", func() {
|
|
It("Should have rescheduled allocs until progress deadline", func() {
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty())
|
|
})
|
|
})
|
|
|
|
})
|
|
|
|
Context("Reschedule with canary", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_canary.hcl"
|
|
})
|
|
It("Should have running allocs and successful deployment", func() {
|
|
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"running", "running", "running"}))
|
|
|
|
time.Sleep(2 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusSuccessful))
|
|
})
|
|
|
|
Context("Updating job to make allocs fail", func() {
|
|
It("Should have rescheduled allocs until progress deadline", func() {
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty())
|
|
|
|
// Verify new deployment and its status
|
|
// Deployment status should be running (because of progress deadline)
|
|
time.Sleep(3 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusRunning))
|
|
})
|
|
})
|
|
|
|
})
|
|
|
|
Context("Reschedule with canary, auto revert with short progress deadline ", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_canary_autorevert.hcl"
|
|
})
|
|
It("Should have running allocs and successful deployment", func() {
|
|
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"running", "running", "running"}))
|
|
|
|
time.Sleep(2 * time.Second)
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusSuccessful))
|
|
|
|
// Make an update that causes the job to fail
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
|
|
|
|
// Wait for the revert
|
|
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"failed", "failed", "failed", "running", "running", "running"}))
|
|
// Verify new deployment and its status
|
|
// There should be one successful, one failed, and one more successful (after revert)
|
|
time.Sleep(5 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
|
Eventually(deploymentStatus(), 5*time.Second, time.Second).Should(
|
|
ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
|
|
})
|
|
|
|
})
|
|
|
|
Context("Reschedule with max parallel/auto_revert false", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_maxp.hcl"
|
|
})
|
|
It("Should have running allocs and successful deployment", func() {
|
|
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"running", "running", "running"}))
|
|
|
|
time.Sleep(2 * time.Second)
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusSuccessful))
|
|
})
|
|
|
|
Context("Updating job to make allocs fail", func() {
|
|
It("Should have rescheduled allocs till progress deadline", func() {
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Eventually(allocStatusesRescheduled, 6*time.Second, time.Second).ShouldNot(BeEmpty())
|
|
|
|
// Should have failed allocs including rescheduled failed allocs
|
|
Eventually(allocStatuses, 6*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"complete", "failed", "failed", "running", "running"}))
|
|
|
|
// Verify new deployment and its status
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusRunning))
|
|
})
|
|
})
|
|
|
|
})
|
|
|
|
Context("Reschedule with max parallel, auto revert true and short progress deadline", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_maxp_autorevert.hcl"
|
|
})
|
|
It("Should have running allocs and successful deployment", func() {
|
|
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"running", "running", "running"}))
|
|
|
|
time.Sleep(4 * time.Second)
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusSuccessful))
|
|
|
|
// Make an update that causes the job to fail
|
|
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
|
_, _, err := jobs.Register(job, nil)
|
|
Expect(err).ShouldNot(HaveOccurred())
|
|
Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
|
|
|
|
// Wait for the revert
|
|
Eventually(allocStatuses, 5*time.Second, time.Second).Should(
|
|
ConsistOf([]string{"complete", "failed", "running", "running", "running"}))
|
|
|
|
// Verify new deployment and its status
|
|
// There should be one successful, one failed, and one more successful (after revert)
|
|
time.Sleep(5 * time.Second)
|
|
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
|
ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
|
|
})
|
|
|
|
})
|
|
|
|
Context("Reschedule with progress deadline", func() {
|
|
BeforeEach(func() {
|
|
specFile = "input/rescheduling_progressdeadline.hcl"
|
|
})
|
|
It("Should have running allocs and successful deployment", func() {
|
|
if !*slow {
|
|
Skip("Skipping slow test")
|
|
}
|
|
// Deployment should succeed eventually
|
|
time.Sleep(20 * time.Second)
|
|
Eventually(deploymentStatus(), 5*time.Second, time.Second).Should(
|
|
ContainElement(structs.DeploymentStatusSuccessful))
|
|
|
|
})
|
|
|
|
})
|
|
|
|
})
|
|
|
|
})
|