Reconciler should consider failed allocs when marking deployment as failed.

This commit is contained in:
Preetha Appan 2018-02-02 17:22:37 -06:00
parent 12da54ae2b
commit d48c411692
No known key found for this signature in database
GPG Key ID: 9F7C19990A50EAFC
3 changed files with 119 additions and 1 deletions

View File

@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
}
// Tests that deployments with failed allocs don't result in placements
func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
h := NewHarness(t)
require := require.New(t)
// Create some nodes
var nodes []*structs.Node
for i := 0; i < 10; i++ {
node := mock.Node()
nodes = append(nodes, node)
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
}
// Generate a fake job with allocations and a reschedule policy.
job := mock.Job()
job.TaskGroups[0].Count = 2
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
Attempts: 1,
Interval: 15 * time.Minute,
}
jobIndex := h.NextIndex()
require.Nil(h.State.UpsertJob(jobIndex, job))
deployment := mock.Deployment()
deployment.JobID = job.ID
deployment.JobCreateIndex = jobIndex
deployment.JobVersion = job.Version
require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
var allocs []*structs.Allocation
for i := 0; i < 2; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = nodes[i].ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.DeploymentID = deployment.ID
allocs = append(allocs, alloc)
}
// Mark one of the allocations as failed
allocs[1].ClientStatus = structs.AllocClientStatusFailed
require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
// Create a mock evaluation
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
Status: structs.EvalStatusPending,
}
require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
// Process the evaluation
require.Nil(h.Process(NewServiceScheduler, eval))
// Verify no plan created
require.Equal(0, len(h.Plans))
}
func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
h := NewHarness(t)

View File

@ -159,8 +159,17 @@ func (a *allocReconciler) Compute() *reconcileResults {
// Detect if the deployment is paused
if a.deployment != nil {
// Detect if any allocs associated with this deploy have failed
failedAllocsInDeploy := false
for _, as := range m {
for _, alloc := range as {
if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
failedAllocsInDeploy = true
}
}
}
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
}
// Reconcile each group

View File

@ -74,6 +74,7 @@ Update stanza Tests:
Change job change while scaling up
Update the job when all allocations from the previous job haven't been placed yet.
Paused or failed deployment doesn't do any rescheduling of failed allocs
Running deployment with failed allocs doesn't do any rescheduling of failed allocs
*/
var (
@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
},
})
}
// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
job := mock.Job()
job.TaskGroups[0].Update = noCanaryUpdate
// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
d := structs.NewDeployment(job)
d.Status = structs.DeploymentStatusRunning
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
Promoted: false,
DesiredTotal: 5,
PlacedAllocs: 4,
}
// Create 4 allocations and mark two as failed
var allocs []*structs.Allocation
for i := 0; i < 4; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = uuid.Generate()
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
alloc.TaskGroup = job.TaskGroups[0].Name
alloc.DeploymentID = d.ID
allocs = append(allocs, alloc)
}
allocs[2].ClientStatus = structs.AllocClientStatusFailed
allocs[3].ClientStatus = structs.AllocClientStatusFailed
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
r := reconciler.Compute()
// Assert that no rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 0,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Ignore: 2,
},
},
})
}