From d48c411692119547bb222c4b03fadf5fa6e1e278 Mon Sep 17 00:00:00 2001 From: Preetha Appan Date: Fri, 2 Feb 2018 17:22:37 -0600 Subject: [PATCH] Reconciler should consider failed allocs when marking deployment as failed. --- scheduler/generic_sched_test.go | 63 +++++++++++++++++++++++++++++++++ scheduler/reconcile.go | 11 +++++- scheduler/reconcile_test.go | 46 ++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 6eafd8e2d..1443a2314 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) { assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts } +// Tests that deployments with failed allocs don't result in placements +func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) { + h := NewHarness(t) + require := require.New(t) + // Create some nodes + var nodes []*structs.Node + for i := 0; i < 10; i++ { + node := mock.Node() + nodes = append(nodes, node) + noErr(t, h.State.UpsertNode(h.NextIndex(), node)) + } + + // Generate a fake job with allocations and a reschedule policy. + job := mock.Job() + job.TaskGroups[0].Count = 2 + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 1, + Interval: 15 * time.Minute, + } + jobIndex := h.NextIndex() + require.Nil(h.State.UpsertJob(jobIndex, job)) + + deployment := mock.Deployment() + deployment.JobID = job.ID + deployment.JobCreateIndex = jobIndex + deployment.JobVersion = job.Version + + require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment)) + + var allocs []*structs.Allocation + for i := 0; i < 2; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + alloc.DeploymentID = deployment.ID + allocs = append(allocs, alloc) + } + // Mark one of the allocations as failed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + + require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs)) + + // Create a mock evaluation + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + require.Nil(h.Process(NewServiceScheduler, eval)) + + // Verify no plan created + require.Equal(0, len(h.Plans)) + +} + func TestBatchSched_Run_CompleteAlloc(t *testing.T) { h := NewHarness(t) diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index ae996535c..5e7b95f67 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -159,8 +159,17 @@ func (a *allocReconciler) Compute() *reconcileResults { // Detect if the deployment is paused if a.deployment != nil { + // Detect if any allocs associated with this deploy have failed + failedAllocsInDeploy := false + for _, as := range m { + for _, alloc := range as { + if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed { + failedAllocsInDeploy = true + } + } + } a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused - a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed + a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy } // Reconcile each group diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 551009fd0..55301fc37 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -74,6 +74,7 @@ Update stanza Tests: √ Change job change while scaling up √ Update the job when all allocations from the previous job haven't been placed yet. √ Paused or failed deployment doesn't do any rescheduling of failed allocs +√ Running deployment with failed allocs doesn't do any rescheduling of failed allocs */ var ( @@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { }, }) } + +// Test that a running deployment with failed allocs will not result in rescheduling failed allocations +func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { + job := mock.Job() + job.TaskGroups[0].Update = noCanaryUpdate + + // Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet + d := structs.NewDeployment(job) + d.Status = structs.DeploymentStatusRunning + d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ + Promoted: false, + DesiredTotal: 5, + PlacedAllocs: 4, + } + + // Create 4 allocations and mark two as failed + var allocs []*structs.Allocation + for i := 0; i < 4; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + alloc.TaskGroup = job.TaskGroups[0].Name + alloc.DeploymentID = d.ID + allocs = append(allocs, alloc) + } + allocs[2].ClientStatus = structs.AllocClientStatusFailed + allocs[3].ClientStatus = structs.AllocClientStatusFailed + + reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil) + r := reconciler.Compute() + + // Assert that no rescheduled placements were created + assertResults(t, r, &resultExpectation{ + place: 0, + createDeployment: nil, + deploymentUpdates: nil, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Ignore: 2, + }, + }, + }) +}