Only use DesiredTransition.Reschedule in reconciler when its an active deployment

2018-04-17 14:41:36 -05:00 · 2018-04-17 14:41:36 -05:00 · 5329900f6d
parent 6f92e0711c
commit 5329900f6d
3 changed files with 62 additions and 7 deletions
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@ -340,7 +340,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)

 	// Determine what set of terminal allocations need to be rescheduled
-	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID)
+	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment)

 	// Create batched follow up evaluations for allocations that are
 	// reschedulable later and mark the allocations for in place updating
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@ -4063,3 +4063,55 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) {
 		},
 	})
 }
+
+// Test that a successful deployment with failed allocs will result in
+// rescheduling failed allocations
+func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T) {
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+	tgName := job.TaskGroups[0].Name
+	now := time.Now()
+
+	// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
+	d := structs.NewDeployment(job)
+	d.Status = structs.DeploymentStatusSuccessful
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     false,
+		DesiredTotal: 10,
+		PlacedAllocs: 10,
+	}
+
+	// Create 10 allocations
+	var allocs []*structs.Allocation
+	for i := 0; i < 10; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		alloc.DeploymentID = d.ID
+		alloc.ClientStatus = structs.AllocClientStatusFailed
+		alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
+			StartedAt:  now.Add(-1 * time.Hour),
+			FinishedAt: now.Add(-10 * time.Second)}}
+		allocs = append(allocs, alloc)
+	}
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "")
+	r := reconciler.Compute()
+
+	// Assert that rescheduled placements were created
+	assertResults(t, r, &resultExpectation{
+		place:             10,
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  10,
+				Ignore: 0,
+			},
+		},
+	})
+	assertPlaceResultsHavePreviousAllocs(t, 10, r.place)
+}
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@ -234,7 +234,7 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
 // untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
 // at a future time are also returned so that we can create follow up evaluations for them. Allocs are
 // skipped or considered untainted according to logic defined in shouldFilter method.
-func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID string) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
+func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
 	untainted = make(map[string]*structs.Allocation)
 	rescheduleNow = make(map[string]*structs.Allocation)

@ -257,7 +257,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str

 		// Only failed allocs with desired state run get to this point
 		// If the failed alloc is not eligible for rescheduling now we add it to the untainted set
-		eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID)
+		eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment)
 		if !eligibleNow {
 			untainted[alloc.ID] = alloc
 			if eligibleLater {
@ -320,11 +320,14 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo

 // updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
 // should be rescheduled now, later or left in the untainted set
-func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
+func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
+	// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
+	// if it has been marked eligible
+	if alloc.DeploymentID != "" && d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
+		return
+	}

-	// If the allocation is part of a deployment, only allow it to reschedule if
-	// it has been marked eligible for it explicitly.
-	if alloc.DeploymentID != "" && !alloc.DesiredTransition.ShouldReschedule() {
+	if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
 		return
 	}