Merge pull request #1086 from hashicorp/b-many-allocs

Fix drained/batch allocations from continually migrating
2016-04-13 09:42:17 -07:00 · 2016-04-13 09:42:17 -07:00 · c5d8c862c3
parent 5d1995080e 1a31e5e137
commit c5d8c862c3
2 changed files with 71 additions and 7 deletions
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@ -231,12 +231,23 @@ func (s *GenericScheduler) process() (bool, error) {
 // re-placed.
 func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation {
 	filter := func(a *structs.Allocation) bool {
-		// Allocs from batch jobs should be filtered when their status is failed
-		// so that they will be replaced. If they are complete but not failed, they
-		// shouldn't be replaced.
 		if s.batch {
-			return a.TerminalStatus() &&
-				a.ClientStatus != structs.AllocClientStatusComplete
+			// Allocs from batch jobs should be filtered when the desired status
+			// is terminal or when the client status is failed so that they will
+			// be replaced. If they are complete but not failed, they shouldn't
+			// be replaced.
+			switch a.DesiredStatus {
+			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed:
+				return true
+			default:
+			}
+
+			switch a.ClientStatus {
+			case structs.AllocClientStatusFailed:
+				return true
+			default:
+				return false
+			}
 		}

 		// Filter terminal, non batch allocations
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@ -1079,7 +1079,7 @@ func TestServiceSched_RetryLimit(t *testing.T) {
 	h.AssertEvalStatus(t, structs.EvalStatusFailed)
 }

-func TestBatchSched_Run_DeadAlloc(t *testing.T) {
+func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
 	h := NewHarness(t)

 	// Create a node
@ -1091,7 +1091,7 @@ func TestBatchSched_Run_DeadAlloc(t *testing.T) {
 	job.TaskGroups[0].Count = 1
 	noErr(t, h.State.UpsertJob(h.NextIndex(), job))

-	// Create a failed alloc
+	// Create a complete alloc
 	alloc := mock.Alloc()
 	alloc.Job = job
 	alloc.JobID = job.ID
@ -1131,6 +1131,59 @@ func TestBatchSched_Run_DeadAlloc(t *testing.T) {
 	h.AssertEvalStatus(t, structs.EvalStatusComplete)
 }

+func TestBatchSched_Run_DrainedAlloc(t *testing.T) {
+	h := NewHarness(t)
+
+	// Create a node
+	node := mock.Node()
+	noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+
+	// Create a job
+	job := mock.Job()
+	job.TaskGroups[0].Count = 1
+	noErr(t, h.State.UpsertJob(h.NextIndex(), job))
+
+	// Create a complete alloc
+	alloc := mock.Alloc()
+	alloc.Job = job
+	alloc.JobID = job.ID
+	alloc.NodeID = node.ID
+	alloc.Name = "my-job.web[0]"
+	alloc.DesiredStatus = structs.AllocDesiredStatusStop
+	alloc.ClientStatus = structs.AllocClientStatusComplete
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
+
+	// Create a mock evaluation to register the job
+	eval := &structs.Evaluation{
+		ID:          structs.GenerateUUID(),
+		Priority:    job.Priority,
+		TriggeredBy: structs.EvalTriggerJobRegister,
+		JobID:       job.ID,
+	}
+
+	// Process the evaluation
+	err := h.Process(NewBatchScheduler, eval)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Ensure a plan
+	if len(h.Plans) != 1 {
+		t.Fatalf("bad: %#v", h.Plans)
+	}
+
+	// Lookup the allocations by JobID
+	out, err := h.State.AllocsByJob(job.ID)
+	noErr(t, err)
+
+	// Ensure a replacement alloc was placed.
+	if len(out) != 2 {
+		t.Fatalf("bad: %#v", out)
+	}
+
+	h.AssertEvalStatus(t, structs.EvalStatusComplete)
+}
+
 func TestBatchSched_Run_FailedAlloc(t *testing.T) {
 	h := NewHarness(t)