Update alloc GC eligility logic to not rely on follow up evals

2018-04-11 13:58:02 -05:00 · 2018-04-11 13:58:02 -05:00 · 688fd9ee37
parent 59cce1d620
commit 688fd9ee37
2 changed files with 92 additions and 64 deletions
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@ -323,16 +323,7 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	gcEval := true
 	var gcAllocIDs []string
 	for _, alloc := range allocs {
-		var followupEval *structs.Evaluation
-		if alloc.FollowupEvalID != "" {
-			followupEval, err = c.snap.EvalByID(ws, alloc.FollowupEvalID)
-			if err != nil {
-				c.srv.logger.Printf("[ERR] sched.core: failed to lookup followup eval %s: %v",
-					eval.ID, err)
-				return false, nil, err
-			}
-		}
-		if !allocGCEligible(alloc, job, followupEval, thresholdIndex) {
+		if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) {
 			// Can't GC the evaluation since not all of the allocations are
 			// terminal
 			gcEval = false
@ -608,16 +599,24 @@ func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs

 // allocGCEligible returns if the allocation is eligible to be garbage collected
 // according to its terminal status and its reschedule trackers
-func allocGCEligible(a *structs.Allocation, job *structs.Job, followupEval *structs.Evaluation, thresholdIndex uint64) bool {
+func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool {
 	// Not in a terminal status and old enough
 	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
 		return false
 	}

+	// If the job is deleted, stopped or dead all allocs can be removed
 	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
 		return true
 	}

+	// If the alloc hasn't failed then we don't need to consider it for rescheduling
+	// Rescheduling needs to copy over information from the previous alloc so that it
+	// can enforce the reschedule policy
+	if a.ClientStatus != structs.AllocClientStatusFailed {
+		return true
+	}
+
 	var reschedulePolicy *structs.ReschedulePolicy
 	tg := job.LookupTaskGroup(a.TaskGroup)

@ -633,10 +632,21 @@ func allocGCEligible(a *structs.Allocation, job *structs.Job, followupEval *stru
 		return true
 	}

-	// If there's a follow up eval, use its status for determining GC eligibility
-	if followupEval != nil {
-		return followupEval.TerminalStatus()
+	// This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet
+	if reschedulePolicy.Unlimited {
+		return false
 	}

-	return true
+	// No restarts have been attempted yet
+	if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
+		return false
+	}
+
+	// Don't GC if most recent reschedule attempt is within time interval
+	interval := reschedulePolicy.Interval
+	lastIndex := len(a.RescheduleTracker.Events)
+	lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
+	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
+
+	return timeDiff > interval.Nanoseconds()
 }
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@ -139,12 +139,6 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
 	err = state.UpsertJob(1001, job)
 	require.Nil(err)

-	// Insert a follow up eval that's complete
-	followupEval := mock.Eval()
-	followupEval.JobID = job.ID
-	followupEval.Status = structs.EvalStatusComplete
-	err = state.UpsertEvals(1002, []*structs.Evaluation{followupEval})
-
 	// Insert failed alloc with an old reschedule attempt, can be GCed
 	alloc := mock.Alloc()
 	alloc.EvalID = eval.ID
@ -152,7 +146,7 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
 	alloc.ClientStatus = structs.AllocClientStatusFailed
 	alloc.JobID = eval.JobID
 	alloc.TaskGroup = job.TaskGroups[0].Name
-	alloc.FollowupEvalID = followupEval.ID
+	alloc.NextAllocation = uuid.Generate()
 	alloc.RescheduleTracker = &structs.RescheduleTracker{
 		Events: []*structs.RescheduleEvent{
 			{
@ -163,18 +157,12 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
 		},
 	}

-	// Insert another failed alloc with a follow up eval pending, can't be GCed
-	followupEval2 := mock.Eval()
-	followupEval2.JobID = job.ID
-	err = state.UpsertEvals(1002, []*structs.Evaluation{followupEval2})
-
 	alloc2 := mock.Alloc()
 	alloc2.EvalID = eval.ID
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
-	alloc2.ClientStatus = structs.AllocClientStatusLost
+	alloc2.ClientStatus = structs.AllocClientStatusFailed
 	alloc2.JobID = eval.JobID
 	alloc2.TaskGroup = job.TaskGroups[0].Name
-	alloc2.FollowupEvalID = followupEval2.ID
 	alloc2.RescheduleTracker = &structs.RescheduleTracker{
 		Events: []*structs.RescheduleEvent{
 			{
@ -1811,10 +1799,10 @@ func TestCoreScheduler_PartitionJobReap(t *testing.T) {
 func TestAllocation_GCEligible(t *testing.T) {
 	type testCase struct {
 		Desc               string
+		GCTime             time.Time
 		ClientStatus       string
 		DesiredStatus      string
 		JobStatus          string
-		FollowupEvalStatus string
 		JobStop            bool
 		ModifyIndex        uint64
 		NextAllocID        string
@ -1831,6 +1819,7 @@ func TestAllocation_GCEligible(t *testing.T) {
 			Desc:           "GC when non terminal",
 			ClientStatus:   structs.AllocClientStatusPending,
 			DesiredStatus:  structs.AllocDesiredStatusRun,
+			GCTime:         fail,
 			ModifyIndex:    90,
 			ThresholdIndex: 90,
 			ShouldGC:       false,
@ -1840,6 +1829,7 @@ func TestAllocation_GCEligible(t *testing.T) {
 			ClientStatus:   structs.AllocClientStatusPending,
 			DesiredStatus:  structs.AllocDesiredStatusRun,
 			JobStop:        true,
+			GCTime:         fail,
 			ModifyIndex:    90,
 			ThresholdIndex: 90,
 			ShouldGC:       false,
@ -1849,14 +1839,26 @@ func TestAllocation_GCEligible(t *testing.T) {
 			ClientStatus:   structs.AllocClientStatusPending,
 			DesiredStatus:  structs.AllocDesiredStatusRun,
 			JobStatus:      structs.JobStatusDead,
+			GCTime:         fail,
 			ModifyIndex:    90,
 			ThresholdIndex: 90,
 			ShouldGC:       false,
 		},
+		{
+			Desc:             "GC when terminal but not failed ",
+			ClientStatus:     structs.AllocClientStatusComplete,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ModifyIndex:      100,
+			ThresholdIndex:   90,
+			ReschedulePolicy: nil,
+			ShouldGC:         false,
+		},
 		{
 			Desc:             "GC when threshold not met",
 			ClientStatus:     structs.AllocClientStatusComplete,
 			DesiredStatus:    structs.AllocDesiredStatusStop,
+			GCTime:           fail,
 			ModifyIndex:      100,
 			ThresholdIndex:   90,
 			ReschedulePolicy: nil,
@ -1866,38 +1868,40 @@ func TestAllocation_GCEligible(t *testing.T) {
 			Desc:             "GC when no reschedule policy",
 			ClientStatus:     structs.AllocClientStatusFailed,
 			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
 			ReschedulePolicy: nil,
 			ModifyIndex:      90,
 			ThresholdIndex:   90,
 			ShouldGC:         true,
 		},
 		{
-			Desc:             "GC with empty reschedule policy",
+			Desc:             "GC when empty policy",
 			ClientStatus:     structs.AllocClientStatusFailed,
 			DesiredStatus:    structs.AllocDesiredStatusRun,
-			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 0, Interval: 0 * time.Minute, Unlimited: false},
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 0, Interval: 0 * time.Minute},
 			ModifyIndex:      90,
 			ThresholdIndex:   90,
 			ShouldGC:         true,
 		},
 		{
-			Desc:               "GC with pending followup eval",
-			ClientStatus:       structs.AllocClientStatusFailed,
-			DesiredStatus:      structs.AllocDesiredStatusRun,
-			FollowupEvalStatus: structs.EvalStatusPending,
-			ModifyIndex:        90,
-			ThresholdIndex:     90,
-			ReschedulePolicy:   &structs.ReschedulePolicy{Attempts: 1, Interval: 1 * time.Minute},
-			ShouldGC:           false,
+			Desc:             "GC with no previous attempts",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 1, Interval: 1 * time.Minute},
+			ShouldGC:         false,
 		},
 		{
-			Desc:               "GC with blocked followup eval",
-			ClientStatus:       structs.AllocClientStatusFailed,
-			DesiredStatus:      structs.AllocDesiredStatusRun,
-			ReschedulePolicy:   &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute},
-			FollowupEvalStatus: structs.EvalStatusBlocked,
-			ModifyIndex:        90,
-			ThresholdIndex:     90,
+			Desc:             "GC with prev reschedule attempt within interval",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute},
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
 			RescheduleTrackers: []*structs.RescheduleEvent{
 				{
 					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
@ -1906,11 +1910,11 @@ func TestAllocation_GCEligible(t *testing.T) {
 			ShouldGC: false,
 		},
 		{
-			Desc:               "GC with complete followup eval",
-			ClientStatus:       structs.AllocClientStatusFailed,
-			DesiredStatus:      structs.AllocDesiredStatusRun,
-			FollowupEvalStatus: structs.EvalStatusComplete,
-			ReschedulePolicy:   &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
+			Desc:             "GC with prev reschedule attempt outside interval",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
 			RescheduleTrackers: []*structs.RescheduleEvent{
 				{
 					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
@ -1925,14 +1929,34 @@ func TestAllocation_GCEligible(t *testing.T) {
 			Desc:             "GC when next alloc id is set",
 			ClientStatus:     structs.AllocClientStatusFailed,
 			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
 			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
-			NextAllocID:      uuid.Generate(),
-			ShouldGC:         true,
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			NextAllocID: uuid.Generate(),
+			ShouldGC:    true,
+		},
+		{
+			Desc:             "GC when next alloc id is not set and unlimited restarts",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: false,
 		},
 		{
 			Desc:             "GC when job is stopped",
 			ClientStatus:     structs.AllocClientStatusFailed,
 			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
 			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
 			RescheduleTrackers: []*structs.RescheduleEvent{
 				{
@ -1946,6 +1970,7 @@ func TestAllocation_GCEligible(t *testing.T) {
 			Desc:             "GC when job status is dead",
 			ClientStatus:     structs.AllocClientStatusFailed,
 			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
 			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
 			RescheduleTrackers: []*structs.RescheduleEvent{
 				{
@ -1964,13 +1989,6 @@ func TestAllocation_GCEligible(t *testing.T) {
 		alloc.ClientStatus = tc.ClientStatus
 		alloc.RescheduleTracker = &structs.RescheduleTracker{Events: tc.RescheduleTrackers}
 		alloc.NextAllocation = tc.NextAllocID
-		var followupEval *structs.Evaluation
-
-		if tc.FollowupEvalStatus != "" {
-			followupEval = mock.Eval()
-			followupEval.Status = tc.FollowupEvalStatus
-		}
-
 		job := mock.Job()
 		alloc.TaskGroup = job.TaskGroups[0].Name
 		job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
@ -1980,10 +1998,10 @@ func TestAllocation_GCEligible(t *testing.T) {
 		job.Stop = tc.JobStop

 		t.Run(tc.Desc, func(t *testing.T) {
-			if got := allocGCEligible(alloc, job, followupEval, tc.ThresholdIndex); got != tc.ShouldGC {
+			if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
 				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
 			}
-		})
+		}

 	}

@ -1991,5 +2009,5 @@ func TestAllocation_GCEligible(t *testing.T) {
 	require := require.New(t)
 	alloc := mock.Alloc()
 	alloc.ClientStatus = structs.AllocClientStatusComplete
-	require.True(allocGCEligible(alloc, nil, nil, 1000))
+	require.True(allocGCEligible(alloc, nil, time.Now(), 1000))
 }