diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 7e8566f33..0786af497 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -258,7 +258,16 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, // Job doesn't exist // Job is Stopped and dead // allowBatch and the job is dead - collect := shouldCollect(job, allowBatch) + collect := false + if job == nil { + collect = true + } else if job.Status != structs.JobStatusDead { + collect = false + } else if job.Stop { + collect = true + } else if allowBatch { + collect = true + } // We don't want to gc anything related to a job which is not dead // If the batch job doesn't exist we can GC it regardless of allowBatch @@ -279,18 +288,7 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, gcEval := true var gcAllocIDs []string for _, alloc := range allocs { - if job == nil || job.Stop || job.Status == structs.JobStatusDead { - // Eligible to be GC'd because the job is not around, stopped or dead - gcAllocIDs = append(gcAllocIDs, alloc.ID) - continue - } - var reschedulePolicy *structs.ReschedulePolicy - tg := job.LookupTaskGroup(alloc.TaskGroup) - - if tg != nil { - reschedulePolicy = tg.ReschedulePolicy - } - if !gcEligible(alloc, reschedulePolicy, time.Now(), thresholdIndex) { + if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) { // Can't GC the evaluation since not all of the allocations are // terminal gcEval = false @@ -303,21 +301,6 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, return gcEval, gcAllocIDs, nil } -// shouldCollect is a helper function that determines whether the job is eligible for GC -func shouldCollect(job *structs.Job, allowBatch bool) bool { - collect := false - if job == nil { - collect = true - } else if job.Status != structs.JobStatusDead { - collect = false - } else if job.Stop { - collect = true - } else if allowBatch { - collect = true - } - return collect -} - // evalReap contacts the leader and issues a reap on the passed evals and // allocs. func (c *CoreScheduler) evalReap(evals, allocs []string) error { @@ -579,13 +562,24 @@ func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs return requests } -// gcEligible returns if the allocation is eligible to be garbage collected +// allocGCEligible returns if the allocation is eligible to be garbage collected // according to its terminal status and its reschedule trackers -func gcEligible(a *structs.Allocation, reschedulePolicy *structs.ReschedulePolicy, gcTime time.Time, thresholdIndex uint64) bool { +func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool { // Not in a terminal status and old enough if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex { return false } + + if job == nil || job.Stop || job.Status == structs.JobStatusDead { + return true + } + + var reschedulePolicy *structs.ReschedulePolicy + tg := job.LookupTaskGroup(a.TaskGroup) + + if tg != nil { + reschedulePolicy = tg.ReschedulePolicy + } // No reschedule policy or restarts are disabled if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 { return true diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index eb2ffda31..36c61c530 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -1774,6 +1774,8 @@ func TestAllocation_GCEligible(t *testing.T) { GCTime time.Time ClientStatus string DesiredStatus string + JobStatus string + JobStop bool ModifyIndex uint64 NextAllocID string ReschedulePolicy *structs.ReschedulePolicy @@ -1794,6 +1796,26 @@ func TestAllocation_GCEligible(t *testing.T) { ThresholdIndex: 90, ShouldGC: false, }, + { + Desc: "GC when non terminal and job stopped", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + JobStop: true, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: false, + }, + { + Desc: "GC when non terminal and job dead", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + JobStatus: structs.JobStatusDead, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: false, + }, { Desc: "GC when threshold not met", ClientStatus: structs.AllocClientStatusComplete, @@ -1879,6 +1901,34 @@ func TestAllocation_GCEligible(t *testing.T) { NextAllocID: uuid.Generate(), ShouldGC: true, }, + { + Desc: "GC when job is stopped", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + }, + JobStop: true, + ShouldGC: true, + }, + { + Desc: "GC when job status is dead", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + }, + JobStatus: structs.JobStatusDead, + ShouldGC: true, + }, } for _, tc := range harness { @@ -1887,12 +1937,26 @@ func TestAllocation_GCEligible(t *testing.T) { alloc.DesiredStatus = tc.DesiredStatus alloc.ClientStatus = tc.ClientStatus alloc.RescheduleTracker = &structs.RescheduleTracker{tc.RescheduleTrackers} + alloc.NextAllocation = tc.NextAllocID + job := mock.Job() + alloc.TaskGroup = job.TaskGroups[0].Name + job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy + if tc.JobStatus != "" { + job.Status = tc.JobStatus + } + job.Stop = tc.JobStop t.Run(tc.Desc, func(t *testing.T) { - if got := gcEligible(alloc, tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC { + if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC { t.Fatalf("expected %v but got %v", tc.ShouldGC, got) } }) } + + // Verify nil job + require := require.New(t) + alloc := mock.Alloc() + alloc.ClientStatus = structs.AllocClientStatusComplete + require.True(allocGCEligible(alloc, nil, time.Now(), 1000)) }