open-nomad/nomad/core_sched.go

package nomad

import (
	"fmt"
	"math"
	"time"

	"github.com/hashicorp/nomad/nomad/state"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/scheduler"
)

var (
	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
	// single Raft transaction. This is to ensure that the Raft message does not
	// become too large.
	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
)

// CoreScheduler is a special "scheduler" that is registered
// as "_core". It is used to run various administrative work
// across the cluster.
type CoreScheduler struct {
	srv  *Server
	snap *state.StateSnapshot
}

// NewCoreScheduler is used to return a new system scheduler instance
func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
	s := &CoreScheduler{
		srv:  srv,
		snap: snap,
	}
	return s
}

// Process is used to implement the scheduler.Scheduler interface
func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
	switch eval.JobID {
	case structs.CoreJobEvalGC:
		return c.evalGC(eval)
	case structs.CoreJobNodeGC:
		return c.nodeGC(eval)
	case structs.CoreJobJobGC:
		return c.jobGC(eval)
	case structs.CoreJobForceGC:
		return c.forceGC(eval)
	default:
		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
	}
}

// forceGC is used to garbage collect all eligible objects.
func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
	if err := c.jobGC(eval); err != nil {
		return err
	}
	if err := c.evalGC(eval); err != nil {
		return err
	}

	// Node GC must occur after the others to ensure the allocations are
	// cleared.
	return c.nodeGC(eval)
}

// jobGC is used to garbage collect eligible jobs.
func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
	// Get all the jobs eligible for garbage collection.
	iter, err := c.snap.JobsByGC(true)
	if err != nil {
		return err
	}

	var oldThreshold uint64
	if eval.JobID == structs.CoreJobForceGC {
		// The GC was forced, so set the threshold to its maximum so everything
		// will GC.
		oldThreshold = math.MaxUint64
		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
	} else {
		// Get the time table to calculate GC cutoffs.
		tt := c.srv.fsm.TimeTable()
		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
		oldThreshold = tt.NearestIndex(cutoff)
		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
			oldThreshold, c.srv.config.JobGCThreshold)
	}

	// Collect the allocations, evaluations and jobs to GC
	var gcAlloc, gcEval, gcJob []string

OUTER:
	for i := iter.Next(); i != nil; i = iter.Next() {
		job := i.(*structs.Job)

		// Ignore new jobs.
		if job.CreateIndex > oldThreshold {
			continue
		}

		evals, err := c.snap.EvalsByJob(job.ID)
		if err != nil {
			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
			continue
		}

		allEvalsGC := true
		var jobAlloc, jobEval []string
		for _, eval := range evals {
			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
			if err != nil {
				continue OUTER
			}

			if gc {
				jobEval = append(jobEval, eval.ID)
				jobAlloc = append(jobAlloc, allocs...)
			} else {
				allEvalsGC = false
				break
			}
		}

		// Job is eligible for garbage collection
		if allEvalsGC {
			gcJob = append(gcJob, job.ID)
			gcAlloc = append(gcAlloc, jobAlloc...)
			gcEval = append(gcEval, jobEval...)
		}
	}

	// Fast-path the nothing case
	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
		return nil
	}
	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
		len(gcJob), len(gcEval), len(gcAlloc))

	// Reap the evals and allocs
	if err := c.evalReap(gcEval, gcAlloc); err != nil {
		return err
	}

	// Call to the leader to deregister the jobs.
	for _, job := range gcJob {
		req := structs.JobDeregisterRequest{
			JobID: job,
			WriteRequest: structs.WriteRequest{
				Region: c.srv.config.Region,
			},
		}
		var resp structs.JobDeregisterResponse
		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
			return err
		}
	}

	return nil
}

// evalGC is used to garbage collect old evaluations
func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
	// Iterate over the evaluations
	iter, err := c.snap.Evals()
	if err != nil {
		return err
	}

	var oldThreshold uint64
	if eval.JobID == structs.CoreJobForceGC {
		// The GC was forced, so set the threshold to its maximum so everything
		// will GC.
		oldThreshold = math.MaxUint64
		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
	} else {
		// Compute the old threshold limit for GC using the FSM
		// time table.  This is a rough mapping of a time to the
		// Raft index it belongs to.
		tt := c.srv.fsm.TimeTable()
		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
		oldThreshold = tt.NearestIndex(cutoff)
		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
			oldThreshold, c.srv.config.EvalGCThreshold)
	}

	// Collect the allocations and evaluations to GC
	var gcAlloc, gcEval []string
	for raw := iter.Next(); raw != nil; raw = iter.Next() {
		eval := raw.(*structs.Evaluation)

		// The Evaluation GC should not handle batch jobs since those need to be
		// garbage collected in one shot
		// XXX believe there is a bug that if a batch job gets stopped, there is no
		// way for it to GC the eval/allocs
		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
		if err != nil {
			return err
		}

		if gc {
			gcEval = append(gcEval, eval.ID)
		}
		gcAlloc = append(gcAlloc, allocs...)
	}

	// Fast-path the nothing case
	if len(gcEval) == 0 && len(gcAlloc) == 0 {
		return nil
	}
	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
		len(gcEval), len(gcAlloc))

	return c.evalReap(gcEval, gcAlloc)
}

// gcEval returns whether the eval should be garbage collected given a raft
// threshold index. The eval disqualifies for garbage collection if it or its
// allocs are not older than the threshold. If the eval should be garbage
// collected, the associated alloc ids that should also be removed are also
// returned
func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
	bool, []string, error) {
	// Ignore non-terminal and new evaluations
	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
		return false, nil, nil
	}

	// If the eval is from a running "batch" job we don't want to garbage
	// collect its allocations. If there is a long running batch job and its
	// terminal allocations get GC'd the scheduler would re-run the
	// allocations.
	if eval.Type == structs.JobTypeBatch {
		if !allowBatch {
			return false, nil, nil
		}

		// Check if the job is running
		job, err := c.snap.JobByID(eval.JobID)
		if err != nil {
			return false, nil, err
		}

		// We don't want to gc anything related to a job which is not dead
		if job != nil && job.Status != structs.JobStatusDead {
			return false, nil, nil
		}
	}

	// Get the allocations by eval
	allocs, err := c.snap.AllocsByEval(eval.ID)
	if err != nil {
		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
			eval.ID, err)
		return false, nil, err
	}

	// Scan the allocations to ensure they are terminal and old
	gcEval := true
	var gcAllocIDs []string
	for _, alloc := range allocs {
		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
			// Can't GC the evaluation since not all of the allocations are
			// terminal
			gcEval = false
		} else {
			// The allocation is eligible to be GC'd
			gcAllocIDs = append(gcAllocIDs, alloc.ID)
		}
	}

	return gcEval, gcAllocIDs, nil
}

// evalReap contacts the leader and issues a reap on the passed evals and
// allocs.
func (c *CoreScheduler) evalReap(evals, allocs []string) error {
	// Call to the leader to issue the reap
	for _, req := range c.partitionReap(evals, allocs) {
		var resp structs.GenericResponse
		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
			return err
		}
	}

	return nil
}

// partitionReap returns a list of EvalDeleteRequest to make, ensuring a single
// request does not contain too many allocations and evaluations. This is
// necessary to ensure that the Raft transaction does not become too large.
func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest {
	var requests []*structs.EvalDeleteRequest
	submittedEvals, submittedAllocs := 0, 0
	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
		req := &structs.EvalDeleteRequest{
			WriteRequest: structs.WriteRequest{
				Region: c.srv.config.Region,
			},
		}
		requests = append(requests, req)
		available := maxIdsPerReap

		// Add the allocs first
		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
			if remaining <= available {
				req.Allocs = allocs[submittedAllocs:]
				available -= remaining
				submittedAllocs += remaining
			} else {
				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
				submittedAllocs += available

				// Exhausted space so skip adding evals
				continue
			}
		}

		// Add the evals
		if remaining := len(evals) - submittedEvals; remaining > 0 {
			if remaining <= available {
				req.Evals = evals[submittedEvals:]
				submittedEvals += remaining
			} else {
				req.Evals = evals[submittedEvals : submittedEvals+available]
				submittedEvals += available
			}
		}
	}

	return requests
}

// nodeGC is used to garbage collect old nodes
func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
	// Iterate over the evaluations
	iter, err := c.snap.Nodes()
	if err != nil {
		return err
	}

	var oldThreshold uint64
	if eval.JobID == structs.CoreJobForceGC {
		// The GC was forced, so set the threshold to its maximum so everything
		// will GC.
		oldThreshold = math.MaxUint64
		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
	} else {
		// Compute the old threshold limit for GC using the FSM
		// time table.  This is a rough mapping of a time to the
		// Raft index it belongs to.
		tt := c.srv.fsm.TimeTable()
		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
		oldThreshold = tt.NearestIndex(cutoff)
		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
			oldThreshold, c.srv.config.NodeGCThreshold)
	}

	// Collect the nodes to GC
	var gcNode []string
OUTER:
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		node := raw.(*structs.Node)

		// Ignore non-terminal and new nodes
		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
			continue
		}

		// Get the allocations by node
		allocs, err := c.snap.AllocsByNode(node.ID)
		if err != nil {
			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
				eval.ID, err)
			continue
		}

		// If there are any non-terminal allocations, skip the node. If the node
		// is terminal and the allocations are not, the scheduler may not have
		// run yet to transition the allocs on the node to terminal. We delay
		// GC'ing until this happens.
		for _, alloc := range allocs {
			if !alloc.TerminalStatus() {
				continue OUTER
			}
		}

		// Node is eligible for garbage collection
		gcNode = append(gcNode, node.ID)
	}

	// Fast-path the nothing case
	if len(gcNode) == 0 {
		return nil
	}
	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))

	// Call to the leader to issue the reap
	for _, nodeID := range gcNode {
		req := structs.NodeDeregisterRequest{
			NodeID: nodeID,
			WriteRequest: structs.WriteRequest{
				Region: c.srv.config.Region,
			},
		}
		var resp structs.NodeUpdateResponse
		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
			return err
		}
	}
	return nil
}
nomad: rename SystemScheduler to CoreScheduler 2015-08-15 19:38:58 +00:00			`package nomad`

			`import (`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`"fmt"`
Job GC endpoint 2016-02-20 23:50:41 +00:00			`"math"`
nomad: Integrate Eval GC with FSM time table 2015-08-16 00:42:51 +00:00			`"time"`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00
nomad: rename SystemScheduler to CoreScheduler 2015-08-15 19:38:58 +00:00			`"github.com/hashicorp/nomad/nomad/state"`
			`"github.com/hashicorp/nomad/nomad/structs"`
			`"github.com/hashicorp/nomad/scheduler"`
			`)`

Limit GC size 2016-03-30 22:17:13 +00:00			`var (`
			`// maxIdsPerReap is the maximum number of evals and allocations to reap in a`
			`// single Raft transaction. This is to ensure that the Raft message does not`
			`// become too large.`
Address comments 2016-04-14 18:41:04 +00:00			`maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.`
Limit GC size 2016-03-30 22:17:13 +00:00			`)`

nomad: rename SystemScheduler to CoreScheduler 2015-08-15 19:38:58 +00:00			`// CoreScheduler is a special "scheduler" that is registered`
			`// as "_core". It is used to run various administrative work`
			`// across the cluster.`
			`type CoreScheduler struct {`
			`srv *Server`
			`snap *state.StateSnapshot`
			`}`

			`// NewCoreScheduler is used to return a new system scheduler instance`
			`func NewCoreScheduler(srv Server, snap state.StateSnapshot) scheduler.Scheduler {`
			`s := &CoreScheduler{`
			`srv: srv,`
			`snap: snap,`
			`}`
			`return s`
			`}`

			`// Process is used to implement the scheduler.Scheduler interface`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`func (c CoreScheduler) Process(eval structs.Evaluation) error {`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`switch eval.JobID {`
			`case structs.CoreJobEvalGC:`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`return c.evalGC(eval)`
nomad: adding node GC 2015-09-07 18:01:29 +00:00			`case structs.CoreJobNodeGC:`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`return c.nodeGC(eval)`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`case structs.CoreJobJobGC:`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`return c.jobGC(eval)`
			`case structs.CoreJobForceGC:`
			`return c.forceGC(eval)`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`default:`
			`return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)`
			`}`
			`}`

Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`// forceGC is used to garbage collect all eligible objects.`
			`func (c CoreScheduler) forceGC(eval structs.Evaluation) error {`
			`if err := c.jobGC(eval); err != nil {`
			`return err`
			`}`
			`if err := c.evalGC(eval); err != nil {`
			`return err`
			`}`

			`// Node GC must occur after the others to ensure the allocations are`
			`// cleared.`
			`return c.nodeGC(eval)`
			`}`

Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`// jobGC is used to garbage collect eligible jobs.`
			`func (c CoreScheduler) jobGC(eval structs.Evaluation) error {`
			`// Get all the jobs eligible for garbage collection.`
Small cleanup 2015-12-16 22:27:40 +00:00			`iter, err := c.snap.JobsByGC(true)`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`if err != nil {`
			`return err`
			`}`

Job GC endpoint 2016-02-20 23:50:41 +00:00			`var oldThreshold uint64`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`if eval.JobID == structs.CoreJobForceGC {`
Job GC endpoint 2016-02-20 23:50:41 +00:00			`// The GC was forced, so set the threshold to its maximum so everything`
			`// will GC.`
			`oldThreshold = math.MaxUint64`
			`c.srv.logger.Println("[DEBUG] sched.core: forced job GC")`
			`} else {`
			`// Get the time table to calculate GC cutoffs.`
			`tt := c.srv.fsm.TimeTable()`
			`cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)`
			`oldThreshold = tt.NearestIndex(cutoff)`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",`
			`oldThreshold, c.srv.config.JobGCThreshold)`
Job GC endpoint 2016-02-20 23:50:41 +00:00			`}`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00
Small cleanup 2015-12-16 22:27:40 +00:00			`// Collect the allocations, evaluations and jobs to GC`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`var gcAlloc, gcEval, gcJob []string`

			`OUTER:`
Small cleanup 2015-12-16 22:27:40 +00:00			`for i := iter.Next(); i != nil; i = iter.Next() {`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`job := i.(*structs.Job)`

			`// Ignore new jobs.`
			`if job.CreateIndex > oldThreshold {`
Small cleanup 2015-12-16 22:27:40 +00:00			`continue`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`}`

			`evals, err := c.snap.EvalsByJob(job.ID)`
			`if err != nil {`
			`c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)`
			`continue`
			`}`

Partial GC allocations 2016-06-11 01:32:37 +00:00			`allEvalsGC := true`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`var jobAlloc, jobEval []string`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`for _, eval := range evals {`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`gc, allocs, err := c.gcEval(eval, oldThreshold, true)`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`if err != nil {`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`continue OUTER`
			`}`

Partial GC allocations 2016-06-11 01:32:37 +00:00			`if gc {`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`jobEval = append(jobEval, eval.ID)`
			`jobAlloc = append(jobAlloc, allocs...)`
			`} else {`
			`allEvalsGC = false`
			`break`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`}`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`}`

			`// Job is eligible for garbage collection`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`if allEvalsGC {`
			`gcJob = append(gcJob, job.ID)`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`gcAlloc = append(gcAlloc, jobAlloc...)`
			`gcEval = append(gcEval, jobEval...)`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`}`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`}`

			`// Fast-path the nothing case`
			`if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {`
			`return nil`
			`}`
			`c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",`
			`len(gcJob), len(gcEval), len(gcAlloc))`

			`// Reap the evals and allocs`
			`if err := c.evalReap(gcEval, gcAlloc); err != nil {`
			`return err`
			`}`

			`// Call to the leader to deregister the jobs.`
			`for _, job := range gcJob {`
			`req := structs.JobDeregisterRequest{`
			`JobID: job,`
			`WriteRequest: structs.WriteRequest{`
			`Region: c.srv.config.Region,`
			`},`
			`}`
			`var resp structs.JobDeregisterResponse`
			`if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {`
			`c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)`
			`return err`
			`}`
			`}`

			`return nil`
			`}`

nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`// evalGC is used to garbage collect old evaluations`
			`func (c CoreScheduler) evalGC(eval structs.Evaluation) error {`
			`// Iterate over the evaluations`
			`iter, err := c.snap.Evals()`
			`if err != nil {`
			`return err`
			`}`

Job GC endpoint 2016-02-20 23:50:41 +00:00			`var oldThreshold uint64`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`if eval.JobID == structs.CoreJobForceGC {`
Job GC endpoint 2016-02-20 23:50:41 +00:00			`// The GC was forced, so set the threshold to its maximum so everything`
			`// will GC.`
			`oldThreshold = math.MaxUint64`
			`c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")`
			`} else {`
			`// Compute the old threshold limit for GC using the FSM`
			`// time table. This is a rough mapping of a time to the`
			`// Raft index it belongs to.`
			`tt := c.srv.fsm.TimeTable()`
			`cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)`
			`oldThreshold = tt.NearestIndex(cutoff)`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",`
			`oldThreshold, c.srv.config.EvalGCThreshold)`
Job GC endpoint 2016-02-20 23:50:41 +00:00			`}`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00
			`// Collect the allocations and evaluations to GC`
			`var gcAlloc, gcEval []string`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`for raw := iter.Next(); raw != nil; raw = iter.Next() {`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`eval := raw.(*structs.Evaluation)`
Limit garbage collection of batch jobs 2016-03-25 23:46:48 +00:00
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`// The Evaluation GC should not handle batch jobs since those need to be`
			`// garbage collected in one shot`
Disallow GC of parameterized jobs This PR makes it so parameterized jobs do not get garbage collected and adds a test. 2017-01-26 19:57:32 +00:00			`// XXX believe there is a bug that if a batch job gets stopped, there is no`
			`// way for it to GC the eval/allocs`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`gc, allocs, err := c.gcEval(eval, oldThreshold, false)`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`if err != nil {`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`return err`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`}`

Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`if gc {`
			`gcEval = append(gcEval, eval.ID)`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`}`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`gcAlloc = append(gcAlloc, allocs...)`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`}`

			`// Fast-path the nothing case`
			`if len(gcEval) == 0 && len(gcAlloc) == 0 {`
			`return nil`
			`}`
nomad: adding node GC 2015-09-07 18:01:29 +00:00			`c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`len(gcEval), len(gcAlloc))`

Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`return c.evalReap(gcEval, gcAlloc)`
			`}`

			`// gcEval returns whether the eval should be garbage collected given a raft`
			`// threshold index. The eval disqualifies for garbage collection if it or its`
			`// allocs are not older than the threshold. If the eval should be garbage`
			`// collected, the associated alloc ids that should also be removed are also`
			`// returned`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`func (c CoreScheduler) gcEval(eval structs.Evaluation, thresholdIndex uint64, allowBatch bool) (`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`bool, []string, error) {`
			`// Ignore non-terminal and new evaluations`
			`if !eval.TerminalStatus() \|\| eval.ModifyIndex > thresholdIndex {`
			`return false, nil, nil`
			`}`

Partial GC allocations 2016-06-11 01:32:37 +00:00			`// If the eval is from a running "batch" job we don't want to garbage`
			`// collect its allocations. If there is a long running batch job and its`
			`// terminal allocations get GC'd the scheduler would re-run the`
			`// allocations.`
			`if eval.Type == structs.JobTypeBatch {`
Disallow EvalGC to reap batch jobs evals/allocs and make JobGC only oneshot GCs everything 2016-06-27 22:47:49 +00:00			`if !allowBatch {`
			`return false, nil, nil`
			`}`

Partial GC allocations 2016-06-11 01:32:37 +00:00			`// Check if the job is running`
			`job, err := c.snap.JobByID(eval.JobID)`
			`if err != nil {`
			`return false, nil, err`
			`}`

GC-ing dead batch jobs 2016-06-22 18:40:27 +00:00			`// We don't want to gc anything related to a job which is not dead`
			`if job != nil && job.Status != structs.JobStatusDead {`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`return false, nil, nil`
			`}`
			`}`

Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`// Get the allocations by eval`
			`allocs, err := c.snap.AllocsByEval(eval.ID)`
			`if err != nil {`
			`c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",`
			`eval.ID, err)`
			`return false, nil, err`
			`}`

			`// Scan the allocations to ensure they are terminal and old`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`gcEval := true`
			`var gcAllocIDs []string`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`for _, alloc := range allocs {`
Revert "Remove client status from allocation TerminalStatus" This reverts commit 819e1e4b3967c7029ee8221144666ff460fdd7ed. 2016-04-08 21:22:06 +00:00			`if !alloc.TerminalStatus() \|\| alloc.ModifyIndex > thresholdIndex {`
Partial GC allocations 2016-06-11 01:32:37 +00:00			`// Can't GC the evaluation since not all of the allocations are`
			`// terminal`
			`gcEval = false`
			`} else {`
			`// The allocation is eligible to be GC'd`
			`gcAllocIDs = append(gcAllocIDs, alloc.ID)`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`}`
			`}`

Partial GC allocations 2016-06-11 01:32:37 +00:00			`return gcEval, gcAllocIDs, nil`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00			`}`

			`// evalReap contacts the leader and issues a reap on the passed evals and`
			`// allocs.`
			`func (c *CoreScheduler) evalReap(evals, allocs []string) error {`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`// Call to the leader to issue the reap`
Limit GC size 2016-03-30 22:17:13 +00:00			`for _, req := range c.partitionReap(evals, allocs) {`
			`var resp structs.GenericResponse`
			`if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {`
			`c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)`
			`return err`
			`}`
nomad: core scheduler will GC evaluations and allocations 2015-08-15 23:07:50 +00:00			`}`
Add garbage collection to jobs 2015-12-15 03:20:57 +00:00
nomad: rename SystemScheduler to CoreScheduler 2015-08-15 19:38:58 +00:00			`return nil`
			`}`
nomad: adding node GC 2015-09-07 18:01:29 +00:00
Limit GC size 2016-03-30 22:17:13 +00:00			`// partitionReap returns a list of EvalDeleteRequest to make, ensuring a single`
			`// request does not contain too many allocations and evaluations. This is`
			`// necessary to ensure that the Raft transaction does not become too large.`
			`func (c CoreScheduler) partitionReap(evals, allocs []string) []structs.EvalDeleteRequest {`
			`var requests []*structs.EvalDeleteRequest`
Address comments 2016-04-14 18:41:04 +00:00			`submittedEvals, submittedAllocs := 0, 0`
Limit GC size 2016-03-30 22:17:13 +00:00			`for submittedEvals != len(evals) \|\| submittedAllocs != len(allocs) {`
			`req := &structs.EvalDeleteRequest{`
			`WriteRequest: structs.WriteRequest{`
			`Region: c.srv.config.Region,`
			`},`
			`}`
			`requests = append(requests, req)`
			`available := maxIdsPerReap`

Address comments 2016-04-14 18:41:04 +00:00			`// Add the allocs first`
			`if remaining := len(allocs) - submittedAllocs; remaining > 0 {`
Limit GC size 2016-03-30 22:17:13 +00:00			`if remaining <= available {`
Address comments 2016-04-14 18:41:04 +00:00			`req.Allocs = allocs[submittedAllocs:]`
Limit GC size 2016-03-30 22:17:13 +00:00			`available -= remaining`
Address comments 2016-04-14 18:41:04 +00:00			`submittedAllocs += remaining`
Limit GC size 2016-03-30 22:17:13 +00:00			`} else {`
Address comments 2016-04-14 18:41:04 +00:00			`req.Allocs = allocs[submittedAllocs : submittedAllocs+available]`
			`submittedAllocs += available`
Limit GC size 2016-03-30 22:17:13 +00:00
Address comments 2016-04-14 18:41:04 +00:00			`// Exhausted space so skip adding evals`
Limit GC size 2016-03-30 22:17:13 +00:00			`continue`
			`}`
			`}`

Address comments 2016-04-14 18:41:04 +00:00			`// Add the evals`
			`if remaining := len(evals) - submittedEvals; remaining > 0 {`
Limit GC size 2016-03-30 22:17:13 +00:00			`if remaining <= available {`
Address comments 2016-04-14 18:41:04 +00:00			`req.Evals = evals[submittedEvals:]`
			`submittedEvals += remaining`
Limit GC size 2016-03-30 22:17:13 +00:00			`} else {`
Address comments 2016-04-14 18:41:04 +00:00			`req.Evals = evals[submittedEvals : submittedEvals+available]`
			`submittedEvals += available`
Limit GC size 2016-03-30 22:17:13 +00:00			`}`
			`}`
			`}`

			`return requests`
			`}`

nomad: adding node GC 2015-09-07 18:01:29 +00:00			`// nodeGC is used to garbage collect old nodes`
			`func (c CoreScheduler) nodeGC(eval structs.Evaluation) error {`
			`// Iterate over the evaluations`
			`iter, err := c.snap.Nodes()`
			`if err != nil {`
			`return err`
			`}`

Add force node gc 2016-02-21 00:11:29 +00:00			`var oldThreshold uint64`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`if eval.JobID == structs.CoreJobForceGC {`
Add force node gc 2016-02-21 00:11:29 +00:00			`// The GC was forced, so set the threshold to its maximum so everything`
			`// will GC.`
			`oldThreshold = math.MaxUint64`
			`c.srv.logger.Println("[DEBUG] sched.core: forced node GC")`
			`} else {`
			`// Compute the old threshold limit for GC using the FSM`
			`// time table. This is a rough mapping of a time to the`
			`// Raft index it belongs to.`
			`tt := c.srv.fsm.TimeTable()`
			`cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)`
			`oldThreshold = tt.NearestIndex(cutoff)`
Force GC garbage collects nodes last and fix eval GC to cleanup deregistered batch jobs 2016-04-08 18:42:02 +00:00			`c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",`
			`oldThreshold, c.srv.config.NodeGCThreshold)`
Add force node gc 2016-02-21 00:11:29 +00:00			`}`
nomad: adding node GC 2015-09-07 18:01:29 +00:00
			`// Collect the nodes to GC`
			`var gcNode []string`
GC Nodes even if they have terminal allocations 2016-06-03 23:24:41 +00:00			`OUTER:`
nomad: adding node GC 2015-09-07 18:01:29 +00:00			`for {`
			`raw := iter.Next()`
			`if raw == nil {`
			`break`
			`}`
			`node := raw.(*structs.Node)`

			`// Ignore non-terminal and new nodes`
			`if !node.TerminalStatus() \|\| node.ModifyIndex > oldThreshold {`
			`continue`
			`}`

			`// Get the allocations by node`
			`allocs, err := c.snap.AllocsByNode(node.ID)`
			`if err != nil {`
			`c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",`
			`eval.ID, err)`
			`continue`
			`}`

GC Nodes even if they have terminal allocations 2016-06-03 23:24:41 +00:00			`// If there are any non-terminal allocations, skip the node. If the node`
			`// is terminal and the allocations are not, the scheduler may not have`
Misc typos 2016-06-16 23:17:17 +00:00			`// run yet to transition the allocs on the node to terminal. We delay`
GC Nodes even if they have terminal allocations 2016-06-03 23:24:41 +00:00			`// GC'ing until this happens.`
			`for _, alloc := range allocs {`
			`if !alloc.TerminalStatus() {`
			`continue OUTER`
			`}`
nomad: adding node GC 2015-09-07 18:01:29 +00:00			`}`

			`// Node is eligible for garbage collection`
			`gcNode = append(gcNode, node.ID)`
			`}`

			`// Fast-path the nothing case`
			`if len(gcNode) == 0 {`
			`return nil`
			`}`
			`c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))`

			`// Call to the leader to issue the reap`
			`for _, nodeID := range gcNode {`
			`req := structs.NodeDeregisterRequest{`
			`NodeID: nodeID,`
			`WriteRequest: structs.WriteRequest{`
			`Region: c.srv.config.Region,`
			`},`
			`}`
			`var resp structs.NodeUpdateResponse`
			`if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {`
			`c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)`
			`return err`
			`}`
			`}`
			`return nil`
			`}`