2015-08-15 19:38:58 +00:00
|
|
|
package nomad
|
|
|
|
|
|
|
|
import (
|
2015-08-15 23:07:50 +00:00
|
|
|
"fmt"
|
2016-02-20 23:50:41 +00:00
|
|
|
"math"
|
2020-02-19 14:05:33 +00:00
|
|
|
"strings"
|
2015-08-16 00:42:51 +00:00
|
|
|
"time"
|
2015-08-15 23:07:50 +00:00
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
2017-02-08 04:31:23 +00:00
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
2020-02-19 14:05:33 +00:00
|
|
|
multierror "github.com/hashicorp/go-multierror"
|
2019-06-07 14:57:57 +00:00
|
|
|
version "github.com/hashicorp/go-version"
|
2020-04-02 20:04:56 +00:00
|
|
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
2015-08-15 19:38:58 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/state"
|
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
"github.com/hashicorp/nomad/scheduler"
|
|
|
|
)
|
|
|
|
|
2016-03-30 22:17:13 +00:00
|
|
|
var (
|
|
|
|
// maxIdsPerReap is the maximum number of evals and allocations to reap in a
|
|
|
|
// single Raft transaction. This is to ensure that the Raft message does not
|
|
|
|
// become too large.
|
2016-04-14 18:41:04 +00:00
|
|
|
maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
|
2016-03-30 22:17:13 +00:00
|
|
|
)
|
|
|
|
|
2015-08-15 19:38:58 +00:00
|
|
|
// CoreScheduler is a special "scheduler" that is registered
|
|
|
|
// as "_core". It is used to run various administrative work
|
|
|
|
// across the cluster.
|
|
|
|
type CoreScheduler struct {
|
2018-09-15 23:23:13 +00:00
|
|
|
srv *Server
|
|
|
|
snap *state.StateSnapshot
|
|
|
|
logger log.Logger
|
2015-08-15 19:38:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewCoreScheduler is used to return a new system scheduler instance
|
|
|
|
func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
|
|
|
|
s := &CoreScheduler{
|
2018-09-15 23:23:13 +00:00
|
|
|
srv: srv,
|
|
|
|
snap: snap,
|
|
|
|
logger: srv.logger.ResetNamed("core.sched"),
|
2015-08-15 19:38:58 +00:00
|
|
|
}
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process is used to implement the scheduler.Scheduler interface
|
2016-04-08 18:42:02 +00:00
|
|
|
func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
|
2020-02-19 14:05:33 +00:00
|
|
|
job := strings.Split(eval.JobID, ":") // extra data can be smuggled in w/ JobID
|
|
|
|
switch job[0] {
|
2015-08-15 23:07:50 +00:00
|
|
|
case structs.CoreJobEvalGC:
|
2016-04-08 18:42:02 +00:00
|
|
|
return c.evalGC(eval)
|
2015-09-07 18:01:29 +00:00
|
|
|
case structs.CoreJobNodeGC:
|
2016-04-08 18:42:02 +00:00
|
|
|
return c.nodeGC(eval)
|
2015-12-15 03:20:57 +00:00
|
|
|
case structs.CoreJobJobGC:
|
2016-04-08 18:42:02 +00:00
|
|
|
return c.jobGC(eval)
|
2017-06-29 19:32:37 +00:00
|
|
|
case structs.CoreJobDeploymentGC:
|
|
|
|
return c.deploymentGC(eval)
|
2020-02-19 14:05:33 +00:00
|
|
|
case structs.CoreJobCSIVolumeClaimGC:
|
|
|
|
return c.csiVolumeClaimGC(eval)
|
2016-04-08 18:42:02 +00:00
|
|
|
case structs.CoreJobForceGC:
|
|
|
|
return c.forceGC(eval)
|
2015-08-15 23:07:50 +00:00
|
|
|
default:
|
|
|
|
return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-08 18:42:02 +00:00
|
|
|
// forceGC is used to garbage collect all eligible objects.
|
|
|
|
func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
|
|
|
|
if err := c.jobGC(eval); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := c.evalGC(eval); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-06-29 19:32:37 +00:00
|
|
|
if err := c.deploymentGC(eval); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-08 18:42:02 +00:00
|
|
|
|
|
|
|
// Node GC must occur after the others to ensure the allocations are
|
|
|
|
// cleared.
|
|
|
|
return c.nodeGC(eval)
|
|
|
|
}
|
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
// jobGC is used to garbage collect eligible jobs.
|
|
|
|
func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
|
|
|
|
// Get all the jobs eligible for garbage collection.
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
iter, err := c.snap.JobsByGC(ws, true)
|
2015-12-15 03:20:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-02-20 23:50:41 +00:00
|
|
|
var oldThreshold uint64
|
2016-04-08 18:42:02 +00:00
|
|
|
if eval.JobID == structs.CoreJobForceGC {
|
2016-02-20 23:50:41 +00:00
|
|
|
// The GC was forced, so set the threshold to its maximum so everything
|
|
|
|
// will GC.
|
|
|
|
oldThreshold = math.MaxUint64
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("forced job GC")
|
2016-02-20 23:50:41 +00:00
|
|
|
} else {
|
|
|
|
// Get the time table to calculate GC cutoffs.
|
|
|
|
tt := c.srv.fsm.TimeTable()
|
|
|
|
cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
|
|
|
|
oldThreshold = tt.NearestIndex(cutoff)
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("job GC scanning before cutoff index",
|
|
|
|
"index", oldThreshold, "job_gc_threshold", c.srv.config.JobGCThreshold)
|
2016-02-20 23:50:41 +00:00
|
|
|
}
|
2015-12-15 03:20:57 +00:00
|
|
|
|
2015-12-16 22:27:40 +00:00
|
|
|
// Collect the allocations, evaluations and jobs to GC
|
2017-09-07 23:56:15 +00:00
|
|
|
var gcAlloc, gcEval []string
|
|
|
|
var gcJob []*structs.Job
|
2015-12-15 03:20:57 +00:00
|
|
|
|
|
|
|
OUTER:
|
2015-12-16 22:27:40 +00:00
|
|
|
for i := iter.Next(); i != nil; i = iter.Next() {
|
2015-12-15 03:20:57 +00:00
|
|
|
job := i.(*structs.Job)
|
|
|
|
|
|
|
|
// Ignore new jobs.
|
|
|
|
if job.CreateIndex > oldThreshold {
|
2015-12-16 22:27:40 +00:00
|
|
|
continue
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID)
|
2015-12-15 03:20:57 +00:00
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Error("job GC failed to get evals for job", "job", job.ID, "error", err)
|
2015-12-15 03:20:57 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2016-06-11 01:32:37 +00:00
|
|
|
allEvalsGC := true
|
2016-06-27 22:47:49 +00:00
|
|
|
var jobAlloc, jobEval []string
|
2015-12-15 03:20:57 +00:00
|
|
|
for _, eval := range evals {
|
2016-06-27 22:47:49 +00:00
|
|
|
gc, allocs, err := c.gcEval(eval, oldThreshold, true)
|
2016-06-11 01:32:37 +00:00
|
|
|
if err != nil {
|
2015-12-15 03:20:57 +00:00
|
|
|
continue OUTER
|
|
|
|
}
|
|
|
|
|
2016-06-11 01:32:37 +00:00
|
|
|
if gc {
|
2016-06-27 22:47:49 +00:00
|
|
|
jobEval = append(jobEval, eval.ID)
|
|
|
|
jobAlloc = append(jobAlloc, allocs...)
|
|
|
|
} else {
|
|
|
|
allEvalsGC = false
|
|
|
|
break
|
2016-06-11 01:32:37 +00:00
|
|
|
}
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Job is eligible for garbage collection
|
2016-06-11 01:32:37 +00:00
|
|
|
if allEvalsGC {
|
2017-09-07 23:56:15 +00:00
|
|
|
gcJob = append(gcJob, job)
|
2016-06-27 22:47:49 +00:00
|
|
|
gcAlloc = append(gcAlloc, jobAlloc...)
|
|
|
|
gcEval = append(gcEval, jobEval...)
|
2016-06-11 01:32:37 +00:00
|
|
|
}
|
2020-02-19 14:05:33 +00:00
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Fast-path the nothing case
|
|
|
|
if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("job GC found eligible objects",
|
|
|
|
"jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc))
|
2015-12-15 03:20:57 +00:00
|
|
|
|
2020-02-19 14:05:33 +00:00
|
|
|
// Clean up any outstanding volume claims
|
|
|
|
if err := c.volumeClaimReap(gcJob, eval.LeaderACL); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
// Reap the evals and allocs
|
|
|
|
if err := c.evalReap(gcEval, gcAlloc); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-03-14 23:06:37 +00:00
|
|
|
// Reap the jobs
|
|
|
|
return c.jobReap(gcJob, eval.LeaderACL)
|
|
|
|
}
|
|
|
|
|
|
|
|
// jobReap contacts the leader and issues a reap on the passed jobs
|
|
|
|
func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error {
|
|
|
|
// Call to the leader to issue the reap
|
|
|
|
for _, req := range c.partitionJobReap(jobs, leaderACL) {
|
|
|
|
var resp structs.JobBatchDeregisterResponse
|
|
|
|
if err := c.srv.RPC("Job.BatchDeregister", req, &resp); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Error("batch job reap failed", "error", err)
|
2018-03-14 23:06:37 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// partitionJobReap returns a list of JobBatchDeregisterRequests to make,
|
|
|
|
// ensuring a single request does not contain too many jobs. This is necessary
|
|
|
|
// to ensure that the Raft transaction does not become too large.
|
|
|
|
func (c *CoreScheduler) partitionJobReap(jobs []*structs.Job, leaderACL string) []*structs.JobBatchDeregisterRequest {
|
|
|
|
option := &structs.JobDeregisterOptions{Purge: true}
|
|
|
|
var requests []*structs.JobBatchDeregisterRequest
|
|
|
|
submittedJobs := 0
|
|
|
|
for submittedJobs != len(jobs) {
|
|
|
|
req := &structs.JobBatchDeregisterRequest{
|
|
|
|
Jobs: make(map[structs.NamespacedID]*structs.JobDeregisterOptions),
|
2015-12-15 03:20:57 +00:00
|
|
|
WriteRequest: structs.WriteRequest{
|
2017-09-07 23:56:15 +00:00
|
|
|
Region: c.srv.config.Region,
|
2018-03-14 23:06:37 +00:00
|
|
|
AuthToken: leaderACL,
|
2015-12-15 03:20:57 +00:00
|
|
|
},
|
|
|
|
}
|
2018-03-14 23:06:37 +00:00
|
|
|
requests = append(requests, req)
|
|
|
|
available := maxIdsPerReap
|
|
|
|
|
|
|
|
if remaining := len(jobs) - submittedJobs; remaining > 0 {
|
|
|
|
if remaining <= available {
|
|
|
|
for _, job := range jobs[submittedJobs:] {
|
|
|
|
jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace}
|
|
|
|
req.Jobs[jns] = option
|
|
|
|
}
|
|
|
|
submittedJobs += remaining
|
|
|
|
} else {
|
|
|
|
for _, job := range jobs[submittedJobs : submittedJobs+available] {
|
|
|
|
jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace}
|
|
|
|
req.Jobs[jns] = option
|
|
|
|
}
|
|
|
|
submittedJobs += available
|
|
|
|
}
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-14 23:06:37 +00:00
|
|
|
return requests
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
|
2015-08-15 23:07:50 +00:00
|
|
|
// evalGC is used to garbage collect old evaluations
|
|
|
|
func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
|
|
|
|
// Iterate over the evaluations
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
iter, err := c.snap.Evals(ws)
|
2015-08-15 23:07:50 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-02-20 23:50:41 +00:00
|
|
|
var oldThreshold uint64
|
2016-04-08 18:42:02 +00:00
|
|
|
if eval.JobID == structs.CoreJobForceGC {
|
2016-02-20 23:50:41 +00:00
|
|
|
// The GC was forced, so set the threshold to its maximum so everything
|
|
|
|
// will GC.
|
|
|
|
oldThreshold = math.MaxUint64
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("forced eval GC")
|
2016-02-20 23:50:41 +00:00
|
|
|
} else {
|
|
|
|
// Compute the old threshold limit for GC using the FSM
|
|
|
|
// time table. This is a rough mapping of a time to the
|
|
|
|
// Raft index it belongs to.
|
|
|
|
tt := c.srv.fsm.TimeTable()
|
|
|
|
cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
|
|
|
|
oldThreshold = tt.NearestIndex(cutoff)
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("eval GC scanning before cutoff index",
|
|
|
|
"index", oldThreshold, "eval_gc_threshold", c.srv.config.EvalGCThreshold)
|
2016-02-20 23:50:41 +00:00
|
|
|
}
|
2015-08-15 23:07:50 +00:00
|
|
|
|
|
|
|
// Collect the allocations and evaluations to GC
|
|
|
|
var gcAlloc, gcEval []string
|
2015-12-15 03:20:57 +00:00
|
|
|
for raw := iter.Next(); raw != nil; raw = iter.Next() {
|
2015-08-15 23:07:50 +00:00
|
|
|
eval := raw.(*structs.Evaluation)
|
2016-03-25 23:46:48 +00:00
|
|
|
|
2016-06-27 22:47:49 +00:00
|
|
|
// The Evaluation GC should not handle batch jobs since those need to be
|
|
|
|
// garbage collected in one shot
|
|
|
|
gc, allocs, err := c.gcEval(eval, oldThreshold, false)
|
2015-08-15 23:07:50 +00:00
|
|
|
if err != nil {
|
2015-12-15 03:20:57 +00:00
|
|
|
return err
|
2015-08-15 23:07:50 +00:00
|
|
|
}
|
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
if gc {
|
|
|
|
gcEval = append(gcEval, eval.ID)
|
2015-08-15 23:07:50 +00:00
|
|
|
}
|
2016-06-11 01:32:37 +00:00
|
|
|
gcAlloc = append(gcAlloc, allocs...)
|
2015-08-15 23:07:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Fast-path the nothing case
|
|
|
|
if len(gcEval) == 0 && len(gcAlloc) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("eval GC found eligibile objects",
|
|
|
|
"evals", len(gcEval), "allocs", len(gcAlloc))
|
2015-08-15 23:07:50 +00:00
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
return c.evalReap(gcEval, gcAlloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// gcEval returns whether the eval should be garbage collected given a raft
|
|
|
|
// threshold index. The eval disqualifies for garbage collection if it or its
|
|
|
|
// allocs are not older than the threshold. If the eval should be garbage
|
|
|
|
// collected, the associated alloc ids that should also be removed are also
|
|
|
|
// returned
|
2016-06-27 22:47:49 +00:00
|
|
|
func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
|
2015-12-15 03:20:57 +00:00
|
|
|
bool, []string, error) {
|
|
|
|
// Ignore non-terminal and new evaluations
|
|
|
|
if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
|
|
|
|
return false, nil, nil
|
|
|
|
}
|
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
// Create a watchset
|
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
|
2018-01-22 22:31:38 +00:00
|
|
|
// Look up the job
|
|
|
|
job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
|
|
|
|
if err != nil {
|
|
|
|
return false, nil, err
|
|
|
|
}
|
|
|
|
|
2018-11-01 05:02:26 +00:00
|
|
|
// Get the allocations by eval
|
|
|
|
allocs, err := c.snap.AllocsByEval(ws, eval.ID)
|
|
|
|
if err != nil {
|
|
|
|
c.logger.Error("failed to get allocs for eval",
|
|
|
|
"eval_id", eval.ID, "error", err)
|
|
|
|
return false, nil, err
|
|
|
|
}
|
|
|
|
|
2016-06-11 01:32:37 +00:00
|
|
|
// If the eval is from a running "batch" job we don't want to garbage
|
|
|
|
// collect its allocations. If there is a long running batch job and its
|
|
|
|
// terminal allocations get GC'd the scheduler would re-run the
|
|
|
|
// allocations.
|
|
|
|
if eval.Type == structs.JobTypeBatch {
|
|
|
|
// Check if the job is running
|
|
|
|
|
2017-04-15 23:47:19 +00:00
|
|
|
// Can collect if:
|
|
|
|
// Job doesn't exist
|
|
|
|
// Job is Stopped and dead
|
|
|
|
// allowBatch and the job is dead
|
2018-01-30 22:14:53 +00:00
|
|
|
collect := false
|
|
|
|
if job == nil {
|
|
|
|
collect = true
|
|
|
|
} else if job.Status != structs.JobStatusDead {
|
|
|
|
collect = false
|
|
|
|
} else if job.Stop {
|
|
|
|
collect = true
|
|
|
|
} else if allowBatch {
|
|
|
|
collect = true
|
|
|
|
}
|
2017-04-15 23:47:19 +00:00
|
|
|
|
2016-06-22 18:40:27 +00:00
|
|
|
// We don't want to gc anything related to a job which is not dead
|
2017-03-11 23:48:57 +00:00
|
|
|
// If the batch job doesn't exist we can GC it regardless of allowBatch
|
2017-04-15 23:47:19 +00:00
|
|
|
if !collect {
|
2018-11-09 17:44:21 +00:00
|
|
|
// Find allocs associated with older (based on createindex) and GC them if terminal
|
2018-11-01 05:02:26 +00:00
|
|
|
oldAllocs := olderVersionTerminalAllocs(allocs, job)
|
|
|
|
return false, oldAllocs, nil
|
2016-06-11 01:32:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
// Scan the allocations to ensure they are terminal and old
|
2016-06-11 01:32:37 +00:00
|
|
|
gcEval := true
|
|
|
|
var gcAllocIDs []string
|
2015-12-15 03:20:57 +00:00
|
|
|
for _, alloc := range allocs {
|
2018-04-11 18:58:02 +00:00
|
|
|
if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) {
|
2016-06-11 01:32:37 +00:00
|
|
|
// Can't GC the evaluation since not all of the allocations are
|
|
|
|
// terminal
|
|
|
|
gcEval = false
|
|
|
|
} else {
|
|
|
|
// The allocation is eligible to be GC'd
|
|
|
|
gcAllocIDs = append(gcAllocIDs, alloc.ID)
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-11 01:32:37 +00:00
|
|
|
return gcEval, gcAllocIDs, nil
|
2015-12-15 03:20:57 +00:00
|
|
|
}
|
|
|
|
|
2018-11-09 17:44:21 +00:00
|
|
|
// olderVersionTerminalAllocs returns terminal allocations whose job create index
|
|
|
|
// is older than the job's create index
|
2018-11-01 05:02:26 +00:00
|
|
|
func olderVersionTerminalAllocs(allocs []*structs.Allocation, job *structs.Job) []string {
|
|
|
|
var ret []string
|
|
|
|
for _, alloc := range allocs {
|
2018-11-09 17:44:21 +00:00
|
|
|
if alloc.Job != nil && alloc.Job.CreateIndex < job.CreateIndex && alloc.TerminalStatus() {
|
2018-11-01 05:02:26 +00:00
|
|
|
ret = append(ret, alloc.ID)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2015-12-15 03:20:57 +00:00
|
|
|
// evalReap contacts the leader and issues a reap on the passed evals and
|
|
|
|
// allocs.
|
|
|
|
func (c *CoreScheduler) evalReap(evals, allocs []string) error {
|
2015-08-15 23:07:50 +00:00
|
|
|
// Call to the leader to issue the reap
|
2017-06-29 19:32:37 +00:00
|
|
|
for _, req := range c.partitionEvalReap(evals, allocs) {
|
2016-03-30 22:17:13 +00:00
|
|
|
var resp structs.GenericResponse
|
|
|
|
if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Error("eval reap failed", "error", err)
|
2016-03-30 22:17:13 +00:00
|
|
|
return err
|
|
|
|
}
|
2015-08-15 23:07:50 +00:00
|
|
|
}
|
2015-12-15 03:20:57 +00:00
|
|
|
|
2015-08-15 19:38:58 +00:00
|
|
|
return nil
|
|
|
|
}
|
2015-09-07 18:01:29 +00:00
|
|
|
|
2017-06-29 19:32:37 +00:00
|
|
|
// partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single
|
2016-03-30 22:17:13 +00:00
|
|
|
// request does not contain too many allocations and evaluations. This is
|
|
|
|
// necessary to ensure that the Raft transaction does not become too large.
|
2017-06-29 19:32:37 +00:00
|
|
|
func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest {
|
2016-03-30 22:17:13 +00:00
|
|
|
var requests []*structs.EvalDeleteRequest
|
2016-04-14 18:41:04 +00:00
|
|
|
submittedEvals, submittedAllocs := 0, 0
|
2016-03-30 22:17:13 +00:00
|
|
|
for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
|
|
|
|
req := &structs.EvalDeleteRequest{
|
|
|
|
WriteRequest: structs.WriteRequest{
|
|
|
|
Region: c.srv.config.Region,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
requests = append(requests, req)
|
|
|
|
available := maxIdsPerReap
|
|
|
|
|
2016-04-14 18:41:04 +00:00
|
|
|
// Add the allocs first
|
|
|
|
if remaining := len(allocs) - submittedAllocs; remaining > 0 {
|
2016-03-30 22:17:13 +00:00
|
|
|
if remaining <= available {
|
2016-04-14 18:41:04 +00:00
|
|
|
req.Allocs = allocs[submittedAllocs:]
|
2016-03-30 22:17:13 +00:00
|
|
|
available -= remaining
|
2016-04-14 18:41:04 +00:00
|
|
|
submittedAllocs += remaining
|
2016-03-30 22:17:13 +00:00
|
|
|
} else {
|
2016-04-14 18:41:04 +00:00
|
|
|
req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
|
|
|
|
submittedAllocs += available
|
2016-03-30 22:17:13 +00:00
|
|
|
|
2016-04-14 18:41:04 +00:00
|
|
|
// Exhausted space so skip adding evals
|
2016-03-30 22:17:13 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-14 18:41:04 +00:00
|
|
|
// Add the evals
|
|
|
|
if remaining := len(evals) - submittedEvals; remaining > 0 {
|
2016-03-30 22:17:13 +00:00
|
|
|
if remaining <= available {
|
2016-04-14 18:41:04 +00:00
|
|
|
req.Evals = evals[submittedEvals:]
|
|
|
|
submittedEvals += remaining
|
2016-03-30 22:17:13 +00:00
|
|
|
} else {
|
2016-04-14 18:41:04 +00:00
|
|
|
req.Evals = evals[submittedEvals : submittedEvals+available]
|
|
|
|
submittedEvals += available
|
2016-03-30 22:17:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return requests
|
|
|
|
}
|
|
|
|
|
2015-09-07 18:01:29 +00:00
|
|
|
// nodeGC is used to garbage collect old nodes
|
|
|
|
func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
|
|
|
|
// Iterate over the evaluations
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
iter, err := c.snap.Nodes(ws)
|
2015-09-07 18:01:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-02-21 00:11:29 +00:00
|
|
|
var oldThreshold uint64
|
2016-04-08 18:42:02 +00:00
|
|
|
if eval.JobID == structs.CoreJobForceGC {
|
2016-02-21 00:11:29 +00:00
|
|
|
// The GC was forced, so set the threshold to its maximum so everything
|
|
|
|
// will GC.
|
|
|
|
oldThreshold = math.MaxUint64
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("forced node GC")
|
2016-02-21 00:11:29 +00:00
|
|
|
} else {
|
|
|
|
// Compute the old threshold limit for GC using the FSM
|
|
|
|
// time table. This is a rough mapping of a time to the
|
|
|
|
// Raft index it belongs to.
|
|
|
|
tt := c.srv.fsm.TimeTable()
|
|
|
|
cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
|
|
|
|
oldThreshold = tt.NearestIndex(cutoff)
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("node GC scanning before cutoff index",
|
|
|
|
"index", oldThreshold, "node_gc_threshold", c.srv.config.NodeGCThreshold)
|
2016-02-21 00:11:29 +00:00
|
|
|
}
|
2015-09-07 18:01:29 +00:00
|
|
|
|
|
|
|
// Collect the nodes to GC
|
|
|
|
var gcNode []string
|
2016-06-03 23:24:41 +00:00
|
|
|
OUTER:
|
2015-09-07 18:01:29 +00:00
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
node := raw.(*structs.Node)
|
|
|
|
|
|
|
|
// Ignore non-terminal and new nodes
|
|
|
|
if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the allocations by node
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
allocs, err := c.snap.AllocsByNode(ws, node.ID)
|
2015-09-07 18:01:29 +00:00
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Error("failed to get allocs for node",
|
|
|
|
"node_id", node.ID, "error", err)
|
2015-09-07 18:01:29 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2016-06-03 23:24:41 +00:00
|
|
|
// If there are any non-terminal allocations, skip the node. If the node
|
|
|
|
// is terminal and the allocations are not, the scheduler may not have
|
2016-06-16 23:17:17 +00:00
|
|
|
// run yet to transition the allocs on the node to terminal. We delay
|
2016-06-03 23:24:41 +00:00
|
|
|
// GC'ing until this happens.
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
if !alloc.TerminalStatus() {
|
|
|
|
continue OUTER
|
|
|
|
}
|
2015-09-07 18:01:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Node is eligible for garbage collection
|
|
|
|
gcNode = append(gcNode, node.ID)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fast-path the nothing case
|
|
|
|
if len(gcNode) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("node GC found eligible nodes", "nodes", len(gcNode))
|
2019-06-07 14:57:57 +00:00
|
|
|
return c.nodeReap(eval, gcNode)
|
2019-06-06 19:59:14 +00:00
|
|
|
}
|
|
|
|
|
2019-06-07 14:57:57 +00:00
|
|
|
func (c *CoreScheduler) nodeReap(eval *structs.Evaluation, nodeIDs []string) error {
|
2019-06-27 19:16:27 +00:00
|
|
|
// For old clusters, send single deregistration messages COMPAT(0.11)
|
2019-06-13 19:04:38 +00:00
|
|
|
minVersionBatchNodeDeregister := version.Must(version.NewVersion("0.9.4"))
|
|
|
|
if !ServersMeetMinimumVersion(c.srv.Members(), minVersionBatchNodeDeregister, true) {
|
2019-06-06 19:59:14 +00:00
|
|
|
for _, id := range nodeIDs {
|
|
|
|
req := structs.NodeDeregisterRequest{
|
|
|
|
NodeID: id,
|
|
|
|
WriteRequest: structs.WriteRequest{
|
|
|
|
Region: c.srv.config.Region,
|
|
|
|
AuthToken: eval.LeaderACL,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
var resp structs.NodeUpdateResponse
|
|
|
|
if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
|
|
|
|
c.logger.Error("node reap failed", "node_id", id, "error", err)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2015-09-07 18:01:29 +00:00
|
|
|
|
|
|
|
// Call to the leader to issue the reap
|
2019-06-07 14:57:57 +00:00
|
|
|
for _, ids := range partitionAll(maxIdsPerReap, nodeIDs) {
|
2019-06-26 14:57:58 +00:00
|
|
|
req := structs.NodeBatchDeregisterRequest{
|
2019-06-05 14:19:21 +00:00
|
|
|
NodeIDs: ids,
|
2015-09-07 18:01:29 +00:00
|
|
|
WriteRequest: structs.WriteRequest{
|
2017-10-23 22:04:00 +00:00
|
|
|
Region: c.srv.config.Region,
|
|
|
|
AuthToken: eval.LeaderACL,
|
2015-09-07 18:01:29 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
var resp structs.NodeUpdateResponse
|
2019-06-26 14:57:58 +00:00
|
|
|
if err := c.srv.RPC("Node.BatchDeregister", &req, &resp); err != nil {
|
2019-06-05 14:19:21 +00:00
|
|
|
c.logger.Error("node reap failed", "node_ids", ids, "error", err)
|
2015-09-07 18:01:29 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2017-06-29 19:32:37 +00:00
|
|
|
|
|
|
|
// deploymentGC is used to garbage collect old deployments
|
|
|
|
func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error {
|
|
|
|
// Iterate over the deployments
|
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
iter, err := c.snap.Deployments(ws)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
var oldThreshold uint64
|
|
|
|
if eval.JobID == structs.CoreJobForceGC {
|
|
|
|
// The GC was forced, so set the threshold to its maximum so everything
|
|
|
|
// will GC.
|
|
|
|
oldThreshold = math.MaxUint64
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("forced deployment GC")
|
2017-06-29 19:32:37 +00:00
|
|
|
} else {
|
|
|
|
// Compute the old threshold limit for GC using the FSM
|
|
|
|
// time table. This is a rough mapping of a time to the
|
|
|
|
// Raft index it belongs to.
|
|
|
|
tt := c.srv.fsm.TimeTable()
|
|
|
|
cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold)
|
|
|
|
oldThreshold = tt.NearestIndex(cutoff)
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("deployment GC scanning before cutoff index",
|
|
|
|
"index", oldThreshold, "deployment_gc_threshold", c.srv.config.DeploymentGCThreshold)
|
2017-06-29 19:32:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Collect the deployments to GC
|
|
|
|
var gcDeployment []string
|
2017-07-14 20:02:39 +00:00
|
|
|
|
|
|
|
OUTER:
|
2017-06-29 19:32:37 +00:00
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
deploy := raw.(*structs.Deployment)
|
|
|
|
|
|
|
|
// Ignore non-terminal and new deployments
|
|
|
|
if deploy.Active() || deploy.ModifyIndex > oldThreshold {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-07-14 20:02:39 +00:00
|
|
|
// Ensure there are no allocs referencing this deployment.
|
|
|
|
allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID)
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Error("failed to get allocs for deployment",
|
|
|
|
"deployment_id", deploy.ID, "error", err)
|
2017-07-14 20:02:39 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure there is no allocation referencing the deployment.
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
if !alloc.TerminalStatus() {
|
|
|
|
continue OUTER
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-29 19:32:37 +00:00
|
|
|
// Deployment is eligible for garbage collection
|
|
|
|
gcDeployment = append(gcDeployment, deploy.ID)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fast-path the nothing case
|
|
|
|
if len(gcDeployment) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Debug("deployment GC found eligible deployments", "deployments", len(gcDeployment))
|
2017-06-29 19:32:37 +00:00
|
|
|
return c.deploymentReap(gcDeployment)
|
|
|
|
}
|
|
|
|
|
|
|
|
// deploymentReap contacts the leader and issues a reap on the passed
|
|
|
|
// deployments.
|
|
|
|
func (c *CoreScheduler) deploymentReap(deployments []string) error {
|
|
|
|
// Call to the leader to issue the reap
|
|
|
|
for _, req := range c.partitionDeploymentReap(deployments) {
|
|
|
|
var resp structs.GenericResponse
|
|
|
|
if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
c.logger.Error("deployment reap failed", "error", err)
|
2017-06-29 19:32:37 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// partitionDeploymentReap returns a list of DeploymentDeleteRequest to make,
|
|
|
|
// ensuring a single request does not contain too many deployments. This is
|
|
|
|
// necessary to ensure that the Raft transaction does not become too large.
|
|
|
|
func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest {
|
|
|
|
var requests []*structs.DeploymentDeleteRequest
|
|
|
|
submittedDeployments := 0
|
|
|
|
for submittedDeployments != len(deployments) {
|
|
|
|
req := &structs.DeploymentDeleteRequest{
|
|
|
|
WriteRequest: structs.WriteRequest{
|
|
|
|
Region: c.srv.config.Region,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
requests = append(requests, req)
|
|
|
|
available := maxIdsPerReap
|
|
|
|
|
|
|
|
if remaining := len(deployments) - submittedDeployments; remaining > 0 {
|
|
|
|
if remaining <= available {
|
|
|
|
req.Deployments = deployments[submittedDeployments:]
|
|
|
|
submittedDeployments += remaining
|
|
|
|
} else {
|
|
|
|
req.Deployments = deployments[submittedDeployments : submittedDeployments+available]
|
|
|
|
submittedDeployments += available
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return requests
|
|
|
|
}
|
2018-01-30 15:12:14 +00:00
|
|
|
|
2018-01-30 22:14:53 +00:00
|
|
|
// allocGCEligible returns if the allocation is eligible to be garbage collected
|
2018-01-30 15:12:14 +00:00
|
|
|
// according to its terminal status and its reschedule trackers
|
2018-04-11 18:58:02 +00:00
|
|
|
func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool {
|
2018-01-30 15:12:14 +00:00
|
|
|
// Not in a terminal status and old enough
|
|
|
|
if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
|
|
|
|
return false
|
|
|
|
}
|
2018-01-30 22:14:53 +00:00
|
|
|
|
2018-12-05 21:01:12 +00:00
|
|
|
// If the allocation is still running on the client we can not garbage
|
|
|
|
// collect it.
|
|
|
|
if a.ClientStatus == structs.AllocClientStatusRunning {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2018-04-11 18:58:02 +00:00
|
|
|
// If the job is deleted, stopped or dead all allocs can be removed
|
2018-01-30 22:14:53 +00:00
|
|
|
if job == nil || job.Stop || job.Status == structs.JobStatusDead {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2018-05-21 18:28:31 +00:00
|
|
|
// If the allocation's desired state is Stop, it can be GCed even if it
|
|
|
|
// has failed and hasn't been rescheduled. This can happen during job updates
|
|
|
|
if a.DesiredStatus == structs.AllocDesiredStatusStop {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2018-04-11 18:58:02 +00:00
|
|
|
// If the alloc hasn't failed then we don't need to consider it for rescheduling
|
|
|
|
// Rescheduling needs to copy over information from the previous alloc so that it
|
|
|
|
// can enforce the reschedule policy
|
|
|
|
if a.ClientStatus != structs.AllocClientStatusFailed {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2018-01-30 22:14:53 +00:00
|
|
|
var reschedulePolicy *structs.ReschedulePolicy
|
|
|
|
tg := job.LookupTaskGroup(a.TaskGroup)
|
|
|
|
|
|
|
|
if tg != nil {
|
|
|
|
reschedulePolicy = tg.ReschedulePolicy
|
|
|
|
}
|
2018-04-10 21:08:37 +00:00
|
|
|
// No reschedule policy or rescheduling is disabled
|
|
|
|
if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
|
2018-01-30 15:12:14 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
// Restart tracking information has been carried forward
|
|
|
|
if a.NextAllocation != "" {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2018-04-11 18:58:02 +00:00
|
|
|
// This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet
|
|
|
|
if reschedulePolicy.Unlimited {
|
|
|
|
return false
|
2018-04-10 21:08:37 +00:00
|
|
|
}
|
2018-01-30 15:12:14 +00:00
|
|
|
|
2018-04-11 18:58:02 +00:00
|
|
|
// No restarts have been attempted yet
|
|
|
|
if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't GC if most recent reschedule attempt is within time interval
|
|
|
|
interval := reschedulePolicy.Interval
|
|
|
|
lastIndex := len(a.RescheduleTracker.Events)
|
|
|
|
lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
|
|
|
|
timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
|
|
|
|
|
|
|
|
return timeDiff > interval.Nanoseconds()
|
2018-01-30 15:12:14 +00:00
|
|
|
}
|
2020-01-30 13:15:56 +00:00
|
|
|
|
2020-02-19 14:05:33 +00:00
|
|
|
// csiVolumeClaimGC is used to garbage collect CSI volume claims
|
|
|
|
func (c *CoreScheduler) csiVolumeClaimGC(eval *structs.Evaluation) error {
|
|
|
|
c.logger.Trace("garbage collecting unclaimed CSI volume claims")
|
|
|
|
|
|
|
|
// JobID smuggled in with the eval's own JobID
|
|
|
|
var jobID string
|
|
|
|
evalJobID := strings.Split(eval.JobID, ":")
|
|
|
|
if len(evalJobID) != 2 {
|
|
|
|
c.logger.Error("volume gc called without jobID")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
jobID = evalJobID[1]
|
|
|
|
job, err := c.srv.State().JobByID(nil, eval.Namespace, jobID)
|
|
|
|
if err != nil || job == nil {
|
|
|
|
c.logger.Trace(
|
|
|
|
"cannot find job to perform volume claim GC. it may have been garbage collected",
|
|
|
|
"job", jobID)
|
|
|
|
return nil
|
|
|
|
}
|
2020-04-02 20:04:56 +00:00
|
|
|
return c.volumeClaimReap([]*structs.Job{job}, eval.LeaderACL)
|
2020-01-30 13:15:56 +00:00
|
|
|
}
|
2020-02-19 14:05:33 +00:00
|
|
|
|
2020-04-03 21:37:26 +00:00
|
|
|
// volumeClaimReap contacts the leader and releases volume claims from terminal allocs
|
2020-02-19 14:05:33 +00:00
|
|
|
func (c *CoreScheduler) volumeClaimReap(jobs []*structs.Job, leaderACL string) error {
|
2020-04-03 21:37:26 +00:00
|
|
|
return volumeClaimReap(c.srv, c.logger, jobs, leaderACL, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
// volumeClaimReap contacts the leader and releases volume claims from terminal allocs
|
|
|
|
func volumeClaimReap(srv *Server, logger log.Logger, jobs []*structs.Job, leaderACL string, runningAllocs bool) error {
|
2020-02-19 14:05:33 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
var result *multierror.Error
|
|
|
|
|
|
|
|
for _, job := range jobs {
|
2020-04-03 21:37:26 +00:00
|
|
|
logger.Trace("garbage collecting unclaimed CSI volume claims for job", "job", job.ID)
|
2020-02-19 14:05:33 +00:00
|
|
|
for _, taskGroup := range job.TaskGroups {
|
|
|
|
for _, tgVolume := range taskGroup.Volumes {
|
|
|
|
if tgVolume.Type != structs.VolumeTypeCSI {
|
|
|
|
continue // filter to just CSI volumes
|
|
|
|
}
|
|
|
|
volID := tgVolume.Source
|
2020-04-03 21:37:26 +00:00
|
|
|
vol, err := srv.State().CSIVolumeByID(ws, job.Namespace, volID)
|
2020-02-19 14:05:33 +00:00
|
|
|
if err != nil {
|
|
|
|
result = multierror.Append(result, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if vol == nil {
|
2020-04-03 21:37:26 +00:00
|
|
|
logger.Trace("cannot find volume to be GC'd. it may have been deregistered",
|
2020-02-19 14:05:33 +00:00
|
|
|
"volume", volID)
|
|
|
|
continue
|
|
|
|
}
|
2020-04-03 21:37:26 +00:00
|
|
|
vol, err = srv.State().CSIVolumeDenormalize(ws, vol)
|
2020-02-19 14:05:33 +00:00
|
|
|
if err != nil {
|
|
|
|
result = multierror.Append(result, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-04-03 21:37:26 +00:00
|
|
|
plug, err := srv.State().CSIPluginByID(ws, vol.PluginID)
|
2020-04-02 20:04:56 +00:00
|
|
|
if err != nil {
|
|
|
|
result = multierror.Append(result, err)
|
|
|
|
continue
|
2020-02-19 14:05:33 +00:00
|
|
|
}
|
|
|
|
|
2020-04-03 21:37:26 +00:00
|
|
|
gcClaims, nodeClaims := collectClaimsToGCImpl(vol, runningAllocs)
|
2020-04-02 20:04:56 +00:00
|
|
|
|
|
|
|
for _, claim := range gcClaims {
|
2020-04-03 21:37:26 +00:00
|
|
|
nodeClaims, err = volumeClaimReapImpl(srv,
|
2020-04-02 20:04:56 +00:00
|
|
|
&volumeClaimReapArgs{
|
|
|
|
vol: vol,
|
|
|
|
plug: plug,
|
|
|
|
allocID: claim.allocID,
|
|
|
|
nodeID: claim.nodeID,
|
|
|
|
mode: claim.mode,
|
|
|
|
region: job.Region,
|
|
|
|
namespace: job.Namespace,
|
|
|
|
leaderACL: leaderACL,
|
|
|
|
nodeClaims: nodeClaims,
|
|
|
|
},
|
|
|
|
)
|
2020-02-19 14:05:33 +00:00
|
|
|
if err != nil {
|
|
|
|
result = multierror.Append(result, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result.ErrorOrNil()
|
|
|
|
}
|
2020-04-02 20:04:56 +00:00
|
|
|
|
|
|
|
type gcClaimRequest struct {
|
|
|
|
allocID string
|
|
|
|
nodeID string
|
|
|
|
mode structs.CSIVolumeClaimMode
|
|
|
|
}
|
|
|
|
|
2020-04-03 21:37:26 +00:00
|
|
|
func collectClaimsToGCImpl(vol *structs.CSIVolume, runningAllocs bool) ([]gcClaimRequest, map[string]int) {
|
2020-04-02 20:04:56 +00:00
|
|
|
gcAllocs := []gcClaimRequest{}
|
|
|
|
nodeClaims := map[string]int{} // node IDs -> count
|
|
|
|
|
|
|
|
collectFunc := func(allocs map[string]*structs.Allocation,
|
|
|
|
mode structs.CSIVolumeClaimMode) {
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
// we call denormalize on the volume above to populate
|
|
|
|
// Allocation pointers. But the alloc might have been
|
|
|
|
// garbage collected concurrently, so if the alloc is
|
|
|
|
// still nil we can safely skip it.
|
|
|
|
if alloc == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodeClaims[alloc.NodeID]++
|
2020-04-03 21:37:26 +00:00
|
|
|
if runningAllocs || alloc.Terminated() {
|
2020-04-02 20:04:56 +00:00
|
|
|
gcAllocs = append(gcAllocs, gcClaimRequest{
|
|
|
|
allocID: alloc.ID,
|
|
|
|
nodeID: alloc.NodeID,
|
|
|
|
mode: mode,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
collectFunc(vol.WriteAllocs, structs.CSIVolumeClaimWrite)
|
|
|
|
collectFunc(vol.ReadAllocs, structs.CSIVolumeClaimRead)
|
|
|
|
return gcAllocs, nodeClaims
|
|
|
|
}
|
|
|
|
|
|
|
|
type volumeClaimReapArgs struct {
|
|
|
|
vol *structs.CSIVolume
|
|
|
|
plug *structs.CSIPlugin
|
|
|
|
allocID string
|
|
|
|
nodeID string
|
|
|
|
mode structs.CSIVolumeClaimMode
|
|
|
|
region string
|
|
|
|
namespace string
|
|
|
|
leaderACL string
|
|
|
|
nodeClaims map[string]int // node IDs -> count
|
|
|
|
}
|
|
|
|
|
|
|
|
func volumeClaimReapImpl(srv RPCServer, args *volumeClaimReapArgs) (map[string]int, error) {
|
|
|
|
vol := args.vol
|
|
|
|
nodeID := args.nodeID
|
|
|
|
|
|
|
|
// (1) NodePublish / NodeUnstage must be completed before controller
|
|
|
|
// operations or releasing the claim.
|
|
|
|
nReq := &cstructs.ClientCSINodeDetachVolumeRequest{
|
|
|
|
PluginID: args.plug.ID,
|
2020-04-04 15:03:44 +00:00
|
|
|
VolumeID: vol.ID,
|
|
|
|
ExternalID: vol.RemoteID(),
|
2020-04-02 20:04:56 +00:00
|
|
|
AllocID: args.allocID,
|
|
|
|
NodeID: nodeID,
|
|
|
|
AttachmentMode: vol.AttachmentMode,
|
|
|
|
AccessMode: vol.AccessMode,
|
|
|
|
ReadOnly: args.mode == structs.CSIVolumeClaimRead,
|
|
|
|
}
|
|
|
|
err := srv.RPC("ClientCSI.NodeDetachVolume", nReq,
|
|
|
|
&cstructs.ClientCSINodeDetachVolumeResponse{})
|
|
|
|
if err != nil {
|
|
|
|
return args.nodeClaims, err
|
|
|
|
}
|
|
|
|
args.nodeClaims[nodeID]--
|
|
|
|
|
|
|
|
// (2) we only emit the controller unpublish if no other allocs
|
|
|
|
// on the node need it, but we also only want to make this
|
|
|
|
// call at most once per node
|
|
|
|
if vol.ControllerRequired && args.nodeClaims[nodeID] < 1 {
|
2020-04-04 15:03:44 +00:00
|
|
|
|
|
|
|
// we need to get the CSI Node ID, which is not the same as
|
|
|
|
// the Nomad Node ID
|
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
targetNode, err := srv.State().NodeByID(ws, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
return args.nodeClaims, err
|
|
|
|
}
|
|
|
|
if targetNode == nil {
|
|
|
|
return args.nodeClaims, fmt.Errorf("%s: %s",
|
|
|
|
structs.ErrUnknownNodePrefix, nodeID)
|
|
|
|
}
|
|
|
|
targetCSIInfo, ok := targetNode.CSINodePlugins[args.plug.ID]
|
|
|
|
if !ok {
|
|
|
|
return args.nodeClaims, fmt.Errorf("Failed to find NodeInfo for node: %s", targetNode.ID)
|
|
|
|
}
|
|
|
|
|
2020-04-02 20:04:56 +00:00
|
|
|
controllerNodeID, err := nodeForControllerPlugin(srv.State(), args.plug)
|
2020-04-04 15:03:44 +00:00
|
|
|
if err != nil || controllerNodeID == "" {
|
2020-04-02 20:04:56 +00:00
|
|
|
return args.nodeClaims, err
|
|
|
|
}
|
|
|
|
cReq := &cstructs.ClientCSIControllerDetachVolumeRequest{
|
|
|
|
VolumeID: vol.RemoteID(),
|
2020-04-04 15:03:44 +00:00
|
|
|
ClientCSINodeID: targetCSIInfo.NodeInfo.ID,
|
2020-04-02 20:04:56 +00:00
|
|
|
}
|
|
|
|
cReq.PluginID = args.plug.ID
|
|
|
|
cReq.ControllerNodeID = controllerNodeID
|
|
|
|
err = srv.RPC("ClientCSI.ControllerDetachVolume", cReq,
|
|
|
|
&cstructs.ClientCSIControllerDetachVolumeResponse{})
|
|
|
|
if err != nil {
|
|
|
|
return args.nodeClaims, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// (3) release the claim from the state store, allowing it to be rescheduled
|
|
|
|
req := &structs.CSIVolumeClaimRequest{
|
|
|
|
VolumeID: vol.ID,
|
|
|
|
AllocationID: args.allocID,
|
|
|
|
Claim: structs.CSIVolumeClaimRelease,
|
|
|
|
WriteRequest: structs.WriteRequest{
|
|
|
|
Region: args.region,
|
|
|
|
Namespace: args.namespace,
|
|
|
|
AuthToken: args.leaderACL,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
err = srv.RPC("CSIVolume.Claim", req, &structs.CSIVolumeClaimResponse{})
|
|
|
|
if err != nil {
|
|
|
|
return args.nodeClaims, err
|
|
|
|
}
|
|
|
|
return args.nodeClaims, nil
|
|
|
|
}
|