946 lines
33 KiB
Go
946 lines
33 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package scheduler
|
|
|
|
import (
|
|
"fmt"
|
|
"runtime/debug"
|
|
"sort"
|
|
"time"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/go-memdb"
|
|
"github.com/hashicorp/go-multierror"
|
|
"github.com/hashicorp/go-version"
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
const (
|
|
// maxServiceScheduleAttempts is used to limit the number of times
|
|
// we will attempt to schedule if we continue to hit conflicts for services.
|
|
maxServiceScheduleAttempts = 5
|
|
|
|
// maxBatchScheduleAttempts is used to limit the number of times
|
|
// we will attempt to schedule if we continue to hit conflicts for batch.
|
|
maxBatchScheduleAttempts = 2
|
|
|
|
// allocNotNeeded is the status used when a job no longer requires an allocation
|
|
allocNotNeeded = "alloc not needed due to job update"
|
|
|
|
// allocReconnected is the status to use when a replacement allocation is stopped
|
|
// because a disconnected node reconnects.
|
|
allocReconnected = "alloc not needed due to disconnected client reconnect"
|
|
|
|
// allocMigrating is the status used when we must migrate an allocation
|
|
allocMigrating = "alloc is being migrated"
|
|
|
|
// allocUpdating is the status used when a job requires an update
|
|
allocUpdating = "alloc is being updated due to job update"
|
|
|
|
// allocLost is the status used when an allocation is lost
|
|
allocLost = "alloc is lost since its node is down"
|
|
|
|
// allocUnknown is the status used when an allocation is unknown
|
|
allocUnknown = "alloc is unknown since its node is disconnected"
|
|
|
|
// allocInPlace is the status used when speculating on an in-place update
|
|
allocInPlace = "alloc updating in-place"
|
|
|
|
// allocNodeTainted is the status used when stopping an alloc because its
|
|
// node is tainted.
|
|
allocNodeTainted = "alloc not needed as node is tainted"
|
|
|
|
// allocRescheduled is the status used when an allocation failed and was rescheduled
|
|
allocRescheduled = "alloc was rescheduled because it failed"
|
|
|
|
// blockedEvalMaxPlanDesc is the description used for blocked evals that are
|
|
// a result of hitting the max number of plan attempts
|
|
blockedEvalMaxPlanDesc = "created due to placement conflicts"
|
|
|
|
// blockedEvalFailedPlacements is the description used for blocked evals
|
|
// that are a result of failing to place all allocations.
|
|
blockedEvalFailedPlacements = "created to place remaining allocations"
|
|
|
|
// reschedulingFollowupEvalDesc is the description used when creating follow
|
|
// up evals for delayed rescheduling
|
|
reschedulingFollowupEvalDesc = "created for delayed rescheduling"
|
|
|
|
// disconnectTimeoutFollowupEvalDesc is the description used when creating follow
|
|
// up evals for allocations that be should be stopped after its disconnect
|
|
// timeout has passed.
|
|
disconnectTimeoutFollowupEvalDesc = "created for delayed disconnect timeout"
|
|
|
|
// maxPastRescheduleEvents is the maximum number of past reschedule event
|
|
// that we track when unlimited rescheduling is enabled
|
|
maxPastRescheduleEvents = 5
|
|
)
|
|
|
|
// minVersionMaxClientDisconnect is the minimum version that supports max_client_disconnect.
|
|
var minVersionMaxClientDisconnect = version.Must(version.NewVersion("1.3.0"))
|
|
|
|
// SetStatusError is used to set the status of the evaluation to the given error
|
|
type SetStatusError struct {
|
|
Err error
|
|
EvalStatus string
|
|
}
|
|
|
|
func (s *SetStatusError) Error() string {
|
|
return s.Err.Error()
|
|
}
|
|
|
|
// GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
|
|
// designed for long-lived services, and as such spends more time attempting
|
|
// to make a high quality placement. This is the primary scheduler for
|
|
// most workloads. It also supports a 'batch' mode to optimize for fast decision
|
|
// making at the cost of quality.
|
|
type GenericScheduler struct {
|
|
logger log.Logger
|
|
eventsCh chan<- interface{}
|
|
state State
|
|
planner Planner
|
|
batch bool
|
|
|
|
eval *structs.Evaluation
|
|
job *structs.Job
|
|
plan *structs.Plan
|
|
planResult *structs.PlanResult
|
|
ctx *EvalContext
|
|
stack *GenericStack
|
|
|
|
// followUpEvals are evals with WaitUntil set, which are delayed until that time
|
|
// before being rescheduled
|
|
followUpEvals []*structs.Evaluation
|
|
|
|
deployment *structs.Deployment
|
|
|
|
blocked *structs.Evaluation
|
|
failedTGAllocs map[string]*structs.AllocMetric
|
|
queuedAllocs map[string]int
|
|
}
|
|
|
|
// NewServiceScheduler is a factory function to instantiate a new service scheduler
|
|
func NewServiceScheduler(logger log.Logger, eventsCh chan<- interface{}, state State, planner Planner) Scheduler {
|
|
s := &GenericScheduler{
|
|
logger: logger.Named("service_sched"),
|
|
eventsCh: eventsCh,
|
|
state: state,
|
|
planner: planner,
|
|
batch: false,
|
|
}
|
|
return s
|
|
}
|
|
|
|
// NewBatchScheduler is a factory function to instantiate a new batch scheduler
|
|
func NewBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state State, planner Planner) Scheduler {
|
|
s := &GenericScheduler{
|
|
logger: logger.Named("batch_sched"),
|
|
eventsCh: eventsCh,
|
|
state: state,
|
|
planner: planner,
|
|
batch: true,
|
|
}
|
|
return s
|
|
}
|
|
|
|
// Process is used to handle a single evaluation
|
|
func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) {
|
|
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
s.logger.Error("processing eval panicked scheduler - please report this as a bug!", "eval_id", eval.ID, "error", r, "stack_trace", string(debug.Stack()))
|
|
err = fmt.Errorf("failed to process eval: %v", r)
|
|
}
|
|
}()
|
|
|
|
// Store the evaluation
|
|
s.eval = eval
|
|
|
|
// Update our logger with the eval's information
|
|
s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace)
|
|
|
|
// Verify the evaluation trigger reason is understood
|
|
switch eval.TriggeredBy {
|
|
case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister,
|
|
structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate,
|
|
structs.EvalTriggerAllocStop,
|
|
structs.EvalTriggerRollingUpdate, structs.EvalTriggerQueuedAllocs,
|
|
structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans,
|
|
structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc,
|
|
structs.EvalTriggerFailedFollowUp, structs.EvalTriggerPreemption,
|
|
structs.EvalTriggerScaling, structs.EvalTriggerMaxDisconnectTimeout, structs.EvalTriggerReconnect:
|
|
default:
|
|
desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
|
|
eval.TriggeredBy)
|
|
return setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
|
|
s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs,
|
|
s.deployment.GetID())
|
|
}
|
|
|
|
// Retry up to the maxScheduleAttempts and reset if progress is made.
|
|
progress := func() bool { return progressMade(s.planResult) }
|
|
limit := maxServiceScheduleAttempts
|
|
if s.batch {
|
|
limit = maxBatchScheduleAttempts
|
|
}
|
|
if err := retryMax(limit, s.process, progress); err != nil {
|
|
if statusErr, ok := err.(*SetStatusError); ok {
|
|
// Scheduling was tried but made no forward progress so create a
|
|
// blocked eval to retry once resources become available.
|
|
var mErr multierror.Error
|
|
if err := s.createBlockedEval(true); err != nil {
|
|
mErr.Errors = append(mErr.Errors, err)
|
|
}
|
|
if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
|
|
s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
|
|
s.queuedAllocs, s.deployment.GetID()); err != nil {
|
|
mErr.Errors = append(mErr.Errors, err)
|
|
}
|
|
return mErr.ErrorOrNil()
|
|
}
|
|
return err
|
|
}
|
|
|
|
// If the current evaluation is a blocked evaluation and we didn't place
|
|
// everything, do not update the status to complete.
|
|
if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 {
|
|
e := s.ctx.Eligibility()
|
|
newEval := s.eval.Copy()
|
|
newEval.EscapedComputedClass = e.HasEscaped()
|
|
newEval.ClassEligibility = e.GetClasses()
|
|
newEval.QuotaLimitReached = e.QuotaLimitReached()
|
|
return s.planner.ReblockEval(newEval)
|
|
}
|
|
|
|
// Update the status to complete
|
|
return setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
|
|
s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs,
|
|
s.deployment.GetID())
|
|
}
|
|
|
|
// createBlockedEval creates a blocked eval and submits it to the planner. If
|
|
// failure is set to true, the eval's trigger reason reflects that.
|
|
func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
|
|
e := s.ctx.Eligibility()
|
|
escaped := e.HasEscaped()
|
|
|
|
// Only store the eligible classes if the eval hasn't escaped.
|
|
var classEligibility map[string]bool
|
|
if !escaped {
|
|
classEligibility = e.GetClasses()
|
|
}
|
|
|
|
s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached(), s.failedTGAllocs)
|
|
if planFailure {
|
|
s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
|
|
s.blocked.StatusDescription = blockedEvalMaxPlanDesc
|
|
} else {
|
|
s.blocked.StatusDescription = blockedEvalFailedPlacements
|
|
}
|
|
|
|
return s.planner.CreateEval(s.blocked)
|
|
}
|
|
|
|
// process is wrapped in retryMax to iteratively run the handler until we have no
|
|
// further work or we've made the maximum number of attempts.
|
|
func (s *GenericScheduler) process() (bool, error) {
|
|
// Lookup the Job by ID
|
|
var err error
|
|
ws := memdb.NewWatchSet()
|
|
s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID)
|
|
if err != nil {
|
|
return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err)
|
|
}
|
|
|
|
numTaskGroups := 0
|
|
stopped := s.job.Stopped()
|
|
if !stopped {
|
|
numTaskGroups = len(s.job.TaskGroups)
|
|
}
|
|
s.queuedAllocs = make(map[string]int, numTaskGroups)
|
|
s.followUpEvals = nil
|
|
|
|
// Create a plan
|
|
s.plan = s.eval.MakePlan(s.job)
|
|
|
|
if !s.batch {
|
|
// Get any existing deployment
|
|
s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID)
|
|
if err != nil {
|
|
return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err)
|
|
}
|
|
}
|
|
|
|
// Reset the failed allocations
|
|
s.failedTGAllocs = nil
|
|
|
|
// Create an evaluation context
|
|
s.ctx = NewEvalContext(s.eventsCh, s.state, s.plan, s.logger)
|
|
|
|
// Construct the placement stack
|
|
s.stack = NewGenericStack(s.batch, s.ctx)
|
|
if !s.job.Stopped() {
|
|
s.setJob(s.job)
|
|
}
|
|
|
|
// Compute the target job allocations
|
|
if err := s.computeJobAllocs(); err != nil {
|
|
s.logger.Error("failed to compute job allocations", "error", err)
|
|
return false, err
|
|
}
|
|
|
|
// If there are failed allocations, we need to create a blocked evaluation
|
|
// to place the failed allocations when resources become available. If the
|
|
// current evaluation is already a blocked eval, we reuse it. If not, submit
|
|
// a new eval to the planner in createBlockedEval. If rescheduling should
|
|
// be delayed, do that instead.
|
|
delayInstead := len(s.followUpEvals) > 0 && s.eval.WaitUntil.IsZero()
|
|
|
|
if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil &&
|
|
!delayInstead {
|
|
if err := s.createBlockedEval(false); err != nil {
|
|
s.logger.Error("failed to make blocked eval", "error", err)
|
|
return false, err
|
|
}
|
|
s.logger.Debug("failed to place all allocations, blocked eval created", "blocked_eval_id", s.blocked.ID)
|
|
}
|
|
|
|
// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
|
|
// anyways to get the annotations.
|
|
if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
|
|
return true, nil
|
|
}
|
|
|
|
// Create follow up evals for any delayed reschedule eligible allocations, except in
|
|
// the case that this evaluation was already delayed.
|
|
if delayInstead {
|
|
for _, eval := range s.followUpEvals {
|
|
eval.PreviousEval = s.eval.ID
|
|
// TODO(preetha) this should be batching evals before inserting them
|
|
if err := s.planner.CreateEval(eval); err != nil {
|
|
s.logger.Error("failed to make next eval for rescheduling", "error", err)
|
|
return false, err
|
|
}
|
|
s.logger.Debug("found reschedulable allocs, followup eval created", "followup_eval_id", eval.ID)
|
|
}
|
|
}
|
|
|
|
// Submit the plan and store the results.
|
|
result, newState, err := s.planner.SubmitPlan(s.plan)
|
|
s.planResult = result
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
// Decrement the number of allocations pending per task group based on the
|
|
// number of allocations successfully placed
|
|
adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
|
|
|
|
// If we got a state refresh, try again since we have stale data
|
|
if newState != nil {
|
|
s.logger.Debug("refresh forced")
|
|
s.state = newState
|
|
return false, nil
|
|
}
|
|
|
|
// Try again if the plan was not fully committed, potential conflict
|
|
fullCommit, expected, actual := result.FullCommit(s.plan)
|
|
if !fullCommit {
|
|
s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual)
|
|
if newState == nil {
|
|
return false, fmt.Errorf("missing state refresh after partial commit")
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// Success!
|
|
return true, nil
|
|
}
|
|
|
|
// computeJobAllocs is used to reconcile differences between the job,
|
|
// existing allocations and node status to update the allocations.
|
|
func (s *GenericScheduler) computeJobAllocs() error {
|
|
// Lookup the allocations by JobID
|
|
ws := memdb.NewWatchSet()
|
|
allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get allocs for job '%s': %v",
|
|
s.eval.JobID, err)
|
|
}
|
|
|
|
// Determine the tainted nodes containing job allocs
|
|
tainted, err := taintedNodes(s.state, allocs)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
|
|
s.eval.JobID, err)
|
|
}
|
|
|
|
// Update the allocations which are in pending/running state on tainted
|
|
// nodes to lost, but only if the scheduler has already marked them
|
|
updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
|
|
|
|
reconciler := NewAllocReconciler(s.logger,
|
|
genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID),
|
|
s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID,
|
|
s.eval.Priority, s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true))
|
|
|
|
results := reconciler.Compute()
|
|
s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", results))
|
|
|
|
if s.eval.AnnotatePlan {
|
|
s.plan.Annotations = &structs.PlanAnnotations{
|
|
DesiredTGUpdates: results.desiredTGUpdates,
|
|
}
|
|
}
|
|
|
|
// Add the deployment changes to the plan
|
|
s.plan.Deployment = results.deployment
|
|
s.plan.DeploymentUpdates = results.deploymentUpdates
|
|
|
|
// Store all the follow up evaluations from rescheduled allocations
|
|
if len(results.desiredFollowupEvals) > 0 {
|
|
for _, evals := range results.desiredFollowupEvals {
|
|
s.followUpEvals = append(s.followUpEvals, evals...)
|
|
}
|
|
}
|
|
|
|
// Update the stored deployment
|
|
if results.deployment != nil {
|
|
s.deployment = results.deployment
|
|
}
|
|
|
|
// Handle the stop
|
|
for _, stop := range results.stop {
|
|
s.plan.AppendStoppedAlloc(stop.alloc, stop.statusDescription, stop.clientStatus, stop.followupEvalID)
|
|
}
|
|
|
|
// Handle disconnect updates
|
|
for _, update := range results.disconnectUpdates {
|
|
s.plan.AppendUnknownAlloc(update)
|
|
}
|
|
|
|
// Handle reconnect updates.
|
|
// Reconnected allocs have a new AllocState entry.
|
|
for _, update := range results.reconnectUpdates {
|
|
s.ctx.Plan().AppendAlloc(update, nil)
|
|
}
|
|
|
|
// Handle the in-place updates
|
|
for _, update := range results.inplaceUpdate {
|
|
if update.DeploymentID != s.deployment.GetID() {
|
|
update.DeploymentID = s.deployment.GetID()
|
|
update.DeploymentStatus = nil
|
|
}
|
|
s.ctx.Plan().AppendAlloc(update, nil)
|
|
}
|
|
|
|
// Handle the annotation updates
|
|
for _, update := range results.attributeUpdates {
|
|
s.ctx.Plan().AppendAlloc(update, nil)
|
|
}
|
|
|
|
// Nothing remaining to do if placement is not required
|
|
if len(results.place)+len(results.destructiveUpdate) == 0 {
|
|
// If the job has been purged we don't have access to the job. Otherwise
|
|
// set the queued allocs to zero. This is true if the job is being
|
|
// stopped as well.
|
|
if s.job != nil {
|
|
for _, tg := range s.job.TaskGroups {
|
|
s.queuedAllocs[tg.Name] = 0
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Compute the placements
|
|
place := make([]placementResult, 0, len(results.place))
|
|
for _, p := range results.place {
|
|
s.queuedAllocs[p.taskGroup.Name] += 1
|
|
place = append(place, p)
|
|
}
|
|
|
|
destructive := make([]placementResult, 0, len(results.destructiveUpdate))
|
|
for _, p := range results.destructiveUpdate {
|
|
s.queuedAllocs[p.placeTaskGroup.Name] += 1
|
|
destructive = append(destructive, p)
|
|
}
|
|
return s.computePlacements(destructive, place, results.taskGroupAllocNameIndexes)
|
|
}
|
|
|
|
// downgradedJobForPlacement returns the job appropriate for non-canary placement replacement
|
|
func (s *GenericScheduler) downgradedJobForPlacement(p placementResult) (string, *structs.Job, error) {
|
|
ns, jobID := s.job.Namespace, s.job.ID
|
|
tgName := p.TaskGroup().Name
|
|
|
|
// find deployments and use the latest promoted or canaried version
|
|
deployments, err := s.state.DeploymentsByJobID(nil, ns, jobID, false)
|
|
if err != nil {
|
|
return "", nil, fmt.Errorf("failed to lookup job deployments: %v", err)
|
|
}
|
|
|
|
sort.Slice(deployments, func(i, j int) bool {
|
|
return deployments[i].JobVersion > deployments[j].JobVersion
|
|
})
|
|
|
|
for _, d := range deployments {
|
|
// It's unexpected to have a recent deployment that doesn't contain the TaskGroup; as all allocations
|
|
// should be destroyed. In such cases, attempt to find the deployment for that TaskGroup and hopefully
|
|
// we will kill it soon. This is a defensive measure, have not seen it in practice
|
|
//
|
|
// Zero dstate.DesiredCanaries indicates that the TaskGroup allocates were updated in-place without using canaries.
|
|
if dstate := d.TaskGroups[tgName]; dstate != nil && (dstate.Promoted || dstate.DesiredCanaries == 0) {
|
|
job, err := s.state.JobByIDAndVersion(nil, ns, jobID, d.JobVersion)
|
|
return d.ID, job, err
|
|
}
|
|
}
|
|
|
|
// check if the non-promoted version is a job without update block. This version should be the latest "stable" version,
|
|
// as all subsequent versions must be canaried deployments. Otherwise, we would have found a deployment above,
|
|
// or the alloc would have been replaced already by a newer non-deployment job.
|
|
if job, err := s.state.JobByIDAndVersion(nil, ns, jobID, p.MinJobVersion()); err == nil && job != nil && job.Update.IsEmpty() {
|
|
return "", job, err
|
|
}
|
|
|
|
return "", nil, nil
|
|
}
|
|
|
|
// computePlacements computes placements for allocations. It is given the set of
|
|
// destructive updates to place and the set of new placements to place.
|
|
func (s *GenericScheduler) computePlacements(destructive, place []placementResult, nameIndex map[string]*allocNameIndex) error {
|
|
|
|
// Get the base nodes
|
|
nodes, byDC, err := s.setNodes(s.job)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var deploymentID string
|
|
if s.deployment != nil && s.deployment.Active() {
|
|
deploymentID = s.deployment.ID
|
|
}
|
|
|
|
// Capture current time to use as the start time for any rescheduled allocations
|
|
now := time.Now()
|
|
|
|
// Have to handle destructive changes first as we need to discount their
|
|
// resources. To understand this imagine the resources were reduced and the
|
|
// count was scaled up.
|
|
for _, results := range [][]placementResult{destructive, place} {
|
|
for _, missing := range results {
|
|
// Get the task group
|
|
tg := missing.TaskGroup()
|
|
|
|
// This is populated from the reconciler via the compute results,
|
|
// therefore we cannot have an allocation belonging to a task group
|
|
// that has not generated and been through allocation name index
|
|
// tracking.
|
|
taskGroupNameIndex := nameIndex[tg.Name]
|
|
|
|
var downgradedJob *structs.Job
|
|
|
|
if missing.DowngradeNonCanary() {
|
|
jobDeploymentID, job, err := s.downgradedJobForPlacement(missing)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Defensive check - if there is no appropriate deployment for this job, use the latest
|
|
if job != nil && job.Version >= missing.MinJobVersion() && job.LookupTaskGroup(tg.Name) != nil {
|
|
tg = job.LookupTaskGroup(tg.Name)
|
|
downgradedJob = job
|
|
deploymentID = jobDeploymentID
|
|
} else {
|
|
jobVersion := -1
|
|
if job != nil {
|
|
jobVersion = int(job.Version)
|
|
}
|
|
s.logger.Debug("failed to find appropriate job; using the latest", "expected_version", missing.MinJobVersion, "found_version", jobVersion)
|
|
}
|
|
}
|
|
|
|
// Check if this task group has already failed
|
|
if metric, ok := s.failedTGAllocs[tg.Name]; ok {
|
|
metric.CoalescedFailures += 1
|
|
metric.ExhaustResources(tg)
|
|
continue
|
|
}
|
|
|
|
// Use downgraded job in scheduling stack to honor old job
|
|
// resources, constraints, and node pool scheduler configuration.
|
|
if downgradedJob != nil {
|
|
s.setJob(downgradedJob)
|
|
|
|
if needsToSetNodes(downgradedJob, s.job) {
|
|
nodes, byDC, err = s.setNodes(downgradedJob)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find the preferred node
|
|
preferredNode, err := s.findPreferredNode(missing)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check if we should stop the previous allocation upon successful
|
|
// placement of its replacement. This allow atomic placements/stops. We
|
|
// stop the allocation before trying to find a replacement because this
|
|
// frees the resources currently used by the previous allocation.
|
|
stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc()
|
|
prevAllocation := missing.PreviousAllocation()
|
|
if stopPrevAlloc {
|
|
s.plan.AppendStoppedAlloc(prevAllocation, stopPrevAllocDesc, "", "")
|
|
}
|
|
|
|
// Compute penalty nodes for rescheduled allocs
|
|
selectOptions := getSelectOptions(prevAllocation, preferredNode)
|
|
selectOptions.AllocName = missing.Name()
|
|
option := s.selectNextOption(tg, selectOptions)
|
|
|
|
// Store the available nodes by datacenter
|
|
s.ctx.Metrics().NodesAvailable = byDC
|
|
s.ctx.Metrics().NodesInPool = len(nodes)
|
|
|
|
// Compute top K scoring node metadata
|
|
s.ctx.Metrics().PopulateScoreMetaData()
|
|
|
|
// Restore stack job and nodes now that placement is done, to use
|
|
// plan job version
|
|
if downgradedJob != nil {
|
|
s.setJob(s.job)
|
|
|
|
if needsToSetNodes(downgradedJob, s.job) {
|
|
nodes, byDC, err = s.setNodes(s.job)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Set fields based on if we found an allocation option
|
|
if option != nil {
|
|
resources := &structs.AllocatedResources{
|
|
Tasks: option.TaskResources,
|
|
TaskLifecycles: option.TaskLifecycles,
|
|
Shared: structs.AllocatedSharedResources{
|
|
DiskMB: int64(tg.EphemeralDisk.SizeMB),
|
|
},
|
|
}
|
|
if option.AllocResources != nil {
|
|
resources.Shared.Networks = option.AllocResources.Networks
|
|
resources.Shared.Ports = option.AllocResources.Ports
|
|
}
|
|
|
|
// Pull the allocation name as a new variables, so we can alter
|
|
// this as needed without making changes to the original
|
|
// object.
|
|
newAllocName := missing.Name()
|
|
|
|
// Identify the index from the name, so we can check this
|
|
// against the allocation name index tracking for duplicates.
|
|
allocIndex := structs.AllocIndexFromName(newAllocName, s.job.ID, tg.Name)
|
|
|
|
// If the allocation index is a duplicate, we cannot simply
|
|
// create a new allocation with the same name. We need to
|
|
// generate a new index and use this. The log message is useful
|
|
// for debugging and development, but could be removed in a
|
|
// future version of Nomad.
|
|
if taskGroupNameIndex.IsDuplicate(allocIndex) {
|
|
oldAllocName := newAllocName
|
|
newAllocName = taskGroupNameIndex.Next(1)[0]
|
|
taskGroupNameIndex.UnsetIndex(allocIndex)
|
|
s.logger.Debug("duplicate alloc index found and changed",
|
|
"old_alloc_name", oldAllocName, "new_alloc_name", newAllocName)
|
|
}
|
|
|
|
// Create an allocation for this
|
|
alloc := &structs.Allocation{
|
|
ID: uuid.Generate(),
|
|
Namespace: s.job.Namespace,
|
|
EvalID: s.eval.ID,
|
|
Name: newAllocName,
|
|
JobID: s.job.ID,
|
|
TaskGroup: tg.Name,
|
|
Metrics: s.ctx.Metrics(),
|
|
NodeID: option.Node.ID,
|
|
NodeName: option.Node.Name,
|
|
DeploymentID: deploymentID,
|
|
TaskResources: resources.OldTaskResources(),
|
|
AllocatedResources: resources,
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
ClientStatus: structs.AllocClientStatusPending,
|
|
// SharedResources is considered deprecated, will be removed in 0.11.
|
|
// It is only set for compat reasons.
|
|
SharedResources: &structs.Resources{
|
|
DiskMB: tg.EphemeralDisk.SizeMB,
|
|
Networks: resources.Shared.Networks,
|
|
},
|
|
}
|
|
|
|
// If the new allocation is replacing an older allocation then we
|
|
// set the record the older allocation id so that they are chained
|
|
if prevAllocation != nil {
|
|
alloc.PreviousAllocation = prevAllocation.ID
|
|
if missing.IsRescheduling() {
|
|
updateRescheduleTracker(alloc, prevAllocation, now)
|
|
}
|
|
|
|
// If the allocation has task handles,
|
|
// copy them to the new allocation
|
|
propagateTaskState(alloc, prevAllocation, missing.PreviousLost())
|
|
}
|
|
|
|
// If we are placing a canary and we found a match, add the canary
|
|
// to the deployment state object and mark it as a canary.
|
|
if missing.Canary() && s.deployment != nil {
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
Canary: true,
|
|
}
|
|
}
|
|
|
|
s.handlePreemptions(option, alloc, missing)
|
|
|
|
// Track the placement
|
|
s.plan.AppendAlloc(alloc, downgradedJob)
|
|
|
|
} else {
|
|
// Lazy initialize the failed map
|
|
if s.failedTGAllocs == nil {
|
|
s.failedTGAllocs = make(map[string]*structs.AllocMetric)
|
|
}
|
|
|
|
// Update metrics with the resources requested by the task group.
|
|
s.ctx.Metrics().ExhaustResources(tg)
|
|
|
|
// Track the fact that we didn't find a placement
|
|
s.failedTGAllocs[tg.Name] = s.ctx.Metrics()
|
|
|
|
// If we weren't able to find a replacement for the allocation, back
|
|
// out the fact that we asked to stop the allocation.
|
|
if stopPrevAlloc {
|
|
s.plan.PopUpdate(prevAllocation)
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// setJob updates the stack with the given job and job's node pool scheduler
|
|
// configuration.
|
|
func (s *GenericScheduler) setJob(job *structs.Job) error {
|
|
// Fetch node pool and global scheduler configuration to determine how to
|
|
// configure the scheduler.
|
|
pool, err := s.state.NodePoolByName(nil, job.NodePool)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get job node pool %q: %v", job.NodePool, err)
|
|
}
|
|
|
|
_, schedConfig, err := s.state.SchedulerConfig()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get scheduler configuration: %v", err)
|
|
}
|
|
|
|
s.stack.SetJob(job)
|
|
s.stack.SetSchedulerConfiguration(schedConfig.WithNodePool(pool))
|
|
return nil
|
|
}
|
|
|
|
// setnodes updates the stack with the nodes that are ready for placement for
|
|
// the given job.
|
|
func (s *GenericScheduler) setNodes(job *structs.Job) ([]*structs.Node, map[string]int, error) {
|
|
nodes, _, byDC, err := readyNodesInDCsAndPool(s.state, job.Datacenters, job.NodePool)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
s.stack.SetNodes(nodes)
|
|
return nodes, byDC, nil
|
|
}
|
|
|
|
// needsToSetNodes returns true if jobs a and b changed in a way that requires
|
|
// the nodes to be reset.
|
|
func needsToSetNodes(a, b *structs.Job) bool {
|
|
return !helper.SliceSetEq(a.Datacenters, b.Datacenters) ||
|
|
a.NodePool != b.NodePool
|
|
}
|
|
|
|
// propagateTaskState copies task handles from previous allocations to
|
|
// replacement allocations when the previous allocation is being drained or was
|
|
// lost. Remote task drivers rely on this to reconnect to remote tasks when the
|
|
// allocation managing them changes due to a down or draining node.
|
|
//
|
|
// The previous allocation will be marked as lost after task state has been
|
|
// propagated (when the plan is applied), so its ClientStatus is not yet marked
|
|
// as lost. Instead, we use the `prevLost` flag to track whether the previous
|
|
// allocation will be marked lost.
|
|
func propagateTaskState(newAlloc, prev *structs.Allocation, prevLost bool) {
|
|
// Don't transfer state from client terminal allocs
|
|
if prev.ClientTerminalStatus() {
|
|
return
|
|
}
|
|
|
|
// If previous allocation is not lost and not draining, do not copy
|
|
// task handles.
|
|
if !prevLost && !prev.DesiredTransition.ShouldMigrate() {
|
|
return
|
|
}
|
|
|
|
newAlloc.TaskStates = make(map[string]*structs.TaskState, len(newAlloc.AllocatedResources.Tasks))
|
|
for taskName, prevState := range prev.TaskStates {
|
|
if prevState.TaskHandle == nil {
|
|
// No task handle, skip
|
|
continue
|
|
}
|
|
|
|
if _, ok := newAlloc.AllocatedResources.Tasks[taskName]; !ok {
|
|
// Task dropped in update, skip
|
|
continue
|
|
}
|
|
|
|
// Copy state
|
|
newState := structs.NewTaskState()
|
|
newState.TaskHandle = prevState.TaskHandle.Copy()
|
|
newAlloc.TaskStates[taskName] = newState
|
|
}
|
|
}
|
|
|
|
// getSelectOptions sets up preferred nodes and penalty nodes
|
|
func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions {
|
|
selectOptions := &SelectOptions{}
|
|
if prevAllocation != nil {
|
|
penaltyNodes := make(map[string]struct{})
|
|
|
|
// If alloc failed, penalize the node it failed on to encourage
|
|
// rescheduling on a new node.
|
|
if prevAllocation.ClientStatus == structs.AllocClientStatusFailed {
|
|
penaltyNodes[prevAllocation.NodeID] = struct{}{}
|
|
}
|
|
if prevAllocation.RescheduleTracker != nil {
|
|
for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
|
|
penaltyNodes[reschedEvent.PrevNodeID] = struct{}{}
|
|
}
|
|
}
|
|
selectOptions.PenaltyNodeIDs = penaltyNodes
|
|
}
|
|
if preferredNode != nil {
|
|
selectOptions.PreferredNodes = []*structs.Node{preferredNode}
|
|
}
|
|
return selectOptions
|
|
}
|
|
|
|
// updateRescheduleTracker carries over previous restart attempts and adds the most recent restart
|
|
func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation, now time.Time) {
|
|
reschedPolicy := prev.ReschedulePolicy()
|
|
var rescheduleEvents []*structs.RescheduleEvent
|
|
if prev.RescheduleTracker != nil {
|
|
var interval time.Duration
|
|
if reschedPolicy != nil {
|
|
interval = reschedPolicy.Interval
|
|
}
|
|
// If attempts is set copy all events in the interval range
|
|
if reschedPolicy.Attempts > 0 {
|
|
for _, reschedEvent := range prev.RescheduleTracker.Events {
|
|
timeDiff := now.UnixNano() - reschedEvent.RescheduleTime
|
|
// Only copy over events that are within restart interval
|
|
// This keeps the list of events small in cases where there's a long chain of old restart events
|
|
if interval > 0 && timeDiff <= interval.Nanoseconds() {
|
|
rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
|
|
}
|
|
}
|
|
} else {
|
|
// Only copy the last n if unlimited is set
|
|
start := 0
|
|
if len(prev.RescheduleTracker.Events) > maxPastRescheduleEvents {
|
|
start = len(prev.RescheduleTracker.Events) - maxPastRescheduleEvents
|
|
}
|
|
for i := start; i < len(prev.RescheduleTracker.Events); i++ {
|
|
reschedEvent := prev.RescheduleTracker.Events[i]
|
|
rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
|
|
}
|
|
}
|
|
}
|
|
nextDelay := prev.NextDelay()
|
|
rescheduleEvent := structs.NewRescheduleEvent(now.UnixNano(), prev.ID, prev.NodeID, nextDelay)
|
|
rescheduleEvents = append(rescheduleEvents, rescheduleEvent)
|
|
alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
|
|
}
|
|
|
|
// findPreferredNode finds the preferred node for an allocation
|
|
func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) {
|
|
prev := place.PreviousAllocation()
|
|
if prev == nil {
|
|
return nil, nil
|
|
}
|
|
if place.TaskGroup().EphemeralDisk.Sticky || place.TaskGroup().EphemeralDisk.Migrate {
|
|
var preferredNode *structs.Node
|
|
ws := memdb.NewWatchSet()
|
|
preferredNode, err := s.state.NodeByID(ws, prev.NodeID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if preferredNode != nil && preferredNode.Ready() {
|
|
return preferredNode, nil
|
|
}
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// selectNextOption calls the stack to get a node for placement
|
|
func (s *GenericScheduler) selectNextOption(tg *structs.TaskGroup, selectOptions *SelectOptions) *RankedNode {
|
|
option := s.stack.Select(tg, selectOptions)
|
|
_, schedConfig, _ := s.ctx.State().SchedulerConfig()
|
|
|
|
// Check if preemption is enabled, defaults to true
|
|
//
|
|
// The scheduler configuration is read directly from state but only
|
|
// values that can't be specified per node pool should be used. Other
|
|
// values must be merged by calling schedConfig.WithNodePool() and set in
|
|
// the stack by calling SetSchedulerConfiguration().
|
|
enablePreemption := true
|
|
if schedConfig != nil {
|
|
if s.job.Type == structs.JobTypeBatch {
|
|
enablePreemption = schedConfig.PreemptionConfig.BatchSchedulerEnabled
|
|
} else {
|
|
enablePreemption = schedConfig.PreemptionConfig.ServiceSchedulerEnabled
|
|
}
|
|
}
|
|
// Run stack again with preemption enabled
|
|
if option == nil && enablePreemption {
|
|
selectOptions.Preempt = true
|
|
option = s.stack.Select(tg, selectOptions)
|
|
}
|
|
return option
|
|
}
|
|
|
|
// handlePreemptions sets relevant preeemption related fields.
|
|
func (s *GenericScheduler) handlePreemptions(option *RankedNode, alloc *structs.Allocation, missing placementResult) {
|
|
if option.PreemptedAllocs == nil {
|
|
return
|
|
}
|
|
|
|
// If this placement involves preemption, set DesiredState to evict for those allocations
|
|
var preemptedAllocIDs []string
|
|
for _, stop := range option.PreemptedAllocs {
|
|
s.plan.AppendPreemptedAlloc(stop, alloc.ID)
|
|
preemptedAllocIDs = append(preemptedAllocIDs, stop.ID)
|
|
|
|
if s.eval.AnnotatePlan && s.plan.Annotations != nil {
|
|
s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub(nil))
|
|
if s.plan.Annotations.DesiredTGUpdates != nil {
|
|
desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup().Name]
|
|
desired.Preemptions += 1
|
|
}
|
|
}
|
|
}
|
|
|
|
alloc.PreemptedAllocations = preemptedAllocIDs
|
|
}
|