open-nomad/scheduler/reconcile.go

984 lines
34 KiB
Go
Raw Normal View History

2017-05-22 17:58:34 +00:00
package scheduler
import (
2017-07-07 23:49:08 +00:00
"fmt"
"time"
"sort"
2018-09-15 23:23:13 +00:00
log "github.com/hashicorp/go-hclog"
2017-05-22 17:58:34 +00:00
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/uuid"
2017-05-22 17:58:34 +00:00
"github.com/hashicorp/nomad/nomad/structs"
)
2018-03-08 00:44:54 +00:00
const (
// batchedFailedAllocWindowSize is the window size used
// to batch up failed allocations before creating an eval
batchedFailedAllocWindowSize = 5 * time.Second
// rescheduleWindowSize is the window size relative to
// current time within which reschedulable allocations are placed.
// This helps protect against small clock drifts between servers
rescheduleWindowSize = 1 * time.Second
2018-03-08 00:44:54 +00:00
)
// allocUpdateType takes an existing allocation and a new job definition and
// returns whether the allocation can ignore the change, requires a destructive
// update, or can be inplace updated. If it can be inplace updated, an updated
// allocation that has the new resources and alloc metrics attached will be
// returned.
2017-06-01 22:16:24 +00:00
type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
2017-05-23 00:42:41 +00:00
// allocReconciler is used to determine the set of allocations that require
// placement, inplace updating or stopping given the job specification and
// existing cluster state. The reconciler should only be used for batch and
// service jobs.
2017-05-22 17:58:34 +00:00
type allocReconciler struct {
// logger is used to log debug information. Logging should be kept at a
// minimal here
2018-09-15 23:23:13 +00:00
logger log.Logger
2017-05-22 17:58:34 +00:00
// canInplace is used to check if the allocation can be inplace upgraded
allocUpdateFn allocUpdateType
2017-05-22 17:58:34 +00:00
// batch marks whether the job is a batch job
batch bool
// job is the job being operated on, it may be nil if the job is being
// stopped via a purge
job *structs.Job
// jobID is the ID of the job being operated on. The job may be nil if it is
// being stopped so we require this separately.
jobID string
2017-07-05 19:50:40 +00:00
// oldDeployment is the last deployment for the job
oldDeployment *structs.Deployment
2017-05-22 17:58:34 +00:00
// deployment is the current deployment for the job
deployment *structs.Deployment
// deploymentPaused marks whether the deployment is paused
deploymentPaused bool
2017-06-02 23:11:29 +00:00
// deploymentFailed marks whether the deployment is failed
deploymentFailed bool
2017-05-22 17:58:34 +00:00
// taintedNodes contains a map of nodes that are tainted
taintedNodes map[string]*structs.Node
// existingAllocs is non-terminal existing allocations
existingAllocs []*structs.Allocation
// evalID is the ID of the evaluation that triggered the reconciler
evalID string
// now is the time used when determining rescheduling eligibility
// defaults to time.Now, and overidden in unit tests
now time.Time
2017-05-22 17:58:34 +00:00
// result is the results of the reconcile. During computation it can be
// used to store intermediate state
result *reconcileResults
}
2017-05-23 00:42:41 +00:00
// reconcileResults contains the results of the reconciliation and should be
// applied by the scheduler.
2017-05-22 17:58:34 +00:00
type reconcileResults struct {
// deployment is the deployment that should be created or updated as a
// result of scheduling
deployment *structs.Deployment
2017-05-23 00:42:41 +00:00
// deploymentUpdates contains a set of deployment updates that should be
// applied as a result of scheduling
2017-05-22 17:58:34 +00:00
deploymentUpdates []*structs.DeploymentStatusUpdate
2017-05-23 00:42:41 +00:00
// place is the set of allocations to place by the scheduler
place []allocPlaceResult
2017-07-15 23:31:33 +00:00
// destructiveUpdate is the set of allocations to apply a destructive update to
destructiveUpdate []allocDestructiveResult
2017-05-23 00:42:41 +00:00
// inplaceUpdate is the set of allocations to apply an inplace update to
2017-05-22 17:58:34 +00:00
inplaceUpdate []*structs.Allocation
2017-05-23 00:42:41 +00:00
// stop is the set of allocations to stop
stop []allocStopResult
2017-05-22 17:58:34 +00:00
2018-03-26 18:06:21 +00:00
// attributeUpdates are updates to the allocation that are not from a
2018-03-23 23:55:21 +00:00
// jobspec change.
2018-03-26 18:06:21 +00:00
attributeUpdates map[string]*structs.Allocation
2018-03-23 23:55:21 +00:00
2017-05-23 20:02:47 +00:00
// desiredTGUpdates captures the desired set of changes to make for each
// task group.
desiredTGUpdates map[string]*structs.DesiredUpdates
// desiredFollowupEvals is the map of follow up evaluations to create per task group
// This is used to create a delayed evaluation for rescheduling failed allocations.
desiredFollowupEvals map[string][]*structs.Evaluation
}
// delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
// this is used to create follow up evaluations
type delayedRescheduleInfo struct {
// allocID is the ID of the allocation eligible to be rescheduled
allocID string
Stop allocs to be rescheduled Currently, when an alloc fails and is rescheduled, the alloc desired state remains as "run" and the nomad client may not free the resources. Here, we ensure that an alloc is marked as stopped when it's rescheduled. Notice the Desired Status and Description before and after this change: Before: ``` mars-2:nomad notnoop$ nomad alloc status 02aba49e ID = 02aba49e Eval ID = bb9ed1d2 Name = example-reschedule.nodes[0] Node ID = 5853d547 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = run Desired Description = <none> Created = 10s ago Modified = 5s ago Replacement Alloc ID = d6bf872b Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 0/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:12:45Z Finished At = 2019-06-06T21:12:50Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:12:50-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:12:50-04:00 Terminated Exit Code: 1 2019-06-06T17:12:45-04:00 Started Task started by client 2019-06-06T17:12:45-04:00 Task Setup Building Task Directory 2019-06-06T17:12:45-04:00 Received Task received by client ``` After: ``` ID = 5001ccd1 Eval ID = 53507a02 Name = example-reschedule.nodes[0] Node ID = a3b04364 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = stop Desired Description = alloc was rescheduled because it failed Created = 13s ago Modified = 3s ago Replacement Alloc ID = 7ba7ac20 Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 21/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:22:50Z Finished At = 2019-06-06T21:22:55Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:22:55-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:22:55-04:00 Terminated Exit Code: 1 2019-06-06T17:22:50-04:00 Started Task started by client 2019-06-06T17:22:50-04:00 Task Setup Building Task Directory 2019-06-06T17:22:50-04:00 Received Task received by client ```
2019-06-06 19:04:32 +00:00
alloc *structs.Allocation
// rescheduleTime is the time to use in the delayed evaluation
rescheduleTime time.Time
2017-05-22 17:58:34 +00:00
}
2017-07-07 23:49:08 +00:00
func (r *reconcileResults) GoString() string {
2017-07-15 23:31:33 +00:00
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
2017-07-07 23:49:08 +00:00
if r.deployment != nil {
base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
}
for _, u := range r.deploymentUpdates {
base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
u.DeploymentID, u.Status, u.StatusDescription)
}
for tg, u := range r.desiredTGUpdates {
base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
}
return base
}
2017-07-05 19:55:51 +00:00
// Changes returns the number of total changes
func (r *reconcileResults) Changes() int {
return len(r.place) + len(r.inplaceUpdate) + len(r.stop)
}
2017-05-23 00:42:41 +00:00
// NewAllocReconciler creates a new reconciler that should be used to determine
// the changes required to bring the cluster state inline with the declared jobspec
2018-09-15 23:23:13 +00:00
func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool,
jobID string, job *structs.Job, deployment *structs.Deployment,
existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler {
2017-07-06 02:46:57 +00:00
return &allocReconciler{
2018-09-15 23:23:13 +00:00
logger: logger.Named("reconciler"),
allocUpdateFn: allocUpdateFn,
2017-05-22 17:58:34 +00:00
batch: batch,
jobID: jobID,
2017-05-22 17:58:34 +00:00
job: job,
2017-07-05 19:55:51 +00:00
deployment: deployment.Copy(),
2017-05-22 17:58:34 +00:00
existingAllocs: existingAllocs,
taintedNodes: taintedNodes,
evalID: evalID,
now: time.Now(),
2017-05-23 20:02:47 +00:00
result: &reconcileResults{
desiredTGUpdates: make(map[string]*structs.DesiredUpdates),
desiredFollowupEvals: make(map[string][]*structs.Evaluation),
2017-05-23 20:02:47 +00:00
},
2017-05-22 17:58:34 +00:00
}
}
2017-05-23 00:42:41 +00:00
// Compute reconciles the existing cluster state and returns the set of changes
// required to converge the job spec and state
2017-05-22 17:58:34 +00:00
func (a *allocReconciler) Compute() *reconcileResults {
2017-06-01 22:16:24 +00:00
// Create the allocation matrix
m := newAllocMatrix(a.job, a.existingAllocs)
2017-06-06 21:08:46 +00:00
// Handle stopping unneeded deployments
a.cancelDeployments()
2017-06-01 22:16:24 +00:00
2017-05-22 17:58:34 +00:00
// If we are just stopping a job we do not need to do anything more than
// stopping all running allocs
2017-06-01 22:16:24 +00:00
if a.job.Stopped() {
a.handleStop(m)
return a.result
}
2017-07-06 02:46:57 +00:00
// Detect if the deployment is paused
if a.deployment != nil {
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused ||
a.deployment.Status == structs.DeploymentStatusPending
2018-04-08 23:09:14 +00:00
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
2017-07-06 02:46:57 +00:00
}
if a.deployment == nil {
// When we create the deployment later, it will be in a pending
// state. But we also need to tell Compute we're paused, otherwise we
// make placements on the paused deployment.
if a.job.IsMultiregion() && !(a.job.IsPeriodic() || a.job.IsParameterized()) {
a.deploymentPaused = true
}
}
2017-07-06 02:46:57 +00:00
2017-06-01 22:16:24 +00:00
// Reconcile each group
2017-07-05 19:55:51 +00:00
complete := true
2017-06-01 22:16:24 +00:00
for group, as := range m {
2017-07-05 19:55:51 +00:00
groupComplete := a.computeGroup(group, as)
complete = complete && groupComplete
}
// Mark the deployment as complete if possible
if a.deployment != nil && complete {
2020-06-17 15:02:26 +00:00
if a.job.IsMultiregion() {
// the unblocking/successful states come after blocked, so we
// need to make sure we don't revert those states
if a.deployment.Status != structs.DeploymentStatusUnblocking &&
a.deployment.Status != structs.DeploymentStatusSuccessful {
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusBlocked,
StatusDescription: structs.DeploymentStatusDescriptionBlocked,
})
}
2020-06-17 15:02:26 +00:00
} else {
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusSuccessful,
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
})
2020-06-17 15:02:26 +00:00
}
2017-06-01 22:16:24 +00:00
}
// Set the description of a created deployment
if d := a.result.deployment; d != nil {
if d.RequiresPromotion() {
if d.HasAutoPromote() {
d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion
} else {
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
}
}
}
2017-06-01 22:16:24 +00:00
return a.result
}
2017-05-22 17:58:34 +00:00
2017-06-06 21:08:46 +00:00
// cancelDeployments cancels any deployment that is not needed
func (a *allocReconciler) cancelDeployments() {
// If the job is stopped and there is a non-terminal deployment, cancel it
2017-06-01 22:16:24 +00:00
if a.job.Stopped() {
2017-06-02 23:11:29 +00:00
if a.deployment != nil && a.deployment.Active() {
2017-05-22 17:58:34 +00:00
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusCancelled,
StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
})
}
2017-06-01 22:16:24 +00:00
// Nothing else to do
a.oldDeployment = a.deployment
a.deployment = nil
2017-06-01 22:16:24 +00:00
return
2017-05-22 17:58:34 +00:00
}
2017-07-05 19:50:40 +00:00
d := a.deployment
if d == nil {
return
}
// Check if the deployment is active and referencing an older job and cancel it
if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
2017-07-06 00:13:45 +00:00
if d.Active() {
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusCancelled,
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
})
}
2017-07-05 19:55:51 +00:00
2017-07-05 19:50:40 +00:00
a.oldDeployment = d
a.deployment = nil
}
// Clear it as the current deployment if it is successful
if d.Status == structs.DeploymentStatusSuccessful {
a.oldDeployment = d
a.deployment = nil
}
2017-05-22 17:58:34 +00:00
}
// handleStop marks all allocations to be stopped, handling the lost case
2017-06-01 22:16:24 +00:00
func (a *allocReconciler) handleStop(m allocMatrix) {
for group, as := range m {
as = filterByTerminal(as)
2017-06-01 22:16:24 +00:00
untainted, migrate, lost := as.filterByTainted(a.taintedNodes)
a.markStop(untainted, "", allocNotNeeded)
a.markStop(migrate, "", allocNotNeeded)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
desiredChanges := new(structs.DesiredUpdates)
desiredChanges.Stop = uint64(len(as))
a.result.desiredTGUpdates[group] = desiredChanges
}
2017-05-22 17:58:34 +00:00
}
2017-05-23 00:42:41 +00:00
// markStop is a helper for marking a set of allocation for stop with a
// particular client status and description.
2017-05-22 17:58:34 +00:00
func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
for _, alloc := range allocs {
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
clientStatus: clientStatus,
statusDescription: statusDescription,
})
}
}
// markDelayed does markStop, but optionally includes a FollowupEvalID so that we can update
// the stopped alloc with its delayed rescheduling evalID
func (a *allocReconciler) markDelayed(allocs allocSet, clientStatus, statusDescription string, followupEvals map[string]string) {
for _, alloc := range allocs {
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
clientStatus: clientStatus,
statusDescription: statusDescription,
followupEvalID: followupEvals[alloc.ID],
})
}
}
2017-07-05 19:55:51 +00:00
// computeGroup reconciles state for a particular task group. It returns whether
// the deployment it is for is complete with regards to the task group.
func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
2017-05-23 20:02:47 +00:00
// Create the desired update object for the group
desiredChanges := new(structs.DesiredUpdates)
a.result.desiredTGUpdates[group] = desiredChanges
2017-05-22 17:58:34 +00:00
// Get the task group. The task group may be nil if the job was updates such
// that the task group no longer exists
tg := a.job.LookupTaskGroup(group)
// If the task group is nil, then the task group has been removed so all we
// need to do is stop everything
if tg == nil {
2017-06-02 23:11:29 +00:00
untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
2017-05-22 17:58:34 +00:00
a.markStop(untainted, "", allocNotNeeded)
a.markStop(migrate, "", allocNotNeeded)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
2017-05-23 20:02:47 +00:00
desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost))
2017-07-05 19:55:51 +00:00
return true
2017-05-22 17:58:34 +00:00
}
2017-05-31 18:34:46 +00:00
// Get the deployment state for the group
var dstate *structs.DeploymentState
existingDeployment := false
2017-05-31 18:34:46 +00:00
if a.deployment != nil {
2017-06-06 21:08:46 +00:00
dstate, existingDeployment = a.deployment.TaskGroups[group]
}
if !existingDeployment {
2018-04-04 22:39:45 +00:00
dstate = &structs.DeploymentState{}
if !tg.Update.IsEmpty() {
2018-04-04 22:39:45 +00:00
dstate.AutoRevert = tg.Update.AutoRevert
dstate.AutoPromote = tg.Update.AutoPromote
2018-04-04 22:39:45 +00:00
dstate.ProgressDeadline = tg.Update.ProgressDeadline
2017-06-30 19:35:59 +00:00
}
2017-05-31 18:34:46 +00:00
}
2018-03-23 23:55:21 +00:00
// Filter allocations that do not need to be considered because they are
// from an older job version and are terminal.
all, ignore := a.filterOldTerminalAllocs(all)
desiredChanges.Ignore += uint64(len(ignore))
// canaries is the set of canaries for the current deployment and all is all
// allocs including the canaries
2017-07-05 19:50:40 +00:00
canaries, all := a.handleGroupCanaries(all, desiredChanges)
2017-05-22 17:58:34 +00:00
2017-07-05 19:50:40 +00:00
// Determine what set of allocations are on tainted nodes
2017-06-02 23:11:29 +00:00
untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
// Determine what set of terminal allocations need to be rescheduled
untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment)
// Find delays for any lost allocs that have stop_after_client_disconnect
lostLater := lost.delayByStopAfterClientDisconnect()
lostLaterEvals := a.handleDelayedLost(lostLater, all, tg.Name)
// Create batched follow up evaluations for allocations that are
// reschedulable later and mark the allocations for in place updating
a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
// Create a structure for choosing names. Seed with the taken names
// which is the union of untainted, rescheduled, allocs on migrating
// nodes, and allocs on down nodes (includes canaries)
nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow, lost))
2017-05-31 18:34:46 +00:00
2017-05-22 17:58:34 +00:00
// Stop any unneeded allocations and update the untainted set to not
// include stopped allocations.
2017-06-02 23:11:29 +00:00
canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState, lostLaterEvals)
2017-05-23 20:02:47 +00:00
desiredChanges.Stop += uint64(len(stop))
2017-05-31 18:34:46 +00:00
untainted = untainted.difference(stop)
2017-05-22 17:58:34 +00:00
// Do inplace upgrades where possible and capture the set of upgrades that
// need to be done destructively.
ignore, inplace, destructive := a.computeUpdates(tg, untainted)
2017-05-23 20:02:47 +00:00
desiredChanges.Ignore += uint64(len(ignore))
desiredChanges.InPlaceUpdate += uint64(len(inplace))
2017-06-06 21:08:46 +00:00
if !existingDeployment {
2017-07-06 21:28:59 +00:00
dstate.DesiredTotal += len(destructive) + len(inplace)
2017-06-02 23:11:29 +00:00
}
2017-05-23 00:42:41 +00:00
// Remove the canaries now that we have handled rescheduling so that we do
// not consider them when making placement decisions.
if canaryState {
untainted = untainted.difference(canaries)
}
2017-05-22 17:58:34 +00:00
// The fact that we have destructive updates and have less canaries than is
// desired means we need to create canaries
2017-06-01 22:16:24 +00:00
strategy := tg.Update
2017-06-26 21:23:52 +00:00
canariesPromoted := dstate != nil && dstate.Promoted
requireCanary := len(destructive) != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
if requireCanary {
dstate.DesiredCanaries = strategy.Canary
}
2017-06-02 23:11:29 +00:00
if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
2017-05-23 20:02:47 +00:00
number := strategy.Canary - len(canaries)
desiredChanges.Canary += uint64(number)
2017-05-23 23:08:35 +00:00
2017-05-31 18:34:46 +00:00
for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) {
2017-05-22 17:58:34 +00:00
a.result.place = append(a.result.place, allocPlaceResult{
2017-05-31 18:34:46 +00:00
name: name,
2017-05-22 17:58:34 +00:00
canary: true,
taskGroup: tg,
})
}
}
// Determine how many we can place
2017-06-02 23:11:29 +00:00
canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState)
2017-05-22 17:58:34 +00:00
// Place if:
2017-06-02 23:11:29 +00:00
// * The deployment is not paused or failed
2017-05-22 17:58:34 +00:00
// * Not placing any canaries
// * If there are any canaries that they have been promoted
// * There is no delayed stop_after_client_disconnect alloc, which delays scheduling for the whole group
// * An alloc was lost
var place []allocPlaceResult
if len(lostLater) == 0 {
place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, canaryState, lost)
if !existingDeployment {
dstate.DesiredTotal += len(place)
}
2017-05-23 23:08:35 +00:00
}
// deploymentPlaceReady tracks whether the deployment is in a state where
// placements can be made without any other consideration.
deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
if deploymentPlaceReady {
2017-06-02 23:11:29 +00:00
desiredChanges.Place += uint64(len(place))
2020-12-09 19:05:18 +00:00
a.result.place = append(a.result.place, place...)
Stop allocs to be rescheduled Currently, when an alloc fails and is rescheduled, the alloc desired state remains as "run" and the nomad client may not free the resources. Here, we ensure that an alloc is marked as stopped when it's rescheduled. Notice the Desired Status and Description before and after this change: Before: ``` mars-2:nomad notnoop$ nomad alloc status 02aba49e ID = 02aba49e Eval ID = bb9ed1d2 Name = example-reschedule.nodes[0] Node ID = 5853d547 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = run Desired Description = <none> Created = 10s ago Modified = 5s ago Replacement Alloc ID = d6bf872b Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 0/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:12:45Z Finished At = 2019-06-06T21:12:50Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:12:50-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:12:50-04:00 Terminated Exit Code: 1 2019-06-06T17:12:45-04:00 Started Task started by client 2019-06-06T17:12:45-04:00 Task Setup Building Task Directory 2019-06-06T17:12:45-04:00 Received Task received by client ``` After: ``` ID = 5001ccd1 Eval ID = 53507a02 Name = example-reschedule.nodes[0] Node ID = a3b04364 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = stop Desired Description = alloc was rescheduled because it failed Created = 13s ago Modified = 3s ago Replacement Alloc ID = 7ba7ac20 Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 21/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:22:50Z Finished At = 2019-06-06T21:22:55Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:22:55-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:22:55-04:00 Terminated Exit Code: 1 2019-06-06T17:22:50-04:00 Started Task started by client 2019-06-06T17:22:50-04:00 Task Setup Building Task Directory 2019-06-06T17:22:50-04:00 Received Task received by client ```
2019-06-06 19:04:32 +00:00
a.markStop(rescheduleNow, "", allocRescheduled)
desiredChanges.Stop += uint64(len(rescheduleNow))
min := helper.IntMin(len(place), limit)
limit -= min
} else if !deploymentPlaceReady {
// We do not want to place additional allocations but in the case we
// have lost allocations or allocations that require rescheduling now,
// we do so regardless to avoid odd user experiences.
if len(lost) != 0 {
allowed := helper.IntMin(len(lost), len(place))
desiredChanges.Place += uint64(allowed)
2020-12-09 19:05:18 +00:00
a.result.place = append(a.result.place, place[:allowed]...)
}
// Handle rescheduling of failed allocations even if the deployment is
// failed. We do not reschedule if the allocation is part of the failed
// deployment.
if now := len(rescheduleNow); now != 0 {
for _, p := range place {
prev := p.PreviousAllocation()
if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
a.result.place = append(a.result.place, p)
desiredChanges.Place++
Stop allocs to be rescheduled Currently, when an alloc fails and is rescheduled, the alloc desired state remains as "run" and the nomad client may not free the resources. Here, we ensure that an alloc is marked as stopped when it's rescheduled. Notice the Desired Status and Description before and after this change: Before: ``` mars-2:nomad notnoop$ nomad alloc status 02aba49e ID = 02aba49e Eval ID = bb9ed1d2 Name = example-reschedule.nodes[0] Node ID = 5853d547 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = run Desired Description = <none> Created = 10s ago Modified = 5s ago Replacement Alloc ID = d6bf872b Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 0/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:12:45Z Finished At = 2019-06-06T21:12:50Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:12:50-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:12:50-04:00 Terminated Exit Code: 1 2019-06-06T17:12:45-04:00 Started Task started by client 2019-06-06T17:12:45-04:00 Task Setup Building Task Directory 2019-06-06T17:12:45-04:00 Received Task received by client ``` After: ``` ID = 5001ccd1 Eval ID = 53507a02 Name = example-reschedule.nodes[0] Node ID = a3b04364 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = stop Desired Description = alloc was rescheduled because it failed Created = 13s ago Modified = 3s ago Replacement Alloc ID = 7ba7ac20 Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 21/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:22:50Z Finished At = 2019-06-06T21:22:55Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:22:55-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:22:55-04:00 Terminated Exit Code: 1 2019-06-06T17:22:50-04:00 Started Task started by client 2019-06-06T17:22:50-04:00 Task Setup Building Task Directory 2019-06-06T17:22:50-04:00 Received Task received by client ```
2019-06-06 19:04:32 +00:00
a.result.stop = append(a.result.stop, allocStopResult{
alloc: prev,
statusDescription: allocRescheduled,
})
desiredChanges.Stop++
}
}
}
}
2017-05-22 17:58:34 +00:00
if deploymentPlaceReady {
2017-05-22 17:58:34 +00:00
// Do all destructive updates
min := helper.IntMin(len(destructive), limit)
2017-06-02 23:11:29 +00:00
desiredChanges.DestructiveUpdate += uint64(min)
desiredChanges.Ignore += uint64(len(destructive) - min)
for _, alloc := range destructive.nameOrder()[:min] {
2017-07-15 23:31:33 +00:00
a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
placeName: alloc.Name,
placeTaskGroup: tg,
stopAlloc: alloc,
stopStatusDescription: allocUpdating,
2017-05-22 17:58:34 +00:00
})
}
2017-06-02 23:11:29 +00:00
} else {
desiredChanges.Ignore += uint64(len(destructive))
2017-05-22 17:58:34 +00:00
}
// Migrate all the allocations
desiredChanges.Migrate += uint64(len(migrate))
for _, alloc := range migrate.nameOrder() {
2017-05-22 17:58:34 +00:00
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocMigrating,
})
a.result.place = append(a.result.place, allocPlaceResult{
name: alloc.Name,
canary: alloc.DeploymentStatus.IsCanary(),
taskGroup: tg,
previousAlloc: alloc,
downgradeNonCanary: canaryState && !alloc.DeploymentStatus.IsCanary(),
minJobVersion: alloc.Job.Version,
})
}
2017-06-02 23:11:29 +00:00
2018-03-23 23:55:21 +00:00
// Create new deployment if:
// 1. Updating a job specification
// 2. No running allocations (first time running a job)
updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0
hadRunning := false
for _, alloc := range all {
if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex {
2018-03-23 23:55:21 +00:00
hadRunning = true
break
}
}
2017-06-06 21:08:46 +00:00
// Create a new deployment if necessary
if !existingDeployment && !strategy.IsEmpty() && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) {
// A previous group may have made the deployment already
if a.deployment == nil {
a.deployment = structs.NewDeployment(a.job)
// in multiregion jobs, most deployments start in a pending state
if a.job.IsMultiregion() && !(a.job.IsPeriodic() && a.job.IsParameterized()) {
a.deployment.Status = structs.DeploymentStatusPending
a.deployment.StatusDescription = structs.DeploymentStatusDescriptionPendingForPeer
}
a.result.deployment = a.deployment
}
// Attach the groups deployment state to the deployment
2017-06-06 21:08:46 +00:00
a.deployment.TaskGroups[group] = dstate
}
2017-07-05 19:55:51 +00:00
2017-07-06 21:28:59 +00:00
// deploymentComplete is whether the deployment is complete which largely
// means that no placements were made or desired to be made
deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary
2017-07-06 21:28:59 +00:00
2017-07-05 19:55:51 +00:00
// Final check to see if the deployment is complete is to ensure everything
// is healthy
if deploymentComplete && a.deployment != nil {
if dstate, ok := a.deployment.TaskGroups[group]; ok {
if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
deploymentComplete = false
}
2017-07-05 19:55:51 +00:00
}
}
return deploymentComplete
2017-05-22 17:58:34 +00:00
}
2018-03-23 23:55:21 +00:00
// filterOldTerminalAllocs filters allocations that should be ignored since they
// are allocations that are terminal from a previous job version.
func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) {
if !a.batch {
return all, nil
}
filtered = filtered.union(all)
ignored := make(map[string]*structs.Allocation)
// Ignore terminal batch jobs from older versions
for id, alloc := range filtered {
older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
if older && alloc.TerminalStatus() {
delete(filtered, id)
ignored[id] = alloc
}
}
return filtered, ignored
}
2017-07-05 19:50:40 +00:00
// handleGroupCanaries handles the canaries for the group by stopping the
// unneeded ones and returning the current set of canaries and the updated total
// set of allocs for the group
func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) {
// Stop any canary from an older deployment or from a failed one
var stop []string
// Cancel any non-promoted canaries from the older deployment
if a.oldDeployment != nil {
for _, dstate := range a.oldDeployment.TaskGroups {
if !dstate.Promoted {
stop = append(stop, dstate.PlacedCanaries...)
2017-07-05 19:50:40 +00:00
}
}
}
2017-07-06 21:28:59 +00:00
// Cancel any non-promoted canaries from a failed deployment
2017-07-05 19:50:40 +00:00
if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
for _, dstate := range a.deployment.TaskGroups {
if !dstate.Promoted {
stop = append(stop, dstate.PlacedCanaries...)
2017-07-05 19:50:40 +00:00
}
}
}
2017-07-06 21:28:59 +00:00
// stopSet is the allocSet that contains the canaries we desire to stop from
// above.
2017-07-05 19:50:40 +00:00
stopSet := all.fromKeys(stop)
a.markStop(stopSet, "", allocNotNeeded)
desiredChanges.Stop += uint64(len(stopSet))
all = all.difference(stopSet)
// Capture our current set of canaries and handle any migrations that are
// needed by just stopping them.
if a.deployment != nil {
var canaryIDs []string
for _, dstate := range a.deployment.TaskGroups {
canaryIDs = append(canaryIDs, dstate.PlacedCanaries...)
2017-07-05 19:50:40 +00:00
}
canaries = all.fromKeys(canaryIDs)
untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes)
a.markStop(migrate, "", allocMigrating)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
canaries = untainted
all = all.difference(migrate, lost)
}
return canaries, all
}
2017-05-23 00:42:41 +00:00
// computeLimit returns the placement limit for a particular group. The inputs
// are the group definition, the untainted, destructive, and migrate allocation
// set and whether we are in a canary state.
func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int {
2018-03-11 18:58:19 +00:00
// If there is no update strategy or deployment for the group we can deploy
// as many as the group has
if group.Update.IsEmpty() || len(destructive)+len(migrate) == 0 {
return group.Count
2017-06-02 23:11:29 +00:00
} else if a.deploymentPaused || a.deploymentFailed {
// If the deployment is paused or failed, do not create anything else
return 0
}
// If we have canaries and they have not been promoted the limit is 0
2017-06-02 23:11:29 +00:00
if canaryState {
return 0
}
// If we have been promoted or there are no canaries, the limit is the
2017-06-06 21:08:46 +00:00
// configured MaxParallel minus any outstanding non-healthy alloc for the
// deployment
2017-05-23 00:42:41 +00:00
limit := group.Update.MaxParallel
2017-06-06 21:08:46 +00:00
if a.deployment != nil {
2017-06-21 20:20:54 +00:00
partOf, _ := untainted.filterByDeployment(a.deployment.ID)
for _, alloc := range partOf {
2017-06-26 21:23:52 +00:00
// An unhealthy allocation means nothing else should be happen.
if alloc.DeploymentStatus.IsUnhealthy() {
return 0
}
2017-06-21 20:20:54 +00:00
if !alloc.DeploymentStatus.IsHealthy() {
limit--
}
}
}
// The limit can be less than zero in the case that the job was changed such
// that it required destructive changes and the count was scaled up.
if limit < 0 {
return 0
}
return limit
}
2017-05-23 00:42:41 +00:00
// computePlacement returns the set of allocations to place given the group
// definition, the set of untainted, migrating and reschedule allocations for the group.
//
// Placements will meet or exceed group count.
2017-05-31 18:34:46 +00:00
func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet,
canaryState bool, lost allocSet) []allocPlaceResult {
2017-05-31 18:34:46 +00:00
// Add rescheduled placement results
var place []allocPlaceResult
for _, alloc := range reschedule {
2017-05-22 17:58:34 +00:00
place = append(place, allocPlaceResult{
name: alloc.Name,
taskGroup: group,
previousAlloc: alloc,
reschedule: true,
canary: alloc.DeploymentStatus.IsCanary(),
downgradeNonCanary: canaryState && !alloc.DeploymentStatus.IsCanary(),
minJobVersion: alloc.Job.Version,
lost: false,
2017-05-22 17:58:34 +00:00
})
}
// Add replacements for lost allocs up to group.Count
existing := len(untainted) + len(migrate) + len(reschedule)
for _, alloc := range lost {
if existing >= group.Count {
// Reached desired count, do not replace remaining lost
// allocs
break
}
existing++
place = append(place, allocPlaceResult{
name: alloc.Name,
taskGroup: group,
previousAlloc: alloc,
reschedule: false,
canary: alloc.DeploymentStatus.IsCanary(),
downgradeNonCanary: canaryState && !alloc.DeploymentStatus.IsCanary(),
minJobVersion: alloc.Job.Version,
lost: true,
})
}
// Add remaining placement results
if existing < group.Count {
for _, name := range nameIndex.Next(uint(group.Count - existing)) {
place = append(place, allocPlaceResult{
name: name,
taskGroup: group,
downgradeNonCanary: canaryState,
})
}
2017-05-22 17:58:34 +00:00
}
return place
}
2017-06-06 21:08:46 +00:00
// computeStop returns the set of allocations that are marked for stopping given
// the group definition, the set of allocations in various states and whether we
2017-06-06 21:08:46 +00:00
// are canarying.
2017-05-31 18:34:46 +00:00
func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
untainted, migrate, lost, canaries allocSet, canaryState bool, followupEvals map[string]string) allocSet {
2017-06-01 22:16:24 +00:00
// Mark all lost allocations for stop.
2017-06-01 22:16:24 +00:00
var stop allocSet
stop = stop.union(lost)
a.markDelayed(lost, structs.AllocClientStatusLost, allocLost, followupEvals)
2017-06-01 22:16:24 +00:00
2017-06-02 23:11:29 +00:00
// If we are still deploying or creating canaries, don't stop them
if canaryState {
2017-06-01 22:16:24 +00:00
untainted = untainted.difference(canaries)
2017-05-22 17:58:34 +00:00
}
2017-06-01 22:16:24 +00:00
// Hot path the nothing to do case
remove := len(untainted) + len(migrate) - group.Count
2017-05-31 18:34:46 +00:00
if remove <= 0 {
2017-06-02 23:11:29 +00:00
return stop
}
// Filter out any terminal allocations from the untainted set
// This is so that we don't try to mark them as stopped redundantly
2018-01-19 21:20:00 +00:00
untainted = filterByTerminal(untainted)
2017-06-02 23:11:29 +00:00
// Prefer stopping any alloc that has the same name as the canaries if we
// are promoted
if !canaryState && len(canaries) != 0 {
canaryNames := canaries.nameSet()
for id, alloc := range untainted.difference(canaries) {
if _, match := canaryNames[alloc.Name]; match {
stop[id] = alloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(untainted, id)
remove--
if remove == 0 {
return stop
}
}
}
2017-05-22 17:58:34 +00:00
}
2017-06-01 22:16:24 +00:00
// Prefer selecting from the migrating set before stopping existing allocs
if len(migrate) != 0 {
mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
removeNames := mNames.Highest(uint(remove))
for id, alloc := range migrate {
if _, match := removeNames[alloc.Name]; !match {
continue
}
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(migrate, id)
stop[id] = alloc
nameIndex.UnsetIndex(alloc.Index())
remove--
if remove == 0 {
return stop
}
}
}
2017-06-06 21:08:46 +00:00
// Select the allocs with the highest count to remove
2017-05-31 18:34:46 +00:00
removeNames := nameIndex.Highest(uint(remove))
2017-06-01 22:16:24 +00:00
for id, alloc := range untainted {
if _, ok := removeNames[alloc.Name]; ok {
2017-06-01 22:16:24 +00:00
stop[id] = alloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(untainted, id)
remove--
if remove == 0 {
return stop
}
}
}
// It is possible that we didn't stop as many as we should have if there
// were allocations with duplicate names.
for id, alloc := range untainted {
stop[id] = alloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(untainted, id)
remove--
if remove == 0 {
return stop
2017-05-22 17:58:34 +00:00
}
}
2017-05-31 18:34:46 +00:00
return stop
2017-05-22 17:58:34 +00:00
}
2017-05-23 00:42:41 +00:00
// computeUpdates determines which allocations for the passed group require
// updates. Three groups are returned:
// 1. Those that require no upgrades
// 2. Those that can be upgraded in-place. These are added to the results
// automatically since the function contains the correct state to do so,
// 3. Those that require destructive updates
func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) {
2017-05-22 17:58:34 +00:00
// Determine the set of allocations that need to be updated
ignore = make(map[string]*structs.Allocation)
inplace = make(map[string]*structs.Allocation)
destructive = make(map[string]*structs.Allocation)
for _, alloc := range untainted {
ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
if ignoreChange {
2017-05-22 17:58:34 +00:00
ignore[alloc.ID] = alloc
} else if destructiveChange {
2017-05-22 17:58:34 +00:00
destructive[alloc.ID] = alloc
} else {
inplace[alloc.ID] = alloc
a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
2017-05-22 17:58:34 +00:00
}
}
return
}
2018-03-08 00:44:54 +00:00
// handleDelayedReschedules creates batched followup evaluations with the WaitUntil field
// set for allocations that are eligible to be rescheduled later, and marks the alloc with
// the followupEvalID
func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
// followupEvals are created in the same way as for delayed lost allocs
allocIDToFollowupEvalID := a.handleDelayedLost(rescheduleLater, all, tgName)
// Initialize the annotations
if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil {
a.result.attributeUpdates = make(map[string]*structs.Allocation)
}
// Create updates that will be applied to the allocs to mark the FollowupEvalID
for allocID, evalID := range allocIDToFollowupEvalID {
existingAlloc := all[allocID]
updatedAlloc := existingAlloc.Copy()
updatedAlloc.FollowupEvalID = evalID
a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
}
}
// handleDelayedLost creates batched followup evaluations with the WaitUntil field set for
// lost allocations. followupEvals are appended to a.result as a side effect, we return a
// map of alloc IDs to their followupEval IDs
func (a *allocReconciler) handleDelayedLost(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) map[string]string {
if len(rescheduleLater) == 0 {
return map[string]string{}
}
// Sort by time
sort.Slice(rescheduleLater, func(i, j int) bool {
return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
})
var evals []*structs.Evaluation
nextReschedTime := rescheduleLater[0].rescheduleTime
allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
2018-03-08 14:33:44 +00:00
// Create a new eval for the first batch
eval := &structs.Evaluation{
ID: uuid.Generate(),
Namespace: a.job.Namespace,
Priority: a.job.Priority,
Type: a.job.Type,
TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
JobID: a.job.ID,
JobModifyIndex: a.job.ModifyIndex,
Status: structs.EvalStatusPending,
StatusDescription: reschedulingFollowupEvalDesc,
WaitUntil: nextReschedTime,
2018-03-08 14:33:44 +00:00
}
evals = append(evals, eval)
for _, allocReschedInfo := range rescheduleLater {
2018-03-08 14:33:44 +00:00
if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
} else {
2018-03-08 14:33:44 +00:00
// Start a new batch
nextReschedTime = allocReschedInfo.rescheduleTime
// Create a new eval for the new batch
eval = &structs.Evaluation{
ID: uuid.Generate(),
Namespace: a.job.Namespace,
Priority: a.job.Priority,
Type: a.job.Type,
TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
JobID: a.job.ID,
JobModifyIndex: a.job.ModifyIndex,
Status: structs.EvalStatusPending,
WaitUntil: nextReschedTime,
}
evals = append(evals, eval)
2018-03-08 14:33:44 +00:00
// Set the evalID for the first alloc in this new batch
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
}
}
a.result.desiredFollowupEvals[tgName] = evals
return allocIDToFollowupEvalID
}