open-nomad/scheduler/reconcile.go

898 lines
30 KiB
Go
Raw Normal View History

2017-05-22 17:58:34 +00:00
package scheduler
import (
2017-07-07 23:49:08 +00:00
"fmt"
"log"
"time"
"sort"
2017-05-22 17:58:34 +00:00
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/uuid"
2017-05-22 17:58:34 +00:00
"github.com/hashicorp/nomad/nomad/structs"
)
2018-03-08 00:44:54 +00:00
const (
// batchedFailedAllocWindowSize is the window size used
// to batch up failed allocations before creating an eval
batchedFailedAllocWindowSize = 5 * time.Second
)
// allocUpdateType takes an existing allocation and a new job definition and
// returns whether the allocation can ignore the change, requires a destructive
// update, or can be inplace updated. If it can be inplace updated, an updated
// allocation that has the new resources and alloc metrics attached will be
// returned.
2017-06-01 22:16:24 +00:00
type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
2017-05-23 00:42:41 +00:00
// allocReconciler is used to determine the set of allocations that require
// placement, inplace updating or stopping given the job specification and
// existing cluster state. The reconciler should only be used for batch and
// service jobs.
2017-05-22 17:58:34 +00:00
type allocReconciler struct {
// logger is used to log debug information. Logging should be kept at a
// minimal here
logger *log.Logger
2017-05-22 17:58:34 +00:00
// canInplace is used to check if the allocation can be inplace upgraded
allocUpdateFn allocUpdateType
2017-05-22 17:58:34 +00:00
// batch marks whether the job is a batch job
batch bool
// job is the job being operated on, it may be nil if the job is being
// stopped via a purge
job *structs.Job
// jobID is the ID of the job being operated on. The job may be nil if it is
// being stopped so we require this separately.
jobID string
2017-07-05 19:50:40 +00:00
// oldDeployment is the last deployment for the job
oldDeployment *structs.Deployment
2017-05-22 17:58:34 +00:00
// deployment is the current deployment for the job
deployment *structs.Deployment
// deploymentPaused marks whether the deployment is paused
deploymentPaused bool
2017-06-02 23:11:29 +00:00
// deploymentFailed marks whether the deployment is failed
deploymentFailed bool
2017-05-22 17:58:34 +00:00
// taintedNodes contains a map of nodes that are tainted
taintedNodes map[string]*structs.Node
// existingAllocs is non-terminal existing allocations
existingAllocs []*structs.Allocation
// result is the results of the reconcile. During computation it can be
// used to store intermediate state
result *reconcileResults
}
2017-05-23 00:42:41 +00:00
// reconcileResults contains the results of the reconciliation and should be
// applied by the scheduler.
2017-05-22 17:58:34 +00:00
type reconcileResults struct {
// deployment is the deployment that should be created or updated as a
// result of scheduling
deployment *structs.Deployment
2017-05-23 00:42:41 +00:00
// deploymentUpdates contains a set of deployment updates that should be
// applied as a result of scheduling
2017-05-22 17:58:34 +00:00
deploymentUpdates []*structs.DeploymentStatusUpdate
2017-05-23 00:42:41 +00:00
// place is the set of allocations to place by the scheduler
place []allocPlaceResult
2017-07-15 23:31:33 +00:00
// destructiveUpdate is the set of allocations to apply a destructive update to
destructiveUpdate []allocDestructiveResult
2017-05-23 00:42:41 +00:00
// inplaceUpdate is the set of allocations to apply an inplace update to
2017-05-22 17:58:34 +00:00
inplaceUpdate []*structs.Allocation
2017-05-23 00:42:41 +00:00
// stop is the set of allocations to stop
stop []allocStopResult
2017-05-22 17:58:34 +00:00
2017-05-23 20:02:47 +00:00
// desiredTGUpdates captures the desired set of changes to make for each
// task group.
desiredTGUpdates map[string]*structs.DesiredUpdates
// followupEvalWait is set if there should be a followup eval run after the
// given duration
// Deprecated, the delay strategy that sets this is not available after nomad 0.7.0
followupEvalWait time.Duration
// desiredFollowupEvals is the map of follow up evaluations to create per task group
// This is used to create a delayed evaluation for rescheduling failed allocations.
desiredFollowupEvals map[string][]*structs.Evaluation
}
// delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
// this is used to create follow up evaluations
type delayedRescheduleInfo struct {
// allocID is the ID of the allocation eligible to be rescheduled
allocID string
// rescheduleTime is the time to use in the delayed evaluation
rescheduleTime time.Time
2017-05-22 17:58:34 +00:00
}
2017-07-07 23:49:08 +00:00
func (r *reconcileResults) GoString() string {
2017-07-15 23:31:33 +00:00
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
2017-07-07 23:49:08 +00:00
if r.deployment != nil {
base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
}
for _, u := range r.deploymentUpdates {
base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
u.DeploymentID, u.Status, u.StatusDescription)
}
if r.followupEvalWait != 0 {
base += fmt.Sprintf("\nFollowup Eval in %v", r.followupEvalWait)
}
for tg, u := range r.desiredTGUpdates {
base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
}
return base
}
2017-07-05 19:55:51 +00:00
// Changes returns the number of total changes
func (r *reconcileResults) Changes() int {
return len(r.place) + len(r.inplaceUpdate) + len(r.stop)
}
2017-05-23 00:42:41 +00:00
// NewAllocReconciler creates a new reconciler that should be used to determine
// the changes required to bring the cluster state inline with the declared jobspec
func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool,
jobID string, job *structs.Job, deployment *structs.Deployment,
2017-05-22 17:58:34 +00:00
existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node) *allocReconciler {
2017-07-06 02:46:57 +00:00
return &allocReconciler{
logger: logger,
allocUpdateFn: allocUpdateFn,
2017-05-22 17:58:34 +00:00
batch: batch,
jobID: jobID,
2017-05-22 17:58:34 +00:00
job: job,
2017-07-05 19:55:51 +00:00
deployment: deployment.Copy(),
2017-05-22 17:58:34 +00:00
existingAllocs: existingAllocs,
taintedNodes: taintedNodes,
2017-05-23 20:02:47 +00:00
result: &reconcileResults{
desiredTGUpdates: make(map[string]*structs.DesiredUpdates),
desiredFollowupEvals: make(map[string][]*structs.Evaluation),
2017-05-23 20:02:47 +00:00
},
2017-05-22 17:58:34 +00:00
}
}
2017-05-23 00:42:41 +00:00
// Compute reconciles the existing cluster state and returns the set of changes
// required to converge the job spec and state
2017-05-22 17:58:34 +00:00
func (a *allocReconciler) Compute() *reconcileResults {
2017-06-01 22:16:24 +00:00
// Create the allocation matrix
m := newAllocMatrix(a.job, a.existingAllocs)
2017-06-06 21:08:46 +00:00
// Handle stopping unneeded deployments
a.cancelDeployments()
2017-06-01 22:16:24 +00:00
2017-05-22 17:58:34 +00:00
// If we are just stopping a job we do not need to do anything more than
// stopping all running allocs
2017-06-01 22:16:24 +00:00
if a.job.Stopped() {
a.handleStop(m)
return a.result
}
2017-07-06 02:46:57 +00:00
// Detect if the deployment is paused
if a.deployment != nil {
// Detect if any allocs associated with this deploy have failed
2018-02-05 22:37:07 +00:00
// Failed allocations could edge trigger an evaluation before the deployment watcher
// runs and marks the deploy as failed. This block makes sure that is still
// considered a failed deploy
failedAllocsInDeploy := false
for _, as := range m {
for _, alloc := range as {
if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
failedAllocsInDeploy = true
}
}
}
2017-07-06 02:46:57 +00:00
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
2017-07-06 02:46:57 +00:00
}
2017-06-01 22:16:24 +00:00
// Reconcile each group
2017-07-05 19:55:51 +00:00
complete := true
2017-06-01 22:16:24 +00:00
for group, as := range m {
2017-07-05 19:55:51 +00:00
groupComplete := a.computeGroup(group, as)
complete = complete && groupComplete
}
// Mark the deployment as complete if possible
if a.deployment != nil && complete {
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusSuccessful,
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
})
2017-06-01 22:16:24 +00:00
}
// Set the description of a created deployment
if d := a.result.deployment; d != nil {
if d.RequiresPromotion() {
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
}
}
2017-06-01 22:16:24 +00:00
return a.result
}
2017-05-22 17:58:34 +00:00
2017-06-06 21:08:46 +00:00
// cancelDeployments cancels any deployment that is not needed
func (a *allocReconciler) cancelDeployments() {
// If the job is stopped and there is a non-terminal deployment, cancel it
2017-06-01 22:16:24 +00:00
if a.job.Stopped() {
2017-06-02 23:11:29 +00:00
if a.deployment != nil && a.deployment.Active() {
2017-05-22 17:58:34 +00:00
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusCancelled,
StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
})
}
2017-06-01 22:16:24 +00:00
// Nothing else to do
a.oldDeployment = a.deployment
a.deployment = nil
2017-06-01 22:16:24 +00:00
return
2017-05-22 17:58:34 +00:00
}
2017-07-05 19:50:40 +00:00
d := a.deployment
if d == nil {
return
}
// Check if the deployment is active and referencing an older job and cancel it
if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
2017-07-06 00:13:45 +00:00
if d.Active() {
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
DeploymentID: a.deployment.ID,
Status: structs.DeploymentStatusCancelled,
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
})
}
2017-07-05 19:55:51 +00:00
2017-07-05 19:50:40 +00:00
a.oldDeployment = d
a.deployment = nil
}
// Clear it as the current deployment if it is successful
if d.Status == structs.DeploymentStatusSuccessful {
a.oldDeployment = d
a.deployment = nil
}
2017-05-22 17:58:34 +00:00
}
// handleStop marks all allocations to be stopped, handling the lost case
2017-06-01 22:16:24 +00:00
func (a *allocReconciler) handleStop(m allocMatrix) {
for group, as := range m {
untainted, migrate, lost := as.filterByTainted(a.taintedNodes)
a.markStop(untainted, "", allocNotNeeded)
a.markStop(migrate, "", allocNotNeeded)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
desiredChanges := new(structs.DesiredUpdates)
desiredChanges.Stop = uint64(len(as))
a.result.desiredTGUpdates[group] = desiredChanges
}
2017-05-22 17:58:34 +00:00
}
2017-05-23 00:42:41 +00:00
// markStop is a helper for marking a set of allocation for stop with a
// particular client status and description.
2017-05-22 17:58:34 +00:00
func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
for _, alloc := range allocs {
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
clientStatus: clientStatus,
statusDescription: statusDescription,
})
}
}
2017-07-05 19:55:51 +00:00
// computeGroup reconciles state for a particular task group. It returns whether
// the deployment it is for is complete with regards to the task group.
func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
2017-05-23 20:02:47 +00:00
// Create the desired update object for the group
desiredChanges := new(structs.DesiredUpdates)
a.result.desiredTGUpdates[group] = desiredChanges
2017-05-22 17:58:34 +00:00
// Get the task group. The task group may be nil if the job was updates such
// that the task group no longer exists
tg := a.job.LookupTaskGroup(group)
// If the task group is nil, then the task group has been removed so all we
// need to do is stop everything
if tg == nil {
2017-06-02 23:11:29 +00:00
untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
2017-05-22 17:58:34 +00:00
a.markStop(untainted, "", allocNotNeeded)
a.markStop(migrate, "", allocNotNeeded)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
2017-05-23 20:02:47 +00:00
desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost))
2017-07-05 19:55:51 +00:00
return true
2017-05-22 17:58:34 +00:00
}
2017-05-31 18:34:46 +00:00
// Get the deployment state for the group
var dstate *structs.DeploymentState
existingDeployment := false
2017-05-31 18:34:46 +00:00
if a.deployment != nil {
2017-06-06 21:08:46 +00:00
dstate, existingDeployment = a.deployment.TaskGroups[group]
}
if !existingDeployment {
2017-06-30 19:35:59 +00:00
autorevert := false
if tg.Update != nil && tg.Update.AutoRevert {
autorevert = true
}
dstate = &structs.DeploymentState{
AutoRevert: autorevert,
}
2017-05-31 18:34:46 +00:00
}
// Filter batch allocations that do not need to be considered.
all, ignore := a.batchFiltration(all)
desiredChanges.Ignore += uint64(len(ignore))
2017-07-05 19:50:40 +00:00
canaries, all := a.handleGroupCanaries(all, desiredChanges)
2017-05-22 17:58:34 +00:00
2017-07-05 19:50:40 +00:00
// Determine what set of allocations are on tainted nodes
2017-06-02 23:11:29 +00:00
untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
// Determine what set of terminal allocations need to be rescheduled
untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch)
// Create batched follow up evaluations for allocations that are reschedulable later
2018-03-08 00:44:54 +00:00
var rescheduleLaterAllocs map[string]*structs.Allocation
if len(rescheduleLater) > 0 {
rescheduleLaterAllocs = a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
}
2017-06-01 22:16:24 +00:00
// Create a structure for choosing names. Seed with the taken names which is
2017-06-02 23:11:29 +00:00
// the union of untainted and migrating nodes (includes canaries)
nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
2017-05-31 18:34:46 +00:00
2017-05-22 17:58:34 +00:00
// Stop any unneeded allocations and update the untainted set to not
2017-06-06 21:08:46 +00:00
// included stopped allocations.
2017-06-02 23:11:29 +00:00
canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState)
2017-05-23 20:02:47 +00:00
desiredChanges.Stop += uint64(len(stop))
2017-05-31 18:34:46 +00:00
untainted = untainted.difference(stop)
// Having stopped un-needed allocations, append the canaries to the existing
// set of untainted because they are promoted. This will cause them to be
// treated like non-canaries
2017-06-02 23:11:29 +00:00
if !canaryState {
2017-05-31 18:34:46 +00:00
untainted = untainted.union(canaries)
2017-06-01 22:16:24 +00:00
nameIndex.Set(canaries)
2017-05-31 18:34:46 +00:00
}
2017-05-22 17:58:34 +00:00
// Do inplace upgrades where possible and capture the set of upgrades that
// need to be done destructively.
ignore, inplace, destructive := a.computeUpdates(tg, untainted, rescheduleLaterAllocs)
2017-05-23 20:02:47 +00:00
desiredChanges.Ignore += uint64(len(ignore))
desiredChanges.InPlaceUpdate += uint64(len(inplace))
2017-06-06 21:08:46 +00:00
if !existingDeployment {
2017-07-06 21:28:59 +00:00
dstate.DesiredTotal += len(destructive) + len(inplace)
2017-06-02 23:11:29 +00:00
}
2017-05-23 00:42:41 +00:00
2017-05-22 17:58:34 +00:00
// The fact that we have destructive updates and have less canaries than is
// desired means we need to create canaries
2017-05-31 18:34:46 +00:00
numDestructive := len(destructive)
2017-06-01 22:16:24 +00:00
strategy := tg.Update
2017-06-26 21:23:52 +00:00
canariesPromoted := dstate != nil && dstate.Promoted
requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
2017-06-02 23:11:29 +00:00
if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
2017-05-23 20:02:47 +00:00
number := strategy.Canary - len(canaries)
2017-05-31 18:34:46 +00:00
number = helper.IntMin(numDestructive, number)
2017-05-23 20:02:47 +00:00
desiredChanges.Canary += uint64(number)
2017-06-06 21:08:46 +00:00
if !existingDeployment {
2017-05-23 23:08:35 +00:00
dstate.DesiredCanaries = strategy.Canary
}
2017-05-31 18:34:46 +00:00
for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) {
2017-05-22 17:58:34 +00:00
a.result.place = append(a.result.place, allocPlaceResult{
2017-05-31 18:34:46 +00:00
name: name,
2017-05-22 17:58:34 +00:00
canary: true,
taskGroup: tg,
})
}
}
// Determine how many we can place
2017-06-02 23:11:29 +00:00
canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState)
2017-05-22 17:58:34 +00:00
// Place if:
2017-06-02 23:11:29 +00:00
// * The deployment is not paused or failed
2017-05-22 17:58:34 +00:00
// * Not placing any canaries
// * If there are any canaries that they have been promoted
place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
2017-06-06 21:08:46 +00:00
if !existingDeployment {
2017-05-23 23:08:35 +00:00
dstate.DesiredTotal += len(place)
}
// deploymentPlaceReady tracks whether the deployment is in a state where
// placements can be made without any other consideration.
deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
if deploymentPlaceReady {
2017-06-02 23:11:29 +00:00
desiredChanges.Place += uint64(len(place))
2017-05-22 17:58:34 +00:00
for _, p := range place {
a.result.place = append(a.result.place, p)
}
min := helper.IntMin(len(place), limit)
limit -= min
} else if !deploymentPlaceReady && len(lost) != 0 {
// We are in a situation where we shouldn't be placing more than we need
// to but we have lost allocations. It is a very weird user experience
// if you have a node go down and Nomad doesn't replace the allocations
// because the deployment is paused/failed so we only place to recover
// the lost allocations.
allowed := helper.IntMin(len(lost), len(place))
desiredChanges.Place += uint64(allowed)
for _, p := range place[:allowed] {
a.result.place = append(a.result.place, p)
}
}
2017-05-22 17:58:34 +00:00
if deploymentPlaceReady {
2017-05-22 17:58:34 +00:00
// Do all destructive updates
min := helper.IntMin(len(destructive), limit)
2017-06-02 23:11:29 +00:00
limit -= min
desiredChanges.DestructiveUpdate += uint64(min)
desiredChanges.Ignore += uint64(len(destructive) - min)
for _, alloc := range destructive.nameOrder()[:min] {
2017-07-15 23:31:33 +00:00
a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
placeName: alloc.Name,
placeTaskGroup: tg,
stopAlloc: alloc,
stopStatusDescription: allocUpdating,
2017-05-22 17:58:34 +00:00
})
}
2017-06-02 23:11:29 +00:00
} else {
desiredChanges.Ignore += uint64(len(destructive))
2017-05-22 17:58:34 +00:00
}
// Calculate the allowed number of changes and set the desired changes
// accordingly.
min := helper.IntMin(len(migrate), limit)
if !a.deploymentFailed && !a.deploymentPaused {
desiredChanges.Migrate += uint64(min)
desiredChanges.Ignore += uint64(len(migrate) - min)
2017-06-02 23:11:29 +00:00
} else {
desiredChanges.Stop += uint64(len(migrate))
}
followup := false
migrated := 0
for _, alloc := range migrate.nameOrder() {
// If the deployment is failed or paused, don't replace it, just mark as stop.
if a.deploymentFailed || a.deploymentPaused {
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNodeTainted,
})
continue
}
if migrated >= limit {
followup = true
break
}
migrated++
2017-05-22 17:58:34 +00:00
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocMigrating,
})
a.result.place = append(a.result.place, allocPlaceResult{
name: alloc.Name,
canary: false,
taskGroup: tg,
previousAlloc: alloc,
})
}
2017-06-02 23:11:29 +00:00
// We need to create a followup evaluation.
if followup && strategy != nil && a.result.followupEvalWait < strategy.Stagger {
a.result.followupEvalWait = strategy.Stagger
2017-05-22 17:58:34 +00:00
}
2017-06-02 23:11:29 +00:00
2017-06-06 21:08:46 +00:00
// Create a new deployment if necessary
if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 {
// A previous group may have made the deployment already
if a.deployment == nil {
a.deployment = structs.NewDeployment(a.job)
a.result.deployment = a.deployment
}
// Attach the groups deployment state to the deployment
2017-06-06 21:08:46 +00:00
a.deployment.TaskGroups[group] = dstate
}
2017-07-05 19:55:51 +00:00
2017-07-06 21:28:59 +00:00
// deploymentComplete is whether the deployment is complete which largely
// means that no placements were made or desired to be made
deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate) == 0 && !requireCanary
2017-07-05 19:55:51 +00:00
// Final check to see if the deployment is complete is to ensure everything
// is healthy
if deploymentComplete && a.deployment != nil {
partOf, _ := untainted.filterByDeployment(a.deployment.ID)
for _, alloc := range partOf {
if !alloc.DeploymentStatus.IsHealthy() {
deploymentComplete = false
break
}
}
}
return deploymentComplete
2017-05-22 17:58:34 +00:00
}
// batchFiltration filters batch allocations that should be ignored. These are
// allocations that are terminal from a previous job version.
func (a *allocReconciler) batchFiltration(all allocSet) (filtered, ignore allocSet) {
if !a.batch {
return all, nil
}
filtered = filtered.union(all)
ignored := make(map[string]*structs.Allocation)
// Ignore terminal batch jobs from older versions
for id, alloc := range filtered {
older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
if older && alloc.TerminalStatus() {
delete(filtered, id)
ignored[id] = alloc
}
}
return filtered, ignored
}
2017-07-05 19:50:40 +00:00
// handleGroupCanaries handles the canaries for the group by stopping the
// unneeded ones and returning the current set of canaries and the updated total
// set of allocs for the group
func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) {
// Stop any canary from an older deployment or from a failed one
var stop []string
// Cancel any non-promoted canaries from the older deployment
if a.oldDeployment != nil {
for _, s := range a.oldDeployment.TaskGroups {
if !s.Promoted {
stop = append(stop, s.PlacedCanaries...)
}
}
}
2017-07-06 21:28:59 +00:00
// Cancel any non-promoted canaries from a failed deployment
2017-07-05 19:50:40 +00:00
if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
for _, s := range a.deployment.TaskGroups {
if !s.Promoted {
stop = append(stop, s.PlacedCanaries...)
}
}
}
2017-07-06 21:28:59 +00:00
// stopSet is the allocSet that contains the canaries we desire to stop from
// above.
2017-07-05 19:50:40 +00:00
stopSet := all.fromKeys(stop)
a.markStop(stopSet, "", allocNotNeeded)
desiredChanges.Stop += uint64(len(stopSet))
all = all.difference(stopSet)
// Capture our current set of canaries and handle any migrations that are
// needed by just stopping them.
if a.deployment != nil {
var canaryIDs []string
for _, s := range a.deployment.TaskGroups {
canaryIDs = append(canaryIDs, s.PlacedCanaries...)
}
canaries = all.fromKeys(canaryIDs)
untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes)
a.markStop(migrate, "", allocMigrating)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
canaries = untainted
all = all.difference(migrate, lost)
}
return canaries, all
}
2017-05-23 00:42:41 +00:00
// computeLimit returns the placement limit for a particular group. The inputs
// are the group definition, the untainted, destructive, and migrate allocation
// set and whether we are in a canary state.
func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int {
2018-03-11 18:58:19 +00:00
// If there is no update strategy or deployment for the group we can deploy
// as many as the group has
if group.Update == nil || len(destructive)+len(migrate) == 0 {
return group.Count
2017-06-02 23:11:29 +00:00
} else if a.deploymentPaused || a.deploymentFailed {
// If the deployment is paused or failed, do not create anything else
return 0
}
// If we have canaries and they have not been promoted the limit is 0
2017-06-02 23:11:29 +00:00
if canaryState {
return 0
}
// If we have been promoted or there are no canaries, the limit is the
2017-06-06 21:08:46 +00:00
// configured MaxParallel minus any outstanding non-healthy alloc for the
// deployment
2017-05-23 00:42:41 +00:00
limit := group.Update.MaxParallel
2017-06-06 21:08:46 +00:00
if a.deployment != nil {
2017-06-21 20:20:54 +00:00
partOf, _ := untainted.filterByDeployment(a.deployment.ID)
for _, alloc := range partOf {
2017-06-26 21:23:52 +00:00
// An unhealthy allocation means nothing else should be happen.
if alloc.DeploymentStatus.IsUnhealthy() {
return 0
}
2017-06-21 20:20:54 +00:00
if !alloc.DeploymentStatus.IsHealthy() {
limit--
}
}
}
// The limit can be less than zero in the case that the job was changed such
// that it required destructive changes and the count was scaled up.
if limit < 0 {
return 0
}
return limit
}
2017-05-23 00:42:41 +00:00
// computePlacement returns the set of allocations to place given the group
// definition, the set of untainted, migrating and reschedule allocations for the group.
2017-05-31 18:34:46 +00:00
func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
2017-05-31 18:34:46 +00:00
2017-05-22 17:58:34 +00:00
// Hot path the nothing to do case
2017-06-01 22:16:24 +00:00
existing := len(untainted) + len(migrate)
2017-05-31 18:34:46 +00:00
if existing >= group.Count {
2017-05-22 17:58:34 +00:00
return nil
}
var place []allocPlaceResult
// Add rescheduled placement results
// Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed
for _, alloc := range reschedule {
2017-05-22 17:58:34 +00:00
place = append(place, allocPlaceResult{
name: alloc.Name,
taskGroup: group,
previousAlloc: alloc,
reschedule: true,
2017-05-22 17:58:34 +00:00
})
existing += 1
if existing == group.Count {
break
}
}
// Add remaining placement results
if existing < group.Count {
for _, name := range nameIndex.Next(uint(group.Count - existing)) {
place = append(place, allocPlaceResult{
name: name,
taskGroup: group,
})
}
2017-05-22 17:58:34 +00:00
}
return place
}
2017-06-06 21:08:46 +00:00
// computeStop returns the set of allocations that are marked for stopping given
// the group definition, the set of allocations in various states and whether we
2017-06-06 21:08:46 +00:00
// are canarying.
2017-05-31 18:34:46 +00:00
func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
2017-06-02 23:11:29 +00:00
untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet {
2017-06-01 22:16:24 +00:00
// Mark all lost allocations for stop. Previous allocation doesn't matter
// here since it is on a lost node
var stop allocSet
stop = stop.union(lost)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
2017-06-02 23:11:29 +00:00
// If we are still deploying or creating canaries, don't stop them
if canaryState {
2017-06-01 22:16:24 +00:00
untainted = untainted.difference(canaries)
2017-05-22 17:58:34 +00:00
}
2017-06-01 22:16:24 +00:00
// Hot path the nothing to do case
remove := len(untainted) + len(migrate) - group.Count
2017-05-31 18:34:46 +00:00
if remove <= 0 {
2017-06-02 23:11:29 +00:00
return stop
}
// Filter out any terminal allocations from the untainted set
// This is so that we don't try to mark them as stopped redundantly
2018-01-19 21:20:00 +00:00
untainted = filterByTerminal(untainted)
2017-06-02 23:11:29 +00:00
// Prefer stopping any alloc that has the same name as the canaries if we
// are promoted
if !canaryState && len(canaries) != 0 {
canaryNames := canaries.nameSet()
for id, alloc := range untainted.difference(canaries) {
if _, match := canaryNames[alloc.Name]; match {
stop[id] = alloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(untainted, id)
remove--
if remove == 0 {
return stop
}
}
}
2017-05-22 17:58:34 +00:00
}
2017-06-01 22:16:24 +00:00
// Prefer selecting from the migrating set before stopping existing allocs
if len(migrate) != 0 {
mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
removeNames := mNames.Highest(uint(remove))
for id, alloc := range migrate {
if _, match := removeNames[alloc.Name]; !match {
continue
}
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(migrate, id)
stop[id] = alloc
nameIndex.UnsetIndex(alloc.Index())
remove--
if remove == 0 {
return stop
}
}
}
2017-06-06 21:08:46 +00:00
// Select the allocs with the highest count to remove
2017-05-31 18:34:46 +00:00
removeNames := nameIndex.Highest(uint(remove))
2017-06-01 22:16:24 +00:00
for id, alloc := range untainted {
if _, ok := removeNames[alloc.Name]; ok {
2017-06-01 22:16:24 +00:00
stop[id] = alloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(untainted, id)
remove--
if remove == 0 {
return stop
}
}
}
// It is possible that we didn't stop as many as we should have if there
// were allocations with duplicate names.
for id, alloc := range untainted {
stop[id] = alloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: alloc,
statusDescription: allocNotNeeded,
})
delete(untainted, id)
remove--
if remove == 0 {
return stop
2017-05-22 17:58:34 +00:00
}
}
2017-05-31 18:34:46 +00:00
return stop
2017-05-22 17:58:34 +00:00
}
2017-05-23 00:42:41 +00:00
// computeUpdates determines which allocations for the passed group require
// updates. Three groups are returned:
// 1. Those that require no upgrades
// 2. Those that can be upgraded in-place. These are added to the results
// automatically since the function contains the correct state to do so,
// 3. Those that require destructive updates
func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted, rescheduleLaterAllocs allocSet) (ignore, inplace, destructive allocSet) {
2017-05-22 17:58:34 +00:00
// Determine the set of allocations that need to be updated
ignore = make(map[string]*structs.Allocation)
inplace = make(map[string]*structs.Allocation)
destructive = make(map[string]*structs.Allocation)
for _, alloc := range untainted {
ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
// Also check if the alloc is marked for later rescheduling.
// If so it should be in the inplace list
reschedLaterAlloc, isRescheduleLater := rescheduleLaterAllocs[alloc.ID]
if isRescheduleLater {
inplace[alloc.ID] = alloc
a.result.inplaceUpdate = append(a.result.inplaceUpdate, reschedLaterAlloc)
} else if ignoreChange {
2017-05-22 17:58:34 +00:00
ignore[alloc.ID] = alloc
} else if destructiveChange {
2017-05-22 17:58:34 +00:00
destructive[alloc.ID] = alloc
} else {
2017-07-05 19:50:40 +00:00
// Attach the deployment ID and and clear the health if the
// deployment has changed
inplace[alloc.ID] = alloc
a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
2017-05-22 17:58:34 +00:00
}
}
return
}
2018-03-08 00:44:54 +00:00
// handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
// for allocations that are eligible to be rescheduled later
func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) allocSet {
// Sort by time
sort.Slice(rescheduleLater, func(i, j int) bool {
return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
})
var evals []*structs.Evaluation
nextReschedTime := rescheduleLater[0].rescheduleTime
allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
2018-03-08 14:33:44 +00:00
// Create a new eval for the first batch
eval := &structs.Evaluation{
ID: uuid.Generate(),
Namespace: a.job.Namespace,
Priority: a.job.Priority,
Type: a.job.Type,
TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
JobID: a.job.ID,
JobModifyIndex: a.job.ModifyIndex,
Status: structs.EvalStatusPending,
WaitUntil: nextReschedTime,
}
evals = append(evals, eval)
for _, allocReschedInfo := range rescheduleLater {
2018-03-08 14:33:44 +00:00
if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
} else {
2018-03-08 14:33:44 +00:00
// Start a new batch
nextReschedTime = allocReschedInfo.rescheduleTime
// Create a new eval for the new batch
eval = &structs.Evaluation{
ID: uuid.Generate(),
Namespace: a.job.Namespace,
Priority: a.job.Priority,
Type: a.job.Type,
TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
JobID: a.job.ID,
JobModifyIndex: a.job.ModifyIndex,
Status: structs.EvalStatusPending,
WaitUntil: nextReschedTime,
}
evals = append(evals, eval)
2018-03-08 14:33:44 +00:00
// Set the evalID for the first alloc in this new batch
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
}
}
a.result.desiredFollowupEvals[tgName] = evals
2018-03-08 14:33:44 +00:00
// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
rescheduleLaterAllocs := make(map[string]*structs.Allocation)
for allocID, evalID := range allocIDToFollowupEvalID {
existingAlloc := all[allocID]
updatedAlloc := existingAlloc.Copy()
updatedAlloc.FollowupEvalID = evalID
rescheduleLaterAllocs[allocID] = updatedAlloc
}
return rescheduleLaterAllocs
}