2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
package scheduler
|
|
|
|
|
2023-02-03 17:29:39 +00:00
|
|
|
// The reconciler is the first stage in the scheduler for service and batch
|
|
|
|
// jobs. It compares the existing state to the desired state to determine the
|
|
|
|
// set of changes needed. System jobs and sysbatch jobs do not use the
|
|
|
|
// reconciler.
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
import (
|
2017-07-07 23:49:08 +00:00
|
|
|
"fmt"
|
2018-03-02 00:23:44 +00:00
|
|
|
"sort"
|
2022-02-05 09:54:19 +00:00
|
|
|
"time"
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2023-12-08 13:47:02 +00:00
|
|
|
"github.com/armon/go-metrics"
|
2018-09-15 23:23:13 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
"github.com/hashicorp/nomad/helper"
|
2018-03-02 00:23:44 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2017-05-22 17:58:34 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
2018-03-08 00:44:54 +00:00
|
|
|
const (
|
|
|
|
// batchedFailedAllocWindowSize is the window size used
|
|
|
|
// to batch up failed allocations before creating an eval
|
|
|
|
batchedFailedAllocWindowSize = 5 * time.Second
|
2018-04-03 20:49:18 +00:00
|
|
|
|
2018-04-04 12:35:20 +00:00
|
|
|
// rescheduleWindowSize is the window size relative to
|
2018-04-03 20:49:18 +00:00
|
|
|
// current time within which reschedulable allocations are placed.
|
|
|
|
// This helps protect against small clock drifts between servers
|
2018-04-04 12:35:20 +00:00
|
|
|
rescheduleWindowSize = 1 * time.Second
|
2018-03-08 00:44:54 +00:00
|
|
|
)
|
|
|
|
|
2017-05-31 23:55:40 +00:00
|
|
|
// allocUpdateType takes an existing allocation and a new job definition and
|
|
|
|
// returns whether the allocation can ignore the change, requires a destructive
|
|
|
|
// update, or can be inplace updated. If it can be inplace updated, an updated
|
|
|
|
// allocation that has the new resources and alloc metrics attached will be
|
|
|
|
// returned.
|
2017-06-01 22:16:24 +00:00
|
|
|
type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
|
|
|
|
newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
|
2017-05-31 23:55:40 +00:00
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// allocReconciler is used to determine the set of allocations that require
|
|
|
|
// placement, inplace updating or stopping given the job specification and
|
|
|
|
// existing cluster state. The reconciler should only be used for batch and
|
|
|
|
// service jobs.
|
2017-05-22 17:58:34 +00:00
|
|
|
type allocReconciler struct {
|
2017-05-31 23:55:40 +00:00
|
|
|
// logger is used to log debug information. Logging should be kept at a
|
|
|
|
// minimal here
|
2018-09-15 23:23:13 +00:00
|
|
|
logger log.Logger
|
2017-05-22 17:58:34 +00:00
|
|
|
|
2017-05-31 23:55:40 +00:00
|
|
|
// canInplace is used to check if the allocation can be inplace upgraded
|
|
|
|
allocUpdateFn allocUpdateType
|
2017-05-22 17:58:34 +00:00
|
|
|
|
|
|
|
// batch marks whether the job is a batch job
|
|
|
|
batch bool
|
|
|
|
|
|
|
|
// job is the job being operated on, it may be nil if the job is being
|
|
|
|
// stopped via a purge
|
|
|
|
job *structs.Job
|
|
|
|
|
2017-05-31 23:55:40 +00:00
|
|
|
// jobID is the ID of the job being operated on. The job may be nil if it is
|
2017-08-07 21:13:05 +00:00
|
|
|
// being stopped so we require this separately.
|
2017-05-31 23:55:40 +00:00
|
|
|
jobID string
|
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
// oldDeployment is the last deployment for the job
|
|
|
|
oldDeployment *structs.Deployment
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// deployment is the current deployment for the job
|
|
|
|
deployment *structs.Deployment
|
|
|
|
|
|
|
|
// deploymentPaused marks whether the deployment is paused
|
|
|
|
deploymentPaused bool
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// deploymentFailed marks whether the deployment is failed
|
|
|
|
deploymentFailed bool
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// taintedNodes contains a map of nodes that are tainted
|
|
|
|
taintedNodes map[string]*structs.Node
|
|
|
|
|
|
|
|
// existingAllocs is non-terminal existing allocations
|
|
|
|
existingAllocs []*structs.Allocation
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
// evalID and evalPriority is the ID and Priority of the evaluation that
|
|
|
|
// triggered the reconciler.
|
|
|
|
evalID string
|
|
|
|
evalPriority int
|
2018-04-03 20:49:18 +00:00
|
|
|
|
2022-03-07 18:40:57 +00:00
|
|
|
// supportsDisconnectedClients indicates whether all servers meet the required
|
|
|
|
// minimum version to allow application of max_client_disconnect configuration.
|
|
|
|
supportsDisconnectedClients bool
|
|
|
|
|
2018-04-04 12:35:20 +00:00
|
|
|
// now is the time used when determining rescheduling eligibility
|
2022-02-10 21:24:51 +00:00
|
|
|
// defaults to time.Now, and overridden in unit tests
|
2018-04-03 20:49:18 +00:00
|
|
|
now time.Time
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// result is the results of the reconcile. During computation it can be
|
|
|
|
// used to store intermediate state
|
|
|
|
result *reconcileResults
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// reconcileResults contains the results of the reconciliation and should be
|
|
|
|
// applied by the scheduler.
|
2017-05-22 17:58:34 +00:00
|
|
|
type reconcileResults struct {
|
2017-07-04 20:31:01 +00:00
|
|
|
// deployment is the deployment that should be created or updated as a
|
|
|
|
// result of scheduling
|
|
|
|
deployment *structs.Deployment
|
2017-05-23 00:42:41 +00:00
|
|
|
|
|
|
|
// deploymentUpdates contains a set of deployment updates that should be
|
|
|
|
// applied as a result of scheduling
|
2017-05-22 17:58:34 +00:00
|
|
|
deploymentUpdates []*structs.DeploymentStatusUpdate
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// place is the set of allocations to place by the scheduler
|
|
|
|
place []allocPlaceResult
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
// destructiveUpdate is the set of allocations to apply a destructive update to
|
|
|
|
destructiveUpdate []allocDestructiveResult
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// inplaceUpdate is the set of allocations to apply an inplace update to
|
2017-05-22 17:58:34 +00:00
|
|
|
inplaceUpdate []*structs.Allocation
|
2017-05-23 00:42:41 +00:00
|
|
|
|
|
|
|
// stop is the set of allocations to stop
|
|
|
|
stop []allocStopResult
|
2017-05-22 17:58:34 +00:00
|
|
|
|
2018-03-26 18:06:21 +00:00
|
|
|
// attributeUpdates are updates to the allocation that are not from a
|
2018-03-23 23:55:21 +00:00
|
|
|
// jobspec change.
|
2018-03-26 18:06:21 +00:00
|
|
|
attributeUpdates map[string]*structs.Allocation
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// disconnectUpdates is the set of allocations are on disconnected nodes, but
|
|
|
|
// have not yet had their ClientStatus set to AllocClientStatusUnknown.
|
|
|
|
disconnectUpdates map[string]*structs.Allocation
|
|
|
|
|
|
|
|
// reconnectUpdates is the set of allocations that have ClientStatus set to
|
|
|
|
// AllocClientStatusUnknown, but the associated Node has reconnected.
|
|
|
|
reconnectUpdates map[string]*structs.Allocation
|
|
|
|
|
2017-05-23 20:02:47 +00:00
|
|
|
// desiredTGUpdates captures the desired set of changes to make for each
|
|
|
|
// task group.
|
|
|
|
desiredTGUpdates map[string]*structs.DesiredUpdates
|
2017-07-07 18:42:51 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// desiredFollowupEvals is the map of follow up evaluations to create per task group
|
|
|
|
// This is used to create a delayed evaluation for rescheduling failed allocations.
|
|
|
|
desiredFollowupEvals map[string][]*structs.Evaluation
|
2023-10-27 16:04:04 +00:00
|
|
|
|
|
|
|
// taskGroupAllocNameIndexes is a tracking of the allocation name index,
|
|
|
|
// keyed by the task group name. This is stored within the results, so the
|
|
|
|
// generic scheduler can use this to perform duplicate alloc index checks
|
|
|
|
// before submitting the plan. This is always non-nil and is handled within
|
|
|
|
// a single routine, so does not require a mutex.
|
|
|
|
taskGroupAllocNameIndexes map[string]*allocNameIndex
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
|
|
|
|
// this is used to create follow up evaluations
|
|
|
|
type delayedRescheduleInfo struct {
|
|
|
|
|
|
|
|
// allocID is the ID of the allocation eligible to be rescheduled
|
|
|
|
allocID string
|
|
|
|
|
2019-06-06 19:04:32 +00:00
|
|
|
alloc *structs.Allocation
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// rescheduleTime is the time to use in the delayed evaluation
|
|
|
|
rescheduleTime time.Time
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2017-07-07 23:49:08 +00:00
|
|
|
func (r *reconcileResults) GoString() string {
|
2022-03-04 16:04:21 +00:00
|
|
|
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d) (disconnect %d) (reconnect %d)",
|
|
|
|
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop), len(r.disconnectUpdates), len(r.reconnectUpdates))
|
2017-07-07 23:49:08 +00:00
|
|
|
|
|
|
|
if r.deployment != nil {
|
|
|
|
base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
|
|
|
|
}
|
|
|
|
for _, u := range r.deploymentUpdates {
|
|
|
|
base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
|
|
|
|
u.DeploymentID, u.Status, u.StatusDescription)
|
|
|
|
}
|
|
|
|
for tg, u := range r.desiredTGUpdates {
|
|
|
|
base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
|
|
|
|
}
|
|
|
|
return base
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// NewAllocReconciler creates a new reconciler that should be used to determine
|
|
|
|
// the changes required to bring the cluster state inline with the declared jobspec
|
2018-09-15 23:23:13 +00:00
|
|
|
func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool,
|
2017-05-31 23:55:40 +00:00
|
|
|
jobID string, job *structs.Job, deployment *structs.Deployment,
|
2021-11-23 08:23:31 +00:00
|
|
|
existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string,
|
2022-03-07 18:40:57 +00:00
|
|
|
evalPriority int, supportsDisconnectedClients bool) *allocReconciler {
|
2017-07-06 02:46:57 +00:00
|
|
|
return &allocReconciler{
|
2022-03-07 18:40:57 +00:00
|
|
|
logger: logger.Named("reconciler"),
|
|
|
|
allocUpdateFn: allocUpdateFn,
|
|
|
|
batch: batch,
|
|
|
|
jobID: jobID,
|
|
|
|
job: job,
|
|
|
|
deployment: deployment.Copy(),
|
|
|
|
existingAllocs: existingAllocs,
|
|
|
|
taintedNodes: taintedNodes,
|
|
|
|
evalID: evalID,
|
|
|
|
evalPriority: evalPriority,
|
|
|
|
supportsDisconnectedClients: supportsDisconnectedClients,
|
|
|
|
now: time.Now(),
|
2017-05-23 20:02:47 +00:00
|
|
|
result: &reconcileResults{
|
2023-10-27 16:04:04 +00:00
|
|
|
attributeUpdates: make(map[string]*structs.Allocation),
|
|
|
|
disconnectUpdates: make(map[string]*structs.Allocation),
|
|
|
|
reconnectUpdates: make(map[string]*structs.Allocation),
|
|
|
|
desiredTGUpdates: make(map[string]*structs.DesiredUpdates),
|
|
|
|
desiredFollowupEvals: make(map[string][]*structs.Evaluation),
|
|
|
|
taskGroupAllocNameIndexes: make(map[string]*allocNameIndex),
|
2017-05-23 20:02:47 +00:00
|
|
|
},
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// Compute reconciles the existing cluster state and returns the set of changes
|
|
|
|
// required to converge the job spec and state
|
2017-05-22 17:58:34 +00:00
|
|
|
func (a *allocReconciler) Compute() *reconcileResults {
|
2017-06-01 22:16:24 +00:00
|
|
|
// Create the allocation matrix
|
|
|
|
m := newAllocMatrix(a.job, a.existingAllocs)
|
|
|
|
|
2022-02-05 09:54:19 +00:00
|
|
|
a.cancelUnneededDeployments()
|
2017-06-01 22:16:24 +00:00
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// If we are just stopping a job we do not need to do anything more than
|
|
|
|
// stopping all running allocs
|
2017-06-01 22:16:24 +00:00
|
|
|
if a.job.Stopped() {
|
|
|
|
a.handleStop(m)
|
|
|
|
return a.result
|
|
|
|
}
|
|
|
|
|
2022-02-05 09:54:19 +00:00
|
|
|
a.computeDeploymentPaused()
|
|
|
|
deploymentComplete := a.computeDeploymentComplete(m)
|
|
|
|
a.computeDeploymentUpdates(deploymentComplete)
|
|
|
|
|
|
|
|
return a.result
|
|
|
|
}
|
2017-07-06 02:46:57 +00:00
|
|
|
|
2022-02-05 09:54:19 +00:00
|
|
|
func (a *allocReconciler) computeDeploymentComplete(m allocMatrix) bool {
|
2017-07-05 19:55:51 +00:00
|
|
|
complete := true
|
2017-06-01 22:16:24 +00:00
|
|
|
for group, as := range m {
|
2017-07-05 19:55:51 +00:00
|
|
|
groupComplete := a.computeGroup(group, as)
|
|
|
|
complete = complete && groupComplete
|
|
|
|
}
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2022-02-05 09:54:19 +00:00
|
|
|
return complete
|
|
|
|
}
|
2017-07-05 19:55:51 +00:00
|
|
|
|
2022-02-05 09:54:19 +00:00
|
|
|
func (a *allocReconciler) computeDeploymentUpdates(deploymentComplete bool) {
|
2022-11-25 17:45:34 +00:00
|
|
|
if a.deployment != nil {
|
|
|
|
// Mark the deployment as complete if possible
|
|
|
|
if deploymentComplete {
|
|
|
|
if a.job.IsMultiregion() {
|
|
|
|
// the unblocking/successful states come after blocked, so we
|
|
|
|
// need to make sure we don't revert those states
|
|
|
|
if a.deployment.Status != structs.DeploymentStatusUnblocking &&
|
|
|
|
a.deployment.Status != structs.DeploymentStatusSuccessful {
|
|
|
|
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
|
|
|
|
DeploymentID: a.deployment.ID,
|
|
|
|
Status: structs.DeploymentStatusBlocked,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionBlocked,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
} else {
|
2020-07-06 15:31:33 +00:00
|
|
|
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
|
|
|
|
DeploymentID: a.deployment.ID,
|
2022-11-25 17:45:34 +00:00
|
|
|
Status: structs.DeploymentStatusSuccessful,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
|
2020-07-06 15:31:33 +00:00
|
|
|
})
|
|
|
|
}
|
2022-11-25 17:45:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Mark the deployment as pending since its state is now computed.
|
|
|
|
if a.deployment.Status == structs.DeploymentStatusInitializing {
|
2020-07-06 15:31:33 +00:00
|
|
|
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
|
|
|
|
DeploymentID: a.deployment.ID,
|
2022-11-25 17:45:34 +00:00
|
|
|
Status: structs.DeploymentStatusPending,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionPendingForPeer,
|
2020-07-06 15:31:33 +00:00
|
|
|
})
|
2020-06-17 15:02:26 +00:00
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
2017-07-07 06:30:46 +00:00
|
|
|
// Set the description of a created deployment
|
|
|
|
if d := a.result.deployment; d != nil {
|
|
|
|
if d.RequiresPromotion() {
|
2019-05-17 21:53:27 +00:00
|
|
|
if d.HasAutoPromote() {
|
|
|
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion
|
|
|
|
} else {
|
|
|
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
|
|
}
|
2017-07-07 06:30:46 +00:00
|
|
|
}
|
|
|
|
}
|
2022-02-05 09:54:19 +00:00
|
|
|
}
|
2017-07-07 06:30:46 +00:00
|
|
|
|
2022-09-22 18:31:27 +00:00
|
|
|
// computeDeploymentPaused is responsible for setting flags on the
|
|
|
|
// allocReconciler that indicate the state of the deployment if one
|
|
|
|
// is required. The flags that are managed are:
|
|
|
|
// 1. deploymentFailed: Did the current deployment fail just as named.
|
2022-11-25 17:45:34 +00:00
|
|
|
// 2. deploymentPaused: Set to true when the current deployment is paused,
|
|
|
|
// which is usually a manual user operation, or if the deployment is
|
|
|
|
// pending or initializing, which are the initial states for multi-region
|
|
|
|
// job deployments. This flag tells Compute that we should not make
|
|
|
|
// placements on the deployment.
|
2022-02-05 09:54:19 +00:00
|
|
|
func (a *allocReconciler) computeDeploymentPaused() {
|
|
|
|
if a.deployment != nil {
|
|
|
|
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused ||
|
2022-11-25 17:45:34 +00:00
|
|
|
a.deployment.Status == structs.DeploymentStatusPending ||
|
|
|
|
a.deployment.Status == structs.DeploymentStatusInitializing
|
2022-02-05 09:54:19 +00:00
|
|
|
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
|
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
2017-05-22 17:58:34 +00:00
|
|
|
|
2022-02-05 09:54:19 +00:00
|
|
|
// cancelUnneededDeployments cancels any deployment that is not needed. If the
|
2022-02-10 21:24:51 +00:00
|
|
|
// current deployment is not needed the deployment field is set to nil. A deployment
|
|
|
|
// update will be staged for jobs that should stop or have the wrong version.
|
|
|
|
// Unneeded deployments include:
|
|
|
|
// 1. Jobs that are marked for stop, but there is a non-terminal deployment.
|
|
|
|
// 2. Deployments that are active, but referencing a different job version.
|
|
|
|
// 3. Deployments that are already successful.
|
2022-02-05 09:54:19 +00:00
|
|
|
func (a *allocReconciler) cancelUnneededDeployments() {
|
2017-06-06 21:08:46 +00:00
|
|
|
// If the job is stopped and there is a non-terminal deployment, cancel it
|
2017-06-01 22:16:24 +00:00
|
|
|
if a.job.Stopped() {
|
2017-06-02 23:11:29 +00:00
|
|
|
if a.deployment != nil && a.deployment.Active() {
|
2017-05-22 17:58:34 +00:00
|
|
|
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
|
|
|
|
DeploymentID: a.deployment.ID,
|
|
|
|
Status: structs.DeploymentStatusCancelled,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Nothing else to do
|
2017-07-07 06:30:46 +00:00
|
|
|
a.oldDeployment = a.deployment
|
|
|
|
a.deployment = nil
|
2017-06-01 22:16:24 +00:00
|
|
|
return
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
d := a.deployment
|
|
|
|
if d == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the deployment is active and referencing an older job and cancel it
|
2017-07-07 02:55:58 +00:00
|
|
|
if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
|
2017-07-06 00:13:45 +00:00
|
|
|
if d.Active() {
|
|
|
|
a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
|
|
|
|
DeploymentID: a.deployment.ID,
|
|
|
|
Status: structs.DeploymentStatusCancelled,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
|
|
|
|
})
|
|
|
|
}
|
2017-07-05 19:55:51 +00:00
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
a.oldDeployment = d
|
|
|
|
a.deployment = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clear it as the current deployment if it is successful
|
|
|
|
if d.Status == structs.DeploymentStatusSuccessful {
|
|
|
|
a.oldDeployment = d
|
|
|
|
a.deployment = nil
|
2017-05-23 00:02:20 +00:00
|
|
|
}
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// handleStop marks all allocations to be stopped, handling the lost case
|
2017-06-01 22:16:24 +00:00
|
|
|
func (a *allocReconciler) handleStop(m allocMatrix) {
|
|
|
|
for group, as := range m {
|
2018-10-09 02:03:49 +00:00
|
|
|
as = filterByTerminal(as)
|
2017-06-01 22:16:24 +00:00
|
|
|
desiredChanges := new(structs.DesiredUpdates)
|
2022-02-10 21:24:51 +00:00
|
|
|
desiredChanges.Stop = a.filterAndStopAll(as)
|
2017-06-01 22:16:24 +00:00
|
|
|
a.result.desiredTGUpdates[group] = desiredChanges
|
|
|
|
}
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// filterAndStopAll stops all allocations in an allocSet. This is useful in when
|
|
|
|
// stopping an entire job or task group.
|
2022-02-10 21:24:51 +00:00
|
|
|
func (a *allocReconciler) filterAndStopAll(set allocSet) uint64 {
|
2022-04-21 14:05:58 +00:00
|
|
|
untainted, migrate, lost, disconnecting, reconnecting, ignore := set.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
|
2022-02-10 21:24:51 +00:00
|
|
|
a.markStop(untainted, "", allocNotNeeded)
|
|
|
|
a.markStop(migrate, "", allocNotNeeded)
|
|
|
|
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
|
2022-02-16 18:50:20 +00:00
|
|
|
a.markStop(disconnecting, "", allocNotNeeded)
|
|
|
|
a.markStop(reconnecting, "", allocNotNeeded)
|
2022-04-21 14:05:58 +00:00
|
|
|
a.markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", allocNotNeeded)
|
2022-02-10 21:24:51 +00:00
|
|
|
return uint64(len(set))
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// markStop is a helper for marking a set of allocation for stop with a
|
|
|
|
// particular client status and description.
|
2017-05-22 17:58:34 +00:00
|
|
|
func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
clientStatus: clientStatus,
|
|
|
|
statusDescription: statusDescription,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-09 21:13:53 +00:00
|
|
|
// markDelayed does markStop, but optionally includes a FollowupEvalID so that we can update
|
|
|
|
// the stopped alloc with its delayed rescheduling evalID
|
|
|
|
func (a *allocReconciler) markDelayed(allocs allocSet, clientStatus, statusDescription string, followupEvals map[string]string) {
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
clientStatus: clientStatus,
|
|
|
|
statusDescription: statusDescription,
|
|
|
|
followupEvalID: followupEvals[alloc.ID],
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-05 19:55:51 +00:00
|
|
|
// computeGroup reconciles state for a particular task group. It returns whether
|
|
|
|
// the deployment it is for is complete with regards to the task group.
|
2022-02-10 21:24:51 +00:00
|
|
|
func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2017-05-23 20:02:47 +00:00
|
|
|
// Create the desired update object for the group
|
|
|
|
desiredChanges := new(structs.DesiredUpdates)
|
2022-02-10 21:24:51 +00:00
|
|
|
a.result.desiredTGUpdates[groupName] = desiredChanges
|
2017-05-23 20:02:47 +00:00
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// Get the task group. The task group may be nil if the job was updates such
|
|
|
|
// that the task group no longer exists
|
2022-02-10 21:24:51 +00:00
|
|
|
tg := a.job.LookupTaskGroup(groupName)
|
2017-05-22 17:58:34 +00:00
|
|
|
|
|
|
|
// If the task group is nil, then the task group has been removed so all we
|
|
|
|
// need to do is stop everything
|
|
|
|
if tg == nil {
|
2022-02-10 21:24:51 +00:00
|
|
|
desiredChanges.Stop = a.filterAndStopAll(all)
|
2017-07-05 19:55:51 +00:00
|
|
|
return true
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
dstate, existingDeployment := a.initializeDeploymentState(groupName, tg)
|
2017-05-31 18:34:46 +00:00
|
|
|
|
2018-03-23 23:55:21 +00:00
|
|
|
// Filter allocations that do not need to be considered because they are
|
|
|
|
// from an older job version and are terminal.
|
|
|
|
all, ignore := a.filterOldTerminalAllocs(all)
|
2017-09-14 21:00:33 +00:00
|
|
|
desiredChanges.Ignore += uint64(len(ignore))
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
canaries, all := a.cancelUnneededCanaries(all, desiredChanges)
|
2017-05-22 17:58:34 +00:00
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
// Determine what set of allocations are on tainted nodes
|
2022-03-31 15:32:18 +00:00
|
|
|
untainted, migrate, lost, disconnecting, reconnecting, ignore := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
|
|
|
|
desiredChanges.Ignore += uint64(len(ignore))
|
2017-06-02 23:11:29 +00:00
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
// Determine what set of terminal allocations need to be rescheduled
|
|
|
|
untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment)
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// If there are allocations reconnecting we need to reconcile them and
|
|
|
|
// their replacements first because there is specific logic when deciding
|
|
|
|
// which ones to keep that can only be applied when the client reconnects.
|
|
|
|
if len(reconnecting) > 0 {
|
|
|
|
// Pass all allocations because the replacements we need to find may be
|
|
|
|
// in any state, including themselves being reconnected.
|
|
|
|
reconnect, stop := a.reconcileReconnecting(reconnecting, all)
|
|
|
|
|
|
|
|
// Stop the reconciled allocations and remove them from the other sets
|
|
|
|
// since they have been already handled.
|
|
|
|
desiredChanges.Stop += uint64(len(stop))
|
|
|
|
|
|
|
|
untainted = untainted.difference(stop)
|
|
|
|
migrate = migrate.difference(stop)
|
|
|
|
lost = lost.difference(stop)
|
|
|
|
disconnecting = disconnecting.difference(stop)
|
|
|
|
reconnecting = reconnecting.difference(stop)
|
|
|
|
ignore = ignore.difference(stop)
|
|
|
|
|
|
|
|
// Validate and add reconnecting allocations to the plan so they are
|
|
|
|
// logged.
|
|
|
|
a.computeReconnecting(reconnect)
|
|
|
|
|
|
|
|
// The rest of the reconnecting allocations is now untainted and will
|
|
|
|
// be further reconciled below.
|
|
|
|
untainted = untainted.union(reconnect)
|
|
|
|
}
|
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
// Determine what set of disconnecting allocations need to be rescheduled now,
|
|
|
|
// which ones later and which ones can't be rescheduled at all.
|
|
|
|
timeoutLaterEvals := map[string]string{}
|
|
|
|
if len(disconnecting) > 0 {
|
|
|
|
untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment)
|
2022-03-31 15:32:18 +00:00
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
rescheduleNow = rescheduleNow.union(rescheduleDisconnecting)
|
|
|
|
untainted = untainted.union(untaintedDisconnecting)
|
|
|
|
rescheduleLater = append(rescheduleLater, laterDisconnecting...)
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
// Find delays for any disconnecting allocs that have max_client_disconnect,
|
|
|
|
// create followup evals, and update the ClientStatus to unknown.
|
|
|
|
timeoutLaterEvals = a.createTimeoutLaterEvals(disconnecting, tg.Name)
|
|
|
|
}
|
2020-05-13 20:39:04 +00:00
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
// Find delays for any lost allocs that have stop_after_client_disconnect
|
|
|
|
lostLaterEvals := map[string]string{}
|
|
|
|
lostLater := []*delayedRescheduleInfo{}
|
|
|
|
if len(lost) > 0 {
|
|
|
|
lostLater = lost.delayByStopAfterClientDisconnect()
|
|
|
|
lostLaterEvals = a.createLostLaterEvals(lostLater, tg.Name)
|
|
|
|
}
|
2022-03-31 15:32:18 +00:00
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// Merge disconnecting with the stop_after_client_disconnect set into the
|
|
|
|
// lostLaterEvals so that computeStop can add them to the stop set.
|
|
|
|
lostLaterEvals = helper.MergeMapStringString(lostLaterEvals, timeoutLaterEvals)
|
|
|
|
|
|
|
|
// Create batched follow-up evaluations for allocations that are
|
2018-03-23 21:36:05 +00:00
|
|
|
// reschedulable later and mark the allocations for in place updating
|
2022-02-10 21:24:51 +00:00
|
|
|
a.createRescheduleLaterEvals(rescheduleLater, all, tg.Name)
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2020-12-17 23:21:46 +00:00
|
|
|
// Create a structure for choosing names. Seed with the taken names
|
|
|
|
// which is the union of untainted, rescheduled, allocs on migrating
|
|
|
|
// nodes, and allocs on down nodes (includes canaries)
|
2022-02-10 21:24:51 +00:00
|
|
|
nameIndex := newAllocNameIndex(a.jobID, groupName, tg.Count, untainted.union(migrate, rescheduleNow, lost))
|
2023-10-27 16:04:04 +00:00
|
|
|
a.result.taskGroupAllocNameIndexes[groupName] = nameIndex
|
2017-05-31 18:34:46 +00:00
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// Stop any unneeded allocations and update the untainted set to not
|
2020-12-17 23:21:46 +00:00
|
|
|
// include stopped allocations.
|
2022-02-05 09:54:19 +00:00
|
|
|
isCanarying := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
|
2023-03-24 23:38:31 +00:00
|
|
|
stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals)
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2017-05-23 20:02:47 +00:00
|
|
|
desiredChanges.Stop += uint64(len(stop))
|
2017-05-31 18:34:46 +00:00
|
|
|
untainted = untainted.difference(stop)
|
|
|
|
|
2017-05-22 17:58:34 +00:00
|
|
|
// Do inplace upgrades where possible and capture the set of upgrades that
|
|
|
|
// need to be done destructively.
|
2023-10-27 15:20:53 +00:00
|
|
|
ignoreUpdates, inplace, destructive := a.computeUpdates(tg, untainted)
|
|
|
|
|
|
|
|
desiredChanges.Ignore += uint64(len(ignoreUpdates))
|
2017-05-23 20:02:47 +00:00
|
|
|
desiredChanges.InPlaceUpdate += uint64(len(inplace))
|
2017-06-06 21:08:46 +00:00
|
|
|
if !existingDeployment {
|
2017-07-06 21:28:59 +00:00
|
|
|
dstate.DesiredTotal += len(destructive) + len(inplace)
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
2017-05-23 00:42:41 +00:00
|
|
|
|
2018-04-23 23:35:25 +00:00
|
|
|
// Remove the canaries now that we have handled rescheduling so that we do
|
|
|
|
// not consider them when making placement decisions.
|
2022-02-05 09:54:19 +00:00
|
|
|
if isCanarying {
|
2018-04-23 23:35:25 +00:00
|
|
|
untainted = untainted.difference(canaries)
|
|
|
|
}
|
2022-02-10 21:24:51 +00:00
|
|
|
requiresCanaries := a.requiresCanaries(tg, dstate, destructive, canaries)
|
|
|
|
if requiresCanaries {
|
|
|
|
a.computeCanaries(tg, dstate, destructive, canaries, desiredChanges, nameIndex)
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// Determine how many non-canary allocs we can place
|
2022-02-05 09:54:19 +00:00
|
|
|
isCanarying = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
|
2022-02-10 21:24:51 +00:00
|
|
|
underProvisionedBy := a.computeUnderProvisionedBy(tg, untainted, destructive, migrate, isCanarying)
|
2017-05-22 17:58:34 +00:00
|
|
|
|
|
|
|
// Place if:
|
2017-06-02 23:11:29 +00:00
|
|
|
// * The deployment is not paused or failed
|
2017-05-22 17:58:34 +00:00
|
|
|
// * Not placing any canaries
|
|
|
|
// * If there are any canaries that they have been promoted
|
2020-06-03 13:48:38 +00:00
|
|
|
// * There is no delayed stop_after_client_disconnect alloc, which delays scheduling for the whole group
|
2020-12-17 23:21:46 +00:00
|
|
|
// * An alloc was lost
|
2020-05-13 20:39:04 +00:00
|
|
|
var place []allocPlaceResult
|
|
|
|
if len(lostLater) == 0 {
|
2023-03-24 23:38:31 +00:00
|
|
|
place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, isCanarying)
|
2020-05-13 20:39:04 +00:00
|
|
|
if !existingDeployment {
|
|
|
|
dstate.DesiredTotal += len(place)
|
|
|
|
}
|
2017-05-23 23:08:35 +00:00
|
|
|
}
|
|
|
|
|
2017-08-04 00:42:14 +00:00
|
|
|
// deploymentPlaceReady tracks whether the deployment is in a state where
|
|
|
|
// placements can be made without any other consideration.
|
2022-02-05 09:54:19 +00:00
|
|
|
deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !isCanarying
|
2017-08-04 00:42:14 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
underProvisionedBy = a.computeReplacements(deploymentPlaceReady, desiredChanges, place, rescheduleNow, lost, underProvisionedBy)
|
2017-05-22 17:58:34 +00:00
|
|
|
|
2017-08-04 00:42:14 +00:00
|
|
|
if deploymentPlaceReady {
|
2022-02-10 21:24:51 +00:00
|
|
|
a.computeDestructiveUpdates(destructive, underProvisionedBy, desiredChanges, tg)
|
2017-06-02 23:11:29 +00:00
|
|
|
} else {
|
|
|
|
desiredChanges.Ignore += uint64(len(destructive))
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
a.computeMigrations(desiredChanges, migrate, tg, isCanarying)
|
|
|
|
a.createDeployment(tg.Name, tg.Update, existingDeployment, dstate, all, destructive)
|
2020-08-13 13:35:09 +00:00
|
|
|
|
2022-11-25 17:45:34 +00:00
|
|
|
// Deployments that are still initializing need to be sent in full in the
|
|
|
|
// plan so its internal state can be persisted by the plan applier.
|
|
|
|
if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusInitializing {
|
|
|
|
a.result.deployment = a.deployment
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
deploymentComplete := a.isDeploymentComplete(groupName, destructive, inplace,
|
2022-03-14 14:00:59 +00:00
|
|
|
migrate, rescheduleNow, place, rescheduleLater, requiresCanaries)
|
2022-02-10 21:24:51 +00:00
|
|
|
|
|
|
|
return deploymentComplete
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *allocReconciler) initializeDeploymentState(group string, tg *structs.TaskGroup) (*structs.DeploymentState, bool) {
|
|
|
|
var dstate *structs.DeploymentState
|
|
|
|
existingDeployment := false
|
|
|
|
|
|
|
|
if a.deployment != nil {
|
|
|
|
dstate, existingDeployment = a.deployment.TaskGroups[group]
|
2017-07-07 18:42:51 +00:00
|
|
|
}
|
2017-06-02 23:11:29 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
if !existingDeployment {
|
|
|
|
dstate = &structs.DeploymentState{}
|
|
|
|
if !tg.Update.IsEmpty() {
|
|
|
|
dstate.AutoRevert = tg.Update.AutoRevert
|
|
|
|
dstate.AutoPromote = tg.Update.AutoPromote
|
|
|
|
dstate.ProgressDeadline = tg.Update.ProgressDeadline
|
2018-03-23 23:55:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
return dstate, existingDeployment
|
|
|
|
}
|
2017-07-25 18:27:47 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// If we have destructive updates, and have fewer canaries than is desired, we need to create canaries.
|
|
|
|
func (a *allocReconciler) requiresCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState, destructive, canaries allocSet) bool {
|
|
|
|
canariesPromoted := dstate != nil && dstate.Promoted
|
|
|
|
return tg.Update != nil &&
|
|
|
|
len(destructive) != 0 &&
|
|
|
|
len(canaries) < tg.Update.Canary &&
|
|
|
|
!canariesPromoted
|
|
|
|
}
|
2017-07-05 19:55:51 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
func (a *allocReconciler) computeCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState,
|
|
|
|
destructive, canaries allocSet, desiredChanges *structs.DesiredUpdates, nameIndex *allocNameIndex) {
|
|
|
|
dstate.DesiredCanaries = tg.Update.Canary
|
2017-07-06 21:28:59 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
if !a.deploymentPaused && !a.deploymentFailed {
|
|
|
|
desiredChanges.Canary += uint64(tg.Update.Canary - len(canaries))
|
|
|
|
for _, name := range nameIndex.NextCanaries(uint(desiredChanges.Canary), canaries, destructive) {
|
|
|
|
a.result.place = append(a.result.place, allocPlaceResult{
|
|
|
|
name: name,
|
|
|
|
canary: true,
|
|
|
|
taskGroup: tg,
|
|
|
|
})
|
2017-07-05 19:55:51 +00:00
|
|
|
}
|
|
|
|
}
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2018-03-23 23:55:21 +00:00
|
|
|
// filterOldTerminalAllocs filters allocations that should be ignored since they
|
|
|
|
// are allocations that are terminal from a previous job version.
|
|
|
|
func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) {
|
2017-09-14 21:00:33 +00:00
|
|
|
if !a.batch {
|
|
|
|
return all, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
filtered = filtered.union(all)
|
|
|
|
ignored := make(map[string]*structs.Allocation)
|
|
|
|
|
|
|
|
// Ignore terminal batch jobs from older versions
|
|
|
|
for id, alloc := range filtered {
|
2017-10-12 23:29:07 +00:00
|
|
|
older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
|
|
|
|
if older && alloc.TerminalStatus() {
|
2017-09-14 21:00:33 +00:00
|
|
|
delete(filtered, id)
|
|
|
|
ignored[id] = alloc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return filtered, ignored
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// cancelUnneededCanaries handles the canaries for the group by stopping the
|
2017-07-05 19:50:40 +00:00
|
|
|
// unneeded ones and returning the current set of canaries and the updated total
|
|
|
|
// set of allocs for the group
|
2022-03-31 15:32:18 +00:00
|
|
|
func (a *allocReconciler) cancelUnneededCanaries(original allocSet, desiredChanges *structs.DesiredUpdates) (canaries, all allocSet) {
|
2017-07-05 19:50:40 +00:00
|
|
|
// Stop any canary from an older deployment or from a failed one
|
|
|
|
var stop []string
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
all = original
|
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
// Cancel any non-promoted canaries from the older deployment
|
|
|
|
if a.oldDeployment != nil {
|
2020-07-17 18:07:43 +00:00
|
|
|
for _, dstate := range a.oldDeployment.TaskGroups {
|
|
|
|
if !dstate.Promoted {
|
|
|
|
stop = append(stop, dstate.PlacedCanaries...)
|
2017-07-05 19:50:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-06 21:28:59 +00:00
|
|
|
// Cancel any non-promoted canaries from a failed deployment
|
2017-07-05 19:50:40 +00:00
|
|
|
if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
|
2020-07-17 18:07:43 +00:00
|
|
|
for _, dstate := range a.deployment.TaskGroups {
|
|
|
|
if !dstate.Promoted {
|
|
|
|
stop = append(stop, dstate.PlacedCanaries...)
|
2017-07-05 19:50:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-06 21:28:59 +00:00
|
|
|
// stopSet is the allocSet that contains the canaries we desire to stop from
|
|
|
|
// above.
|
2017-07-05 19:50:40 +00:00
|
|
|
stopSet := all.fromKeys(stop)
|
|
|
|
a.markStop(stopSet, "", allocNotNeeded)
|
|
|
|
desiredChanges.Stop += uint64(len(stopSet))
|
|
|
|
all = all.difference(stopSet)
|
|
|
|
|
|
|
|
// Capture our current set of canaries and handle any migrations that are
|
|
|
|
// needed by just stopping them.
|
|
|
|
if a.deployment != nil {
|
|
|
|
var canaryIDs []string
|
2020-07-17 18:07:43 +00:00
|
|
|
for _, dstate := range a.deployment.TaskGroups {
|
|
|
|
canaryIDs = append(canaryIDs, dstate.PlacedCanaries...)
|
2017-07-05 19:50:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
canaries = all.fromKeys(canaryIDs)
|
2022-03-31 15:32:18 +00:00
|
|
|
untainted, migrate, lost, _, _, _ := canaries.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
|
2022-04-21 14:05:58 +00:00
|
|
|
// We don't add these stops to desiredChanges because the deployment is
|
|
|
|
// still active. DesiredChanges is used to report deployment progress/final
|
|
|
|
// state. These transient failures aren't meaningful.
|
2017-07-05 19:50:40 +00:00
|
|
|
a.markStop(migrate, "", allocMigrating)
|
|
|
|
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
|
|
|
|
|
|
|
|
canaries = untainted
|
|
|
|
all = all.difference(migrate, lost)
|
|
|
|
}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
return
|
2017-07-05 19:50:40 +00:00
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// computeUnderProvisionedBy returns the number of allocs that still need to be
|
|
|
|
// placed for a particular group. The inputs are the group definition, the untainted,
|
|
|
|
// destructive, and migrate allocation sets, and whether we are in a canary state.
|
|
|
|
func (a *allocReconciler) computeUnderProvisionedBy(group *structs.TaskGroup, untainted, destructive, migrate allocSet, isCanarying bool) int {
|
|
|
|
// If no update strategy, nothing is migrating, and nothing is being replaced,
|
|
|
|
// allow as many as defined in group.Count
|
2019-09-02 17:30:09 +00:00
|
|
|
if group.Update.IsEmpty() || len(destructive)+len(migrate) == 0 {
|
2017-05-23 00:02:20 +00:00
|
|
|
return group.Count
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// If the deployment is nil, allow MaxParallel placements
|
|
|
|
if a.deployment == nil {
|
|
|
|
return group.Update.MaxParallel
|
2017-05-23 00:02:20 +00:00
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// If the deployment is paused, failed, or we have un-promoted canaries, do not create anything else.
|
|
|
|
if a.deploymentPaused ||
|
|
|
|
a.deploymentFailed ||
|
|
|
|
isCanarying {
|
|
|
|
return 0
|
|
|
|
}
|
2017-06-26 21:23:52 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
underProvisionedBy := group.Update.MaxParallel
|
|
|
|
partOf, _ := untainted.filterByDeployment(a.deployment.ID)
|
|
|
|
for _, alloc := range partOf {
|
|
|
|
// An unhealthy allocation means nothing else should happen.
|
|
|
|
if alloc.DeploymentStatus.IsUnhealthy() {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
// If not yet explicitly set to healthy (nil) decrement.
|
|
|
|
if !alloc.DeploymentStatus.IsHealthy() {
|
|
|
|
underProvisionedBy--
|
2017-05-23 00:02:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-19 18:08:45 +00:00
|
|
|
// The limit can be less than zero in the case that the job was changed such
|
|
|
|
// that it required destructive changes and the count was scaled up.
|
2022-02-10 21:24:51 +00:00
|
|
|
if underProvisionedBy < 0 {
|
2017-07-19 18:08:45 +00:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
return underProvisionedBy
|
2017-05-23 00:02:20 +00:00
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// computePlacements returns the set of allocations to place given the group
|
2018-01-19 14:41:53 +00:00
|
|
|
// definition, the set of untainted, migrating and reschedule allocations for the group.
|
2020-12-17 23:21:46 +00:00
|
|
|
//
|
|
|
|
// Placements will meet or exceed group count.
|
2017-05-31 18:34:46 +00:00
|
|
|
func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
|
2023-03-24 23:38:31 +00:00
|
|
|
nameIndex *allocNameIndex, untainted, migrate, reschedule, lost allocSet,
|
2022-02-10 21:24:51 +00:00
|
|
|
isCanarying bool) []allocPlaceResult {
|
2017-05-31 18:34:46 +00:00
|
|
|
|
2018-01-16 14:01:36 +00:00
|
|
|
// Add rescheduled placement results
|
2018-04-23 23:35:25 +00:00
|
|
|
var place []allocPlaceResult
|
2018-01-14 22:47:21 +00:00
|
|
|
for _, alloc := range reschedule {
|
2017-05-22 17:58:34 +00:00
|
|
|
place = append(place, allocPlaceResult{
|
2018-01-14 22:47:21 +00:00
|
|
|
name: alloc.Name,
|
|
|
|
taskGroup: group,
|
|
|
|
previousAlloc: alloc,
|
2018-01-19 19:21:50 +00:00
|
|
|
reschedule: true,
|
2018-04-23 23:35:25 +00:00
|
|
|
canary: alloc.DeploymentStatus.IsCanary(),
|
2020-08-13 13:35:09 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
downgradeNonCanary: isCanarying && !alloc.DeploymentStatus.IsCanary(),
|
2020-08-13 13:35:09 +00:00
|
|
|
minJobVersion: alloc.Job.Version,
|
2020-12-17 23:21:46 +00:00
|
|
|
lost: false,
|
2017-05-22 17:58:34 +00:00
|
|
|
})
|
2018-01-14 22:47:21 +00:00
|
|
|
}
|
2018-04-23 23:35:25 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Add replacements for disconnected and lost allocs up to group.Count
|
2023-03-24 23:38:31 +00:00
|
|
|
existing := len(untainted) + len(migrate) + len(reschedule)
|
2020-12-17 23:21:46 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Add replacements for lost
|
2020-12-17 23:21:46 +00:00
|
|
|
for _, alloc := range lost {
|
|
|
|
if existing >= group.Count {
|
|
|
|
// Reached desired count, do not replace remaining lost
|
|
|
|
// allocs
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
existing++
|
|
|
|
place = append(place, allocPlaceResult{
|
|
|
|
name: alloc.Name,
|
|
|
|
taskGroup: group,
|
|
|
|
previousAlloc: alloc,
|
|
|
|
reschedule: false,
|
|
|
|
canary: alloc.DeploymentStatus.IsCanary(),
|
2022-02-10 21:24:51 +00:00
|
|
|
downgradeNonCanary: isCanarying && !alloc.DeploymentStatus.IsCanary(),
|
2020-12-17 23:21:46 +00:00
|
|
|
minJobVersion: alloc.Job.Version,
|
|
|
|
lost: true,
|
|
|
|
})
|
2018-04-23 23:35:25 +00:00
|
|
|
}
|
|
|
|
|
2018-01-16 14:01:36 +00:00
|
|
|
// Add remaining placement results
|
2018-01-14 22:47:21 +00:00
|
|
|
if existing < group.Count {
|
|
|
|
for _, name := range nameIndex.Next(uint(group.Count - existing)) {
|
|
|
|
place = append(place, allocPlaceResult{
|
2020-08-13 13:35:09 +00:00
|
|
|
name: name,
|
|
|
|
taskGroup: group,
|
2022-02-10 21:24:51 +00:00
|
|
|
downgradeNonCanary: isCanarying,
|
2018-01-14 22:47:21 +00:00
|
|
|
})
|
|
|
|
}
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return place
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// computeReplacements either applies the placements calculated by computePlacements,
|
|
|
|
// or computes more placements based on whether the deployment is ready for placement
|
|
|
|
// and if the placement is already rescheduling or part of a failed deployment.
|
|
|
|
// The input deploymentPlaceReady is calculated as the deployment is not paused, failed, or canarying.
|
|
|
|
// It returns the number of allocs still needed.
|
|
|
|
func (a *allocReconciler) computeReplacements(deploymentPlaceReady bool, desiredChanges *structs.DesiredUpdates,
|
2022-03-31 15:32:18 +00:00
|
|
|
place []allocPlaceResult, rescheduleNow, lost allocSet, underProvisionedBy int) int {
|
|
|
|
|
|
|
|
// Disconnecting allocs are not failing, but are included in rescheduleNow.
|
|
|
|
// Create a new set that only includes the actual failures and compute
|
|
|
|
// replacements based off that.
|
|
|
|
failed := make(allocSet)
|
|
|
|
for id, alloc := range rescheduleNow {
|
2023-10-27 15:20:53 +00:00
|
|
|
_, ok := a.result.disconnectUpdates[id]
|
|
|
|
if !ok && alloc.ClientStatus != structs.AllocClientStatusUnknown {
|
2022-03-31 15:32:18 +00:00
|
|
|
failed[id] = alloc
|
|
|
|
}
|
|
|
|
}
|
2022-02-10 21:24:51 +00:00
|
|
|
|
|
|
|
// If the deployment is place ready, apply all placements and return
|
|
|
|
if deploymentPlaceReady {
|
|
|
|
desiredChanges.Place += uint64(len(place))
|
|
|
|
// This relies on the computePlacements having built this set, which in
|
|
|
|
// turn relies on len(lostLater) == 0.
|
|
|
|
a.result.place = append(a.result.place, place...)
|
2022-03-31 15:32:18 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
a.markStop(failed, "", allocRescheduled)
|
|
|
|
desiredChanges.Stop += uint64(len(failed))
|
|
|
|
|
2023-08-14 13:43:27 +00:00
|
|
|
minimum := min(len(place), underProvisionedBy)
|
|
|
|
underProvisionedBy -= minimum
|
2022-02-10 21:24:51 +00:00
|
|
|
return underProvisionedBy
|
|
|
|
}
|
|
|
|
|
|
|
|
// We do not want to place additional allocations but in the case we
|
|
|
|
// have lost allocations or allocations that require rescheduling now,
|
|
|
|
// we do so regardless to avoid odd user experiences.
|
|
|
|
|
|
|
|
// If allocs have been lost, determine the number of replacements that are needed
|
|
|
|
// and add placements to the result for the lost allocs.
|
|
|
|
if len(lost) != 0 {
|
2023-08-14 13:43:27 +00:00
|
|
|
allowed := min(len(lost), len(place))
|
2022-02-10 21:24:51 +00:00
|
|
|
desiredChanges.Place += uint64(allowed)
|
|
|
|
a.result.place = append(a.result.place, place[:allowed]...)
|
|
|
|
}
|
|
|
|
|
|
|
|
// if no failures or there are no pending placements return.
|
2022-03-31 15:32:18 +00:00
|
|
|
if len(rescheduleNow) == 0 || len(place) == 0 {
|
2022-02-10 21:24:51 +00:00
|
|
|
return underProvisionedBy
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle rescheduling of failed allocations even if the deployment is failed.
|
|
|
|
// If the placement is rescheduling, and not part of a failed deployment, add
|
2022-03-31 15:32:18 +00:00
|
|
|
// to the place set. Add the previous alloc to the stop set unless it is disconnecting.
|
2022-02-10 21:24:51 +00:00
|
|
|
for _, p := range place {
|
|
|
|
prev := p.PreviousAllocation()
|
|
|
|
partOfFailedDeployment := a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID
|
|
|
|
|
|
|
|
if !partOfFailedDeployment && p.IsRescheduling() {
|
|
|
|
a.result.place = append(a.result.place, p)
|
|
|
|
desiredChanges.Place++
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
_, prevIsDisconnecting := a.result.disconnectUpdates[prev.ID]
|
|
|
|
if prevIsDisconnecting {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: prev,
|
|
|
|
statusDescription: allocRescheduled,
|
|
|
|
})
|
|
|
|
desiredChanges.Stop++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return underProvisionedBy
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *allocReconciler) computeDestructiveUpdates(destructive allocSet, underProvisionedBy int,
|
|
|
|
desiredChanges *structs.DesiredUpdates, tg *structs.TaskGroup) {
|
|
|
|
|
|
|
|
// Do all destructive updates
|
2023-08-14 13:43:27 +00:00
|
|
|
minimum := min(len(destructive), underProvisionedBy)
|
|
|
|
desiredChanges.DestructiveUpdate += uint64(minimum)
|
|
|
|
desiredChanges.Ignore += uint64(len(destructive) - minimum)
|
|
|
|
for _, alloc := range destructive.nameOrder()[:minimum] {
|
2022-02-10 21:24:51 +00:00
|
|
|
a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
|
|
|
|
placeName: alloc.Name,
|
|
|
|
placeTaskGroup: tg,
|
|
|
|
stopAlloc: alloc,
|
|
|
|
stopStatusDescription: allocUpdating,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *allocReconciler) computeMigrations(desiredChanges *structs.DesiredUpdates, migrate allocSet, tg *structs.TaskGroup, isCanarying bool) {
|
|
|
|
desiredChanges.Migrate += uint64(len(migrate))
|
|
|
|
for _, alloc := range migrate.nameOrder() {
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
statusDescription: allocMigrating,
|
|
|
|
})
|
|
|
|
a.result.place = append(a.result.place, allocPlaceResult{
|
|
|
|
name: alloc.Name,
|
|
|
|
canary: alloc.DeploymentStatus.IsCanary(),
|
|
|
|
taskGroup: tg,
|
|
|
|
previousAlloc: alloc,
|
|
|
|
|
|
|
|
downgradeNonCanary: isCanarying && !alloc.DeploymentStatus.IsCanary(),
|
|
|
|
minJobVersion: alloc.Job.Version,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *allocReconciler) createDeployment(groupName string, strategy *structs.UpdateStrategy,
|
|
|
|
existingDeployment bool, dstate *structs.DeploymentState, all, destructive allocSet) {
|
|
|
|
// Guard the simple cases that require no computation first.
|
|
|
|
if existingDeployment ||
|
|
|
|
strategy.IsEmpty() ||
|
|
|
|
dstate.DesiredTotal == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0
|
|
|
|
|
|
|
|
hadRunning := false
|
|
|
|
for _, alloc := range all {
|
|
|
|
if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex {
|
|
|
|
hadRunning = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't create a deployment if it's not the first time running the job
|
|
|
|
// and there are no updates to the spec.
|
|
|
|
if hadRunning && !updatingSpec {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// A previous group may have made the deployment already. If not create one.
|
|
|
|
if a.deployment == nil {
|
|
|
|
a.deployment = structs.NewDeployment(a.job, a.evalPriority)
|
|
|
|
a.result.deployment = a.deployment
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attach the groups deployment state to the deployment
|
|
|
|
a.deployment.TaskGroups[groupName] = dstate
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *allocReconciler) isDeploymentComplete(groupName string, destructive, inplace, migrate, rescheduleNow allocSet,
|
2022-03-14 14:00:59 +00:00
|
|
|
place []allocPlaceResult, rescheduleLater []*delayedRescheduleInfo, requiresCanaries bool) bool {
|
2022-02-10 21:24:51 +00:00
|
|
|
|
|
|
|
complete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 &&
|
|
|
|
!requiresCanaries
|
|
|
|
|
|
|
|
if !complete || a.deployment == nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Final check to see if the deployment is complete is to ensure everything is healthy
|
2022-03-31 15:32:18 +00:00
|
|
|
if dstate, ok := a.deployment.TaskGroups[groupName]; ok {
|
2023-08-14 13:43:27 +00:00
|
|
|
if dstate.HealthyAllocs < max(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
|
2022-02-10 21:24:51 +00:00
|
|
|
(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
|
|
|
|
complete = false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return complete
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// computeStop returns the set of allocations that are marked for stopping given
|
2017-08-07 21:13:05 +00:00
|
|
|
// the group definition, the set of allocations in various states and whether we
|
2017-06-06 21:08:46 +00:00
|
|
|
// are canarying.
|
2017-05-31 18:34:46 +00:00
|
|
|
func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
|
2023-03-24 23:38:31 +00:00
|
|
|
untainted, migrate, lost, canaries allocSet, isCanarying bool, followupEvals map[string]string) allocSet {
|
2017-06-01 22:16:24 +00:00
|
|
|
|
2020-12-17 23:21:46 +00:00
|
|
|
// Mark all lost allocations for stop.
|
2017-06-01 22:16:24 +00:00
|
|
|
var stop allocSet
|
|
|
|
stop = stop.union(lost)
|
2020-06-09 21:13:53 +00:00
|
|
|
a.markDelayed(lost, structs.AllocClientStatusLost, allocLost, followupEvals)
|
2017-06-01 22:16:24 +00:00
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// If we are still deploying or creating canaries, don't stop them
|
2022-02-05 09:54:19 +00:00
|
|
|
if isCanarying {
|
2017-06-01 22:16:24 +00:00
|
|
|
untainted = untainted.difference(canaries)
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
// Remove disconnected allocations so they won't be stopped
|
|
|
|
knownUntainted := untainted.filterOutByClientStatus(structs.AllocClientStatusUnknown)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Hot path the nothing to do case
|
2023-10-27 16:04:04 +00:00
|
|
|
//
|
|
|
|
// Note that this path can result in duplication allocation indexes in a
|
|
|
|
// scenario with a destructive job change (ex. image update) happens with
|
|
|
|
// an increased group count. Once the canary is replaced, and we compute
|
|
|
|
// the next set of stops, the untainted set equals the new group count,
|
|
|
|
// which results is missing one removal. The duplicate alloc index is
|
|
|
|
// corrected in `computePlacements`
|
2023-10-27 15:20:53 +00:00
|
|
|
remove := len(knownUntainted) + len(migrate) - group.Count
|
2017-05-31 18:34:46 +00:00
|
|
|
if remove <= 0 {
|
2023-03-24 23:38:31 +00:00
|
|
|
return stop
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
|
2018-01-19 14:41:53 +00:00
|
|
|
// Filter out any terminal allocations from the untainted set
|
|
|
|
// This is so that we don't try to mark them as stopped redundantly
|
2018-01-19 21:20:00 +00:00
|
|
|
untainted = filterByTerminal(untainted)
|
2018-01-19 14:41:53 +00:00
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Prefer stopping any alloc that has the same name as the canaries if we
|
|
|
|
// are promoted
|
2022-02-05 09:54:19 +00:00
|
|
|
if !isCanarying && len(canaries) != 0 {
|
2017-06-02 23:11:29 +00:00
|
|
|
canaryNames := canaries.nameSet()
|
|
|
|
for id, alloc := range untainted.difference(canaries) {
|
|
|
|
if _, match := canaryNames[alloc.Name]; match {
|
|
|
|
stop[id] = alloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
statusDescription: allocNotNeeded,
|
|
|
|
})
|
|
|
|
delete(untainted, id)
|
|
|
|
|
|
|
|
remove--
|
|
|
|
if remove == 0 {
|
2023-03-24 23:38:31 +00:00
|
|
|
return stop
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Prefer selecting from the migrating set before stopping existing allocs
|
|
|
|
if len(migrate) != 0 {
|
2022-02-05 09:54:19 +00:00
|
|
|
migratingNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
|
|
|
|
removeNames := migratingNames.Highest(uint(remove))
|
2017-06-01 22:16:24 +00:00
|
|
|
for id, alloc := range migrate {
|
|
|
|
if _, match := removeNames[alloc.Name]; !match {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
statusDescription: allocNotNeeded,
|
|
|
|
})
|
|
|
|
delete(migrate, id)
|
|
|
|
stop[id] = alloc
|
|
|
|
nameIndex.UnsetIndex(alloc.Index())
|
|
|
|
|
|
|
|
remove--
|
|
|
|
if remove == 0 {
|
2023-03-24 23:38:31 +00:00
|
|
|
return stop
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// Select the allocs with the highest count to remove
|
2017-05-31 18:34:46 +00:00
|
|
|
removeNames := nameIndex.Highest(uint(remove))
|
2017-06-01 22:16:24 +00:00
|
|
|
for id, alloc := range untainted {
|
2017-09-14 21:00:33 +00:00
|
|
|
if _, ok := removeNames[alloc.Name]; ok {
|
2017-06-01 22:16:24 +00:00
|
|
|
stop[id] = alloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
statusDescription: allocNotNeeded,
|
|
|
|
})
|
2017-09-14 21:00:33 +00:00
|
|
|
delete(untainted, id)
|
|
|
|
|
|
|
|
remove--
|
|
|
|
if remove == 0 {
|
2023-03-24 23:38:31 +00:00
|
|
|
return stop
|
2017-09-14 21:00:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// It is possible that we didn't stop as many as we should have if there
|
|
|
|
// were allocations with duplicate names.
|
|
|
|
for id, alloc := range untainted {
|
|
|
|
stop[id] = alloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: alloc,
|
|
|
|
statusDescription: allocNotNeeded,
|
|
|
|
})
|
|
|
|
delete(untainted, id)
|
|
|
|
|
|
|
|
remove--
|
|
|
|
if remove == 0 {
|
2023-03-24 23:38:31 +00:00
|
|
|
return stop
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
return stop
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// reconcileReconnecting receives the set of allocations that are reconnecting
|
|
|
|
// and all other allocations for the same group and determines which ones to
|
|
|
|
// reconnect which ones or stop.
|
|
|
|
//
|
|
|
|
// - Every reconnecting allocation MUST be present in one, and only one, of
|
|
|
|
// the returned set.
|
|
|
|
// - Every replacement allocation that is not preferred MUST be returned in
|
|
|
|
// the stop set.
|
|
|
|
// - Only reconnecting allocations are allowed to be present in the returned
|
|
|
|
// reconnect set.
|
|
|
|
// - If the reconnecting allocation is to be stopped, its replacements may
|
|
|
|
// not be present in any of the returned sets. The rest of the reconciler
|
|
|
|
// logic will handle them.
|
2023-10-27 15:20:53 +00:00
|
|
|
func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all allocSet) (allocSet, allocSet) {
|
2023-03-24 23:38:31 +00:00
|
|
|
stop := make(allocSet)
|
|
|
|
reconnect := make(allocSet)
|
2022-02-16 18:50:20 +00:00
|
|
|
|
|
|
|
for _, reconnectingAlloc := range reconnecting {
|
2023-03-24 23:38:31 +00:00
|
|
|
// Stop allocations that failed to reconnect.
|
|
|
|
reconnectFailed := !reconnectingAlloc.ServerTerminalStatus() &&
|
|
|
|
reconnectingAlloc.ClientStatus == structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
if reconnectFailed {
|
|
|
|
stop[reconnectingAlloc.ID] = reconnectingAlloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: reconnectingAlloc,
|
|
|
|
clientStatus: structs.AllocClientStatusFailed,
|
|
|
|
statusDescription: allocRescheduled,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the desired status is not run, or if the user-specified desired
|
2022-03-31 15:32:18 +00:00
|
|
|
// transition is not run, stop the reconnecting allocation.
|
2023-03-24 23:38:31 +00:00
|
|
|
stopReconnecting := reconnectingAlloc.DesiredStatus != structs.AllocDesiredStatusRun ||
|
2022-02-16 18:50:20 +00:00
|
|
|
reconnectingAlloc.DesiredTransition.ShouldMigrate() ||
|
|
|
|
reconnectingAlloc.DesiredTransition.ShouldReschedule() ||
|
2022-03-31 15:32:18 +00:00
|
|
|
reconnectingAlloc.DesiredTransition.ShouldForceReschedule() ||
|
|
|
|
reconnectingAlloc.Job.Version < a.job.Version ||
|
2023-03-24 23:38:31 +00:00
|
|
|
reconnectingAlloc.Job.CreateIndex < a.job.CreateIndex
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
if stopReconnecting {
|
2022-02-16 18:50:20 +00:00
|
|
|
stop[reconnectingAlloc.ID] = reconnectingAlloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: reconnectingAlloc,
|
|
|
|
statusDescription: allocNotNeeded,
|
|
|
|
})
|
2022-02-22 11:14:46 +00:00
|
|
|
continue
|
2022-02-16 18:50:20 +00:00
|
|
|
}
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// Find replacement allocations and decide which one to stop. A
|
|
|
|
// reconnecting allocation may have multiple replacements.
|
2023-10-27 15:20:53 +00:00
|
|
|
for _, replacementAlloc := range all {
|
2023-03-24 23:38:31 +00:00
|
|
|
|
|
|
|
// Skip allocations that are not a replacement of the one
|
2023-10-27 15:20:53 +00:00
|
|
|
// reconnecting.
|
|
|
|
isReplacement := replacementAlloc.ID == reconnectingAlloc.NextAllocation
|
2023-03-24 23:38:31 +00:00
|
|
|
|
|
|
|
// Skip allocations that are server terminal.
|
|
|
|
// We don't want to replace a reconnecting allocation with one that
|
|
|
|
// is or will terminate and we don't need to stop them since they
|
|
|
|
// are already marked as terminal by the servers.
|
|
|
|
if !isReplacement || replacementAlloc.ServerTerminalStatus() {
|
2022-02-16 18:50:20 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// Pick which allocation we want to keep.
|
|
|
|
keepAlloc := pickReconnectingAlloc(reconnectingAlloc, replacementAlloc)
|
|
|
|
if keepAlloc == replacementAlloc {
|
|
|
|
// The replacement allocation is preferred, so stop the one
|
|
|
|
// reconnecting if not stopped yet.
|
|
|
|
if _, ok := stop[reconnectingAlloc.ID]; !ok {
|
|
|
|
stop[reconnectingAlloc.ID] = reconnectingAlloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: reconnectingAlloc,
|
|
|
|
statusDescription: allocNotNeeded,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// The reconnecting allocation is preferred, so stop this
|
2023-10-27 15:20:53 +00:00
|
|
|
// replacement, but avoid re-stopping stopped allocs
|
|
|
|
if replacementAlloc.ClientStatus != structs.AllocClientStatusFailed {
|
|
|
|
stop[replacementAlloc.ID] = replacementAlloc
|
|
|
|
a.result.stop = append(a.result.stop, allocStopResult{
|
|
|
|
alloc: replacementAlloc,
|
|
|
|
statusDescription: allocReconnected,
|
|
|
|
})
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
}
|
2023-03-24 23:38:31 +00:00
|
|
|
}
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// Any reconnecting allocation not set to stop must be reconnected.
|
|
|
|
for _, alloc := range reconnecting {
|
|
|
|
if _, ok := stop[alloc.ID]; !ok {
|
|
|
|
reconnect[alloc.ID] = alloc
|
|
|
|
}
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
return reconnect, stop
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// pickReconnectingAlloc returns the allocation to keep between the original
|
|
|
|
// one that is reconnecting and one of its replacements.
|
|
|
|
//
|
|
|
|
// This function is not commutative, meaning that pickReconnectingAlloc(A, B)
|
|
|
|
// is not the same as pickReconnectingAlloc(B, A). Preference is given to keep
|
|
|
|
// the original allocation when possible.
|
|
|
|
func pickReconnectingAlloc(original *structs.Allocation, replacement *structs.Allocation) *structs.Allocation {
|
|
|
|
// Check if the replacement is newer.
|
|
|
|
// Always prefer the replacement if true.
|
|
|
|
replacementIsNewer := replacement.Job.Version > original.Job.Version ||
|
|
|
|
replacement.Job.CreateIndex > original.Job.CreateIndex
|
|
|
|
if replacementIsNewer {
|
|
|
|
return replacement
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
// Check if the replacement has better placement score.
|
|
|
|
// If any of the scores is not available, only pick the replacement if
|
|
|
|
// itself does have scores.
|
|
|
|
originalMaxScoreMeta := original.Metrics.MaxNormScore()
|
|
|
|
replacementMaxScoreMeta := replacement.Metrics.MaxNormScore()
|
|
|
|
|
|
|
|
replacementHasBetterScore := originalMaxScoreMeta == nil && replacementMaxScoreMeta != nil ||
|
|
|
|
(originalMaxScoreMeta != nil && replacementMaxScoreMeta != nil &&
|
|
|
|
replacementMaxScoreMeta.NormScore > originalMaxScoreMeta.NormScore)
|
|
|
|
|
|
|
|
// Check if the replacement has better client status.
|
|
|
|
// Even with a better placement score make sure we don't replace a running
|
|
|
|
// allocation with one that is not.
|
|
|
|
replacementIsRunning := replacement.ClientStatus == structs.AllocClientStatusRunning
|
|
|
|
originalNotRunning := original.ClientStatus != structs.AllocClientStatusRunning
|
|
|
|
|
|
|
|
if replacementHasBetterScore && (replacementIsRunning || originalNotRunning) {
|
|
|
|
return replacement
|
2022-02-16 18:50:20 +00:00
|
|
|
}
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
return original
|
2022-02-16 18:50:20 +00:00
|
|
|
}
|
|
|
|
|
2017-05-23 00:42:41 +00:00
|
|
|
// computeUpdates determines which allocations for the passed group require
|
|
|
|
// updates. Three groups are returned:
|
|
|
|
// 1. Those that require no upgrades
|
|
|
|
// 2. Those that can be upgraded in-place. These are added to the results
|
|
|
|
// automatically since the function contains the correct state to do so,
|
|
|
|
// 3. Those that require destructive updates
|
2018-03-23 21:36:05 +00:00
|
|
|
func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) {
|
2017-05-22 17:58:34 +00:00
|
|
|
// Determine the set of allocations that need to be updated
|
|
|
|
ignore = make(map[string]*structs.Allocation)
|
|
|
|
inplace = make(map[string]*structs.Allocation)
|
|
|
|
destructive = make(map[string]*structs.Allocation)
|
|
|
|
|
|
|
|
for _, alloc := range untainted {
|
2017-05-31 23:55:40 +00:00
|
|
|
ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
|
2018-03-23 21:36:05 +00:00
|
|
|
if ignoreChange {
|
2017-05-22 17:58:34 +00:00
|
|
|
ignore[alloc.ID] = alloc
|
2017-05-31 23:55:40 +00:00
|
|
|
} else if destructiveChange {
|
2017-05-22 17:58:34 +00:00
|
|
|
destructive[alloc.ID] = alloc
|
2017-05-31 23:55:40 +00:00
|
|
|
} else {
|
|
|
|
inplace[alloc.ID] = alloc
|
|
|
|
a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
|
2017-05-22 17:58:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
2018-03-08 00:44:54 +00:00
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// createRescheduleLaterEvals creates batched followup evaluations with the WaitUntil field
|
2020-06-09 21:13:53 +00:00
|
|
|
// set for allocations that are eligible to be rescheduled later, and marks the alloc with
|
|
|
|
// the followupEvalID
|
2022-02-10 21:24:51 +00:00
|
|
|
func (a *allocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
|
2020-06-09 21:13:53 +00:00
|
|
|
// followupEvals are created in the same way as for delayed lost allocs
|
2022-03-31 15:32:18 +00:00
|
|
|
allocIDToFollowupEvalID := a.createLostLaterEvals(rescheduleLater, tgName)
|
2020-06-03 13:48:38 +00:00
|
|
|
|
2020-06-09 21:13:53 +00:00
|
|
|
// Create updates that will be applied to the allocs to mark the FollowupEvalID
|
2023-10-27 15:20:53 +00:00
|
|
|
for _, laterAlloc := range rescheduleLater {
|
|
|
|
existingAlloc := all[laterAlloc.alloc.ID]
|
2020-06-09 21:13:53 +00:00
|
|
|
updatedAlloc := existingAlloc.Copy()
|
2023-10-27 15:20:53 +00:00
|
|
|
updatedAlloc.FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID]
|
|
|
|
|
|
|
|
// Can't updated an allocation that is disconnected
|
|
|
|
if _, ok := a.result.disconnectUpdates[laterAlloc.allocID]; !ok {
|
|
|
|
a.result.attributeUpdates[laterAlloc.allocID] = updatedAlloc
|
|
|
|
} else {
|
|
|
|
a.result.disconnectUpdates[laterAlloc.allocID].FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID]
|
|
|
|
}
|
2020-06-09 21:13:53 +00:00
|
|
|
}
|
2020-06-03 13:48:38 +00:00
|
|
|
}
|
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// computeReconnecting copies existing allocations in the unknown state, but
|
|
|
|
// whose nodes have been identified as ready. The Allocations DesiredStatus is
|
|
|
|
// set to running, and these allocs are appended to the Plan as non-destructive
|
|
|
|
// updates. Clients are responsible for reconciling the DesiredState with the
|
|
|
|
// actual state as the node comes back online.
|
|
|
|
func (a *allocReconciler) computeReconnecting(reconnecting allocSet) {
|
|
|
|
if len(reconnecting) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create updates that will be appended to the plan.
|
|
|
|
for _, alloc := range reconnecting {
|
|
|
|
// If the user has defined a DesiredTransition don't resume the alloc.
|
2022-03-31 15:32:18 +00:00
|
|
|
if alloc.DesiredTransition.ShouldMigrate() ||
|
|
|
|
alloc.DesiredTransition.ShouldReschedule() ||
|
|
|
|
alloc.DesiredTransition.ShouldForceReschedule() ||
|
|
|
|
alloc.Job.Version < a.job.Version ||
|
|
|
|
alloc.Job.CreateIndex < a.job.CreateIndex {
|
2022-02-16 18:50:20 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the scheduler has defined a terminal DesiredStatus don't resume the alloc.
|
|
|
|
if alloc.DesiredStatus != structs.AllocDesiredStatusRun {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-04-06 13:33:32 +00:00
|
|
|
// If the alloc has failed don't reconnect.
|
|
|
|
if alloc.ClientStatus != structs.AllocClientStatusRunning {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
// Record the new ClientStatus to indicate to future evals that the
|
|
|
|
// alloc has already reconnected.
|
|
|
|
// Use a copy to prevent mutating the object from statestore.
|
|
|
|
reconnectedAlloc := alloc.Copy()
|
|
|
|
reconnectedAlloc.AppendState(structs.AllocStateFieldClientStatus, alloc.ClientStatus)
|
|
|
|
a.result.reconnectUpdates[reconnectedAlloc.ID] = reconnectedAlloc
|
2022-02-16 18:50:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// handleDelayedLost creates batched followup evaluations with the WaitUntil field set for
|
2020-06-09 21:13:53 +00:00
|
|
|
// lost allocations. followupEvals are appended to a.result as a side effect, we return a
|
2022-02-10 21:24:51 +00:00
|
|
|
// map of alloc IDs to their followupEval IDs.
|
2022-03-31 15:32:18 +00:00
|
|
|
func (a *allocReconciler) createLostLaterEvals(rescheduleLater []*delayedRescheduleInfo, tgName string) map[string]string {
|
2018-03-23 21:36:05 +00:00
|
|
|
if len(rescheduleLater) == 0 {
|
2020-06-09 21:13:53 +00:00
|
|
|
return map[string]string{}
|
2018-03-23 21:36:05 +00:00
|
|
|
}
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Sort by time
|
|
|
|
sort.Slice(rescheduleLater, func(i, j int) bool {
|
|
|
|
return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
|
|
|
|
})
|
|
|
|
|
|
|
|
var evals []*structs.Evaluation
|
|
|
|
nextReschedTime := rescheduleLater[0].rescheduleTime
|
|
|
|
allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
|
2018-03-23 21:36:05 +00:00
|
|
|
|
2018-03-08 14:33:44 +00:00
|
|
|
// Create a new eval for the first batch
|
|
|
|
eval := &structs.Evaluation{
|
2018-04-10 20:30:15 +00:00
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: a.job.Namespace,
|
2021-11-23 08:23:31 +00:00
|
|
|
Priority: a.evalPriority,
|
2018-04-10 20:30:15 +00:00
|
|
|
Type: a.job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
|
|
|
|
JobID: a.job.ID,
|
|
|
|
JobModifyIndex: a.job.ModifyIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
StatusDescription: reschedulingFollowupEvalDesc,
|
|
|
|
WaitUntil: nextReschedTime,
|
2018-03-08 14:33:44 +00:00
|
|
|
}
|
|
|
|
evals = append(evals, eval)
|
2018-03-23 21:36:05 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
for _, allocReschedInfo := range rescheduleLater {
|
2018-03-08 14:33:44 +00:00
|
|
|
if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
|
|
|
|
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
|
2018-03-02 00:23:44 +00:00
|
|
|
} else {
|
2018-03-08 14:33:44 +00:00
|
|
|
// Start a new batch
|
|
|
|
nextReschedTime = allocReschedInfo.rescheduleTime
|
|
|
|
// Create a new eval for the new batch
|
|
|
|
eval = &structs.Evaluation{
|
2018-03-02 00:23:44 +00:00
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: a.job.Namespace,
|
2021-11-23 08:23:31 +00:00
|
|
|
Priority: a.evalPriority,
|
2018-03-02 00:23:44 +00:00
|
|
|
Type: a.job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
|
|
|
|
JobID: a.job.ID,
|
|
|
|
JobModifyIndex: a.job.ModifyIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
WaitUntil: nextReschedTime,
|
|
|
|
}
|
|
|
|
evals = append(evals, eval)
|
2018-03-08 14:33:44 +00:00
|
|
|
// Set the evalID for the first alloc in this new batch
|
|
|
|
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
2022-01-06 20:56:43 +00:00
|
|
|
emitRescheduleInfo(allocReschedInfo.alloc, eval)
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
a.appendFollowupEvals(tgName, evals)
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2020-06-09 21:13:53 +00:00
|
|
|
return allocIDToFollowupEvalID
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
2022-01-06 20:56:43 +00:00
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// createTimeoutLaterEvals creates followup evaluations with the
|
|
|
|
// WaitUntil field set for allocations in an unknown state on disconnected nodes.
|
|
|
|
// Followup Evals are appended to a.result as a side effect. It returns a map of
|
|
|
|
// allocIDs to their associated followUpEvalIDs.
|
|
|
|
func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName string) map[string]string {
|
|
|
|
if len(disconnecting) == 0 {
|
|
|
|
return map[string]string{}
|
|
|
|
}
|
|
|
|
|
|
|
|
timeoutDelays, err := disconnecting.delayByMaxClientDisconnect(a.now)
|
2023-10-27 15:20:53 +00:00
|
|
|
if err != nil {
|
|
|
|
a.logger.Error("error for task_group",
|
2022-09-01 13:06:10 +00:00
|
|
|
"task_group", tgName, "error", err)
|
2022-02-16 18:50:20 +00:00
|
|
|
return map[string]string{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort by time
|
|
|
|
sort.Slice(timeoutDelays, func(i, j int) bool {
|
|
|
|
return timeoutDelays[i].rescheduleTime.Before(timeoutDelays[j].rescheduleTime)
|
|
|
|
})
|
|
|
|
|
|
|
|
var evals []*structs.Evaluation
|
|
|
|
nextReschedTime := timeoutDelays[0].rescheduleTime
|
|
|
|
allocIDToFollowupEvalID := make(map[string]string, len(timeoutDelays))
|
|
|
|
|
|
|
|
eval := &structs.Evaluation{
|
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: a.job.Namespace,
|
|
|
|
Priority: a.evalPriority,
|
|
|
|
Type: a.job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerMaxDisconnectTimeout,
|
|
|
|
JobID: a.job.ID,
|
|
|
|
JobModifyIndex: a.job.ModifyIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
StatusDescription: disconnectTimeoutFollowupEvalDesc,
|
|
|
|
WaitUntil: nextReschedTime,
|
|
|
|
}
|
|
|
|
evals = append(evals, eval)
|
|
|
|
|
|
|
|
// Important to remember that these are sorted. The rescheduleTime can only
|
|
|
|
// get farther into the future. If this loop detects the next delay is greater
|
|
|
|
// than the batch window (5s) it creates another batch.
|
|
|
|
for _, timeoutInfo := range timeoutDelays {
|
|
|
|
if timeoutInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
|
|
|
|
allocIDToFollowupEvalID[timeoutInfo.allocID] = eval.ID
|
|
|
|
} else {
|
|
|
|
// Start a new batch
|
|
|
|
nextReschedTime = timeoutInfo.rescheduleTime
|
|
|
|
// Create a new eval for the new batch
|
|
|
|
eval = &structs.Evaluation{
|
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: a.job.Namespace,
|
|
|
|
Priority: a.evalPriority,
|
|
|
|
Type: a.job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerMaxDisconnectTimeout,
|
|
|
|
JobID: a.job.ID,
|
|
|
|
JobModifyIndex: a.job.ModifyIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
StatusDescription: disconnectTimeoutFollowupEvalDesc,
|
|
|
|
WaitUntil: timeoutInfo.rescheduleTime,
|
|
|
|
}
|
|
|
|
evals = append(evals, eval)
|
|
|
|
allocIDToFollowupEvalID[timeoutInfo.allocID] = eval.ID
|
|
|
|
}
|
|
|
|
|
2022-03-04 16:04:21 +00:00
|
|
|
emitRescheduleInfo(timeoutInfo.alloc, eval)
|
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// Create updates that will be applied to the allocs to mark the FollowupEvalID
|
2022-03-31 15:32:18 +00:00
|
|
|
// and the unknown ClientStatus and AllocState.
|
2022-02-16 18:50:20 +00:00
|
|
|
updatedAlloc := timeoutInfo.alloc.Copy()
|
|
|
|
updatedAlloc.ClientStatus = structs.AllocClientStatusUnknown
|
2022-03-31 15:32:18 +00:00
|
|
|
updatedAlloc.AppendState(structs.AllocStateFieldClientStatus, structs.AllocClientStatusUnknown)
|
2022-02-16 18:50:20 +00:00
|
|
|
updatedAlloc.ClientDescription = allocUnknown
|
|
|
|
updatedAlloc.FollowupEvalID = eval.ID
|
|
|
|
a.result.disconnectUpdates[updatedAlloc.ID] = updatedAlloc
|
|
|
|
}
|
|
|
|
|
|
|
|
a.appendFollowupEvals(tgName, evals)
|
|
|
|
|
|
|
|
return allocIDToFollowupEvalID
|
|
|
|
}
|
|
|
|
|
|
|
|
// appendFollowupEvals appends a set of followup evals for a task group to the
|
|
|
|
// desiredFollowupEvals map which is later added to the scheduler's followUpEvals set.
|
|
|
|
func (a *allocReconciler) appendFollowupEvals(tgName string, evals []*structs.Evaluation) {
|
|
|
|
// Merge with
|
|
|
|
if existingFollowUpEvals, ok := a.result.desiredFollowupEvals[tgName]; ok {
|
|
|
|
evals = append(existingFollowUpEvals, evals...)
|
|
|
|
}
|
|
|
|
|
|
|
|
a.result.desiredFollowupEvals[tgName] = evals
|
|
|
|
}
|
|
|
|
|
|
|
|
// emitRescheduleInfo emits metrics about the rescheduling decision of an evaluation. If a followup evaluation is
|
2022-01-06 20:56:43 +00:00
|
|
|
// provided, the waitUntil time is emitted.
|
|
|
|
func emitRescheduleInfo(alloc *structs.Allocation, followupEval *structs.Evaluation) {
|
|
|
|
// Emit short-lived metrics data point. Note, these expire and stop emitting after about a minute.
|
2022-01-07 14:51:15 +00:00
|
|
|
baseMetric := []string{"scheduler", "allocs", "reschedule"}
|
2022-01-06 20:56:43 +00:00
|
|
|
labels := []metrics.Label{
|
|
|
|
{Name: "alloc_id", Value: alloc.ID},
|
|
|
|
{Name: "job", Value: alloc.JobID},
|
|
|
|
{Name: "namespace", Value: alloc.Namespace},
|
|
|
|
{Name: "task_group", Value: alloc.TaskGroup},
|
|
|
|
}
|
|
|
|
if followupEval != nil {
|
|
|
|
labels = append(labels, metrics.Label{Name: "followup_eval_id", Value: followupEval.ID})
|
|
|
|
metrics.SetGaugeWithLabels(append(baseMetric, "wait_until"), float32(followupEval.WaitUntil.Unix()), labels)
|
|
|
|
}
|
|
|
|
attempted, availableAttempts := alloc.RescheduleInfo()
|
|
|
|
metrics.SetGaugeWithLabels(append(baseMetric, "attempted"), float32(attempted), labels)
|
|
|
|
metrics.SetGaugeWithLabels(append(baseMetric, "limit"), float32(availableAttempts), labels)
|
|
|
|
}
|