2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
package deploymentwatcher
|
|
|
|
|
|
|
|
import (
|
2017-06-28 04:36:16 +00:00
|
|
|
"context"
|
2018-03-23 17:56:00 +00:00
|
|
|
"fmt"
|
2017-06-26 21:23:52 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
2017-08-31 00:45:32 +00:00
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
2022-08-17 16:26:34 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
2017-09-29 16:58:48 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2017-08-31 00:45:32 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/state"
|
2017-06-26 21:23:52 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2018-11-05 22:43:07 +00:00
|
|
|
"golang.org/x/time/rate"
|
2017-06-26 21:23:52 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2017-07-03 18:08:35 +00:00
|
|
|
// perJobEvalBatchPeriod is the batching length before creating an evaluation to
|
2017-06-26 21:23:52 +00:00
|
|
|
// trigger the scheduler when allocations are marked as healthy.
|
2017-07-03 18:08:35 +00:00
|
|
|
perJobEvalBatchPeriod = 1 * time.Second
|
2017-06-26 21:23:52 +00:00
|
|
|
)
|
|
|
|
|
2018-04-07 00:23:35 +00:00
|
|
|
var (
|
2018-04-10 22:02:52 +00:00
|
|
|
// allowRescheduleTransition is the transition that allows failed
|
2018-04-07 00:23:35 +00:00
|
|
|
// allocations part of a deployment to be rescheduled. We create a one off
|
|
|
|
// variable to avoid creating a new object for every request.
|
2018-04-10 22:02:52 +00:00
|
|
|
allowRescheduleTransition = &structs.DesiredTransition{
|
2022-08-17 16:26:34 +00:00
|
|
|
Reschedule: pointer.Of(true),
|
2018-04-07 00:23:35 +00:00
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// deploymentTriggers are the set of functions required to trigger changes on
|
|
|
|
// behalf of a deployment
|
|
|
|
type deploymentTriggers interface {
|
2018-04-10 22:02:52 +00:00
|
|
|
// createUpdate is used to create allocation desired transition updates and
|
2018-04-06 22:53:14 +00:00
|
|
|
// an evaluation.
|
|
|
|
createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error)
|
2017-06-26 21:23:52 +00:00
|
|
|
|
|
|
|
// upsertJob is used to roll back a job when autoreverting for a deployment
|
|
|
|
upsertJob(job *structs.Job) (uint64, error)
|
|
|
|
|
|
|
|
// upsertDeploymentStatusUpdate is used to upsert a deployment status update
|
|
|
|
// and an optional evaluation and job to upsert
|
|
|
|
upsertDeploymentStatusUpdate(u *structs.DeploymentStatusUpdate, eval *structs.Evaluation, job *structs.Job) (uint64, error)
|
|
|
|
|
|
|
|
// upsertDeploymentPromotion is used to promote canaries in a deployment
|
|
|
|
upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
|
|
|
|
|
|
|
|
// upsertDeploymentAllocHealth is used to set the health of allocations in a
|
|
|
|
// deployment
|
|
|
|
upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
// deploymentWatcher is used to watch a single deployment and trigger the
|
2018-03-11 19:06:05 +00:00
|
|
|
// scheduler when allocation health transitions.
|
2017-06-26 21:23:52 +00:00
|
|
|
type deploymentWatcher struct {
|
2017-06-28 04:36:16 +00:00
|
|
|
// queryLimiter is used to limit the rate of blocking queries
|
|
|
|
queryLimiter *rate.Limiter
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// deploymentTriggers holds the methods required to trigger changes on behalf of the
|
|
|
|
// deployment
|
|
|
|
deploymentTriggers
|
|
|
|
|
2020-06-16 23:23:14 +00:00
|
|
|
// DeploymentRPC holds methods for interacting with peer regions
|
|
|
|
// in enterprise edition
|
|
|
|
DeploymentRPC
|
|
|
|
|
|
|
|
// JobRPC holds methods for interacting with peer regions
|
|
|
|
// in enterprise edition
|
|
|
|
JobRPC
|
2020-06-16 12:10:41 +00:00
|
|
|
|
2017-08-31 00:45:32 +00:00
|
|
|
// state is the state that is watched for state changes.
|
|
|
|
state *state.StateStore
|
2017-06-26 21:23:52 +00:00
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// deploymentID is the deployment's ID being watched
|
|
|
|
deploymentID string
|
|
|
|
|
|
|
|
// deploymentUpdateCh is triggered when there is an updated deployment
|
|
|
|
deploymentUpdateCh chan struct{}
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// d is the deployment being watched
|
|
|
|
d *structs.Deployment
|
|
|
|
|
|
|
|
// j is the job the deployment is for
|
|
|
|
j *structs.Job
|
|
|
|
|
|
|
|
// outstandingBatch marks whether an outstanding function exists to create
|
2018-04-07 00:23:35 +00:00
|
|
|
// the evaluation. Access should be done through the lock.
|
2017-06-26 21:23:52 +00:00
|
|
|
outstandingBatch bool
|
|
|
|
|
2018-04-07 00:23:35 +00:00
|
|
|
// outstandingAllowReplacements is the map of allocations that will be
|
|
|
|
// marked as allowing a replacement. Access should be done through the lock.
|
|
|
|
outstandingAllowReplacements map[string]*structs.DesiredTransition
|
|
|
|
|
2017-06-28 21:32:11 +00:00
|
|
|
// latestEval is the latest eval for the job. It is updated by the watch
|
|
|
|
// loop and any time an evaluation is created. The field should be accessed
|
|
|
|
// by holding the lock or using the setter and getter methods.
|
2017-06-28 20:19:41 +00:00
|
|
|
latestEval uint64
|
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
logger log.Logger
|
2017-06-28 04:36:16 +00:00
|
|
|
ctx context.Context
|
|
|
|
exitFn context.CancelFunc
|
2017-06-26 21:23:52 +00:00
|
|
|
l sync.RWMutex
|
|
|
|
}
|
|
|
|
|
|
|
|
// newDeploymentWatcher returns a deployment watcher that is used to watch
|
|
|
|
// deployments and trigger the scheduler as needed.
|
2017-06-28 21:32:11 +00:00
|
|
|
func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter,
|
2018-09-15 23:23:13 +00:00
|
|
|
logger log.Logger, state *state.StateStore, d *structs.Deployment,
|
2020-06-16 23:23:14 +00:00
|
|
|
j *structs.Job, triggers deploymentTriggers,
|
|
|
|
deploymentRPC DeploymentRPC, jobRPC JobRPC) *deploymentWatcher {
|
2017-06-26 21:23:52 +00:00
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
ctx, exitFn := context.WithCancel(parent)
|
2017-06-26 21:23:52 +00:00
|
|
|
w := &deploymentWatcher{
|
2017-07-03 18:08:35 +00:00
|
|
|
queryLimiter: queryLimiter,
|
2018-04-04 23:49:49 +00:00
|
|
|
deploymentID: d.ID,
|
|
|
|
deploymentUpdateCh: make(chan struct{}, 1),
|
2017-07-03 18:08:35 +00:00
|
|
|
d: d,
|
|
|
|
j: j,
|
2017-08-31 00:45:32 +00:00
|
|
|
state: state,
|
2017-07-03 18:08:35 +00:00
|
|
|
deploymentTriggers: triggers,
|
2020-06-16 23:23:14 +00:00
|
|
|
DeploymentRPC: deploymentRPC,
|
|
|
|
JobRPC: jobRPC,
|
2018-09-15 23:23:13 +00:00
|
|
|
logger: logger.With("deployment_id", d.ID, "job", j.NamespacedID()),
|
2017-07-03 18:08:35 +00:00
|
|
|
ctx: ctx,
|
|
|
|
exitFn: exitFn,
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-06-28 21:32:11 +00:00
|
|
|
// Start the long lived watcher that scans for allocation updates
|
2018-04-04 23:49:49 +00:00
|
|
|
go w.watch()
|
2017-06-28 21:32:11 +00:00
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
return w
|
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// updateDeployment is used to update the tracked deployment.
|
|
|
|
func (w *deploymentWatcher) updateDeployment(d *structs.Deployment) {
|
|
|
|
w.l.Lock()
|
|
|
|
defer w.l.Unlock()
|
|
|
|
|
|
|
|
// Update and trigger
|
|
|
|
w.d = d
|
|
|
|
select {
|
|
|
|
case w.deploymentUpdateCh <- struct{}{}:
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// getDeployment returns the tracked deployment.
|
|
|
|
func (w *deploymentWatcher) getDeployment() *structs.Deployment {
|
|
|
|
w.l.RLock()
|
|
|
|
defer w.l.RUnlock()
|
|
|
|
return w.d
|
|
|
|
}
|
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
func (w *deploymentWatcher) SetAllocHealth(
|
|
|
|
req *structs.DeploymentAllocHealthRequest,
|
|
|
|
resp *structs.DeploymentUpdateResponse) error {
|
2017-06-26 21:23:52 +00:00
|
|
|
|
|
|
|
// If we are failing the deployment, update the status and potentially
|
|
|
|
// rollback
|
|
|
|
var j *structs.Job
|
|
|
|
var u *structs.DeploymentStatusUpdate
|
|
|
|
|
|
|
|
// If there are unhealthy allocations we need to mark the deployment as
|
|
|
|
// failed and check if we should roll back to a stable job.
|
|
|
|
if l := len(req.UnhealthyAllocationIDs); l != 0 {
|
|
|
|
unhealthy := make(map[string]struct{}, l)
|
|
|
|
for _, alloc := range req.UnhealthyAllocationIDs {
|
|
|
|
unhealthy[alloc] = struct{}{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the allocations for the deployment
|
2017-08-31 00:45:32 +00:00
|
|
|
snap, err := w.state.Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
allocs, err := snap.AllocsByDeployment(nil, req.DeploymentID)
|
|
|
|
if err != nil {
|
2017-06-28 04:36:16 +00:00
|
|
|
return err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-07-03 18:08:35 +00:00
|
|
|
// Determine if we should autorevert to an older job
|
2017-06-26 21:23:52 +00:00
|
|
|
desc := structs.DeploymentStatusDescriptionFailedAllocations
|
2017-08-31 00:45:32 +00:00
|
|
|
for _, alloc := range allocs {
|
2017-06-26 21:23:52 +00:00
|
|
|
// Check that the alloc has been marked unhealthy
|
|
|
|
if _, ok := unhealthy[alloc.ID]; !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the group has autorevert set
|
2020-07-17 18:07:43 +00:00
|
|
|
dstate, ok := w.getDeployment().TaskGroups[alloc.TaskGroup]
|
|
|
|
if !ok || !dstate.AutoRevert {
|
2017-06-26 21:23:52 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
var err error
|
|
|
|
j, err = w.latestStableJob()
|
|
|
|
if err != nil {
|
2017-06-28 04:36:16 +00:00
|
|
|
return err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-07-06 22:03:27 +00:00
|
|
|
if j != nil {
|
2017-11-03 20:33:34 +00:00
|
|
|
j, desc = w.handleRollbackValidity(j, desc)
|
2017-07-06 22:03:27 +00:00
|
|
|
}
|
2017-06-26 21:23:52 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
u = w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc)
|
|
|
|
}
|
|
|
|
|
2017-09-07 23:56:15 +00:00
|
|
|
// Canonicalize the job in case it doesn't have namespace set
|
|
|
|
j.Canonicalize()
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// Create the request
|
|
|
|
areq := &structs.ApplyDeploymentAllocHealthRequest{
|
|
|
|
DeploymentAllocHealthRequest: *req,
|
2018-04-06 20:11:58 +00:00
|
|
|
Timestamp: time.Now(),
|
|
|
|
Eval: w.getEval(),
|
|
|
|
DeploymentUpdate: u,
|
|
|
|
Job: j,
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
index, err := w.upsertDeploymentAllocHealth(areq)
|
|
|
|
if err != nil {
|
2017-06-28 04:36:16 +00:00
|
|
|
return err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
// Build the response
|
|
|
|
resp.EvalID = areq.Eval.ID
|
|
|
|
resp.EvalCreateIndex = index
|
|
|
|
resp.DeploymentModifyIndex = index
|
2017-06-29 05:00:18 +00:00
|
|
|
resp.Index = index
|
2017-07-06 22:03:27 +00:00
|
|
|
if j != nil {
|
2022-08-17 16:26:34 +00:00
|
|
|
resp.RevertedJobVersion = pointer.Of(j.Version)
|
2017-07-06 22:03:27 +00:00
|
|
|
}
|
2017-06-28 04:36:16 +00:00
|
|
|
return nil
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
2017-11-03 21:49:16 +00:00
|
|
|
|
|
|
|
// handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job
|
|
|
|
// Returns a modified description and job accordingly.
|
|
|
|
func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) {
|
2017-11-03 20:33:34 +00:00
|
|
|
// Only rollback if job being changed has a different spec.
|
|
|
|
// This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback
|
|
|
|
// If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed.
|
2017-11-03 21:49:16 +00:00
|
|
|
if w.j.SpecChanged(rollbackJob) {
|
|
|
|
desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
|
2017-11-03 20:33:34 +00:00
|
|
|
} else {
|
2017-11-03 21:49:16 +00:00
|
|
|
desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version)
|
|
|
|
rollbackJob = nil
|
2017-11-03 20:33:34 +00:00
|
|
|
}
|
2017-11-03 21:49:16 +00:00
|
|
|
return rollbackJob, desc
|
2017-11-03 20:33:34 +00:00
|
|
|
}
|
2017-06-26 21:23:52 +00:00
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
func (w *deploymentWatcher) PromoteDeployment(
|
|
|
|
req *structs.DeploymentPromoteRequest,
|
|
|
|
resp *structs.DeploymentUpdateResponse) error {
|
2017-06-26 21:23:52 +00:00
|
|
|
|
|
|
|
// Create the request
|
|
|
|
areq := &structs.ApplyDeploymentPromoteRequest{
|
|
|
|
DeploymentPromoteRequest: *req,
|
2018-09-04 23:03:52 +00:00
|
|
|
Eval: w.getEval(),
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
index, err := w.upsertDeploymentPromotion(areq)
|
|
|
|
if err != nil {
|
2017-06-28 04:36:16 +00:00
|
|
|
return err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
// Build the response
|
|
|
|
resp.EvalID = areq.Eval.ID
|
|
|
|
resp.EvalCreateIndex = index
|
|
|
|
resp.DeploymentModifyIndex = index
|
2017-06-29 05:00:18 +00:00
|
|
|
resp.Index = index
|
2017-06-28 04:36:16 +00:00
|
|
|
return nil
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2019-05-10 15:20:12 +00:00
|
|
|
// autoPromoteDeployment creates a synthetic promotion request, and upserts it for processing
|
|
|
|
func (w *deploymentWatcher) autoPromoteDeployment(allocs []*structs.AllocListStub) error {
|
|
|
|
d := w.getDeployment()
|
|
|
|
if !d.HasPlacedCanaries() || !d.RequiresPromotion() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-01-19 00:40:44 +00:00
|
|
|
// AutoPromote iff every task group with canaries is marked auto_promote and is healthy. The whole
|
2019-05-21 17:02:59 +00:00
|
|
|
// job version has been incremented, so we promote together. See also AutoRevert
|
2020-07-17 18:07:43 +00:00
|
|
|
for _, dstate := range d.TaskGroups {
|
2022-01-19 00:40:44 +00:00
|
|
|
|
|
|
|
// skip auto promote canary validation if the task group has no canaries
|
|
|
|
// to prevent auto promote hanging on mixed canary/non-canary taskgroup deploys
|
|
|
|
if dstate.DesiredCanaries < 1 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-08-04 15:53:50 +00:00
|
|
|
if !dstate.AutoPromote || len(dstate.PlacedCanaries) < dstate.DesiredCanaries {
|
2019-05-21 17:02:59 +00:00
|
|
|
return nil
|
2019-05-10 15:20:12 +00:00
|
|
|
}
|
|
|
|
|
2022-08-04 15:53:50 +00:00
|
|
|
healthyCanaries := 0
|
2019-05-10 15:20:12 +00:00
|
|
|
// Find the health status of each canary
|
2020-07-17 18:07:43 +00:00
|
|
|
for _, c := range dstate.PlacedCanaries {
|
2019-05-10 15:20:12 +00:00
|
|
|
for _, a := range allocs {
|
2022-08-04 15:53:50 +00:00
|
|
|
if c == a.ID && a.DeploymentStatus.IsHealthy() {
|
|
|
|
healthyCanaries += 1
|
2019-05-10 15:20:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-08-04 15:53:50 +00:00
|
|
|
if healthyCanaries != dstate.DesiredCanaries {
|
|
|
|
return nil
|
|
|
|
}
|
2019-05-10 15:20:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Send the request
|
|
|
|
_, err := w.upsertDeploymentPromotion(&structs.ApplyDeploymentPromoteRequest{
|
2019-05-21 17:02:59 +00:00
|
|
|
DeploymentPromoteRequest: structs.DeploymentPromoteRequest{DeploymentID: d.GetID(), All: true},
|
2019-05-10 15:20:12 +00:00
|
|
|
Eval: w.getEval(),
|
|
|
|
})
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
func (w *deploymentWatcher) PauseDeployment(
|
|
|
|
req *structs.DeploymentPauseRequest,
|
|
|
|
resp *structs.DeploymentUpdateResponse) error {
|
2017-08-07 21:13:05 +00:00
|
|
|
// Determine the status we should transition to and if we need to create an
|
2017-06-26 21:23:52 +00:00
|
|
|
// evaluation
|
|
|
|
status, desc := structs.DeploymentStatusPaused, structs.DeploymentStatusDescriptionPaused
|
|
|
|
var eval *structs.Evaluation
|
|
|
|
evalID := ""
|
|
|
|
if !req.Pause {
|
|
|
|
status, desc = structs.DeploymentStatusRunning, structs.DeploymentStatusDescriptionRunning
|
2017-06-28 04:36:16 +00:00
|
|
|
eval = w.getEval()
|
2017-06-26 21:23:52 +00:00
|
|
|
evalID = eval.ID
|
|
|
|
}
|
|
|
|
update := w.getDeploymentStatusUpdate(status, desc)
|
|
|
|
|
|
|
|
// Commit the change
|
|
|
|
i, err := w.upsertDeploymentStatusUpdate(update, eval, nil)
|
|
|
|
if err != nil {
|
2017-06-28 04:36:16 +00:00
|
|
|
return err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-06-28 04:36:16 +00:00
|
|
|
// Build the response
|
2017-06-29 05:00:18 +00:00
|
|
|
if evalID != "" {
|
|
|
|
resp.EvalID = evalID
|
|
|
|
resp.EvalCreateIndex = i
|
|
|
|
}
|
2017-06-28 04:36:16 +00:00
|
|
|
resp.DeploymentModifyIndex = i
|
2017-06-29 05:00:18 +00:00
|
|
|
resp.Index = i
|
2017-06-28 04:36:16 +00:00
|
|
|
return nil
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2017-06-28 23:29:48 +00:00
|
|
|
func (w *deploymentWatcher) FailDeployment(
|
2017-06-29 05:00:18 +00:00
|
|
|
req *structs.DeploymentFailRequest,
|
2017-06-28 23:29:48 +00:00
|
|
|
resp *structs.DeploymentUpdateResponse) error {
|
|
|
|
|
|
|
|
status, desc := structs.DeploymentStatusFailed, structs.DeploymentStatusDescriptionFailedByUser
|
2017-07-06 22:03:27 +00:00
|
|
|
|
|
|
|
// Determine if we should rollback
|
|
|
|
rollback := false
|
2020-07-17 18:07:43 +00:00
|
|
|
for _, dstate := range w.getDeployment().TaskGroups {
|
|
|
|
if dstate.AutoRevert {
|
2017-07-06 22:03:27 +00:00
|
|
|
rollback = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var rollbackJob *structs.Job
|
|
|
|
if rollback {
|
|
|
|
var err error
|
|
|
|
rollbackJob, err = w.latestStableJob()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if rollbackJob != nil {
|
2017-11-03 20:33:34 +00:00
|
|
|
rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc)
|
2017-08-12 22:50:51 +00:00
|
|
|
} else {
|
|
|
|
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
|
2017-07-06 22:03:27 +00:00
|
|
|
}
|
|
|
|
}
|
2017-06-28 23:29:48 +00:00
|
|
|
|
|
|
|
// Commit the change
|
2017-07-06 22:03:27 +00:00
|
|
|
update := w.getDeploymentStatusUpdate(status, desc)
|
|
|
|
eval := w.getEval()
|
|
|
|
i, err := w.upsertDeploymentStatusUpdate(update, eval, rollbackJob)
|
2017-06-28 23:29:48 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build the response
|
|
|
|
resp.EvalID = eval.ID
|
|
|
|
resp.EvalCreateIndex = i
|
|
|
|
resp.DeploymentModifyIndex = i
|
2017-06-29 05:00:18 +00:00
|
|
|
resp.Index = i
|
2017-07-06 22:03:27 +00:00
|
|
|
if rollbackJob != nil {
|
2022-08-17 16:26:34 +00:00
|
|
|
resp.RevertedJobVersion = pointer.Of(rollbackJob.Version)
|
2017-07-06 22:03:27 +00:00
|
|
|
}
|
2017-06-28 23:29:48 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// StopWatch stops watching the deployment. This should be called whenever a
|
|
|
|
// deployment is completed or the watcher is no longer needed.
|
|
|
|
func (w *deploymentWatcher) StopWatch() {
|
2017-06-28 04:36:16 +00:00
|
|
|
w.exitFn()
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// watch is the long running watcher that watches for both allocation and
|
|
|
|
// deployment changes. Its function is to create evaluations to trigger the
|
|
|
|
// scheduler when more progress can be made, to fail the deployment if it has
|
2018-04-10 18:42:13 +00:00
|
|
|
// failed and potentially rolling back the job. Progress can be made when an
|
2018-04-10 22:02:52 +00:00
|
|
|
// allocation transitions to healthy, so we create an eval.
|
2017-06-26 21:23:52 +00:00
|
|
|
func (w *deploymentWatcher) watch() {
|
2018-04-06 00:38:14 +00:00
|
|
|
// Get the deadline. This is likely a zero time to begin with but we need to
|
|
|
|
// handle the case that the deployment has already progressed and we are now
|
2018-04-10 18:42:13 +00:00
|
|
|
// just starting to watch it. This must likely would occur if there was a
|
2018-04-10 22:02:52 +00:00
|
|
|
// leader transition and we are now starting our watcher.
|
2018-11-05 22:43:07 +00:00
|
|
|
currentDeadline := w.getDeploymentProgressCutoff(w.getDeployment())
|
2018-04-06 00:38:14 +00:00
|
|
|
var deadlineTimer *time.Timer
|
|
|
|
if currentDeadline.IsZero() {
|
|
|
|
deadlineTimer = time.NewTimer(0)
|
|
|
|
if !deadlineTimer.Stop() {
|
|
|
|
<-deadlineTimer.C
|
2018-04-04 20:54:53 +00:00
|
|
|
}
|
2018-04-06 00:38:14 +00:00
|
|
|
} else {
|
2020-12-09 19:05:18 +00:00
|
|
|
deadlineTimer = time.NewTimer(time.Until(currentDeadline))
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
allocIndex := uint64(1)
|
deployment watcher: Reuse allocsCh if allocIndex remains the same (#10756)
Fix deployment watchers to avoid creating unnecessary deployment watcher goroutines and blocking queries. `deploymentWatcher.getAllocsCh` creates a new goroutine that makes a blocking query to fetch updates of deployment allocs.
## Background
When operators submit a new or updated service job, Nomad create a new deployment by default. The deployment object controls how fast to place the allocations through [`max_parallel`](https://www.nomadproject.io/docs/job-specification/update#max_parallel) and health checks configurations.
The `scheduler` and `deploymentwatcher` package collaborate to achieve deployment logic: The scheduler only places the canaries and `max_parallel` allocations for a new deployment; the `deploymentwatcher` monitors for alloc progress and then enqueues a new evaluation whenever the scheduler should reprocess a job and places the next `max_parallel` round of allocations.
The `deploymentwatcher` package makes blocking queries against the state store, to fetch all deployments and the relevant allocs for each running deployments. If `deploymentwatcher` fails or is hindered from fetching the state, the deployments fail to make progress.
`Deploymentwatcher` logic only runs on the leader.
## Why unnecessary deployment watchers can halt cluster progress
Previously, `getAllocsCh` is called on every for loop iteration in `deploymentWatcher.watch()` function. However, the for-loop may iterate many times before the allocs get updated. In fact, whenever a new deployment is created/updated/deleted, *all* `deploymentWatcher`s get notified through `w.deploymentUpdateCh`. The `getAllocsCh` goroutines and blocking queries spike significantly and grow quadratically with respect to the number of running deployments. The growth leads to two adverse outcomes:
1. it spikes the CPU/Memory usage resulting potentially leading to OOM or very slow processing
2. it activates the [query rate limiter](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L896-L898), so later the watcher fails to get updates and consequently fails to make progress towards placing new allocations for the deployment!
So the cluster fails to catch up and fails to make progress in almost all deployments. The cluster recovers after a leader transition: the deposed leader stops all watchers and free up goroutines and blocking queries; the new leader recreates the watchers without the quadratic growth and remaining under the rate limiter. Well, until a spike of deployments are created triggering the condition again.
### Relevant Code References
Path for deployment monitoring:
* [`Watcher.watchDeployments`](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L164-L192) loops waiting for deployment updates.
* On every deployment update, [`w.getDeploys`](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L194-L229) returns all deployments in the system
* `watchDeployments` calls `w.add(d)` on every active deployment
* which in turns, [updates existing watcher if one is found](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L251-L255).
* The deployment watcher [updates local local deployment field and trigger `deploymentUpdateCh` channel]( https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L136-L147)
* The [deployment watcher `deploymentUpdateCh` selector is activated](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L455-L489). Most of the time the selector clause is a no-op, because the flow was triggered due to another deployment update
* The `watch` for-loop iterates again and in the previous code we create yet another goroutine and blocking call that risks being rate limited.
Co-authored-by: Tim Gross <tgross@hashicorp.com>
2021-06-14 20:01:01 +00:00
|
|
|
allocsCh := w.getAllocsCh(allocIndex)
|
2018-03-23 17:56:00 +00:00
|
|
|
var updates *allocUpdates
|
|
|
|
|
|
|
|
rollback, deadlineHit := false, false
|
|
|
|
|
|
|
|
FAIL:
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-w.ctx.Done():
|
2019-05-10 15:20:12 +00:00
|
|
|
// This is the successful case, and we stop the loop
|
2018-03-23 17:56:00 +00:00
|
|
|
return
|
|
|
|
case <-deadlineTimer.C:
|
2021-02-22 21:44:03 +00:00
|
|
|
// We have hit the progress deadline, so fail the deployment
|
|
|
|
// unless we're waiting for manual promotion. We need to determine
|
|
|
|
// whether we should roll back the job by inspecting which allocs
|
|
|
|
// as part of the deployment are healthy and which aren't. The
|
|
|
|
// deadlineHit flag is never reset, so even in the case of a
|
|
|
|
// manual promotion, we'll describe any failure as a progress
|
|
|
|
// deadline failure at this point.
|
2018-04-04 23:49:49 +00:00
|
|
|
deadlineHit = true
|
2018-04-25 00:41:34 +00:00
|
|
|
fail, rback, err := w.shouldFail()
|
2018-03-23 17:56:00 +00:00
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Error("failed to determine whether to rollback job", "error", err)
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
2018-04-25 00:41:34 +00:00
|
|
|
if !fail {
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Debug("skipping deadline")
|
2018-04-25 00:41:34 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Debug("deadline hit", "rollback", rback)
|
2018-04-25 00:41:34 +00:00
|
|
|
rollback = rback
|
2020-06-16 23:23:14 +00:00
|
|
|
err = w.nextRegion(structs.DeploymentStatusFailed)
|
|
|
|
if err != nil {
|
|
|
|
w.logger.Error("multiregion deployment error", "error", err)
|
|
|
|
}
|
2018-03-23 17:56:00 +00:00
|
|
|
break FAIL
|
2018-04-04 23:49:49 +00:00
|
|
|
case <-w.deploymentUpdateCh:
|
|
|
|
// Get the updated deployment and check if we should change the
|
|
|
|
// deadline timer
|
2018-11-05 22:43:07 +00:00
|
|
|
next := w.getDeploymentProgressCutoff(w.getDeployment())
|
2018-04-04 23:49:49 +00:00
|
|
|
if !next.Equal(currentDeadline) {
|
2018-04-19 13:12:18 +00:00
|
|
|
prevDeadlineZero := currentDeadline.IsZero()
|
2018-04-04 23:49:49 +00:00
|
|
|
currentDeadline = next
|
2018-04-19 17:59:16 +00:00
|
|
|
// The most recent deadline can be zero if no allocs were created for this deployment.
|
|
|
|
// The deadline timer would have already been stopped once in that case. To prevent
|
|
|
|
// deadlocking on the already stopped deadline timer, we only drain the channel if
|
|
|
|
// the previous deadline was not zero.
|
2018-04-19 13:12:18 +00:00
|
|
|
if !prevDeadlineZero && !deadlineTimer.Stop() {
|
2018-05-03 21:03:16 +00:00
|
|
|
select {
|
|
|
|
case <-deadlineTimer.C:
|
|
|
|
default:
|
|
|
|
}
|
2018-04-04 23:49:49 +00:00
|
|
|
}
|
2018-11-05 22:43:07 +00:00
|
|
|
|
|
|
|
// If the next deadline is zero, we should not reset the timer
|
2018-11-08 17:48:36 +00:00
|
|
|
// as we aren't tracking towards a progress deadline yet. This
|
|
|
|
// can happen if you have multiple task groups with progress
|
|
|
|
// deadlines and one of the task groups hasn't made any
|
|
|
|
// placements. As soon as the other task group finishes its
|
|
|
|
// rollout, the next progress deadline becomes zero, so we want
|
|
|
|
// to avoid resetting, causing a deployment failure.
|
2018-11-05 22:43:07 +00:00
|
|
|
if !next.IsZero() {
|
2020-12-09 19:05:18 +00:00
|
|
|
deadlineTimer.Reset(time.Until(next))
|
2021-02-22 21:44:03 +00:00
|
|
|
w.logger.Trace("resetting deadline")
|
2018-11-05 22:43:07 +00:00
|
|
|
}
|
2018-04-04 23:49:49 +00:00
|
|
|
}
|
2018-03-23 17:56:00 +00:00
|
|
|
|
2020-06-16 23:23:14 +00:00
|
|
|
err := w.nextRegion(w.getStatus())
|
|
|
|
if err != nil {
|
|
|
|
break FAIL
|
|
|
|
}
|
|
|
|
|
deployment watcher: Reuse allocsCh if allocIndex remains the same (#10756)
Fix deployment watchers to avoid creating unnecessary deployment watcher goroutines and blocking queries. `deploymentWatcher.getAllocsCh` creates a new goroutine that makes a blocking query to fetch updates of deployment allocs.
## Background
When operators submit a new or updated service job, Nomad create a new deployment by default. The deployment object controls how fast to place the allocations through [`max_parallel`](https://www.nomadproject.io/docs/job-specification/update#max_parallel) and health checks configurations.
The `scheduler` and `deploymentwatcher` package collaborate to achieve deployment logic: The scheduler only places the canaries and `max_parallel` allocations for a new deployment; the `deploymentwatcher` monitors for alloc progress and then enqueues a new evaluation whenever the scheduler should reprocess a job and places the next `max_parallel` round of allocations.
The `deploymentwatcher` package makes blocking queries against the state store, to fetch all deployments and the relevant allocs for each running deployments. If `deploymentwatcher` fails or is hindered from fetching the state, the deployments fail to make progress.
`Deploymentwatcher` logic only runs on the leader.
## Why unnecessary deployment watchers can halt cluster progress
Previously, `getAllocsCh` is called on every for loop iteration in `deploymentWatcher.watch()` function. However, the for-loop may iterate many times before the allocs get updated. In fact, whenever a new deployment is created/updated/deleted, *all* `deploymentWatcher`s get notified through `w.deploymentUpdateCh`. The `getAllocsCh` goroutines and blocking queries spike significantly and grow quadratically with respect to the number of running deployments. The growth leads to two adverse outcomes:
1. it spikes the CPU/Memory usage resulting potentially leading to OOM or very slow processing
2. it activates the [query rate limiter](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L896-L898), so later the watcher fails to get updates and consequently fails to make progress towards placing new allocations for the deployment!
So the cluster fails to catch up and fails to make progress in almost all deployments. The cluster recovers after a leader transition: the deposed leader stops all watchers and free up goroutines and blocking queries; the new leader recreates the watchers without the quadratic growth and remaining under the rate limiter. Well, until a spike of deployments are created triggering the condition again.
### Relevant Code References
Path for deployment monitoring:
* [`Watcher.watchDeployments`](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L164-L192) loops waiting for deployment updates.
* On every deployment update, [`w.getDeploys`](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L194-L229) returns all deployments in the system
* `watchDeployments` calls `w.add(d)` on every active deployment
* which in turns, [updates existing watcher if one is found](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L251-L255).
* The deployment watcher [updates local local deployment field and trigger `deploymentUpdateCh` channel]( https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L136-L147)
* The [deployment watcher `deploymentUpdateCh` selector is activated](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L455-L489). Most of the time the selector clause is a no-op, because the flow was triggered due to another deployment update
* The `watch` for-loop iterates again and in the previous code we create yet another goroutine and blocking call that risks being rate limited.
Co-authored-by: Tim Gross <tgross@hashicorp.com>
2021-06-14 20:01:01 +00:00
|
|
|
case updates = <-allocsCh:
|
2018-03-23 17:56:00 +00:00
|
|
|
if err := updates.err; err != nil {
|
|
|
|
if err == context.Canceled || w.ctx.Err() == context.Canceled {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Error("failed to retrieve allocations", "error", err)
|
2018-03-23 17:56:00 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
allocIndex = updates.index
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// We have allocation changes for this deployment so determine the
|
|
|
|
// steps to take.
|
|
|
|
res, err := w.handleAllocUpdate(updates.allocs)
|
2018-03-23 17:56:00 +00:00
|
|
|
if err != nil {
|
|
|
|
if err == context.Canceled || w.ctx.Err() == context.Canceled {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Error("failed handling allocation updates", "error", err)
|
2018-03-23 17:56:00 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// The deployment has failed, so break out of the watch loop and
|
|
|
|
// handle the failure
|
2018-03-23 17:56:00 +00:00
|
|
|
if res.failDeployment {
|
|
|
|
rollback = res.rollback
|
2020-06-16 23:23:14 +00:00
|
|
|
err := w.nextRegion(structs.DeploymentStatusFailed)
|
|
|
|
if err != nil {
|
|
|
|
w.logger.Error("multiregion deployment error", "error", err)
|
|
|
|
}
|
2018-03-23 17:56:00 +00:00
|
|
|
break FAIL
|
|
|
|
}
|
|
|
|
|
2019-05-10 15:20:12 +00:00
|
|
|
// If permitted, automatically promote this canary deployment
|
2019-05-20 18:31:30 +00:00
|
|
|
err = w.autoPromoteDeployment(updates.allocs)
|
|
|
|
if err != nil {
|
|
|
|
w.logger.Error("failed to auto promote deployment", "error", err)
|
|
|
|
}
|
2019-05-10 15:20:12 +00:00
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// Create an eval to push the deployment along
|
2018-04-07 00:23:35 +00:00
|
|
|
if res.createEval || len(res.allowReplacements) != 0 {
|
|
|
|
w.createBatchedUpdate(res.allowReplacements, allocIndex)
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
deployment watcher: Reuse allocsCh if allocIndex remains the same (#10756)
Fix deployment watchers to avoid creating unnecessary deployment watcher goroutines and blocking queries. `deploymentWatcher.getAllocsCh` creates a new goroutine that makes a blocking query to fetch updates of deployment allocs.
## Background
When operators submit a new or updated service job, Nomad create a new deployment by default. The deployment object controls how fast to place the allocations through [`max_parallel`](https://www.nomadproject.io/docs/job-specification/update#max_parallel) and health checks configurations.
The `scheduler` and `deploymentwatcher` package collaborate to achieve deployment logic: The scheduler only places the canaries and `max_parallel` allocations for a new deployment; the `deploymentwatcher` monitors for alloc progress and then enqueues a new evaluation whenever the scheduler should reprocess a job and places the next `max_parallel` round of allocations.
The `deploymentwatcher` package makes blocking queries against the state store, to fetch all deployments and the relevant allocs for each running deployments. If `deploymentwatcher` fails or is hindered from fetching the state, the deployments fail to make progress.
`Deploymentwatcher` logic only runs on the leader.
## Why unnecessary deployment watchers can halt cluster progress
Previously, `getAllocsCh` is called on every for loop iteration in `deploymentWatcher.watch()` function. However, the for-loop may iterate many times before the allocs get updated. In fact, whenever a new deployment is created/updated/deleted, *all* `deploymentWatcher`s get notified through `w.deploymentUpdateCh`. The `getAllocsCh` goroutines and blocking queries spike significantly and grow quadratically with respect to the number of running deployments. The growth leads to two adverse outcomes:
1. it spikes the CPU/Memory usage resulting potentially leading to OOM or very slow processing
2. it activates the [query rate limiter](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L896-L898), so later the watcher fails to get updates and consequently fails to make progress towards placing new allocations for the deployment!
So the cluster fails to catch up and fails to make progress in almost all deployments. The cluster recovers after a leader transition: the deposed leader stops all watchers and free up goroutines and blocking queries; the new leader recreates the watchers without the quadratic growth and remaining under the rate limiter. Well, until a spike of deployments are created triggering the condition again.
### Relevant Code References
Path for deployment monitoring:
* [`Watcher.watchDeployments`](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L164-L192) loops waiting for deployment updates.
* On every deployment update, [`w.getDeploys`](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L194-L229) returns all deployments in the system
* `watchDeployments` calls `w.add(d)` on every active deployment
* which in turns, [updates existing watcher if one is found](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployments_watcher.go#L251-L255).
* The deployment watcher [updates local local deployment field and trigger `deploymentUpdateCh` channel]( https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L136-L147)
* The [deployment watcher `deploymentUpdateCh` selector is activated](https://github.com/hashicorp/nomad/blob/abaa9c5c5bd09af774fda30d76d5767b06128df4/nomad/deploymentwatcher/deployment_watcher.go#L455-L489). Most of the time the selector clause is a no-op, because the flow was triggered due to another deployment update
* The `watch` for-loop iterates again and in the previous code we create yet another goroutine and blocking call that risks being rate limited.
Co-authored-by: Tim Gross <tgross@hashicorp.com>
2021-06-14 20:01:01 +00:00
|
|
|
|
|
|
|
// only start a new blocking query if we haven't returned early
|
|
|
|
allocsCh = w.getAllocsCh(allocIndex)
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Change the deployments status to failed
|
|
|
|
desc := structs.DeploymentStatusDescriptionFailedAllocations
|
|
|
|
if deadlineHit {
|
|
|
|
desc = structs.DeploymentStatusDescriptionProgressDeadline
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rollback to the old job if necessary
|
|
|
|
var j *structs.Job
|
|
|
|
if rollback {
|
|
|
|
var err error
|
|
|
|
j, err = w.latestStableJob()
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Error("failed to lookup latest stable job", "error", err)
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Description should include that the job is being rolled back to
|
|
|
|
// version N
|
|
|
|
if j != nil {
|
|
|
|
j, desc = w.handleRollbackValidity(j, desc)
|
|
|
|
} else {
|
|
|
|
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update the status of the deployment to failed and create an evaluation.
|
|
|
|
e := w.getEval()
|
|
|
|
u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc)
|
2018-09-21 20:59:11 +00:00
|
|
|
if _, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Error("failed to update deployment status", "error", err)
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// allocUpdateResult is used to return the desired actions given the newest set
|
|
|
|
// of allocations for the deployment.
|
2018-03-23 17:56:00 +00:00
|
|
|
type allocUpdateResult struct {
|
2018-04-07 00:23:35 +00:00
|
|
|
createEval bool
|
|
|
|
failDeployment bool
|
|
|
|
rollback bool
|
|
|
|
allowReplacements []string
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
// handleAllocUpdate is used to compute the set of actions to take based on the
|
|
|
|
// updated allocations for the deployment.
|
|
|
|
func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (allocUpdateResult, error) {
|
2018-03-23 17:56:00 +00:00
|
|
|
var res allocUpdateResult
|
|
|
|
|
|
|
|
// Get the latest evaluation index
|
2018-11-05 22:43:07 +00:00
|
|
|
latestEval, err := w.jobEvalStatus()
|
2018-03-23 17:56:00 +00:00
|
|
|
if err != nil {
|
|
|
|
if err == context.Canceled || w.ctx.Err() == context.Canceled {
|
|
|
|
return res, err
|
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
return res, fmt.Errorf("failed to determine last evaluation index for job %q: %v", w.j.ID, err)
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
deployment := w.getDeployment()
|
2018-03-23 17:56:00 +00:00
|
|
|
for _, alloc := range allocs {
|
2018-04-04 23:49:49 +00:00
|
|
|
dstate, ok := deployment.TaskGroups[alloc.TaskGroup]
|
|
|
|
if !ok {
|
2018-03-23 17:56:00 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2023-06-26 18:01:03 +00:00
|
|
|
// Check if we can already fail the deployment
|
|
|
|
failDeployment := w.shouldFailEarly(deployment, alloc, dstate)
|
2018-04-07 00:23:35 +00:00
|
|
|
|
2018-09-21 21:56:48 +00:00
|
|
|
// Check if the allocation has failed and we need to mark it for allow
|
|
|
|
// replacements
|
2023-06-26 18:01:03 +00:00
|
|
|
if alloc.DeploymentStatus.IsUnhealthy() && !failDeployment &&
|
2018-09-21 21:56:48 +00:00
|
|
|
deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() {
|
|
|
|
res.allowReplacements = append(res.allowReplacements, alloc.ID)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-03-23 17:56:00 +00:00
|
|
|
// We need to create an eval so the job can progress.
|
2018-11-05 22:43:07 +00:00
|
|
|
if alloc.DeploymentStatus.IsHealthy() && alloc.DeploymentStatus.ModifyIndex > latestEval {
|
2018-03-23 17:56:00 +00:00
|
|
|
res.createEval = true
|
|
|
|
}
|
|
|
|
|
2023-06-26 18:01:03 +00:00
|
|
|
if failDeployment {
|
2018-03-23 17:56:00 +00:00
|
|
|
// Check if the group has autorevert set
|
2018-04-04 23:49:49 +00:00
|
|
|
if dstate.AutoRevert {
|
2018-03-23 17:56:00 +00:00
|
|
|
res.rollback = true
|
|
|
|
}
|
|
|
|
|
|
|
|
res.failDeployment = true
|
|
|
|
}
|
|
|
|
|
|
|
|
// All conditions have been hit so we can break
|
|
|
|
if res.createEval && res.failDeployment && res.rollback {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
2018-04-25 00:41:34 +00:00
|
|
|
// shouldFail returns whether the job should be failed and whether it should
|
2018-05-01 15:33:39 +00:00
|
|
|
// rolled back to an earlier stable version by examining the allocations in the
|
2018-04-25 00:41:34 +00:00
|
|
|
// deployment.
|
|
|
|
func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
|
2018-03-23 17:56:00 +00:00
|
|
|
snap, err := w.state.Snapshot()
|
|
|
|
if err != nil {
|
2018-04-25 00:41:34 +00:00
|
|
|
return false, false, err
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
d, err := snap.DeploymentByID(nil, w.deploymentID)
|
2018-03-23 17:56:00 +00:00
|
|
|
if err != nil {
|
2018-04-25 00:41:34 +00:00
|
|
|
return false, false, err
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
2018-05-01 15:33:39 +00:00
|
|
|
if d == nil {
|
|
|
|
// The deployment wasn't in the state store, possibly due to a system gc
|
|
|
|
return false, false, fmt.Errorf("deployment id not found: %q", w.deploymentID)
|
|
|
|
}
|
2018-03-23 17:56:00 +00:00
|
|
|
|
2018-04-25 00:41:34 +00:00
|
|
|
fail = false
|
2020-07-17 18:07:43 +00:00
|
|
|
for tg, dstate := range d.TaskGroups {
|
2018-04-25 00:41:34 +00:00
|
|
|
// If we are in a canary state we fail if there aren't enough healthy
|
|
|
|
// allocs to satisfy DesiredCanaries
|
2020-07-17 18:07:43 +00:00
|
|
|
if dstate.DesiredCanaries > 0 && !dstate.Promoted {
|
|
|
|
if dstate.HealthyAllocs >= dstate.DesiredCanaries {
|
2018-04-25 00:41:34 +00:00
|
|
|
continue
|
|
|
|
}
|
2020-07-17 18:07:43 +00:00
|
|
|
} else if dstate.HealthyAllocs >= dstate.DesiredTotal {
|
2018-03-23 17:56:00 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-04-25 00:41:34 +00:00
|
|
|
// We have failed this TG
|
|
|
|
fail = true
|
|
|
|
|
2018-03-23 17:56:00 +00:00
|
|
|
// We don't need to autorevert this group
|
|
|
|
upd := w.j.LookupTaskGroup(tg).Update
|
|
|
|
if upd == nil || !upd.AutoRevert {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unhealthy allocs and we need to autorevert
|
2021-02-22 21:44:03 +00:00
|
|
|
return fail, true, nil
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
2018-04-25 00:41:34 +00:00
|
|
|
return fail, false, nil
|
2018-03-23 17:56:00 +00:00
|
|
|
}
|
|
|
|
|
2023-06-26 18:01:03 +00:00
|
|
|
func (w *deploymentWatcher) shouldFailEarly(deployment *structs.Deployment, alloc *structs.AllocListStub, dstate *structs.DeploymentState) bool {
|
|
|
|
if !alloc.DeploymentStatus.IsUnhealthy() {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fail on the first unhealthy allocation if no progress deadline is specified.
|
|
|
|
if dstate.ProgressDeadline == 0 {
|
|
|
|
w.logger.Debug("failing deployment because an allocation failed and the deployment is not progress based", "alloc", alloc.ID)
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if deployment.Active() {
|
|
|
|
reschedulePolicy := w.j.LookupTaskGroup(alloc.TaskGroup).ReschedulePolicy
|
|
|
|
isRescheduleEligible := alloc.RescheduleEligible(reschedulePolicy, time.Now())
|
|
|
|
if !isRescheduleEligible {
|
|
|
|
// We have run out of reschedule attempts: do not wait for the progress deadline to expire because
|
|
|
|
// we know that we will not be able to try to get another allocation healthy
|
|
|
|
w.logger.Debug("failing deployment because an allocation has failed and the task group has run out of reschedule attempts", "alloc", alloc.ID)
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2018-04-06 00:38:14 +00:00
|
|
|
// getDeploymentProgressCutoff returns the progress cutoff for the given
|
|
|
|
// deployment
|
2018-11-05 22:43:07 +00:00
|
|
|
func (w *deploymentWatcher) getDeploymentProgressCutoff(d *structs.Deployment) time.Time {
|
2018-04-06 00:38:14 +00:00
|
|
|
var next time.Time
|
2018-11-05 22:43:07 +00:00
|
|
|
doneTGs := w.doneGroups(d)
|
2020-07-17 18:07:43 +00:00
|
|
|
for name, dstate := range d.TaskGroups {
|
2018-11-05 22:43:07 +00:00
|
|
|
// This task group is done so we don't have to concern ourselves with
|
|
|
|
// its progress deadline.
|
|
|
|
if done, ok := doneTGs[name]; ok && done {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-07-17 18:07:43 +00:00
|
|
|
if dstate.RequireProgressBy.IsZero() {
|
2018-11-05 22:43:07 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-07-17 18:07:43 +00:00
|
|
|
if next.IsZero() || dstate.RequireProgressBy.Before(next) {
|
|
|
|
next = dstate.RequireProgressBy
|
2018-04-06 00:38:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return next
|
|
|
|
}
|
|
|
|
|
2018-11-05 22:43:07 +00:00
|
|
|
// doneGroups returns a map of task group to whether the deployment appears to
|
|
|
|
// be done for the group. A true value doesn't mean no more action will be taken
|
|
|
|
// in the life time of the deployment because there could always be node
|
|
|
|
// failures, or rescheduling events.
|
|
|
|
func (w *deploymentWatcher) doneGroups(d *structs.Deployment) map[string]bool {
|
|
|
|
if d == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect the allocations by the task group
|
|
|
|
snap, err := w.state.Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
allocs, err := snap.AllocsByDeployment(nil, d.ID)
|
|
|
|
if err != nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Go through the allocs and count up how many healthy allocs we have
|
|
|
|
healthy := make(map[string]int, len(d.TaskGroups))
|
|
|
|
for _, a := range allocs {
|
|
|
|
if a.TerminalStatus() || !a.DeploymentStatus.IsHealthy() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
healthy[a.TaskGroup]++
|
|
|
|
}
|
|
|
|
|
|
|
|
// Go through each group and check if it done
|
|
|
|
groups := make(map[string]bool, len(d.TaskGroups))
|
2020-07-17 18:07:43 +00:00
|
|
|
for name, dstate := range d.TaskGroups {
|
2018-11-05 22:43:07 +00:00
|
|
|
// Requires promotion
|
2020-07-17 18:07:43 +00:00
|
|
|
if dstate.DesiredCanaries != 0 && !dstate.Promoted {
|
2018-11-05 22:43:07 +00:00
|
|
|
groups[name] = false
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check we have enough healthy currently running allocations
|
2020-07-17 18:07:43 +00:00
|
|
|
groups[name] = healthy[name] >= dstate.DesiredTotal
|
2018-11-05 22:43:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return groups
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// latestStableJob returns the latest stable job. It may be nil if none exist
|
|
|
|
func (w *deploymentWatcher) latestStableJob() (*structs.Job, error) {
|
2017-08-31 00:45:32 +00:00
|
|
|
snap, err := w.state.Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
versions, err := snap.JobVersionsByID(nil, w.j.Namespace, w.j.ID)
|
2017-08-31 00:45:32 +00:00
|
|
|
if err != nil {
|
2017-06-26 21:23:52 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var stable *structs.Job
|
2017-08-31 00:45:32 +00:00
|
|
|
for _, job := range versions {
|
2017-06-26 21:23:52 +00:00
|
|
|
if job.Stable {
|
|
|
|
stable = job
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return stable, nil
|
|
|
|
}
|
|
|
|
|
2018-04-07 00:23:35 +00:00
|
|
|
// createBatchedUpdate creates an eval for the given index as well as updating
|
|
|
|
// the given allocations to allow them to reschedule.
|
|
|
|
func (w *deploymentWatcher) createBatchedUpdate(allowReplacements []string, forIndex uint64) {
|
2017-06-26 21:23:52 +00:00
|
|
|
w.l.Lock()
|
|
|
|
defer w.l.Unlock()
|
|
|
|
|
2018-04-07 00:23:35 +00:00
|
|
|
// Store the allocations that can be replaced
|
|
|
|
for _, allocID := range allowReplacements {
|
|
|
|
if w.outstandingAllowReplacements == nil {
|
|
|
|
w.outstandingAllowReplacements = make(map[string]*structs.DesiredTransition, len(allowReplacements))
|
|
|
|
}
|
2018-04-10 22:02:52 +00:00
|
|
|
w.outstandingAllowReplacements[allocID] = allowRescheduleTransition
|
2018-04-07 00:23:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if w.outstandingBatch || (forIndex < w.latestEval && len(allowReplacements) == 0) {
|
2017-06-26 21:23:52 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2017-06-28 19:58:05 +00:00
|
|
|
w.outstandingBatch = true
|
|
|
|
|
2017-07-03 18:08:35 +00:00
|
|
|
time.AfterFunc(perJobEvalBatchPeriod, func() {
|
2018-02-20 20:47:43 +00:00
|
|
|
// If the timer has been created and then we shutdown, we need to no-op
|
|
|
|
// the evaluation creation.
|
|
|
|
select {
|
|
|
|
case <-w.ctx.Done():
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
2018-04-07 00:23:35 +00:00
|
|
|
w.l.Lock()
|
|
|
|
replacements := w.outstandingAllowReplacements
|
|
|
|
w.outstandingAllowReplacements = nil
|
|
|
|
w.outstandingBatch = false
|
|
|
|
w.l.Unlock()
|
|
|
|
|
2017-06-28 21:32:11 +00:00
|
|
|
// Create the eval
|
2018-09-21 20:59:11 +00:00
|
|
|
if _, err := w.createUpdate(replacements, w.getEval()); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
w.logger.Error("failed to create evaluation for deployment", "deployment_id", w.deploymentID, "error", err)
|
2017-06-28 20:19:41 +00:00
|
|
|
}
|
2017-07-03 18:08:35 +00:00
|
|
|
})
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// getEval returns an evaluation suitable for the deployment
|
|
|
|
func (w *deploymentWatcher) getEval() *structs.Evaluation {
|
2019-08-07 16:50:35 +00:00
|
|
|
now := time.Now().UTC().UnixNano()
|
2021-11-23 08:23:31 +00:00
|
|
|
|
|
|
|
// During a server upgrade it's possible we end up with deployments created
|
|
|
|
// on the previous version that are then "watched" on a leader that's on
|
|
|
|
// the new version. This would result in an eval with its priority set to
|
|
|
|
// zero which would be bad. This therefore protects against that.
|
2022-08-16 17:50:40 +00:00
|
|
|
w.l.Lock()
|
2021-11-23 08:23:31 +00:00
|
|
|
priority := w.d.EvalPriority
|
|
|
|
if priority == 0 {
|
|
|
|
priority = w.j.Priority
|
|
|
|
}
|
2022-08-16 17:50:40 +00:00
|
|
|
w.l.Unlock()
|
2021-11-23 08:23:31 +00:00
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
return &structs.Evaluation{
|
2017-09-29 16:58:48 +00:00
|
|
|
ID: uuid.Generate(),
|
2017-09-07 23:56:15 +00:00
|
|
|
Namespace: w.j.Namespace,
|
2021-11-23 08:23:31 +00:00
|
|
|
Priority: priority,
|
2017-06-26 21:23:52 +00:00
|
|
|
Type: w.j.Type,
|
2017-07-03 18:08:35 +00:00
|
|
|
TriggeredBy: structs.EvalTriggerDeploymentWatcher,
|
2017-06-26 21:23:52 +00:00
|
|
|
JobID: w.j.ID,
|
2018-04-04 23:49:49 +00:00
|
|
|
DeploymentID: w.deploymentID,
|
2017-06-26 21:23:52 +00:00
|
|
|
Status: structs.EvalStatusPending,
|
2019-08-07 16:50:35 +00:00
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// getDeploymentStatusUpdate returns a deployment status update
|
|
|
|
func (w *deploymentWatcher) getDeploymentStatusUpdate(status, desc string) *structs.DeploymentStatusUpdate {
|
|
|
|
return &structs.DeploymentStatusUpdate{
|
2018-04-04 23:49:49 +00:00
|
|
|
DeploymentID: w.deploymentID,
|
2017-06-26 21:23:52 +00:00
|
|
|
Status: status,
|
|
|
|
StatusDescription: desc,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-16 23:23:14 +00:00
|
|
|
// getStatus returns the current status of the deployment
|
|
|
|
func (w *deploymentWatcher) getStatus() string {
|
|
|
|
w.l.RLock()
|
|
|
|
defer w.l.RUnlock()
|
|
|
|
return w.d.Status
|
|
|
|
}
|
|
|
|
|
2018-03-23 17:56:00 +00:00
|
|
|
type allocUpdates struct {
|
|
|
|
allocs []*structs.AllocListStub
|
|
|
|
index uint64
|
|
|
|
err error
|
|
|
|
}
|
|
|
|
|
2019-05-10 15:20:12 +00:00
|
|
|
// getAllocsCh creates a channel and starts a goroutine that
|
|
|
|
// 1. parks a blocking query for allocations on the state
|
|
|
|
// 2. reads those and drops them on the channel
|
|
|
|
// This query runs once here, but watch calls it in a loop
|
2018-03-23 17:56:00 +00:00
|
|
|
func (w *deploymentWatcher) getAllocsCh(index uint64) <-chan *allocUpdates {
|
|
|
|
out := make(chan *allocUpdates, 1)
|
|
|
|
go func() {
|
|
|
|
allocs, index, err := w.getAllocs(index)
|
|
|
|
out <- &allocUpdates{
|
|
|
|
allocs: allocs,
|
|
|
|
index: index,
|
|
|
|
err: err,
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:23:52 +00:00
|
|
|
// getAllocs retrieves the allocations that are part of the deployment blocking
|
|
|
|
// at the given index.
|
2017-08-31 00:45:32 +00:00
|
|
|
func (w *deploymentWatcher) getAllocs(index uint64) ([]*structs.AllocListStub, uint64, error) {
|
|
|
|
resp, index, err := w.state.BlockingQuery(w.getAllocsImpl, index, w.ctx)
|
|
|
|
if err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
if err := w.ctx.Err(); err != nil {
|
|
|
|
return nil, 0, err
|
2017-06-28 04:36:16 +00:00
|
|
|
}
|
|
|
|
|
2017-08-31 00:45:32 +00:00
|
|
|
return resp.([]*structs.AllocListStub), index, nil
|
|
|
|
}
|
2017-06-26 21:23:52 +00:00
|
|
|
|
2017-08-31 00:45:32 +00:00
|
|
|
// getDeploysImpl retrieves all deployments from the passed state store.
|
|
|
|
func (w *deploymentWatcher) getAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
|
|
|
|
if err := w.queryLimiter.Wait(w.ctx); err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Capture all the allocations
|
2018-04-04 23:49:49 +00:00
|
|
|
allocs, err := state.AllocsByDeployment(ws, w.deploymentID)
|
2017-08-31 00:45:32 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, 0, err
|
|
|
|
}
|
|
|
|
|
2018-09-21 20:59:11 +00:00
|
|
|
maxIndex := uint64(0)
|
2017-08-31 00:45:32 +00:00
|
|
|
stubs := make([]*structs.AllocListStub, 0, len(allocs))
|
|
|
|
for _, alloc := range allocs {
|
2020-10-09 05:21:41 +00:00
|
|
|
stubs = append(stubs, alloc.Stub(nil))
|
2018-09-21 20:59:11 +00:00
|
|
|
|
|
|
|
if maxIndex < alloc.ModifyIndex {
|
|
|
|
maxIndex = alloc.ModifyIndex
|
|
|
|
}
|
2017-06-28 04:36:16 +00:00
|
|
|
}
|
2017-06-26 21:23:52 +00:00
|
|
|
|
2018-09-24 20:50:57 +00:00
|
|
|
// Use the last index that affected the allocs table
|
2018-09-21 20:59:11 +00:00
|
|
|
if len(stubs) == 0 {
|
|
|
|
index, err := state.Index("allocs")
|
|
|
|
if err != nil {
|
|
|
|
return nil, index, err
|
|
|
|
}
|
|
|
|
maxIndex = index
|
2017-08-31 00:45:32 +00:00
|
|
|
}
|
|
|
|
|
2018-09-21 20:59:11 +00:00
|
|
|
return stubs, maxIndex, nil
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2018-11-05 22:43:07 +00:00
|
|
|
// jobEvalStatus returns the latest eval index for a job. The index is used to
|
|
|
|
// determine if an allocation update requires an evaluation to be triggered.
|
|
|
|
func (w *deploymentWatcher) jobEvalStatus() (latestIndex uint64, err error) {
|
2017-06-28 04:36:16 +00:00
|
|
|
if err := w.queryLimiter.Wait(w.ctx); err != nil {
|
2018-11-05 22:43:07 +00:00
|
|
|
return 0, err
|
2017-06-28 04:36:16 +00:00
|
|
|
}
|
|
|
|
|
2017-08-31 00:45:32 +00:00
|
|
|
snap, err := w.state.Snapshot()
|
|
|
|
if err != nil {
|
2018-11-05 22:43:07 +00:00
|
|
|
return 0, err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
2017-08-31 00:45:32 +00:00
|
|
|
|
2018-04-04 23:49:49 +00:00
|
|
|
evals, err := snap.EvalsByJob(nil, w.j.Namespace, w.j.ID)
|
2017-06-26 21:23:52 +00:00
|
|
|
if err != nil {
|
2018-11-05 22:43:07 +00:00
|
|
|
return 0, err
|
2017-06-26 21:23:52 +00:00
|
|
|
}
|
|
|
|
|
2018-11-08 17:48:36 +00:00
|
|
|
// If there are no evals for the job, return zero, since we want any
|
|
|
|
// allocation change to trigger an evaluation.
|
|
|
|
if len(evals) == 0 {
|
|
|
|
return 0, nil
|
|
|
|
}
|
|
|
|
|
2018-09-21 20:59:11 +00:00
|
|
|
var max uint64
|
|
|
|
for _, eval := range evals {
|
2018-11-05 22:43:07 +00:00
|
|
|
// A cancelled eval never impacts what the scheduler has saw, so do not
|
|
|
|
// use it's indexes.
|
|
|
|
if eval.Status == structs.EvalStatusCancelled {
|
|
|
|
continue
|
2018-09-21 20:59:11 +00:00
|
|
|
}
|
2018-04-06 20:19:20 +00:00
|
|
|
|
2018-09-21 20:59:11 +00:00
|
|
|
// Prefer using the snapshot index. Otherwise use the create index
|
|
|
|
if eval.SnapshotIndex != 0 && max < eval.SnapshotIndex {
|
|
|
|
max = eval.SnapshotIndex
|
|
|
|
} else if max < eval.CreateIndex {
|
|
|
|
max = eval.CreateIndex
|
|
|
|
}
|
2018-04-06 20:19:20 +00:00
|
|
|
}
|
|
|
|
|
2018-11-05 22:43:07 +00:00
|
|
|
return max, nil
|
2018-04-06 20:19:20 +00:00
|
|
|
}
|