deploymentwatcher: fail early whenever possible (#17341)
Given a deployment that has a `progress_deadline`, if a task group runs out of reschedule attempts, allow it to fail at this time instead of waiting until the `progress_deadline` is reached. Fixes: #17260
This commit is contained in:
parent
81edceb2de
commit
649831c1d3
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvement
|
||||||
|
deploymentwatcher: Allow deployments to fail early when running out of reschedule attempts
|
||||||
|
```
|
|
@ -615,12 +615,12 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine if the update block for this group is progress based
|
// Check if we can already fail the deployment
|
||||||
progressBased := dstate.ProgressDeadline != 0
|
failDeployment := w.shouldFailEarly(deployment, alloc, dstate)
|
||||||
|
|
||||||
// Check if the allocation has failed and we need to mark it for allow
|
// Check if the allocation has failed and we need to mark it for allow
|
||||||
// replacements
|
// replacements
|
||||||
if progressBased && alloc.DeploymentStatus.IsUnhealthy() &&
|
if alloc.DeploymentStatus.IsUnhealthy() && !failDeployment &&
|
||||||
deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() {
|
deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() {
|
||||||
res.allowReplacements = append(res.allowReplacements, alloc.ID)
|
res.allowReplacements = append(res.allowReplacements, alloc.ID)
|
||||||
continue
|
continue
|
||||||
|
@ -631,19 +631,12 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
|
||||||
res.createEval = true
|
res.createEval = true
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the group is using a progress deadline, we don't have to do anything.
|
if failDeployment {
|
||||||
if progressBased {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fail on the first bad allocation
|
|
||||||
if alloc.DeploymentStatus.IsUnhealthy() {
|
|
||||||
// Check if the group has autorevert set
|
// Check if the group has autorevert set
|
||||||
if dstate.AutoRevert {
|
if dstate.AutoRevert {
|
||||||
res.rollback = true
|
res.rollback = true
|
||||||
}
|
}
|
||||||
|
|
||||||
// Since we have an unhealthy allocation, fail the deployment
|
|
||||||
res.failDeployment = true
|
res.failDeployment = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -702,6 +695,31 @@ func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
|
||||||
return fail, false, nil
|
return fail, false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w *deploymentWatcher) shouldFailEarly(deployment *structs.Deployment, alloc *structs.AllocListStub, dstate *structs.DeploymentState) bool {
|
||||||
|
if !alloc.DeploymentStatus.IsUnhealthy() {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fail on the first unhealthy allocation if no progress deadline is specified.
|
||||||
|
if dstate.ProgressDeadline == 0 {
|
||||||
|
w.logger.Debug("failing deployment because an allocation failed and the deployment is not progress based", "alloc", alloc.ID)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
if deployment.Active() {
|
||||||
|
reschedulePolicy := w.j.LookupTaskGroup(alloc.TaskGroup).ReschedulePolicy
|
||||||
|
isRescheduleEligible := alloc.RescheduleEligible(reschedulePolicy, time.Now())
|
||||||
|
if !isRescheduleEligible {
|
||||||
|
// We have run out of reschedule attempts: do not wait for the progress deadline to expire because
|
||||||
|
// we know that we will not be able to try to get another allocation healthy
|
||||||
|
w.logger.Debug("failing deployment because an allocation has failed and the task group has run out of reschedule attempts", "alloc", alloc.ID)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// getDeploymentProgressCutoff returns the progress cutoff for the given
|
// getDeploymentProgressCutoff returns the progress cutoff for the given
|
||||||
// deployment
|
// deployment
|
||||||
func (w *deploymentWatcher) getDeploymentProgressCutoff(d *structs.Deployment) time.Time {
|
func (w *deploymentWatcher) getDeploymentProgressCutoff(d *structs.Deployment) time.Time {
|
||||||
|
|
|
@ -16,6 +16,7 @@ import (
|
||||||
"github.com/hashicorp/nomad/nomad/mock"
|
"github.com/hashicorp/nomad/nomad/mock"
|
||||||
"github.com/hashicorp/nomad/nomad/structs"
|
"github.com/hashicorp/nomad/nomad/structs"
|
||||||
"github.com/hashicorp/nomad/testutil"
|
"github.com/hashicorp/nomad/testutil"
|
||||||
|
"github.com/shoenig/test/must"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
mocker "github.com/stretchr/testify/mock"
|
mocker "github.com/stretchr/testify/mock"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
@ -1784,6 +1785,90 @@ func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test that we exit before hitting the Progress Deadline when we run out of reschedule attempts
|
||||||
|
// for a failing deployment
|
||||||
|
func TestDeploymentWatcher_Watch_FailEarly(t *testing.T) {
|
||||||
|
ci.Parallel(t)
|
||||||
|
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)
|
||||||
|
|
||||||
|
// Create a job, alloc, and a deployment
|
||||||
|
j := mock.Job()
|
||||||
|
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
|
||||||
|
j.TaskGroups[0].Update.MaxParallel = 2
|
||||||
|
j.TaskGroups[0].Update.ProgressDeadline = 500 * time.Millisecond
|
||||||
|
// Allow only 1 allocation for that deployment
|
||||||
|
j.TaskGroups[0].ReschedulePolicy.Attempts = 0
|
||||||
|
j.TaskGroups[0].ReschedulePolicy.Unlimited = false
|
||||||
|
j.Stable = true
|
||||||
|
d := mock.Deployment()
|
||||||
|
d.JobID = j.ID
|
||||||
|
d.TaskGroups["web"].ProgressDeadline = 500 * time.Millisecond
|
||||||
|
d.TaskGroups["web"].RequireProgressBy = time.Now().Add(d.TaskGroups["web"].ProgressDeadline)
|
||||||
|
a := mock.Alloc()
|
||||||
|
now := time.Now()
|
||||||
|
a.CreateTime = now.UnixNano()
|
||||||
|
a.ModifyTime = now.UnixNano()
|
||||||
|
a.DeploymentID = d.ID
|
||||||
|
must.Nil(t, m.state.UpsertJob(structs.MsgTypeTestSetup, m.nextIndex(), nil, j), must.Sprint("UpsertJob"))
|
||||||
|
must.Nil(t, m.state.UpsertDeployment(m.nextIndex(), d), must.Sprint("UpsertDeployment"))
|
||||||
|
must.Nil(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a}), must.Sprint("UpsertAllocs"))
|
||||||
|
|
||||||
|
// require that we get a call to UpsertDeploymentStatusUpdate
|
||||||
|
c := &matchDeploymentStatusUpdateConfig{
|
||||||
|
DeploymentID: d.ID,
|
||||||
|
Status: structs.DeploymentStatusFailed,
|
||||||
|
StatusDescription: structs.DeploymentStatusDescriptionFailedAllocations,
|
||||||
|
Eval: true,
|
||||||
|
}
|
||||||
|
m2 := matchDeploymentStatusUpdateRequest(c)
|
||||||
|
m.On("UpdateDeploymentStatus", mocker.MatchedBy(m2)).Return(nil)
|
||||||
|
|
||||||
|
w.SetEnabled(true, m.state)
|
||||||
|
testutil.WaitForResult(func() (bool, error) { return 1 == watchersCount(w), nil },
|
||||||
|
func(err error) { must.Eq(t, 1, watchersCount(w), must.Sprint("Should have 1 deployment")) })
|
||||||
|
|
||||||
|
// Update the alloc to be unhealthy
|
||||||
|
a2 := a.Copy()
|
||||||
|
a2.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||||
|
Healthy: pointer.Of(false),
|
||||||
|
Timestamp: now,
|
||||||
|
}
|
||||||
|
must.Nil(t, m.state.UpdateAllocsFromClient(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a2}))
|
||||||
|
|
||||||
|
// Wait for the deployment to be failed
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
d, err := m.state.DeploymentByID(nil, d.ID)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if d.Status != structs.DeploymentStatusFailed {
|
||||||
|
return false, fmt.Errorf("bad status %q", d.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
return d.StatusDescription == structs.DeploymentStatusDescriptionFailedAllocations, fmt.Errorf("bad status description %q", d.StatusDescription)
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatal(err)
|
||||||
|
})
|
||||||
|
|
||||||
|
// require there are is only one evaluation
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
ws := memdb.NewWatchSet()
|
||||||
|
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if l := len(evals); l != 1 {
|
||||||
|
return false, fmt.Errorf("Got %d evals; want 1", l)
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}, func(err error) {
|
||||||
|
t.Fatal(err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// Tests that the watcher fails rollback when the spec hasn't changed
|
// Tests that the watcher fails rollback when the spec hasn't changed
|
||||||
func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
|
func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
|
||||||
ci.Parallel(t)
|
ci.Parallel(t)
|
||||||
|
|
|
@ -10164,6 +10164,46 @@ func (rt *RescheduleTracker) Copy() *RescheduleTracker {
|
||||||
return nt
|
return nt
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (rt *RescheduleTracker) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
|
||||||
|
if reschedulePolicy == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
attempts := reschedulePolicy.Attempts
|
||||||
|
enabled := attempts > 0 || reschedulePolicy.Unlimited
|
||||||
|
if !enabled {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if reschedulePolicy.Unlimited {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Early return true if there are no attempts yet and the number of allowed attempts is > 0
|
||||||
|
if (rt == nil || len(rt.Events) == 0) && attempts > 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
attempted, _ := rt.rescheduleInfo(reschedulePolicy, failTime)
|
||||||
|
return attempted < attempts
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *RescheduleTracker) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
|
||||||
|
if reschedulePolicy == nil {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
attempts := reschedulePolicy.Attempts
|
||||||
|
interval := reschedulePolicy.Interval
|
||||||
|
|
||||||
|
attempted := 0
|
||||||
|
if rt != nil && attempts > 0 {
|
||||||
|
for j := len(rt.Events) - 1; j >= 0; j-- {
|
||||||
|
lastAttempt := rt.Events[j].RescheduleTime
|
||||||
|
timeDiff := failTime.UTC().UnixNano() - lastAttempt
|
||||||
|
if timeDiff < interval.Nanoseconds() {
|
||||||
|
attempted += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return attempted, attempts
|
||||||
|
}
|
||||||
|
|
||||||
// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
|
// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
|
||||||
type RescheduleEvent struct {
|
type RescheduleEvent struct {
|
||||||
// RescheduleTime is the timestamp of a reschedule attempt
|
// RescheduleTime is the timestamp of a reschedule attempt
|
||||||
|
@ -10598,47 +10638,11 @@ func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy, failTi
|
||||||
// RescheduleEligible returns if the allocation is eligible to be rescheduled according
|
// RescheduleEligible returns if the allocation is eligible to be rescheduled according
|
||||||
// to its ReschedulePolicy and the current state of its reschedule trackers
|
// to its ReschedulePolicy and the current state of its reschedule trackers
|
||||||
func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
|
func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
|
||||||
if reschedulePolicy == nil {
|
return a.RescheduleTracker.RescheduleEligible(reschedulePolicy, failTime)
|
||||||
return false
|
|
||||||
}
|
|
||||||
attempts := reschedulePolicy.Attempts
|
|
||||||
enabled := attempts > 0 || reschedulePolicy.Unlimited
|
|
||||||
if !enabled {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if reschedulePolicy.Unlimited {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
// Early return true if there are no attempts yet and the number of allowed attempts is > 0
|
|
||||||
if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
attempted, _ := a.rescheduleInfo(reschedulePolicy, failTime)
|
|
||||||
return attempted < attempts
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *Allocation) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
|
|
||||||
if reschedulePolicy == nil {
|
|
||||||
return 0, 0
|
|
||||||
}
|
|
||||||
attempts := reschedulePolicy.Attempts
|
|
||||||
interval := reschedulePolicy.Interval
|
|
||||||
|
|
||||||
attempted := 0
|
|
||||||
if a.RescheduleTracker != nil && attempts > 0 {
|
|
||||||
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
|
|
||||||
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
|
|
||||||
timeDiff := failTime.UTC().UnixNano() - lastAttempt
|
|
||||||
if timeDiff < interval.Nanoseconds() {
|
|
||||||
attempted += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return attempted, attempts
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *Allocation) RescheduleInfo() (int, int) {
|
func (a *Allocation) RescheduleInfo() (int, int) {
|
||||||
return a.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime())
|
return a.RescheduleTracker.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime())
|
||||||
}
|
}
|
||||||
|
|
||||||
// LastEventTime is the time of the last task event in the allocation.
|
// LastEventTime is the time of the last task event in the allocation.
|
||||||
|
@ -10696,7 +10700,7 @@ func (a *Allocation) nextRescheduleTime(failTime time.Time, reschedulePolicy *Re
|
||||||
rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil)
|
rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil)
|
||||||
if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil {
|
if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil {
|
||||||
// Check for eligibility based on the interval if max attempts is set
|
// Check for eligibility based on the interval if max attempts is set
|
||||||
attempted, attempts := a.rescheduleInfo(reschedulePolicy, failTime)
|
attempted, attempts := a.RescheduleTracker.rescheduleInfo(reschedulePolicy, failTime)
|
||||||
rescheduleEligible = attempted < attempts && nextDelay < reschedulePolicy.Interval
|
rescheduleEligible = attempted < attempts && nextDelay < reschedulePolicy.Interval
|
||||||
}
|
}
|
||||||
return nextRescheduleTime, rescheduleEligible
|
return nextRescheduleTime, rescheduleEligible
|
||||||
|
@ -11174,6 +11178,12 @@ func (a *AllocListStub) SetEventDisplayMessages() {
|
||||||
setDisplayMsg(a.TaskStates)
|
setDisplayMsg(a.TaskStates)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RescheduleEligible returns if the allocation is eligible to be rescheduled according
|
||||||
|
// to its ReschedulePolicy and the current state of its reschedule trackers
|
||||||
|
func (a *AllocListStub) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
|
||||||
|
return a.RescheduleTracker.RescheduleEligible(reschedulePolicy, failTime)
|
||||||
|
}
|
||||||
|
|
||||||
func setDisplayMsg(taskStates map[string]*TaskState) {
|
func setDisplayMsg(taskStates map[string]*TaskState) {
|
||||||
for _, taskState := range taskStates {
|
for _, taskState := range taskStates {
|
||||||
for _, event := range taskState.Events {
|
for _, event := range taskState.Events {
|
||||||
|
|
Loading…
Reference in New Issue