deploymentwatcher: fail early whenever possible (#17341)

Given a deployment that has a `progress_deadline`, if a task group runs
out of reschedule attempts, allow it to fail at this time instead of
waiting until the `progress_deadline` is reached.

Fixes: #17260
This commit is contained in:
nicoche 2023-06-26 20:01:03 +02:00 committed by GitHub
parent 81edceb2de
commit 649831c1d3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 166 additions and 50 deletions

3
.changelog/17341.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
deploymentwatcher: Allow deployments to fail early when running out of reschedule attempts
```

View File

@ -615,12 +615,12 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
continue
}
// Determine if the update block for this group is progress based
progressBased := dstate.ProgressDeadline != 0
// Check if we can already fail the deployment
failDeployment := w.shouldFailEarly(deployment, alloc, dstate)
// Check if the allocation has failed and we need to mark it for allow
// replacements
if progressBased && alloc.DeploymentStatus.IsUnhealthy() &&
if alloc.DeploymentStatus.IsUnhealthy() && !failDeployment &&
deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() {
res.allowReplacements = append(res.allowReplacements, alloc.ID)
continue
@ -631,19 +631,12 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
res.createEval = true
}
// If the group is using a progress deadline, we don't have to do anything.
if progressBased {
continue
}
// Fail on the first bad allocation
if alloc.DeploymentStatus.IsUnhealthy() {
if failDeployment {
// Check if the group has autorevert set
if dstate.AutoRevert {
res.rollback = true
}
// Since we have an unhealthy allocation, fail the deployment
res.failDeployment = true
}
@ -702,6 +695,31 @@ func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
return fail, false, nil
}
func (w *deploymentWatcher) shouldFailEarly(deployment *structs.Deployment, alloc *structs.AllocListStub, dstate *structs.DeploymentState) bool {
if !alloc.DeploymentStatus.IsUnhealthy() {
return false
}
// Fail on the first unhealthy allocation if no progress deadline is specified.
if dstate.ProgressDeadline == 0 {
w.logger.Debug("failing deployment because an allocation failed and the deployment is not progress based", "alloc", alloc.ID)
return true
}
if deployment.Active() {
reschedulePolicy := w.j.LookupTaskGroup(alloc.TaskGroup).ReschedulePolicy
isRescheduleEligible := alloc.RescheduleEligible(reschedulePolicy, time.Now())
if !isRescheduleEligible {
// We have run out of reschedule attempts: do not wait for the progress deadline to expire because
// we know that we will not be able to try to get another allocation healthy
w.logger.Debug("failing deployment because an allocation has failed and the task group has run out of reschedule attempts", "alloc", alloc.ID)
return true
}
}
return false
}
// getDeploymentProgressCutoff returns the progress cutoff for the given
// deployment
func (w *deploymentWatcher) getDeploymentProgressCutoff(d *structs.Deployment) time.Time {

View File

@ -16,6 +16,7 @@ import (
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
mocker "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
@ -1784,6 +1785,90 @@ func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {
})
}
// Test that we exit before hitting the Progress Deadline when we run out of reschedule attempts
// for a failing deployment
func TestDeploymentWatcher_Watch_FailEarly(t *testing.T) {
ci.Parallel(t)
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)
// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.ProgressDeadline = 500 * time.Millisecond
// Allow only 1 allocation for that deployment
j.TaskGroups[0].ReschedulePolicy.Attempts = 0
j.TaskGroups[0].ReschedulePolicy.Unlimited = false
j.Stable = true
d := mock.Deployment()
d.JobID = j.ID
d.TaskGroups["web"].ProgressDeadline = 500 * time.Millisecond
d.TaskGroups["web"].RequireProgressBy = time.Now().Add(d.TaskGroups["web"].ProgressDeadline)
a := mock.Alloc()
now := time.Now()
a.CreateTime = now.UnixNano()
a.ModifyTime = now.UnixNano()
a.DeploymentID = d.ID
must.Nil(t, m.state.UpsertJob(structs.MsgTypeTestSetup, m.nextIndex(), nil, j), must.Sprint("UpsertJob"))
must.Nil(t, m.state.UpsertDeployment(m.nextIndex(), d), must.Sprint("UpsertDeployment"))
must.Nil(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a}), must.Sprint("UpsertAllocs"))
// require that we get a call to UpsertDeploymentStatusUpdate
c := &matchDeploymentStatusUpdateConfig{
DeploymentID: d.ID,
Status: structs.DeploymentStatusFailed,
StatusDescription: structs.DeploymentStatusDescriptionFailedAllocations,
Eval: true,
}
m2 := matchDeploymentStatusUpdateRequest(c)
m.On("UpdateDeploymentStatus", mocker.MatchedBy(m2)).Return(nil)
w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == watchersCount(w), nil },
func(err error) { must.Eq(t, 1, watchersCount(w), must.Sprint("Should have 1 deployment")) })
// Update the alloc to be unhealthy
a2 := a.Copy()
a2.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: pointer.Of(false),
Timestamp: now,
}
must.Nil(t, m.state.UpdateAllocsFromClient(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a2}))
// Wait for the deployment to be failed
testutil.WaitForResult(func() (bool, error) {
d, err := m.state.DeploymentByID(nil, d.ID)
if err != nil {
return false, err
}
if d.Status != structs.DeploymentStatusFailed {
return false, fmt.Errorf("bad status %q", d.Status)
}
return d.StatusDescription == structs.DeploymentStatusDescriptionFailedAllocations, fmt.Errorf("bad status description %q", d.StatusDescription)
}, func(err error) {
t.Fatal(err)
})
// require there are is only one evaluation
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}
if l := len(evals); l != 1 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}
return true, nil
}, func(err error) {
t.Fatal(err)
})
}
// Tests that the watcher fails rollback when the spec hasn't changed
func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
ci.Parallel(t)

View File

@ -10164,6 +10164,46 @@ func (rt *RescheduleTracker) Copy() *RescheduleTracker {
return nt
}
func (rt *RescheduleTracker) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
if reschedulePolicy == nil {
return false
}
attempts := reschedulePolicy.Attempts
enabled := attempts > 0 || reschedulePolicy.Unlimited
if !enabled {
return false
}
if reschedulePolicy.Unlimited {
return true
}
// Early return true if there are no attempts yet and the number of allowed attempts is > 0
if (rt == nil || len(rt.Events) == 0) && attempts > 0 {
return true
}
attempted, _ := rt.rescheduleInfo(reschedulePolicy, failTime)
return attempted < attempts
}
func (rt *RescheduleTracker) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
if reschedulePolicy == nil {
return 0, 0
}
attempts := reschedulePolicy.Attempts
interval := reschedulePolicy.Interval
attempted := 0
if rt != nil && attempts > 0 {
for j := len(rt.Events) - 1; j >= 0; j-- {
lastAttempt := rt.Events[j].RescheduleTime
timeDiff := failTime.UTC().UnixNano() - lastAttempt
if timeDiff < interval.Nanoseconds() {
attempted += 1
}
}
}
return attempted, attempts
}
// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
type RescheduleEvent struct {
// RescheduleTime is the timestamp of a reschedule attempt
@ -10598,47 +10638,11 @@ func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy, failTi
// RescheduleEligible returns if the allocation is eligible to be rescheduled according
// to its ReschedulePolicy and the current state of its reschedule trackers
func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
if reschedulePolicy == nil {
return false
}
attempts := reschedulePolicy.Attempts
enabled := attempts > 0 || reschedulePolicy.Unlimited
if !enabled {
return false
}
if reschedulePolicy.Unlimited {
return true
}
// Early return true if there are no attempts yet and the number of allowed attempts is > 0
if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
return true
}
attempted, _ := a.rescheduleInfo(reschedulePolicy, failTime)
return attempted < attempts
}
func (a *Allocation) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
if reschedulePolicy == nil {
return 0, 0
}
attempts := reschedulePolicy.Attempts
interval := reschedulePolicy.Interval
attempted := 0
if a.RescheduleTracker != nil && attempts > 0 {
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
timeDiff := failTime.UTC().UnixNano() - lastAttempt
if timeDiff < interval.Nanoseconds() {
attempted += 1
}
}
}
return attempted, attempts
return a.RescheduleTracker.RescheduleEligible(reschedulePolicy, failTime)
}
func (a *Allocation) RescheduleInfo() (int, int) {
return a.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime())
return a.RescheduleTracker.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime())
}
// LastEventTime is the time of the last task event in the allocation.
@ -10696,7 +10700,7 @@ func (a *Allocation) nextRescheduleTime(failTime time.Time, reschedulePolicy *Re
rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil)
if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil {
// Check for eligibility based on the interval if max attempts is set
attempted, attempts := a.rescheduleInfo(reschedulePolicy, failTime)
attempted, attempts := a.RescheduleTracker.rescheduleInfo(reschedulePolicy, failTime)
rescheduleEligible = attempted < attempts && nextDelay < reschedulePolicy.Interval
}
return nextRescheduleTime, rescheduleEligible
@ -11174,6 +11178,12 @@ func (a *AllocListStub) SetEventDisplayMessages() {
setDisplayMsg(a.TaskStates)
}
// RescheduleEligible returns if the allocation is eligible to be rescheduled according
// to its ReschedulePolicy and the current state of its reschedule trackers
func (a *AllocListStub) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
return a.RescheduleTracker.RescheduleEligible(reschedulePolicy, failTime)
}
func setDisplayMsg(taskStates map[string]*TaskState) {
for _, taskState := range taskStates {
for _, event := range taskState.Events {