From 50c9af53b770a691c9b21875a368ff31899c995e Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Fri, 27 Oct 2023 10:20:53 -0500 Subject: [PATCH] backport of commit e8efe2d251bf3628f13c7eb3ce2422eb7e5b85f6 (#18884) Co-authored-by: Juana De La Cuesta --- nomad/structs/structs.go | 15 +- nomad/worker.go | 2 +- scheduler/generic_sched_test.go | 45 +++- scheduler/reconcile.go | 92 +++++--- scheduler/reconcile_test.go | 178 +++++++------- scheduler/reconcile_util.go | 103 ++++++--- scheduler/reconcile_util_test.go | 385 +++++++++++++++++++++++-------- 7 files changed, 560 insertions(+), 260 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 49829b74e..acfdbb954 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -134,6 +134,7 @@ const ( ) const ( + // SystemInitializationType is used for messages that initialize parts of // the system, such as the state store. These messages are not included in // the event stream. @@ -10730,6 +10731,12 @@ func (a *Allocation) MigrateStrategy() *MigrateStrategy { func (a *Allocation) NextRescheduleTime() (time.Time, bool) { failTime := a.LastEventTime() reschedulePolicy := a.ReschedulePolicy() + + //If reschedule is disabled, return early + if reschedulePolicy.Attempts == 0 && !reschedulePolicy.Unlimited { + return time.Time{}, false + } + if a.DesiredStatus == AllocDesiredStatusStop || a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil { return time.Time{}, false } @@ -10749,16 +10756,16 @@ func (a *Allocation) nextRescheduleTime(failTime time.Time, reschedulePolicy *Re return nextRescheduleTime, rescheduleEligible } -// NextRescheduleTimeByFailTime works like NextRescheduleTime but allows callers +// NextRescheduleTimeByTime works like NextRescheduleTime but allows callers // specify a failure time. Useful for things like determining whether to reschedule // an alloc on a disconnected node. -func (a *Allocation) NextRescheduleTimeByFailTime(failTime time.Time) (time.Time, bool) { +func (a *Allocation) NextRescheduleTimeByTime(t time.Time) (time.Time, bool) { reschedulePolicy := a.ReschedulePolicy() if reschedulePolicy == nil { return time.Time{}, false } - return a.nextRescheduleTime(failTime, reschedulePolicy) + return a.nextRescheduleTime(t, reschedulePolicy) } // ShouldClientStop tests an alloc for StopAfterClientDisconnect configuration @@ -11098,7 +11105,7 @@ func (a *Allocation) Expired(now time.Time) bool { } expiry := lastUnknown.Add(*tg.MaxClientDisconnect) - return now.UTC().After(expiry) || now.UTC().Equal(expiry) + return expiry.Sub(now) <= 0 } // LastUnknown returns the timestamp for the last time the allocation diff --git a/nomad/worker.go b/nomad/worker.go index 928b73a9a..af4714256 100644 --- a/nomad/worker.go +++ b/nomad/worker.go @@ -786,7 +786,7 @@ SUBMIT: } return err } else { - w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval)) + w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval), "waitUntil", log.Fmt("%#v", eval.WaitUntil.String())) w.backoffReset() } return nil diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index fdfcb04d4..d8559ca3e 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -6944,6 +6944,7 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T) NodeID: disconnectedNode.ID, Status: structs.EvalStatusPending, }} + nodeStatusUpdateEval := evals[0] require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), evals)) @@ -6953,16 +6954,21 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T) require.Equal(t, structs.EvalStatusComplete, h.Evals[0].Status) require.Len(t, h.Plans, 1, "plan") - // One followup delayed eval created - require.Len(t, h.CreateEvals, 1) - followUpEval := h.CreateEvals[0] - require.Equal(t, nodeStatusUpdateEval.ID, followUpEval.PreviousEval) - require.Equal(t, "pending", followUpEval.Status) - require.NotEmpty(t, followUpEval.WaitUntil) + // Two followup delayed eval created + require.Len(t, h.CreateEvals, 2) + followUpEval1 := h.CreateEvals[0] + require.Equal(t, nodeStatusUpdateEval.ID, followUpEval1.PreviousEval) + require.Equal(t, "pending", followUpEval1.Status) + require.NotEmpty(t, followUpEval1.WaitUntil) - // Insert eval in the state store + followUpEval2 := h.CreateEvals[1] + require.Equal(t, nodeStatusUpdateEval.ID, followUpEval2.PreviousEval) + require.Equal(t, "pending", followUpEval2.Status) + require.NotEmpty(t, followUpEval2.WaitUntil) + + // Insert eval1 in the state store testutil.WaitForResult(func() (bool, error) { - found, err := h.State.EvalByID(nil, followUpEval.ID) + found, err := h.State.EvalByID(nil, followUpEval1.ID) if err != nil { return false, err } @@ -6976,12 +6982,34 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T) return true, nil }, func(err error) { + + require.NoError(t, err) + }) + + // Insert eval2 in the state store + testutil.WaitForResult(func() (bool, error) { + found, err := h.State.EvalByID(nil, followUpEval2.ID) + if err != nil { + return false, err + } + if found == nil { + return false, nil + } + + require.Equal(t, nodeStatusUpdateEval.ID, found.PreviousEval) + require.Equal(t, "pending", found.Status) + require.NotEmpty(t, found.WaitUntil) + + return true, nil + }, func(err error) { + require.NoError(t, err) }) // Validate that the ClientStatus updates are part of the plan. require.Len(t, h.Plans[0].NodeAllocation[disconnectedNode.ID], count) // Pending update should have unknown status. + for _, nodeAlloc := range h.Plans[0].NodeAllocation[disconnectedNode.ID] { require.Equal(t, nodeAlloc.ClientStatus, structs.AllocClientStatusUnknown) } @@ -6991,6 +7019,7 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T) require.NoError(t, err, "plan.NodeUpdate") // Validate that the StateStore Upsert applied the ClientStatus we specified. + for _, alloc := range unknownAllocs { alloc, err = h.State.AllocByID(nil, alloc.ID) require.NoError(t, err) diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index ace47e4da..0b41c0ff5 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -230,6 +230,7 @@ func (a *allocReconciler) computeDeploymentComplete(m allocMatrix) bool { groupComplete := a.computeGroup(group, as) complete = complete && groupComplete } + return complete } @@ -398,6 +399,7 @@ func (a *allocReconciler) markDelayed(allocs allocSet, clientStatus, statusDescr // computeGroup reconciles state for a particular task group. It returns whether // the deployment it is for is complete with regards to the task group. func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { + // Create the desired update object for the group desiredChanges := new(structs.DesiredUpdates) a.result.desiredTGUpdates[groupName] = desiredChanges @@ -426,6 +428,9 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { untainted, migrate, lost, disconnecting, reconnecting, ignore := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) desiredChanges.Ignore += uint64(len(ignore)) + // Determine what set of terminal allocations need to be rescheduled + untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment) + // If there are allocations reconnecting we need to reconcile them and // their replacements first because there is specific logic when deciding // which ones to keep that can only be applied when the client reconnects. @@ -454,20 +459,28 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { untainted = untainted.union(reconnect) } - // Determine what set of terminal allocations need to be rescheduled - untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment) + // Determine what set of disconnecting allocations need to be rescheduled now, + // which ones later and which ones can't be rescheduled at all. + timeoutLaterEvals := map[string]string{} + if len(disconnecting) > 0 { + untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment) - // Determine what set of disconnecting allocations need to be rescheduled - _, rescheduleDisconnecting, _ := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment) - rescheduleNow = rescheduleNow.union(rescheduleDisconnecting) + rescheduleNow = rescheduleNow.union(rescheduleDisconnecting) + untainted = untainted.union(untaintedDisconnecting) + rescheduleLater = append(rescheduleLater, laterDisconnecting...) + + // Find delays for any disconnecting allocs that have max_client_disconnect, + // create followup evals, and update the ClientStatus to unknown. + timeoutLaterEvals = a.createTimeoutLaterEvals(disconnecting, tg.Name) + } // Find delays for any lost allocs that have stop_after_client_disconnect - lostLater := lost.delayByStopAfterClientDisconnect() - lostLaterEvals := a.createLostLaterEvals(lostLater, tg.Name) - - // Find delays for any disconnecting allocs that have max_client_disconnect, - // create followup evals, and update the ClientStatus to unknown. - timeoutLaterEvals := a.createTimeoutLaterEvals(disconnecting, tg.Name) + lostLaterEvals := map[string]string{} + lostLater := []*delayedRescheduleInfo{} + if len(lost) > 0 { + lostLater = lost.delayByStopAfterClientDisconnect() + lostLaterEvals = a.createLostLaterEvals(lostLater, tg.Name) + } // Merge disconnecting with the stop_after_client_disconnect set into the // lostLaterEvals so that computeStop can add them to the stop set. @@ -486,13 +499,15 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { // include stopped allocations. isCanarying := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals) + desiredChanges.Stop += uint64(len(stop)) untainted = untainted.difference(stop) // Do inplace upgrades where possible and capture the set of upgrades that // need to be done destructively. - ignore, inplace, destructive := a.computeUpdates(tg, untainted) - desiredChanges.Ignore += uint64(len(ignore)) + ignoreUpdates, inplace, destructive := a.computeUpdates(tg, untainted) + + desiredChanges.Ignore += uint64(len(ignoreUpdates)) desiredChanges.InPlaceUpdate += uint64(len(inplace)) if !existingDeployment { dstate.DesiredTotal += len(destructive) + len(inplace) @@ -796,7 +811,8 @@ func (a *allocReconciler) computeReplacements(deploymentPlaceReady bool, desired // replacements based off that. failed := make(allocSet) for id, alloc := range rescheduleNow { - if _, ok := a.result.disconnectUpdates[id]; !ok { + _, ok := a.result.disconnectUpdates[id] + if !ok && alloc.ClientStatus != structs.AllocClientStatusUnknown { failed[id] = alloc } } @@ -968,8 +984,11 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc untainted = untainted.difference(canaries) } + // Remove disconnected allocations so they won't be stopped + knownUntainted := untainted.filterOutByClientStatus(structs.AllocClientStatusUnknown) + // Hot path the nothing to do case - remove := len(untainted) + len(migrate) - group.Count + remove := len(knownUntainted) + len(migrate) - group.Count if remove <= 0 { return stop } @@ -1072,7 +1091,7 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc // - If the reconnecting allocation is to be stopped, its replacements may // not be present in any of the returned sets. The rest of the reconciler // logic will handle them. -func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others allocSet) (allocSet, allocSet) { +func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all allocSet) (allocSet, allocSet) { stop := make(allocSet) reconnect := make(allocSet) @@ -1111,14 +1130,11 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others al // Find replacement allocations and decide which one to stop. A // reconnecting allocation may have multiple replacements. - for _, replacementAlloc := range others { + for _, replacementAlloc := range all { // Skip allocations that are not a replacement of the one - // reconnecting. Replacement allocations have the same name but a - // higher CreateIndex and a different ID. - isReplacement := replacementAlloc.ID != reconnectingAlloc.ID && - replacementAlloc.Name == reconnectingAlloc.Name && - replacementAlloc.CreateIndex > reconnectingAlloc.CreateIndex + // reconnecting. + isReplacement := replacementAlloc.ID == reconnectingAlloc.NextAllocation // Skip allocations that are server terminal. // We don't want to replace a reconnecting allocation with one that @@ -1142,12 +1158,14 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others al } } else { // The reconnecting allocation is preferred, so stop this - // replacement. - stop[replacementAlloc.ID] = replacementAlloc - a.result.stop = append(a.result.stop, allocStopResult{ - alloc: replacementAlloc, - statusDescription: allocReconnected, - }) + // replacement, but avoid re-stopping stopped allocs + if replacementAlloc.ClientStatus != structs.AllocClientStatusFailed { + stop[replacementAlloc.ID] = replacementAlloc + a.result.stop = append(a.result.stop, allocStopResult{ + alloc: replacementAlloc, + statusDescription: allocReconnected, + }) + } } } } @@ -1235,11 +1253,17 @@ func (a *allocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedR allocIDToFollowupEvalID := a.createLostLaterEvals(rescheduleLater, tgName) // Create updates that will be applied to the allocs to mark the FollowupEvalID - for allocID, evalID := range allocIDToFollowupEvalID { - existingAlloc := all[allocID] + for _, laterAlloc := range rescheduleLater { + existingAlloc := all[laterAlloc.alloc.ID] updatedAlloc := existingAlloc.Copy() - updatedAlloc.FollowupEvalID = evalID - a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc + updatedAlloc.FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID] + + // Can't updated an allocation that is disconnected + if _, ok := a.result.disconnectUpdates[laterAlloc.allocID]; !ok { + a.result.attributeUpdates[laterAlloc.allocID] = updatedAlloc + } else { + a.result.disconnectUpdates[laterAlloc.allocID].FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID] + } } } @@ -1355,8 +1379,8 @@ func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName } timeoutDelays, err := disconnecting.delayByMaxClientDisconnect(a.now) - if err != nil || len(timeoutDelays) != len(disconnecting) { - a.logger.Error("error computing disconnecting timeouts for task_group", + if err != nil { + a.logger.Error("error for task_group", "task_group", tgName, "error", err) return map[string]string{} } diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index b1c92e88c..6b84974f6 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -5327,7 +5327,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) { nodeScoreIncrement float64 disconnectedAllocStatus string disconnectedAllocStates []*structs.AllocState - serverDesiredStatus string isBatch bool nodeStatusDisconnected bool replace bool @@ -5342,13 +5341,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { testCases := []testCase{ { - name: "reconnect-original-no-replacement", - allocCount: 2, - replace: false, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusRunning, + name: "reconnect-original-no-replacement", + allocCount: 2, + replace: false, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: false, expected: &resultExpectation{ reconnectUpdates: 2, @@ -5360,13 +5359,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "resume-original-and-stop-replacement", - allocCount: 3, - replace: true, - disconnectedAllocCount: 1, - disconnectedAllocStatus: structs.AllocClientStatusRunning, + name: "resume-original-and-stop-replacement", + allocCount: 3, + replace: true, + disconnectedAllocCount: 1, + disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: false, expected: &resultExpectation{ stop: 1, @@ -5380,13 +5379,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "stop-original-with-lower-node-score", - allocCount: 4, - replace: true, - disconnectedAllocCount: 1, - disconnectedAllocStatus: structs.AllocClientStatusRunning, + name: "stop-original-with-lower-node-score", + allocCount: 4, + replace: true, + disconnectedAllocCount: 1, + disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, nodeScoreIncrement: 1, expected: &resultExpectation{ @@ -5400,13 +5399,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "stop-original-failed-on-reconnect", - allocCount: 4, - replace: true, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusFailed, + name: "stop-original-failed-on-reconnect", + allocCount: 4, + replace: true, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusFailed, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ stop: 2, @@ -5419,13 +5418,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "reschedule-original-failed-if-not-replaced", - allocCount: 4, - replace: false, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusFailed, + name: "reschedule-original-failed-if-not-replaced", + allocCount: 4, + replace: false, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusFailed, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ stop: 2, @@ -5445,13 +5444,15 @@ func TestReconciler_Disconnected_Client(t *testing.T) { replace: false, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusComplete, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, isBatch: true, expected: &resultExpectation{ + place: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 2, + Place: 2, }, }, }, @@ -5463,15 +5464,14 @@ func TestReconciler_Disconnected_Client(t *testing.T) { failReplacement: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, expected: &resultExpectation{ reconnectUpdates: 2, - stop: 2, + stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Ignore: 3, - Stop: 2, + Ignore: 5, }, }, }, @@ -5483,8 +5483,8 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectReplacement: true, disconnectedAllocCount: 1, disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, expected: &resultExpectation{ reconnectUpdates: 1, stop: 1, @@ -5503,8 +5503,8 @@ func TestReconciler_Disconnected_Client(t *testing.T) { taintReplacement: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, expected: &resultExpectation{ reconnectUpdates: 2, stop: 2, @@ -5517,13 +5517,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "stop-original-alloc-with-old-job-version", - allocCount: 5, - replace: true, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusRunning, + name: "stop-original-alloc-with-old-job-version", + allocCount: 5, + replace: true, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, jobVersionIncrement: 1, expected: &resultExpectation{ @@ -5537,13 +5537,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "stop-original-alloc-with-old-job-version-reconnect-eval", - allocCount: 5, - replace: true, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusRunning, + name: "stop-original-alloc-with-old-job-version-reconnect-eval", + allocCount: 5, + replace: true, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, jobVersionIncrement: 1, expected: &resultExpectation{ @@ -5557,35 +5557,36 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "stop-original-alloc-with-old-job-version-and-failed-replacements-replaced", - allocCount: 5, - replace: true, - failReplacement: true, - replaceFailedReplacement: true, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusRunning, + name: "stop-original-alloc-with-old-job-version-and-failed-replacements-replaced", + allocCount: 5, + replace: true, + failReplacement: true, + replaceFailedReplacement: true, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusRunning, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, - shouldStopOnDisconnectedNode: true, + shouldStopOnDisconnectedNode: false, jobVersionIncrement: 1, expected: &resultExpectation{ - stop: 2, + stop: 2, + reconnectUpdates: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 2, - Ignore: 5, + Ignore: 7, }, }, }, }, { - name: "stop-original-pending-alloc-for-disconnected-node", - allocCount: 2, - replace: true, - disconnectedAllocCount: 1, - disconnectedAllocStatus: structs.AllocClientStatusPending, + name: "stop-original-pending-alloc-for-disconnected-node", + allocCount: 2, + replace: true, + disconnectedAllocCount: 1, + disconnectedAllocStatus: structs.AllocClientStatusPending, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, nodeStatusDisconnected: true, expected: &resultExpectation{ @@ -5599,23 +5600,23 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "stop-failed-original-and-failed-replacements-and-place-new", - allocCount: 5, - replace: true, - failReplacement: true, - disconnectedAllocCount: 2, - disconnectedAllocStatus: structs.AllocClientStatusFailed, + name: "stop-failed-original-and-failed-replacements-and-place-new", + allocCount: 5, + replace: true, + failReplacement: true, + disconnectedAllocCount: 2, + disconnectedAllocStatus: structs.AllocClientStatusFailed, + disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ - stop: 4, + stop: 2, place: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Stop: 4, + Stop: 2, Place: 2, - Ignore: 3, + Ignore: 5, }, }, }, @@ -5627,7 +5628,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusUnknown, disconnectedAllocStates: disconnectAllocState, - serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, nodeStatusDisconnected: true, maxDisconnect: pointer.Of(2 * time.Second), @@ -5648,7 +5648,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, disconnectedAllocStates: []*structs.AllocState{}, - serverDesiredStatus: structs.AllocDesiredStatusRun, nodeStatusDisconnected: true, expected: &resultExpectation{ place: 2, @@ -5687,7 +5686,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) { // Set alloc state disconnectedAllocCount := tc.disconnectedAllocCount for _, alloc := range allocs { - alloc.DesiredStatus = tc.serverDesiredStatus + alloc.DesiredStatus = structs.AllocDesiredStatusRun if tc.maxDisconnect != nil { alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect @@ -5699,7 +5698,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) { // Set the node id on all the disconnected allocs to the node under test. alloc.NodeID = testNode.ID alloc.NodeName = "disconnected" - disconnectedAllocCount-- } } @@ -5791,6 +5789,12 @@ func TestReconciler_Disconnected_Client(t *testing.T) { // Tests that a client disconnect while a canary is in progress generates the result. func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { + disconnectAllocState := []*structs.AllocState{{ + Field: structs.AllocStateFieldClientStatus, + Value: structs.AllocClientStatusUnknown, + Time: time.Now(), + }} + type testCase struct { name string nodes []string @@ -5883,7 +5887,7 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { updatedJob.TaskGroups[0].Name: { Place: 3, Canary: 0, - Ignore: 3, + Ignore: 6, }, }, }, @@ -5947,7 +5951,7 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { updatedJob.TaskGroups[0].Name: { Place: 2, Canary: 0, - Ignore: 4, + Ignore: 7, }, }, }, @@ -6013,7 +6017,7 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { updatedJob.TaskGroups[0].Name: { Place: 2, Canary: 0, - Ignore: 3, + Ignore: 6, // The 2 stops in this test are transient failures, but // the deployment can still progress. We don't include // them in the stop count since DesiredTGUpdates is used @@ -6083,6 +6087,12 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { if alloc.ClientStatus == structs.AllocClientStatusRunning { alloc.DeploymentStatus.Healthy = pointer.Of(true) } + + if alloc.ClientStatus == structs.AllocClientStatusUnknown { + alloc.AllocStates = disconnectAllocState + alloc.FollowupEvalID = "eval-where-it-was-set-to-unknow" + } + tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID) handled[alloc.ID] = allocUpdateFnIgnore canariesConfigured++ diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index ad971eed9..87e431e0a 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -9,6 +9,7 @@ package scheduler // all scheduler types before moving it into util.go import ( + "errors" "fmt" "sort" "strings" @@ -264,10 +265,11 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID] if taintedNode != nil { - // Group disconnecting/reconnecting + // Group disconnecting switch taintedNode.Status { case structs.NodeStatusDisconnected: if supportsDisconnectedClients { + // Filter running allocs on a node that is disconnected to be marked as unknown. if alloc.ClientStatus == structs.AllocClientStatusRunning { disconnecting[alloc.ID] = alloc @@ -289,6 +291,7 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS lost[alloc.ID] = alloc continue } + reconnecting[alloc.ID] = alloc continue } @@ -296,9 +299,16 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS } } - // Terminal allocs, if not reconnect, are always untainted as they - // should never be migrated. if alloc.TerminalStatus() && !reconnect { + // Terminal allocs, if supportsDisconnectedClient and not reconnect, + // are probably stopped replacements and should be ignored + if supportsDisconnectedClients { + ignore[alloc.ID] = alloc + continue + } + + // Terminal allocs, if not reconnect, are always untainted as they + // should never be migrated. untainted[alloc.ID] = alloc continue } @@ -315,11 +325,11 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS continue } - // Ignore unknown allocs that we want to reconnect eventually. + // Acknowledge unknown allocs that we want to reconnect eventually. if supportsDisconnectedClients && alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.DesiredStatus == structs.AllocDesiredStatusRun { - ignore[alloc.ID] = alloc + untainted[alloc.ID] = alloc continue } @@ -366,12 +376,11 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS // untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled // at a future time are also returned so that we can create follow up evaluations for them. Allocs are // skipped or considered untainted according to logic defined in shouldFilter method. -func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) { - untainted = make(map[string]*structs.Allocation) - rescheduleNow = make(map[string]*structs.Allocation) +func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (allocSet, allocSet, []*delayedRescheduleInfo) { + untainted := make(map[string]*structs.Allocation) + rescheduleNow := make(map[string]*structs.Allocation) + rescheduleLater := []*delayedRescheduleInfo{} - // When filtering disconnected sets, the untainted set is never populated. - // It has no purpose in that context. for _, alloc := range a { // Ignore disconnecting allocs that are already unknown. This can happen // in the case of canaries that are interrupted by a disconnect. @@ -393,25 +402,27 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time if isUntainted && !isDisconnecting { untainted[alloc.ID] = alloc } - if isUntainted || ignore { + + if ignore { continue } - // Only failed allocs with desired state run get to this point - // If the failed alloc is not eligible for rescheduling now we - // add it to the untainted set. Disconnecting delay evals are - // handled by allocReconciler.createTimeoutLaterEvals eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting) - if !isDisconnecting && !eligibleNow { - untainted[alloc.ID] = alloc - if eligibleLater { - rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) - } - } else { + if eligibleNow { rescheduleNow[alloc.ID] = alloc + continue } + + // If the failed alloc is not eligible for rescheduling now we + // add it to the untainted set. + untainted[alloc.ID] = alloc + + if eligibleLater { + rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) + } + } - return + return untainted, rescheduleNow, rescheduleLater } // shouldFilter returns whether the alloc should be ignored or considered untainted. @@ -436,32 +447,31 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo if alloc.RanSuccessfully() { return true, false } + return false, true case structs.AllocDesiredStatusEvict: return false, true - default: } switch alloc.ClientStatus { case structs.AllocClientStatusFailed: - default: - return true, false + return false, false } - return false, false + + return true, false } // Handle service jobs switch alloc.DesiredStatus { case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: return false, true - default: } switch alloc.ClientStatus { case structs.AllocClientStatusComplete, structs.AllocClientStatusLost: return false, true - default: } + return false, false } @@ -481,9 +491,15 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri // Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time var eligible bool - if isDisconnecting { - rescheduleTime, eligible = alloc.NextRescheduleTimeByFailTime(now) - } else { + switch { + case isDisconnecting: + rescheduleTime, eligible = alloc.NextRescheduleTimeByTime(now) + + case alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.FollowupEvalID == evalID: + lastDisconnectTime := alloc.LastUnknown() + rescheduleTime, eligible = alloc.NextRescheduleTimeByTime(lastDisconnectTime) + + default: rescheduleTime, eligible = alloc.NextRescheduleTime() } @@ -491,9 +507,11 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri rescheduleNow = true return } - if eligible && alloc.FollowupEvalID == "" { + + if eligible && (alloc.FollowupEvalID == "" || isDisconnecting) { rescheduleLater = true } + return } @@ -547,12 +565,13 @@ func (a allocSet) delayByStopAfterClientDisconnect() (later []*delayedReschedule // delayByMaxClientDisconnect returns a delay for any unknown allocation // that's got a max_client_reconnect configured -func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRescheduleInfo, err error) { +func (a allocSet) delayByMaxClientDisconnect(now time.Time) ([]*delayedRescheduleInfo, error) { + var later []*delayedRescheduleInfo + for _, alloc := range a { timeout := alloc.DisconnectTimeout(now) - if !timeout.After(now) { - continue + return nil, errors.New("unable to computing disconnecting timeouts") } later = append(later, &delayedRescheduleInfo{ @@ -562,7 +581,19 @@ func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRes }) } - return + return later, nil +} + +// filterOutByClientStatus returns all allocs from the set without the specified client status. +func (a allocSet) filterOutByClientStatus(clientStatus string) allocSet { + allocs := make(allocSet) + for _, alloc := range a { + if alloc.ClientStatus != clientStatus { + allocs[alloc.ID] = alloc + } + } + + return allocs } // filterByClientStatus returns allocs from the set with the specified client status. diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index 002ee0d5b..6ee719894 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -237,7 +237,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, }, - { name: "disco-client-disconnect-unset-max-disconnect", supportsDisconnectedClients: true, @@ -273,7 +272,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, }, - // Everything below this line tests the disconnected client mode. { name: "disco-client-untainted-reconnect-failed-and-replaced", @@ -381,10 +379,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { taintedNodes: nodes, skipNilNodeTest: false, all: allocSet{ - // Allocs on reconnected nodes that are complete are untainted - "untainted-reconnect-complete": { - ID: "untainted-reconnect-complete", - Name: "untainted-reconnect-complete", + // Allocs on reconnected nodes that are complete are ignored + "ignored-reconnect-complete": { + ID: "ignored-reconnect-complete", + Name: "ignored-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -405,9 +403,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { AllocStates: unknownAllocState, }, // Lost allocs on reconnected nodes don't get restarted - "untainted-reconnect-lost": { - ID: "untainted-reconnect-lost", - Name: "untainted-reconnect-lost", + "ignored-reconnect-lost": { + ID: "ignored-reconnect-lost", + Name: "ignored-reconnect-lost", ClientStatus: structs.AllocClientStatusLost, DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, @@ -415,10 +413,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { TaskGroup: "web", AllocStates: unknownAllocState, }, - // Replacement allocs that are complete are untainted - "untainted-reconnect-complete-replacement": { - ID: "untainted-reconnect-complete-replacement", - Name: "untainted-reconnect-complete", + // Replacement allocs that are complete are ignored + "ignored-reconnect-complete-replacement": { + ID: "ignored-reconnect-complete-replacement", + Name: "ignored-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -427,10 +425,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { AllocStates: unknownAllocState, PreviousAllocation: "untainted-reconnect-complete", }, - // Replacement allocs on reconnected nodes that are failed are untainted - "untainted-reconnect-failed-replacement": { - ID: "untainted-reconnect-failed-replacement", - Name: "untainted-reconnect-failed", + // Replacement allocs on reconnected nodes that are failed are ignored + "ignored-reconnect-failed-replacement": { + ID: "ignored-reconnect-failed-replacement", + Name: "ignored-reconnect-failed", ClientStatus: structs.AllocClientStatusFailed, DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, @@ -439,63 +437,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { PreviousAllocation: "reconnecting-failed", }, // Lost replacement allocs on reconnected nodes don't get restarted - "untainted-reconnect-lost-replacement": { - ID: "untainted-reconnect-lost-replacement", - Name: "untainted-reconnect-lost", - ClientStatus: structs.AllocClientStatusLost, - DesiredStatus: structs.AllocDesiredStatusStop, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - PreviousAllocation: "untainted-reconnect-lost", - }, - }, - untainted: allocSet{ - "untainted-reconnect-complete": { - ID: "untainted-reconnect-complete", - Name: "untainted-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - }, - "untainted-reconnect-lost": { - ID: "untainted-reconnect-lost", - Name: "untainted-reconnect-lost", - ClientStatus: structs.AllocClientStatusLost, - DesiredStatus: structs.AllocDesiredStatusStop, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - }, - "untainted-reconnect-complete-replacement": { - ID: "untainted-reconnect-complete-replacement", - Name: "untainted-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - PreviousAllocation: "untainted-reconnect-complete", - }, - "untainted-reconnect-failed-replacement": { - ID: "untainted-reconnect-failed-replacement", - Name: "untainted-reconnect-failed", - ClientStatus: structs.AllocClientStatusFailed, - DesiredStatus: structs.AllocDesiredStatusStop, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - PreviousAllocation: "reconnecting-failed", - }, - "untainted-reconnect-lost-replacement": { - ID: "untainted-reconnect-lost-replacement", - Name: "untainted-reconnect-lost", + "ignored-reconnect-lost-replacement": { + ID: "ignored-reconnect-lost-replacement", + Name: "ignored-reconnect-lost", ClientStatus: structs.AllocClientStatusLost, DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, @@ -505,6 +449,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { PreviousAllocation: "untainted-reconnect-lost", }, }, + untainted: allocSet{}, migrate: allocSet{}, disconnecting: allocSet{}, reconnecting: allocSet{ @@ -519,8 +464,62 @@ func TestAllocSet_filterByTainted(t *testing.T) { AllocStates: unknownAllocState, }, }, - ignore: allocSet{}, - lost: allocSet{}, + ignore: allocSet{ + + "ignored-reconnect-complete": { + ID: "ignored-reconnect-complete", + Name: "ignored-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + }, + "ignored-reconnect-lost": { + ID: "ignored-reconnect-lost", + Name: "ignored-reconnect-lost", + ClientStatus: structs.AllocClientStatusLost, + DesiredStatus: structs.AllocDesiredStatusStop, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + }, + "ignored-reconnect-complete-replacement": { + ID: "ignored-reconnect-complete-replacement", + Name: "ignored-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + PreviousAllocation: "untainted-reconnect-complete", + }, + "ignored-reconnect-failed-replacement": { + ID: "ignored-reconnect-failed-replacement", + Name: "ignored-reconnect-failed", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusStop, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + PreviousAllocation: "reconnecting-failed", + }, + "ignored-reconnect-lost-replacement": { + ID: "ignored-reconnect-lost-replacement", + Name: "ignored-reconnect-lost", + ClientStatus: structs.AllocClientStatusLost, + DesiredStatus: structs.AllocDesiredStatusStop, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + PreviousAllocation: "untainted-reconnect-lost", + }, + }, + lost: allocSet{}, }, { name: "disco-client-disconnect", @@ -539,10 +538,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { NodeID: "disconnected", TaskGroup: "web", }, - // Unknown allocs on disconnected nodes are ignored - "ignore-unknown": { - ID: "ignore-unknown", - Name: "ignore-unknown", + // Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again + "untainted-unknown": { + ID: "untainted-unknown", + Name: "untainted-unknown", ClientStatus: structs.AllocClientStatusUnknown, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -595,8 +594,20 @@ func TestAllocSet_filterByTainted(t *testing.T) { AllocStates: unknownAllocState, }, }, - untainted: allocSet{}, - migrate: allocSet{}, + untainted: allocSet{ + // Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again + "untainted-unknown": { + ID: "untainted-unknown", + Name: "untainted-unknown", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", + AllocStates: unknownAllocState, + }, + }, + migrate: allocSet{}, disconnecting: allocSet{ "disconnect-running": { ID: "disconnect-running", @@ -610,17 +621,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, reconnecting: allocSet{}, ignore: allocSet{ - // Unknown allocs on disconnected nodes are ignored - "ignore-unknown": { - ID: "ignore-unknown", - Name: "ignore-unknown", - ClientStatus: structs.AllocClientStatusUnknown, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "disconnected", - TaskGroup: "web", - AllocStates: unknownAllocState, - }, "ignore-reconnected-failed-stopped": { ID: "ignore-reconnected-failed-stopped", Name: "ignore-reconnected-failed-stopped", @@ -1167,3 +1167,202 @@ func Test_allocNameIndex_Next(t *testing.T) { }) } } + +func TestAllocSet_filterByRescheduleable(t *testing.T) { + ci.Parallel(t) + + noRescheduleJob := mock.Job() + noRescheduleTG := &structs.TaskGroup{ + Name: "noRescheduleTG", + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 0, + Unlimited: false, + }, + } + + noRescheduleJob.TaskGroups[0] = noRescheduleTG + + testJob := mock.Job() + rescheduleTG := &structs.TaskGroup{ + Name: "rescheduleTG", + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 1, + Unlimited: false, + }, + } + testJob.TaskGroups[0] = rescheduleTG + + now := time.Now() + + type testCase struct { + name string + all allocSet + isBatch bool + supportsDisconnectedClients bool + isDisconnecting bool + deployment *structs.Deployment + + // expected results + untainted allocSet + resNow allocSet + resLater []*delayedRescheduleInfo + } + + testCases := []testCase{ + { + name: "batch disconnecting allocation no reschedule", + isDisconnecting: true, + isBatch: true, + all: allocSet{ + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: noRescheduleJob, + TaskGroup: "noRescheduleTG", + }, + }, + untainted: allocSet{ + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: noRescheduleJob, + TaskGroup: "noRescheduleTG", + }, + }, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "batch ignore unknown disconnecting allocs", + isDisconnecting: true, + isBatch: true, + all: allocSet{ + "disconnecting1": { + ID: "disconnection1", + ClientStatus: structs.AllocClientStatusUnknown, + Job: testJob, + }, + }, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "batch disconnecting allocation reschedule", + isDisconnecting: true, + isBatch: true, + all: allocSet{ + "rescheduleNow1": { + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + }, + }, + untainted: allocSet{}, + resNow: allocSet{ + "rescheduleNow1": { + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + }, + }, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "service disconnecting allocation no reschedule", + isDisconnecting: true, + isBatch: false, + all: allocSet{ + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: noRescheduleJob, + TaskGroup: "noRescheduleTG", + }, + }, + untainted: allocSet{ + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: noRescheduleJob, + TaskGroup: "noRescheduleTG", + }, + }, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "service disconnecting allocation reschedule", + isDisconnecting: true, + isBatch: false, + all: allocSet{ + "rescheduleNow1": { + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + }, + }, + untainted: allocSet{}, + resNow: allocSet{ + "rescheduleNow1": { + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + }, + }, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "service ignore unknown disconnecting allocs", + isDisconnecting: true, + isBatch: false, + all: allocSet{ + "disconnecting1": { + ID: "disconnection1", + ClientStatus: structs.AllocClientStatusUnknown, + Job: testJob, + }, + }, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "service running allocation no reschedule", + isDisconnecting: false, + isBatch: true, + all: allocSet{ + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: noRescheduleJob, + TaskGroup: "noRescheduleTG", + }, + }, + untainted: allocSet{ + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: noRescheduleJob, + TaskGroup: "noRescheduleTG", + }, + }, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + untainted, resNow, resLater := tc.all.filterByRescheduleable(tc.isBatch, + tc.isDisconnecting, now, "evailID", tc.deployment) + must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted")) + must.Eq(t, tc.resNow, resNow, must.Sprintf("with-nodes: reschedule-now")) + must.Eq(t, tc.resLater, resLater, must.Sprintf("with-nodes: rescheduleLater")) + }) + } +}