backport of commit e8efe2d251bf3628f13c7eb3ce2422eb7e5b85f6 (#18884)
Co-authored-by: Juana De La Cuesta <juanita.delacuestamorales@hashicorp.com>
This commit is contained in:
parent
c21331bc21
commit
50c9af53b7
|
@ -134,6 +134,7 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|
||||||
// SystemInitializationType is used for messages that initialize parts of
|
// SystemInitializationType is used for messages that initialize parts of
|
||||||
// the system, such as the state store. These messages are not included in
|
// the system, such as the state store. These messages are not included in
|
||||||
// the event stream.
|
// the event stream.
|
||||||
|
@ -10730,6 +10731,12 @@ func (a *Allocation) MigrateStrategy() *MigrateStrategy {
|
||||||
func (a *Allocation) NextRescheduleTime() (time.Time, bool) {
|
func (a *Allocation) NextRescheduleTime() (time.Time, bool) {
|
||||||
failTime := a.LastEventTime()
|
failTime := a.LastEventTime()
|
||||||
reschedulePolicy := a.ReschedulePolicy()
|
reschedulePolicy := a.ReschedulePolicy()
|
||||||
|
|
||||||
|
//If reschedule is disabled, return early
|
||||||
|
if reschedulePolicy.Attempts == 0 && !reschedulePolicy.Unlimited {
|
||||||
|
return time.Time{}, false
|
||||||
|
}
|
||||||
|
|
||||||
if a.DesiredStatus == AllocDesiredStatusStop || a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil {
|
if a.DesiredStatus == AllocDesiredStatusStop || a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil {
|
||||||
return time.Time{}, false
|
return time.Time{}, false
|
||||||
}
|
}
|
||||||
|
@ -10749,16 +10756,16 @@ func (a *Allocation) nextRescheduleTime(failTime time.Time, reschedulePolicy *Re
|
||||||
return nextRescheduleTime, rescheduleEligible
|
return nextRescheduleTime, rescheduleEligible
|
||||||
}
|
}
|
||||||
|
|
||||||
// NextRescheduleTimeByFailTime works like NextRescheduleTime but allows callers
|
// NextRescheduleTimeByTime works like NextRescheduleTime but allows callers
|
||||||
// specify a failure time. Useful for things like determining whether to reschedule
|
// specify a failure time. Useful for things like determining whether to reschedule
|
||||||
// an alloc on a disconnected node.
|
// an alloc on a disconnected node.
|
||||||
func (a *Allocation) NextRescheduleTimeByFailTime(failTime time.Time) (time.Time, bool) {
|
func (a *Allocation) NextRescheduleTimeByTime(t time.Time) (time.Time, bool) {
|
||||||
reschedulePolicy := a.ReschedulePolicy()
|
reschedulePolicy := a.ReschedulePolicy()
|
||||||
if reschedulePolicy == nil {
|
if reschedulePolicy == nil {
|
||||||
return time.Time{}, false
|
return time.Time{}, false
|
||||||
}
|
}
|
||||||
|
|
||||||
return a.nextRescheduleTime(failTime, reschedulePolicy)
|
return a.nextRescheduleTime(t, reschedulePolicy)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ShouldClientStop tests an alloc for StopAfterClientDisconnect configuration
|
// ShouldClientStop tests an alloc for StopAfterClientDisconnect configuration
|
||||||
|
@ -11098,7 +11105,7 @@ func (a *Allocation) Expired(now time.Time) bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
expiry := lastUnknown.Add(*tg.MaxClientDisconnect)
|
expiry := lastUnknown.Add(*tg.MaxClientDisconnect)
|
||||||
return now.UTC().After(expiry) || now.UTC().Equal(expiry)
|
return expiry.Sub(now) <= 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// LastUnknown returns the timestamp for the last time the allocation
|
// LastUnknown returns the timestamp for the last time the allocation
|
||||||
|
|
|
@ -786,7 +786,7 @@ SUBMIT:
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
} else {
|
} else {
|
||||||
w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval))
|
w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval), "waitUntil", log.Fmt("%#v", eval.WaitUntil.String()))
|
||||||
w.backoffReset()
|
w.backoffReset()
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
|
@ -6944,6 +6944,7 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T)
|
||||||
NodeID: disconnectedNode.ID,
|
NodeID: disconnectedNode.ID,
|
||||||
Status: structs.EvalStatusPending,
|
Status: structs.EvalStatusPending,
|
||||||
}}
|
}}
|
||||||
|
|
||||||
nodeStatusUpdateEval := evals[0]
|
nodeStatusUpdateEval := evals[0]
|
||||||
require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), evals))
|
require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), evals))
|
||||||
|
|
||||||
|
@ -6953,16 +6954,21 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T)
|
||||||
require.Equal(t, structs.EvalStatusComplete, h.Evals[0].Status)
|
require.Equal(t, structs.EvalStatusComplete, h.Evals[0].Status)
|
||||||
require.Len(t, h.Plans, 1, "plan")
|
require.Len(t, h.Plans, 1, "plan")
|
||||||
|
|
||||||
// One followup delayed eval created
|
// Two followup delayed eval created
|
||||||
require.Len(t, h.CreateEvals, 1)
|
require.Len(t, h.CreateEvals, 2)
|
||||||
followUpEval := h.CreateEvals[0]
|
followUpEval1 := h.CreateEvals[0]
|
||||||
require.Equal(t, nodeStatusUpdateEval.ID, followUpEval.PreviousEval)
|
require.Equal(t, nodeStatusUpdateEval.ID, followUpEval1.PreviousEval)
|
||||||
require.Equal(t, "pending", followUpEval.Status)
|
require.Equal(t, "pending", followUpEval1.Status)
|
||||||
require.NotEmpty(t, followUpEval.WaitUntil)
|
require.NotEmpty(t, followUpEval1.WaitUntil)
|
||||||
|
|
||||||
// Insert eval in the state store
|
followUpEval2 := h.CreateEvals[1]
|
||||||
|
require.Equal(t, nodeStatusUpdateEval.ID, followUpEval2.PreviousEval)
|
||||||
|
require.Equal(t, "pending", followUpEval2.Status)
|
||||||
|
require.NotEmpty(t, followUpEval2.WaitUntil)
|
||||||
|
|
||||||
|
// Insert eval1 in the state store
|
||||||
testutil.WaitForResult(func() (bool, error) {
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
found, err := h.State.EvalByID(nil, followUpEval.ID)
|
found, err := h.State.EvalByID(nil, followUpEval1.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
@ -6976,12 +6982,34 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T)
|
||||||
|
|
||||||
return true, nil
|
return true, nil
|
||||||
}, func(err error) {
|
}, func(err error) {
|
||||||
|
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Insert eval2 in the state store
|
||||||
|
testutil.WaitForResult(func() (bool, error) {
|
||||||
|
found, err := h.State.EvalByID(nil, followUpEval2.ID)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
if found == nil {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
require.Equal(t, nodeStatusUpdateEval.ID, found.PreviousEval)
|
||||||
|
require.Equal(t, "pending", found.Status)
|
||||||
|
require.NotEmpty(t, found.WaitUntil)
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}, func(err error) {
|
||||||
|
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
// Validate that the ClientStatus updates are part of the plan.
|
// Validate that the ClientStatus updates are part of the plan.
|
||||||
require.Len(t, h.Plans[0].NodeAllocation[disconnectedNode.ID], count)
|
require.Len(t, h.Plans[0].NodeAllocation[disconnectedNode.ID], count)
|
||||||
// Pending update should have unknown status.
|
// Pending update should have unknown status.
|
||||||
|
|
||||||
for _, nodeAlloc := range h.Plans[0].NodeAllocation[disconnectedNode.ID] {
|
for _, nodeAlloc := range h.Plans[0].NodeAllocation[disconnectedNode.ID] {
|
||||||
require.Equal(t, nodeAlloc.ClientStatus, structs.AllocClientStatusUnknown)
|
require.Equal(t, nodeAlloc.ClientStatus, structs.AllocClientStatusUnknown)
|
||||||
}
|
}
|
||||||
|
@ -6991,6 +7019,7 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T)
|
||||||
require.NoError(t, err, "plan.NodeUpdate")
|
require.NoError(t, err, "plan.NodeUpdate")
|
||||||
|
|
||||||
// Validate that the StateStore Upsert applied the ClientStatus we specified.
|
// Validate that the StateStore Upsert applied the ClientStatus we specified.
|
||||||
|
|
||||||
for _, alloc := range unknownAllocs {
|
for _, alloc := range unknownAllocs {
|
||||||
alloc, err = h.State.AllocByID(nil, alloc.ID)
|
alloc, err = h.State.AllocByID(nil, alloc.ID)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
|
@ -230,6 +230,7 @@ func (a *allocReconciler) computeDeploymentComplete(m allocMatrix) bool {
|
||||||
groupComplete := a.computeGroup(group, as)
|
groupComplete := a.computeGroup(group, as)
|
||||||
complete = complete && groupComplete
|
complete = complete && groupComplete
|
||||||
}
|
}
|
||||||
|
|
||||||
return complete
|
return complete
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -398,6 +399,7 @@ func (a *allocReconciler) markDelayed(allocs allocSet, clientStatus, statusDescr
|
||||||
// computeGroup reconciles state for a particular task group. It returns whether
|
// computeGroup reconciles state for a particular task group. It returns whether
|
||||||
// the deployment it is for is complete with regards to the task group.
|
// the deployment it is for is complete with regards to the task group.
|
||||||
func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
|
func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
|
||||||
|
|
||||||
// Create the desired update object for the group
|
// Create the desired update object for the group
|
||||||
desiredChanges := new(structs.DesiredUpdates)
|
desiredChanges := new(structs.DesiredUpdates)
|
||||||
a.result.desiredTGUpdates[groupName] = desiredChanges
|
a.result.desiredTGUpdates[groupName] = desiredChanges
|
||||||
|
@ -426,6 +428,9 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
|
||||||
untainted, migrate, lost, disconnecting, reconnecting, ignore := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
|
untainted, migrate, lost, disconnecting, reconnecting, ignore := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
|
||||||
desiredChanges.Ignore += uint64(len(ignore))
|
desiredChanges.Ignore += uint64(len(ignore))
|
||||||
|
|
||||||
|
// Determine what set of terminal allocations need to be rescheduled
|
||||||
|
untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment)
|
||||||
|
|
||||||
// If there are allocations reconnecting we need to reconcile them and
|
// If there are allocations reconnecting we need to reconcile them and
|
||||||
// their replacements first because there is specific logic when deciding
|
// their replacements first because there is specific logic when deciding
|
||||||
// which ones to keep that can only be applied when the client reconnects.
|
// which ones to keep that can only be applied when the client reconnects.
|
||||||
|
@ -454,20 +459,28 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
|
||||||
untainted = untainted.union(reconnect)
|
untainted = untainted.union(reconnect)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine what set of terminal allocations need to be rescheduled
|
// Determine what set of disconnecting allocations need to be rescheduled now,
|
||||||
untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment)
|
// which ones later and which ones can't be rescheduled at all.
|
||||||
|
timeoutLaterEvals := map[string]string{}
|
||||||
|
if len(disconnecting) > 0 {
|
||||||
|
untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment)
|
||||||
|
|
||||||
// Determine what set of disconnecting allocations need to be rescheduled
|
rescheduleNow = rescheduleNow.union(rescheduleDisconnecting)
|
||||||
_, rescheduleDisconnecting, _ := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment)
|
untainted = untainted.union(untaintedDisconnecting)
|
||||||
rescheduleNow = rescheduleNow.union(rescheduleDisconnecting)
|
rescheduleLater = append(rescheduleLater, laterDisconnecting...)
|
||||||
|
|
||||||
|
// Find delays for any disconnecting allocs that have max_client_disconnect,
|
||||||
|
// create followup evals, and update the ClientStatus to unknown.
|
||||||
|
timeoutLaterEvals = a.createTimeoutLaterEvals(disconnecting, tg.Name)
|
||||||
|
}
|
||||||
|
|
||||||
// Find delays for any lost allocs that have stop_after_client_disconnect
|
// Find delays for any lost allocs that have stop_after_client_disconnect
|
||||||
lostLater := lost.delayByStopAfterClientDisconnect()
|
lostLaterEvals := map[string]string{}
|
||||||
lostLaterEvals := a.createLostLaterEvals(lostLater, tg.Name)
|
lostLater := []*delayedRescheduleInfo{}
|
||||||
|
if len(lost) > 0 {
|
||||||
// Find delays for any disconnecting allocs that have max_client_disconnect,
|
lostLater = lost.delayByStopAfterClientDisconnect()
|
||||||
// create followup evals, and update the ClientStatus to unknown.
|
lostLaterEvals = a.createLostLaterEvals(lostLater, tg.Name)
|
||||||
timeoutLaterEvals := a.createTimeoutLaterEvals(disconnecting, tg.Name)
|
}
|
||||||
|
|
||||||
// Merge disconnecting with the stop_after_client_disconnect set into the
|
// Merge disconnecting with the stop_after_client_disconnect set into the
|
||||||
// lostLaterEvals so that computeStop can add them to the stop set.
|
// lostLaterEvals so that computeStop can add them to the stop set.
|
||||||
|
@ -486,13 +499,15 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
|
||||||
// include stopped allocations.
|
// include stopped allocations.
|
||||||
isCanarying := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
|
isCanarying := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
|
||||||
stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals)
|
stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals)
|
||||||
|
|
||||||
desiredChanges.Stop += uint64(len(stop))
|
desiredChanges.Stop += uint64(len(stop))
|
||||||
untainted = untainted.difference(stop)
|
untainted = untainted.difference(stop)
|
||||||
|
|
||||||
// Do inplace upgrades where possible and capture the set of upgrades that
|
// Do inplace upgrades where possible and capture the set of upgrades that
|
||||||
// need to be done destructively.
|
// need to be done destructively.
|
||||||
ignore, inplace, destructive := a.computeUpdates(tg, untainted)
|
ignoreUpdates, inplace, destructive := a.computeUpdates(tg, untainted)
|
||||||
desiredChanges.Ignore += uint64(len(ignore))
|
|
||||||
|
desiredChanges.Ignore += uint64(len(ignoreUpdates))
|
||||||
desiredChanges.InPlaceUpdate += uint64(len(inplace))
|
desiredChanges.InPlaceUpdate += uint64(len(inplace))
|
||||||
if !existingDeployment {
|
if !existingDeployment {
|
||||||
dstate.DesiredTotal += len(destructive) + len(inplace)
|
dstate.DesiredTotal += len(destructive) + len(inplace)
|
||||||
|
@ -796,7 +811,8 @@ func (a *allocReconciler) computeReplacements(deploymentPlaceReady bool, desired
|
||||||
// replacements based off that.
|
// replacements based off that.
|
||||||
failed := make(allocSet)
|
failed := make(allocSet)
|
||||||
for id, alloc := range rescheduleNow {
|
for id, alloc := range rescheduleNow {
|
||||||
if _, ok := a.result.disconnectUpdates[id]; !ok {
|
_, ok := a.result.disconnectUpdates[id]
|
||||||
|
if !ok && alloc.ClientStatus != structs.AllocClientStatusUnknown {
|
||||||
failed[id] = alloc
|
failed[id] = alloc
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -968,8 +984,11 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
|
||||||
untainted = untainted.difference(canaries)
|
untainted = untainted.difference(canaries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove disconnected allocations so they won't be stopped
|
||||||
|
knownUntainted := untainted.filterOutByClientStatus(structs.AllocClientStatusUnknown)
|
||||||
|
|
||||||
// Hot path the nothing to do case
|
// Hot path the nothing to do case
|
||||||
remove := len(untainted) + len(migrate) - group.Count
|
remove := len(knownUntainted) + len(migrate) - group.Count
|
||||||
if remove <= 0 {
|
if remove <= 0 {
|
||||||
return stop
|
return stop
|
||||||
}
|
}
|
||||||
|
@ -1072,7 +1091,7 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
|
||||||
// - If the reconnecting allocation is to be stopped, its replacements may
|
// - If the reconnecting allocation is to be stopped, its replacements may
|
||||||
// not be present in any of the returned sets. The rest of the reconciler
|
// not be present in any of the returned sets. The rest of the reconciler
|
||||||
// logic will handle them.
|
// logic will handle them.
|
||||||
func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others allocSet) (allocSet, allocSet) {
|
func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all allocSet) (allocSet, allocSet) {
|
||||||
stop := make(allocSet)
|
stop := make(allocSet)
|
||||||
reconnect := make(allocSet)
|
reconnect := make(allocSet)
|
||||||
|
|
||||||
|
@ -1111,14 +1130,11 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others al
|
||||||
|
|
||||||
// Find replacement allocations and decide which one to stop. A
|
// Find replacement allocations and decide which one to stop. A
|
||||||
// reconnecting allocation may have multiple replacements.
|
// reconnecting allocation may have multiple replacements.
|
||||||
for _, replacementAlloc := range others {
|
for _, replacementAlloc := range all {
|
||||||
|
|
||||||
// Skip allocations that are not a replacement of the one
|
// Skip allocations that are not a replacement of the one
|
||||||
// reconnecting. Replacement allocations have the same name but a
|
// reconnecting.
|
||||||
// higher CreateIndex and a different ID.
|
isReplacement := replacementAlloc.ID == reconnectingAlloc.NextAllocation
|
||||||
isReplacement := replacementAlloc.ID != reconnectingAlloc.ID &&
|
|
||||||
replacementAlloc.Name == reconnectingAlloc.Name &&
|
|
||||||
replacementAlloc.CreateIndex > reconnectingAlloc.CreateIndex
|
|
||||||
|
|
||||||
// Skip allocations that are server terminal.
|
// Skip allocations that are server terminal.
|
||||||
// We don't want to replace a reconnecting allocation with one that
|
// We don't want to replace a reconnecting allocation with one that
|
||||||
|
@ -1142,12 +1158,14 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others al
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// The reconnecting allocation is preferred, so stop this
|
// The reconnecting allocation is preferred, so stop this
|
||||||
// replacement.
|
// replacement, but avoid re-stopping stopped allocs
|
||||||
stop[replacementAlloc.ID] = replacementAlloc
|
if replacementAlloc.ClientStatus != structs.AllocClientStatusFailed {
|
||||||
a.result.stop = append(a.result.stop, allocStopResult{
|
stop[replacementAlloc.ID] = replacementAlloc
|
||||||
alloc: replacementAlloc,
|
a.result.stop = append(a.result.stop, allocStopResult{
|
||||||
statusDescription: allocReconnected,
|
alloc: replacementAlloc,
|
||||||
})
|
statusDescription: allocReconnected,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1235,11 +1253,17 @@ func (a *allocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedR
|
||||||
allocIDToFollowupEvalID := a.createLostLaterEvals(rescheduleLater, tgName)
|
allocIDToFollowupEvalID := a.createLostLaterEvals(rescheduleLater, tgName)
|
||||||
|
|
||||||
// Create updates that will be applied to the allocs to mark the FollowupEvalID
|
// Create updates that will be applied to the allocs to mark the FollowupEvalID
|
||||||
for allocID, evalID := range allocIDToFollowupEvalID {
|
for _, laterAlloc := range rescheduleLater {
|
||||||
existingAlloc := all[allocID]
|
existingAlloc := all[laterAlloc.alloc.ID]
|
||||||
updatedAlloc := existingAlloc.Copy()
|
updatedAlloc := existingAlloc.Copy()
|
||||||
updatedAlloc.FollowupEvalID = evalID
|
updatedAlloc.FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID]
|
||||||
a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
|
|
||||||
|
// Can't updated an allocation that is disconnected
|
||||||
|
if _, ok := a.result.disconnectUpdates[laterAlloc.allocID]; !ok {
|
||||||
|
a.result.attributeUpdates[laterAlloc.allocID] = updatedAlloc
|
||||||
|
} else {
|
||||||
|
a.result.disconnectUpdates[laterAlloc.allocID].FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1355,8 +1379,8 @@ func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName
|
||||||
}
|
}
|
||||||
|
|
||||||
timeoutDelays, err := disconnecting.delayByMaxClientDisconnect(a.now)
|
timeoutDelays, err := disconnecting.delayByMaxClientDisconnect(a.now)
|
||||||
if err != nil || len(timeoutDelays) != len(disconnecting) {
|
if err != nil {
|
||||||
a.logger.Error("error computing disconnecting timeouts for task_group",
|
a.logger.Error("error for task_group",
|
||||||
"task_group", tgName, "error", err)
|
"task_group", tgName, "error", err)
|
||||||
return map[string]string{}
|
return map[string]string{}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5327,7 +5327,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
nodeScoreIncrement float64
|
nodeScoreIncrement float64
|
||||||
disconnectedAllocStatus string
|
disconnectedAllocStatus string
|
||||||
disconnectedAllocStates []*structs.AllocState
|
disconnectedAllocStates []*structs.AllocState
|
||||||
serverDesiredStatus string
|
|
||||||
isBatch bool
|
isBatch bool
|
||||||
nodeStatusDisconnected bool
|
nodeStatusDisconnected bool
|
||||||
replace bool
|
replace bool
|
||||||
|
@ -5342,13 +5341,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
|
|
||||||
testCases := []testCase{
|
testCases := []testCase{
|
||||||
{
|
{
|
||||||
name: "reconnect-original-no-replacement",
|
name: "reconnect-original-no-replacement",
|
||||||
allocCount: 2,
|
allocCount: 2,
|
||||||
replace: false,
|
replace: false,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: false,
|
shouldStopOnDisconnectedNode: false,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
reconnectUpdates: 2,
|
reconnectUpdates: 2,
|
||||||
|
@ -5360,13 +5359,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "resume-original-and-stop-replacement",
|
name: "resume-original-and-stop-replacement",
|
||||||
allocCount: 3,
|
allocCount: 3,
|
||||||
replace: true,
|
replace: true,
|
||||||
disconnectedAllocCount: 1,
|
disconnectedAllocCount: 1,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: false,
|
shouldStopOnDisconnectedNode: false,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
stop: 1,
|
stop: 1,
|
||||||
|
@ -5380,13 +5379,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-with-lower-node-score",
|
name: "stop-original-with-lower-node-score",
|
||||||
allocCount: 4,
|
allocCount: 4,
|
||||||
replace: true,
|
replace: true,
|
||||||
disconnectedAllocCount: 1,
|
disconnectedAllocCount: 1,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
nodeScoreIncrement: 1,
|
nodeScoreIncrement: 1,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
|
@ -5400,13 +5399,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-failed-on-reconnect",
|
name: "stop-original-failed-on-reconnect",
|
||||||
allocCount: 4,
|
allocCount: 4,
|
||||||
replace: true,
|
replace: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
stop: 2,
|
stop: 2,
|
||||||
|
@ -5419,13 +5418,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "reschedule-original-failed-if-not-replaced",
|
name: "reschedule-original-failed-if-not-replaced",
|
||||||
allocCount: 4,
|
allocCount: 4,
|
||||||
replace: false,
|
replace: false,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
stop: 2,
|
stop: 2,
|
||||||
|
@ -5445,13 +5444,15 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
replace: false,
|
replace: false,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusComplete,
|
disconnectedAllocStatus: structs.AllocClientStatusComplete,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
isBatch: true,
|
isBatch: true,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
|
place: 2,
|
||||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||||
"web": {
|
"web": {
|
||||||
Ignore: 2,
|
Ignore: 2,
|
||||||
|
Place: 2,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -5463,15 +5464,14 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
failReplacement: true,
|
failReplacement: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
reconnectUpdates: 2,
|
reconnectUpdates: 2,
|
||||||
stop: 2,
|
stop: 0,
|
||||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||||
"web": {
|
"web": {
|
||||||
Ignore: 3,
|
Ignore: 5,
|
||||||
Stop: 2,
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -5483,8 +5483,8 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
disconnectReplacement: true,
|
disconnectReplacement: true,
|
||||||
disconnectedAllocCount: 1,
|
disconnectedAllocCount: 1,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
reconnectUpdates: 1,
|
reconnectUpdates: 1,
|
||||||
stop: 1,
|
stop: 1,
|
||||||
|
@ -5503,8 +5503,8 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
taintReplacement: true,
|
taintReplacement: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
reconnectUpdates: 2,
|
reconnectUpdates: 2,
|
||||||
stop: 2,
|
stop: 2,
|
||||||
|
@ -5517,13 +5517,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-alloc-with-old-job-version",
|
name: "stop-original-alloc-with-old-job-version",
|
||||||
allocCount: 5,
|
allocCount: 5,
|
||||||
replace: true,
|
replace: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
jobVersionIncrement: 1,
|
jobVersionIncrement: 1,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
|
@ -5537,13 +5537,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-alloc-with-old-job-version-reconnect-eval",
|
name: "stop-original-alloc-with-old-job-version-reconnect-eval",
|
||||||
allocCount: 5,
|
allocCount: 5,
|
||||||
replace: true,
|
replace: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
jobVersionIncrement: 1,
|
jobVersionIncrement: 1,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
|
@ -5557,35 +5557,36 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-alloc-with-old-job-version-and-failed-replacements-replaced",
|
name: "stop-original-alloc-with-old-job-version-and-failed-replacements-replaced",
|
||||||
allocCount: 5,
|
allocCount: 5,
|
||||||
replace: true,
|
replace: true,
|
||||||
failReplacement: true,
|
failReplacement: true,
|
||||||
replaceFailedReplacement: true,
|
replaceFailedReplacement: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
shouldStopOnDisconnectedNode: false,
|
||||||
shouldStopOnDisconnectedNode: true,
|
|
||||||
jobVersionIncrement: 1,
|
jobVersionIncrement: 1,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
stop: 2,
|
stop: 2,
|
||||||
|
reconnectUpdates: 2,
|
||||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||||
"web": {
|
"web": {
|
||||||
Stop: 2,
|
Stop: 2,
|
||||||
Ignore: 5,
|
Ignore: 7,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-pending-alloc-for-disconnected-node",
|
name: "stop-original-pending-alloc-for-disconnected-node",
|
||||||
allocCount: 2,
|
allocCount: 2,
|
||||||
replace: true,
|
replace: true,
|
||||||
disconnectedAllocCount: 1,
|
disconnectedAllocCount: 1,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusPending,
|
disconnectedAllocStatus: structs.AllocClientStatusPending,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
nodeStatusDisconnected: true,
|
nodeStatusDisconnected: true,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
|
@ -5599,23 +5600,23 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-failed-original-and-failed-replacements-and-place-new",
|
name: "stop-failed-original-and-failed-replacements-and-place-new",
|
||||||
allocCount: 5,
|
allocCount: 5,
|
||||||
replace: true,
|
replace: true,
|
||||||
failReplacement: true,
|
failReplacement: true,
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
||||||
|
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
stop: 4,
|
stop: 2,
|
||||||
place: 2,
|
place: 2,
|
||||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||||
"web": {
|
"web": {
|
||||||
Stop: 4,
|
Stop: 2,
|
||||||
Place: 2,
|
Place: 2,
|
||||||
Ignore: 3,
|
Ignore: 5,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -5627,7 +5628,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusUnknown,
|
disconnectedAllocStatus: structs.AllocClientStatusUnknown,
|
||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
shouldStopOnDisconnectedNode: true,
|
shouldStopOnDisconnectedNode: true,
|
||||||
nodeStatusDisconnected: true,
|
nodeStatusDisconnected: true,
|
||||||
maxDisconnect: pointer.Of(2 * time.Second),
|
maxDisconnect: pointer.Of(2 * time.Second),
|
||||||
|
@ -5648,7 +5648,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
disconnectedAllocCount: 2,
|
disconnectedAllocCount: 2,
|
||||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
disconnectedAllocStates: []*structs.AllocState{},
|
disconnectedAllocStates: []*structs.AllocState{},
|
||||||
serverDesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
nodeStatusDisconnected: true,
|
nodeStatusDisconnected: true,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
place: 2,
|
place: 2,
|
||||||
|
@ -5687,7 +5686,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
// Set alloc state
|
// Set alloc state
|
||||||
disconnectedAllocCount := tc.disconnectedAllocCount
|
disconnectedAllocCount := tc.disconnectedAllocCount
|
||||||
for _, alloc := range allocs {
|
for _, alloc := range allocs {
|
||||||
alloc.DesiredStatus = tc.serverDesiredStatus
|
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
||||||
|
|
||||||
if tc.maxDisconnect != nil {
|
if tc.maxDisconnect != nil {
|
||||||
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
|
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
|
||||||
|
@ -5699,7 +5698,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
// Set the node id on all the disconnected allocs to the node under test.
|
// Set the node id on all the disconnected allocs to the node under test.
|
||||||
alloc.NodeID = testNode.ID
|
alloc.NodeID = testNode.ID
|
||||||
alloc.NodeName = "disconnected"
|
alloc.NodeName = "disconnected"
|
||||||
|
|
||||||
disconnectedAllocCount--
|
disconnectedAllocCount--
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5791,6 +5789,12 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||||
// Tests that a client disconnect while a canary is in progress generates the result.
|
// Tests that a client disconnect while a canary is in progress generates the result.
|
||||||
func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
||||||
|
|
||||||
|
disconnectAllocState := []*structs.AllocState{{
|
||||||
|
Field: structs.AllocStateFieldClientStatus,
|
||||||
|
Value: structs.AllocClientStatusUnknown,
|
||||||
|
Time: time.Now(),
|
||||||
|
}}
|
||||||
|
|
||||||
type testCase struct {
|
type testCase struct {
|
||||||
name string
|
name string
|
||||||
nodes []string
|
nodes []string
|
||||||
|
@ -5883,7 +5887,7 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
||||||
updatedJob.TaskGroups[0].Name: {
|
updatedJob.TaskGroups[0].Name: {
|
||||||
Place: 3,
|
Place: 3,
|
||||||
Canary: 0,
|
Canary: 0,
|
||||||
Ignore: 3,
|
Ignore: 6,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -5947,7 +5951,7 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
||||||
updatedJob.TaskGroups[0].Name: {
|
updatedJob.TaskGroups[0].Name: {
|
||||||
Place: 2,
|
Place: 2,
|
||||||
Canary: 0,
|
Canary: 0,
|
||||||
Ignore: 4,
|
Ignore: 7,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -6013,7 +6017,7 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
||||||
updatedJob.TaskGroups[0].Name: {
|
updatedJob.TaskGroups[0].Name: {
|
||||||
Place: 2,
|
Place: 2,
|
||||||
Canary: 0,
|
Canary: 0,
|
||||||
Ignore: 3,
|
Ignore: 6,
|
||||||
// The 2 stops in this test are transient failures, but
|
// The 2 stops in this test are transient failures, but
|
||||||
// the deployment can still progress. We don't include
|
// the deployment can still progress. We don't include
|
||||||
// them in the stop count since DesiredTGUpdates is used
|
// them in the stop count since DesiredTGUpdates is used
|
||||||
|
@ -6083,6 +6087,12 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
||||||
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
||||||
alloc.DeploymentStatus.Healthy = pointer.Of(true)
|
alloc.DeploymentStatus.Healthy = pointer.Of(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if alloc.ClientStatus == structs.AllocClientStatusUnknown {
|
||||||
|
alloc.AllocStates = disconnectAllocState
|
||||||
|
alloc.FollowupEvalID = "eval-where-it-was-set-to-unknow"
|
||||||
|
}
|
||||||
|
|
||||||
tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID)
|
tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID)
|
||||||
handled[alloc.ID] = allocUpdateFnIgnore
|
handled[alloc.ID] = allocUpdateFnIgnore
|
||||||
canariesConfigured++
|
canariesConfigured++
|
||||||
|
|
|
@ -9,6 +9,7 @@ package scheduler
|
||||||
// all scheduler types before moving it into util.go
|
// all scheduler types before moving it into util.go
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -264,10 +265,11 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
|
||||||
|
|
||||||
taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID]
|
taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID]
|
||||||
if taintedNode != nil {
|
if taintedNode != nil {
|
||||||
// Group disconnecting/reconnecting
|
// Group disconnecting
|
||||||
switch taintedNode.Status {
|
switch taintedNode.Status {
|
||||||
case structs.NodeStatusDisconnected:
|
case structs.NodeStatusDisconnected:
|
||||||
if supportsDisconnectedClients {
|
if supportsDisconnectedClients {
|
||||||
|
|
||||||
// Filter running allocs on a node that is disconnected to be marked as unknown.
|
// Filter running allocs on a node that is disconnected to be marked as unknown.
|
||||||
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
||||||
disconnecting[alloc.ID] = alloc
|
disconnecting[alloc.ID] = alloc
|
||||||
|
@ -289,6 +291,7 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
|
||||||
lost[alloc.ID] = alloc
|
lost[alloc.ID] = alloc
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
reconnecting[alloc.ID] = alloc
|
reconnecting[alloc.ID] = alloc
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -296,9 +299,16 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Terminal allocs, if not reconnect, are always untainted as they
|
|
||||||
// should never be migrated.
|
|
||||||
if alloc.TerminalStatus() && !reconnect {
|
if alloc.TerminalStatus() && !reconnect {
|
||||||
|
// Terminal allocs, if supportsDisconnectedClient and not reconnect,
|
||||||
|
// are probably stopped replacements and should be ignored
|
||||||
|
if supportsDisconnectedClients {
|
||||||
|
ignore[alloc.ID] = alloc
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Terminal allocs, if not reconnect, are always untainted as they
|
||||||
|
// should never be migrated.
|
||||||
untainted[alloc.ID] = alloc
|
untainted[alloc.ID] = alloc
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -315,11 +325,11 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ignore unknown allocs that we want to reconnect eventually.
|
// Acknowledge unknown allocs that we want to reconnect eventually.
|
||||||
if supportsDisconnectedClients &&
|
if supportsDisconnectedClients &&
|
||||||
alloc.ClientStatus == structs.AllocClientStatusUnknown &&
|
alloc.ClientStatus == structs.AllocClientStatusUnknown &&
|
||||||
alloc.DesiredStatus == structs.AllocDesiredStatusRun {
|
alloc.DesiredStatus == structs.AllocDesiredStatusRun {
|
||||||
ignore[alloc.ID] = alloc
|
untainted[alloc.ID] = alloc
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,12 +376,11 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
|
||||||
// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
|
// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
|
||||||
// at a future time are also returned so that we can create follow up evaluations for them. Allocs are
|
// at a future time are also returned so that we can create follow up evaluations for them. Allocs are
|
||||||
// skipped or considered untainted according to logic defined in shouldFilter method.
|
// skipped or considered untainted according to logic defined in shouldFilter method.
|
||||||
func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
|
func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (allocSet, allocSet, []*delayedRescheduleInfo) {
|
||||||
untainted = make(map[string]*structs.Allocation)
|
untainted := make(map[string]*structs.Allocation)
|
||||||
rescheduleNow = make(map[string]*structs.Allocation)
|
rescheduleNow := make(map[string]*structs.Allocation)
|
||||||
|
rescheduleLater := []*delayedRescheduleInfo{}
|
||||||
|
|
||||||
// When filtering disconnected sets, the untainted set is never populated.
|
|
||||||
// It has no purpose in that context.
|
|
||||||
for _, alloc := range a {
|
for _, alloc := range a {
|
||||||
// Ignore disconnecting allocs that are already unknown. This can happen
|
// Ignore disconnecting allocs that are already unknown. This can happen
|
||||||
// in the case of canaries that are interrupted by a disconnect.
|
// in the case of canaries that are interrupted by a disconnect.
|
||||||
|
@ -393,25 +402,27 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time
|
||||||
if isUntainted && !isDisconnecting {
|
if isUntainted && !isDisconnecting {
|
||||||
untainted[alloc.ID] = alloc
|
untainted[alloc.ID] = alloc
|
||||||
}
|
}
|
||||||
if isUntainted || ignore {
|
|
||||||
|
if ignore {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only failed allocs with desired state run get to this point
|
|
||||||
// If the failed alloc is not eligible for rescheduling now we
|
|
||||||
// add it to the untainted set. Disconnecting delay evals are
|
|
||||||
// handled by allocReconciler.createTimeoutLaterEvals
|
|
||||||
eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting)
|
eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting)
|
||||||
if !isDisconnecting && !eligibleNow {
|
if eligibleNow {
|
||||||
untainted[alloc.ID] = alloc
|
|
||||||
if eligibleLater {
|
|
||||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
rescheduleNow[alloc.ID] = alloc
|
rescheduleNow[alloc.ID] = alloc
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the failed alloc is not eligible for rescheduling now we
|
||||||
|
// add it to the untainted set.
|
||||||
|
untainted[alloc.ID] = alloc
|
||||||
|
|
||||||
|
if eligibleLater {
|
||||||
|
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return
|
return untainted, rescheduleNow, rescheduleLater
|
||||||
}
|
}
|
||||||
|
|
||||||
// shouldFilter returns whether the alloc should be ignored or considered untainted.
|
// shouldFilter returns whether the alloc should be ignored or considered untainted.
|
||||||
|
@ -436,32 +447,31 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo
|
||||||
if alloc.RanSuccessfully() {
|
if alloc.RanSuccessfully() {
|
||||||
return true, false
|
return true, false
|
||||||
}
|
}
|
||||||
|
|
||||||
return false, true
|
return false, true
|
||||||
case structs.AllocDesiredStatusEvict:
|
case structs.AllocDesiredStatusEvict:
|
||||||
return false, true
|
return false, true
|
||||||
default:
|
|
||||||
}
|
}
|
||||||
|
|
||||||
switch alloc.ClientStatus {
|
switch alloc.ClientStatus {
|
||||||
case structs.AllocClientStatusFailed:
|
case structs.AllocClientStatusFailed:
|
||||||
default:
|
return false, false
|
||||||
return true, false
|
|
||||||
}
|
}
|
||||||
return false, false
|
|
||||||
|
return true, false
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle service jobs
|
// Handle service jobs
|
||||||
switch alloc.DesiredStatus {
|
switch alloc.DesiredStatus {
|
||||||
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
||||||
return false, true
|
return false, true
|
||||||
default:
|
|
||||||
}
|
}
|
||||||
|
|
||||||
switch alloc.ClientStatus {
|
switch alloc.ClientStatus {
|
||||||
case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
|
case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
|
||||||
return false, true
|
return false, true
|
||||||
default:
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false, false
|
return false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -481,9 +491,15 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri
|
||||||
|
|
||||||
// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
|
// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
|
||||||
var eligible bool
|
var eligible bool
|
||||||
if isDisconnecting {
|
switch {
|
||||||
rescheduleTime, eligible = alloc.NextRescheduleTimeByFailTime(now)
|
case isDisconnecting:
|
||||||
} else {
|
rescheduleTime, eligible = alloc.NextRescheduleTimeByTime(now)
|
||||||
|
|
||||||
|
case alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.FollowupEvalID == evalID:
|
||||||
|
lastDisconnectTime := alloc.LastUnknown()
|
||||||
|
rescheduleTime, eligible = alloc.NextRescheduleTimeByTime(lastDisconnectTime)
|
||||||
|
|
||||||
|
default:
|
||||||
rescheduleTime, eligible = alloc.NextRescheduleTime()
|
rescheduleTime, eligible = alloc.NextRescheduleTime()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -491,9 +507,11 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri
|
||||||
rescheduleNow = true
|
rescheduleNow = true
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if eligible && alloc.FollowupEvalID == "" {
|
|
||||||
|
if eligible && (alloc.FollowupEvalID == "" || isDisconnecting) {
|
||||||
rescheduleLater = true
|
rescheduleLater = true
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -547,12 +565,13 @@ func (a allocSet) delayByStopAfterClientDisconnect() (later []*delayedReschedule
|
||||||
|
|
||||||
// delayByMaxClientDisconnect returns a delay for any unknown allocation
|
// delayByMaxClientDisconnect returns a delay for any unknown allocation
|
||||||
// that's got a max_client_reconnect configured
|
// that's got a max_client_reconnect configured
|
||||||
func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRescheduleInfo, err error) {
|
func (a allocSet) delayByMaxClientDisconnect(now time.Time) ([]*delayedRescheduleInfo, error) {
|
||||||
|
var later []*delayedRescheduleInfo
|
||||||
|
|
||||||
for _, alloc := range a {
|
for _, alloc := range a {
|
||||||
timeout := alloc.DisconnectTimeout(now)
|
timeout := alloc.DisconnectTimeout(now)
|
||||||
|
|
||||||
if !timeout.After(now) {
|
if !timeout.After(now) {
|
||||||
continue
|
return nil, errors.New("unable to computing disconnecting timeouts")
|
||||||
}
|
}
|
||||||
|
|
||||||
later = append(later, &delayedRescheduleInfo{
|
later = append(later, &delayedRescheduleInfo{
|
||||||
|
@ -562,7 +581,19 @@ func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRes
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return later, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// filterOutByClientStatus returns all allocs from the set without the specified client status.
|
||||||
|
func (a allocSet) filterOutByClientStatus(clientStatus string) allocSet {
|
||||||
|
allocs := make(allocSet)
|
||||||
|
for _, alloc := range a {
|
||||||
|
if alloc.ClientStatus != clientStatus {
|
||||||
|
allocs[alloc.ID] = alloc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return allocs
|
||||||
}
|
}
|
||||||
|
|
||||||
// filterByClientStatus returns allocs from the set with the specified client status.
|
// filterByClientStatus returns allocs from the set with the specified client status.
|
||||||
|
|
|
@ -237,7 +237,6 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
name: "disco-client-disconnect-unset-max-disconnect",
|
name: "disco-client-disconnect-unset-max-disconnect",
|
||||||
supportsDisconnectedClients: true,
|
supportsDisconnectedClients: true,
|
||||||
|
@ -273,7 +272,6 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
// Everything below this line tests the disconnected client mode.
|
// Everything below this line tests the disconnected client mode.
|
||||||
{
|
{
|
||||||
name: "disco-client-untainted-reconnect-failed-and-replaced",
|
name: "disco-client-untainted-reconnect-failed-and-replaced",
|
||||||
|
@ -381,10 +379,10 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
taintedNodes: nodes,
|
taintedNodes: nodes,
|
||||||
skipNilNodeTest: false,
|
skipNilNodeTest: false,
|
||||||
all: allocSet{
|
all: allocSet{
|
||||||
// Allocs on reconnected nodes that are complete are untainted
|
// Allocs on reconnected nodes that are complete are ignored
|
||||||
"untainted-reconnect-complete": {
|
"ignored-reconnect-complete": {
|
||||||
ID: "untainted-reconnect-complete",
|
ID: "ignored-reconnect-complete",
|
||||||
Name: "untainted-reconnect-complete",
|
Name: "ignored-reconnect-complete",
|
||||||
ClientStatus: structs.AllocClientStatusComplete,
|
ClientStatus: structs.AllocClientStatusComplete,
|
||||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||||
Job: testJob,
|
Job: testJob,
|
||||||
|
@ -405,9 +403,9 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
AllocStates: unknownAllocState,
|
AllocStates: unknownAllocState,
|
||||||
},
|
},
|
||||||
// Lost allocs on reconnected nodes don't get restarted
|
// Lost allocs on reconnected nodes don't get restarted
|
||||||
"untainted-reconnect-lost": {
|
"ignored-reconnect-lost": {
|
||||||
ID: "untainted-reconnect-lost",
|
ID: "ignored-reconnect-lost",
|
||||||
Name: "untainted-reconnect-lost",
|
Name: "ignored-reconnect-lost",
|
||||||
ClientStatus: structs.AllocClientStatusLost,
|
ClientStatus: structs.AllocClientStatusLost,
|
||||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||||
Job: testJob,
|
Job: testJob,
|
||||||
|
@ -415,10 +413,10 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
TaskGroup: "web",
|
TaskGroup: "web",
|
||||||
AllocStates: unknownAllocState,
|
AllocStates: unknownAllocState,
|
||||||
},
|
},
|
||||||
// Replacement allocs that are complete are untainted
|
// Replacement allocs that are complete are ignored
|
||||||
"untainted-reconnect-complete-replacement": {
|
"ignored-reconnect-complete-replacement": {
|
||||||
ID: "untainted-reconnect-complete-replacement",
|
ID: "ignored-reconnect-complete-replacement",
|
||||||
Name: "untainted-reconnect-complete",
|
Name: "ignored-reconnect-complete",
|
||||||
ClientStatus: structs.AllocClientStatusComplete,
|
ClientStatus: structs.AllocClientStatusComplete,
|
||||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||||
Job: testJob,
|
Job: testJob,
|
||||||
|
@ -427,10 +425,10 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
AllocStates: unknownAllocState,
|
AllocStates: unknownAllocState,
|
||||||
PreviousAllocation: "untainted-reconnect-complete",
|
PreviousAllocation: "untainted-reconnect-complete",
|
||||||
},
|
},
|
||||||
// Replacement allocs on reconnected nodes that are failed are untainted
|
// Replacement allocs on reconnected nodes that are failed are ignored
|
||||||
"untainted-reconnect-failed-replacement": {
|
"ignored-reconnect-failed-replacement": {
|
||||||
ID: "untainted-reconnect-failed-replacement",
|
ID: "ignored-reconnect-failed-replacement",
|
||||||
Name: "untainted-reconnect-failed",
|
Name: "ignored-reconnect-failed",
|
||||||
ClientStatus: structs.AllocClientStatusFailed,
|
ClientStatus: structs.AllocClientStatusFailed,
|
||||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||||
Job: testJob,
|
Job: testJob,
|
||||||
|
@ -439,63 +437,9 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
PreviousAllocation: "reconnecting-failed",
|
PreviousAllocation: "reconnecting-failed",
|
||||||
},
|
},
|
||||||
// Lost replacement allocs on reconnected nodes don't get restarted
|
// Lost replacement allocs on reconnected nodes don't get restarted
|
||||||
"untainted-reconnect-lost-replacement": {
|
"ignored-reconnect-lost-replacement": {
|
||||||
ID: "untainted-reconnect-lost-replacement",
|
ID: "ignored-reconnect-lost-replacement",
|
||||||
Name: "untainted-reconnect-lost",
|
Name: "ignored-reconnect-lost",
|
||||||
ClientStatus: structs.AllocClientStatusLost,
|
|
||||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
||||||
Job: testJob,
|
|
||||||
NodeID: "normal",
|
|
||||||
TaskGroup: "web",
|
|
||||||
AllocStates: unknownAllocState,
|
|
||||||
PreviousAllocation: "untainted-reconnect-lost",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
untainted: allocSet{
|
|
||||||
"untainted-reconnect-complete": {
|
|
||||||
ID: "untainted-reconnect-complete",
|
|
||||||
Name: "untainted-reconnect-complete",
|
|
||||||
ClientStatus: structs.AllocClientStatusComplete,
|
|
||||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
Job: testJob,
|
|
||||||
NodeID: "normal",
|
|
||||||
TaskGroup: "web",
|
|
||||||
AllocStates: unknownAllocState,
|
|
||||||
},
|
|
||||||
"untainted-reconnect-lost": {
|
|
||||||
ID: "untainted-reconnect-lost",
|
|
||||||
Name: "untainted-reconnect-lost",
|
|
||||||
ClientStatus: structs.AllocClientStatusLost,
|
|
||||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
||||||
Job: testJob,
|
|
||||||
NodeID: "normal",
|
|
||||||
TaskGroup: "web",
|
|
||||||
AllocStates: unknownAllocState,
|
|
||||||
},
|
|
||||||
"untainted-reconnect-complete-replacement": {
|
|
||||||
ID: "untainted-reconnect-complete-replacement",
|
|
||||||
Name: "untainted-reconnect-complete",
|
|
||||||
ClientStatus: structs.AllocClientStatusComplete,
|
|
||||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
Job: testJob,
|
|
||||||
NodeID: "normal",
|
|
||||||
TaskGroup: "web",
|
|
||||||
AllocStates: unknownAllocState,
|
|
||||||
PreviousAllocation: "untainted-reconnect-complete",
|
|
||||||
},
|
|
||||||
"untainted-reconnect-failed-replacement": {
|
|
||||||
ID: "untainted-reconnect-failed-replacement",
|
|
||||||
Name: "untainted-reconnect-failed",
|
|
||||||
ClientStatus: structs.AllocClientStatusFailed,
|
|
||||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
||||||
Job: testJob,
|
|
||||||
NodeID: "normal",
|
|
||||||
TaskGroup: "web",
|
|
||||||
PreviousAllocation: "reconnecting-failed",
|
|
||||||
},
|
|
||||||
"untainted-reconnect-lost-replacement": {
|
|
||||||
ID: "untainted-reconnect-lost-replacement",
|
|
||||||
Name: "untainted-reconnect-lost",
|
|
||||||
ClientStatus: structs.AllocClientStatusLost,
|
ClientStatus: structs.AllocClientStatusLost,
|
||||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||||
Job: testJob,
|
Job: testJob,
|
||||||
|
@ -505,6 +449,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
PreviousAllocation: "untainted-reconnect-lost",
|
PreviousAllocation: "untainted-reconnect-lost",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
untainted: allocSet{},
|
||||||
migrate: allocSet{},
|
migrate: allocSet{},
|
||||||
disconnecting: allocSet{},
|
disconnecting: allocSet{},
|
||||||
reconnecting: allocSet{
|
reconnecting: allocSet{
|
||||||
|
@ -519,8 +464,62 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
AllocStates: unknownAllocState,
|
AllocStates: unknownAllocState,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
ignore: allocSet{},
|
ignore: allocSet{
|
||||||
lost: allocSet{},
|
|
||||||
|
"ignored-reconnect-complete": {
|
||||||
|
ID: "ignored-reconnect-complete",
|
||||||
|
Name: "ignored-reconnect-complete",
|
||||||
|
ClientStatus: structs.AllocClientStatusComplete,
|
||||||
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||||
|
Job: testJob,
|
||||||
|
NodeID: "normal",
|
||||||
|
TaskGroup: "web",
|
||||||
|
AllocStates: unknownAllocState,
|
||||||
|
},
|
||||||
|
"ignored-reconnect-lost": {
|
||||||
|
ID: "ignored-reconnect-lost",
|
||||||
|
Name: "ignored-reconnect-lost",
|
||||||
|
ClientStatus: structs.AllocClientStatusLost,
|
||||||
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||||
|
Job: testJob,
|
||||||
|
NodeID: "normal",
|
||||||
|
TaskGroup: "web",
|
||||||
|
AllocStates: unknownAllocState,
|
||||||
|
},
|
||||||
|
"ignored-reconnect-complete-replacement": {
|
||||||
|
ID: "ignored-reconnect-complete-replacement",
|
||||||
|
Name: "ignored-reconnect-complete",
|
||||||
|
ClientStatus: structs.AllocClientStatusComplete,
|
||||||
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||||
|
Job: testJob,
|
||||||
|
NodeID: "normal",
|
||||||
|
TaskGroup: "web",
|
||||||
|
AllocStates: unknownAllocState,
|
||||||
|
PreviousAllocation: "untainted-reconnect-complete",
|
||||||
|
},
|
||||||
|
"ignored-reconnect-failed-replacement": {
|
||||||
|
ID: "ignored-reconnect-failed-replacement",
|
||||||
|
Name: "ignored-reconnect-failed",
|
||||||
|
ClientStatus: structs.AllocClientStatusFailed,
|
||||||
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||||
|
Job: testJob,
|
||||||
|
NodeID: "normal",
|
||||||
|
TaskGroup: "web",
|
||||||
|
PreviousAllocation: "reconnecting-failed",
|
||||||
|
},
|
||||||
|
"ignored-reconnect-lost-replacement": {
|
||||||
|
ID: "ignored-reconnect-lost-replacement",
|
||||||
|
Name: "ignored-reconnect-lost",
|
||||||
|
ClientStatus: structs.AllocClientStatusLost,
|
||||||
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||||
|
Job: testJob,
|
||||||
|
NodeID: "normal",
|
||||||
|
TaskGroup: "web",
|
||||||
|
AllocStates: unknownAllocState,
|
||||||
|
PreviousAllocation: "untainted-reconnect-lost",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
lost: allocSet{},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "disco-client-disconnect",
|
name: "disco-client-disconnect",
|
||||||
|
@ -539,10 +538,10 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
NodeID: "disconnected",
|
NodeID: "disconnected",
|
||||||
TaskGroup: "web",
|
TaskGroup: "web",
|
||||||
},
|
},
|
||||||
// Unknown allocs on disconnected nodes are ignored
|
// Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again
|
||||||
"ignore-unknown": {
|
"untainted-unknown": {
|
||||||
ID: "ignore-unknown",
|
ID: "untainted-unknown",
|
||||||
Name: "ignore-unknown",
|
Name: "untainted-unknown",
|
||||||
ClientStatus: structs.AllocClientStatusUnknown,
|
ClientStatus: structs.AllocClientStatusUnknown,
|
||||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||||
Job: testJob,
|
Job: testJob,
|
||||||
|
@ -595,8 +594,20 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
AllocStates: unknownAllocState,
|
AllocStates: unknownAllocState,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
untainted: allocSet{},
|
untainted: allocSet{
|
||||||
migrate: allocSet{},
|
// Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again
|
||||||
|
"untainted-unknown": {
|
||||||
|
ID: "untainted-unknown",
|
||||||
|
Name: "untainted-unknown",
|
||||||
|
ClientStatus: structs.AllocClientStatusUnknown,
|
||||||
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||||
|
Job: testJob,
|
||||||
|
NodeID: "disconnected",
|
||||||
|
TaskGroup: "web",
|
||||||
|
AllocStates: unknownAllocState,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
migrate: allocSet{},
|
||||||
disconnecting: allocSet{
|
disconnecting: allocSet{
|
||||||
"disconnect-running": {
|
"disconnect-running": {
|
||||||
ID: "disconnect-running",
|
ID: "disconnect-running",
|
||||||
|
@ -610,17 +621,6 @@ func TestAllocSet_filterByTainted(t *testing.T) {
|
||||||
},
|
},
|
||||||
reconnecting: allocSet{},
|
reconnecting: allocSet{},
|
||||||
ignore: allocSet{
|
ignore: allocSet{
|
||||||
// Unknown allocs on disconnected nodes are ignored
|
|
||||||
"ignore-unknown": {
|
|
||||||
ID: "ignore-unknown",
|
|
||||||
Name: "ignore-unknown",
|
|
||||||
ClientStatus: structs.AllocClientStatusUnknown,
|
|
||||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
||||||
Job: testJob,
|
|
||||||
NodeID: "disconnected",
|
|
||||||
TaskGroup: "web",
|
|
||||||
AllocStates: unknownAllocState,
|
|
||||||
},
|
|
||||||
"ignore-reconnected-failed-stopped": {
|
"ignore-reconnected-failed-stopped": {
|
||||||
ID: "ignore-reconnected-failed-stopped",
|
ID: "ignore-reconnected-failed-stopped",
|
||||||
Name: "ignore-reconnected-failed-stopped",
|
Name: "ignore-reconnected-failed-stopped",
|
||||||
|
@ -1167,3 +1167,202 @@ func Test_allocNameIndex_Next(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAllocSet_filterByRescheduleable(t *testing.T) {
|
||||||
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
noRescheduleJob := mock.Job()
|
||||||
|
noRescheduleTG := &structs.TaskGroup{
|
||||||
|
Name: "noRescheduleTG",
|
||||||
|
ReschedulePolicy: &structs.ReschedulePolicy{
|
||||||
|
Attempts: 0,
|
||||||
|
Unlimited: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
noRescheduleJob.TaskGroups[0] = noRescheduleTG
|
||||||
|
|
||||||
|
testJob := mock.Job()
|
||||||
|
rescheduleTG := &structs.TaskGroup{
|
||||||
|
Name: "rescheduleTG",
|
||||||
|
ReschedulePolicy: &structs.ReschedulePolicy{
|
||||||
|
Attempts: 1,
|
||||||
|
Unlimited: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
testJob.TaskGroups[0] = rescheduleTG
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
type testCase struct {
|
||||||
|
name string
|
||||||
|
all allocSet
|
||||||
|
isBatch bool
|
||||||
|
supportsDisconnectedClients bool
|
||||||
|
isDisconnecting bool
|
||||||
|
deployment *structs.Deployment
|
||||||
|
|
||||||
|
// expected results
|
||||||
|
untainted allocSet
|
||||||
|
resNow allocSet
|
||||||
|
resLater []*delayedRescheduleInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
testCases := []testCase{
|
||||||
|
{
|
||||||
|
name: "batch disconnecting allocation no reschedule",
|
||||||
|
isDisconnecting: true,
|
||||||
|
isBatch: true,
|
||||||
|
all: allocSet{
|
||||||
|
"untainted1": {
|
||||||
|
ID: "untainted1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: noRescheduleJob,
|
||||||
|
TaskGroup: "noRescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{
|
||||||
|
"untainted1": {
|
||||||
|
ID: "untainted1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: noRescheduleJob,
|
||||||
|
TaskGroup: "noRescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
resNow: allocSet{},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "batch ignore unknown disconnecting allocs",
|
||||||
|
isDisconnecting: true,
|
||||||
|
isBatch: true,
|
||||||
|
all: allocSet{
|
||||||
|
"disconnecting1": {
|
||||||
|
ID: "disconnection1",
|
||||||
|
ClientStatus: structs.AllocClientStatusUnknown,
|
||||||
|
Job: testJob,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{},
|
||||||
|
resNow: allocSet{},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "batch disconnecting allocation reschedule",
|
||||||
|
isDisconnecting: true,
|
||||||
|
isBatch: true,
|
||||||
|
all: allocSet{
|
||||||
|
"rescheduleNow1": {
|
||||||
|
ID: "rescheduleNow1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: testJob,
|
||||||
|
TaskGroup: "rescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{},
|
||||||
|
resNow: allocSet{
|
||||||
|
"rescheduleNow1": {
|
||||||
|
ID: "rescheduleNow1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: testJob,
|
||||||
|
TaskGroup: "rescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "service disconnecting allocation no reschedule",
|
||||||
|
isDisconnecting: true,
|
||||||
|
isBatch: false,
|
||||||
|
all: allocSet{
|
||||||
|
"untainted1": {
|
||||||
|
ID: "untainted1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: noRescheduleJob,
|
||||||
|
TaskGroup: "noRescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{
|
||||||
|
"untainted1": {
|
||||||
|
ID: "untainted1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: noRescheduleJob,
|
||||||
|
TaskGroup: "noRescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
resNow: allocSet{},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "service disconnecting allocation reschedule",
|
||||||
|
isDisconnecting: true,
|
||||||
|
isBatch: false,
|
||||||
|
all: allocSet{
|
||||||
|
"rescheduleNow1": {
|
||||||
|
ID: "rescheduleNow1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: testJob,
|
||||||
|
TaskGroup: "rescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{},
|
||||||
|
resNow: allocSet{
|
||||||
|
"rescheduleNow1": {
|
||||||
|
ID: "rescheduleNow1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: testJob,
|
||||||
|
TaskGroup: "rescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "service ignore unknown disconnecting allocs",
|
||||||
|
isDisconnecting: true,
|
||||||
|
isBatch: false,
|
||||||
|
all: allocSet{
|
||||||
|
"disconnecting1": {
|
||||||
|
ID: "disconnection1",
|
||||||
|
ClientStatus: structs.AllocClientStatusUnknown,
|
||||||
|
Job: testJob,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{},
|
||||||
|
resNow: allocSet{},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "service running allocation no reschedule",
|
||||||
|
isDisconnecting: false,
|
||||||
|
isBatch: true,
|
||||||
|
all: allocSet{
|
||||||
|
"untainted1": {
|
||||||
|
ID: "untainted1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: noRescheduleJob,
|
||||||
|
TaskGroup: "noRescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
untainted: allocSet{
|
||||||
|
"untainted1": {
|
||||||
|
ID: "untainted1",
|
||||||
|
ClientStatus: structs.AllocClientStatusRunning,
|
||||||
|
Job: noRescheduleJob,
|
||||||
|
TaskGroup: "noRescheduleTG",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
resNow: allocSet{},
|
||||||
|
resLater: []*delayedRescheduleInfo{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
untainted, resNow, resLater := tc.all.filterByRescheduleable(tc.isBatch,
|
||||||
|
tc.isDisconnecting, now, "evailID", tc.deployment)
|
||||||
|
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
|
||||||
|
must.Eq(t, tc.resNow, resNow, must.Sprintf("with-nodes: reschedule-now"))
|
||||||
|
must.Eq(t, tc.resLater, resLater, must.Sprintf("with-nodes: rescheduleLater"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue