From 5e309f3f33d12c1646fa37db06363ddfdb709c0a Mon Sep 17 00:00:00 2001 From: Derek Strickland <1111455+DerekStrickland@users.noreply.github.com> Date: Thu, 21 Apr 2022 10:05:58 -0400 Subject: [PATCH] reconciler: Handle canaries when client disconnects (#12539) * plan_apply: Allow node updates in disconnected node plans * plan: Keep the job when persisting unknown allocs * reconciler: stop unknown allocs when stopping all * reconcile_util: reorder filtering to handle canaries; skip rescheduling unknown * heartbeat: Fix bug in node heartbeating --- nomad/heartbeat.go | 5 +- nomad/node_endpoint.go | 3 +- nomad/plan_apply.go | 4 - nomad/structs/structs.go | 2 - scheduler/reconcile.go | 7 +- scheduler/reconcile_test.go | 364 +++++++++++++++++++++++++++++++ scheduler/reconcile_util.go | 89 +++++--- scheduler/reconcile_util_test.go | 351 ++++++++++++++++++----------- 8 files changed, 645 insertions(+), 180 deletions(-) diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index e6dbb655f..e76b39541 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -167,7 +167,6 @@ func (h *nodeHeartbeater) invalidateHeartbeat(id string) { if canDisconnect && hasPendingReconnects { req.Status = structs.NodeStatusDisconnected } - var resp structs.NodeUpdateResponse if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { h.logger.Error("update node status failed", "error", err) @@ -181,8 +180,8 @@ func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) { return false, false } - // Exit if this is the node already in a state other than ready. - if node.Status != structs.NodeStatusReady { + // Exit if the node is already down or just initializing. + if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit { return false, false } diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index fc3aa81cc..f1dbb04df 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -573,7 +573,8 @@ func nodeStatusTransitionRequiresEval(newStatus, oldStatus string) bool { initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady disconnectedToOther := oldStatus == structs.NodeStatusDisconnected && newStatus != structs.NodeStatusDisconnected - return initToReady || terminalToReady || disconnectedToOther + otherToDisconnected := oldStatus != structs.NodeStatusDisconnected && newStatus == structs.NodeStatusDisconnected + return initToReady || terminalToReady || disconnectedToOther || otherToDisconnected } // UpdateDrain is used to update the drain mode of a client node diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 555f235cd..c86e854a4 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -705,10 +705,6 @@ func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID stri // The plan is only valid for disconnected nodes if it only contains // updates to mark allocations as unknown. func isValidForDisconnectedNode(plan *structs.Plan, nodeID string) bool { - if len(plan.NodeUpdate[nodeID]) != 0 || len(plan.NodePreemptions[nodeID]) != 0 { - return false - } - for _, alloc := range plan.NodeAllocation[nodeID] { if alloc.ClientStatus != structs.AllocClientStatusUnknown { return false diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index ae8a87b5c..3a49f1322 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11305,8 +11305,6 @@ func (p *Plan) AppendPreemptedAlloc(alloc *Allocation, preemptingAllocID string) // AppendUnknownAlloc marks an allocation as unknown. func (p *Plan) AppendUnknownAlloc(alloc *Allocation) { - // Strip the job as it's set once on the ApplyPlanResultRequest. - alloc.Job = nil // Strip the resources as they can be rebuilt. alloc.Resources = nil diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index d363912bb..cfdc0996d 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -344,12 +344,13 @@ func (a *allocReconciler) handleStop(m allocMatrix) { // filterAndStopAll stops all allocations in an allocSet. This is useful in when // stopping an entire job or task group. func (a *allocReconciler) filterAndStopAll(set allocSet) uint64 { - untainted, migrate, lost, disconnecting, reconnecting, _ := set.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) + untainted, migrate, lost, disconnecting, reconnecting, ignore := set.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) a.markStop(untainted, "", allocNotNeeded) a.markStop(migrate, "", allocNotNeeded) a.markStop(lost, structs.AllocClientStatusLost, allocLost) a.markStop(disconnecting, "", allocNotNeeded) a.markStop(reconnecting, "", allocNotNeeded) + a.markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", allocNotNeeded) return uint64(len(set)) } @@ -462,7 +463,6 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { if isCanarying { untainted = untainted.difference(canaries) } - requiresCanaries := a.requiresCanaries(tg, dstate, destructive, canaries) if requiresCanaries { a.computeCanaries(tg, dstate, destructive, canaries, desiredChanges, nameIndex) @@ -619,6 +619,9 @@ func (a *allocReconciler) cancelUnneededCanaries(original allocSet, desiredChang canaries = all.fromKeys(canaryIDs) untainted, migrate, lost, _, _, _ := canaries.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) + // We don't add these stops to desiredChanges because the deployment is + // still active. DesiredChanges is used to report deployment progress/final + // state. These transient failures aren't meaningful. a.markStop(migrate, "", allocMigrating) a.markStop(lost, structs.AllocClientStatusLost, allocLost) diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 783970042..3cb29cb90 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -5549,6 +5549,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) { alloc.ClientStatus = tc.disconnectedAllocStatus // Set the node id on all the disconnected allocs to the node under test. alloc.NodeID = testNode.ID + alloc.NodeName = "disconnected" alloc.AllocStates = []*structs.AllocState{{ Field: structs.AllocStateFieldClientStatus, @@ -5631,3 +5632,366 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }) } } + +// Tests that a client disconnect while a canary is in progress generates the result. +func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { + + type testCase struct { + name string + nodes []string + deploymentState *structs.DeploymentState + deployedAllocs map[*structs.Node][]*structs.Allocation + canaryAllocs map[*structs.Node][]*structs.Allocation + expectedResult *resultExpectation + } + + running := structs.AllocClientStatusRunning + complete := structs.AllocClientStatusComplete + unknown := structs.AllocClientStatusUnknown + pending := structs.AllocClientStatusPending + run := structs.AllocDesiredStatusRun + stop := structs.AllocDesiredStatusStop + + maxClientDisconnect := 10 * time.Minute + + readyNode := mock.Node() + readyNode.Name = "ready-" + readyNode.ID + readyNode.Status = structs.NodeStatusReady + + disconnectedNode := mock.Node() + disconnectedNode.Name = "disconnected-" + disconnectedNode.ID + disconnectedNode.Status = structs.NodeStatusDisconnected + + // Job with allocations and max_client_disconnect + job := mock.Job() + + updatedJob := job.Copy() + updatedJob.Version = updatedJob.Version + 1 + + testCases := []testCase{ + { + name: "3-placed-1-disconnect", + deploymentState: &structs.DeploymentState{ + AutoRevert: false, + AutoPromote: false, + Promoted: false, + ProgressDeadline: 5 * time.Minute, + RequireProgressBy: time.Now().Add(5 * time.Minute), + PlacedCanaries: []string{}, + DesiredCanaries: 1, + DesiredTotal: 6, + PlacedAllocs: 3, + HealthyAllocs: 2, + UnhealthyAllocs: 0, + }, + deployedAllocs: map[*structs.Node][]*structs.Allocation{ + readyNode: { + // filtered as terminal + {Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop}, + // Ignored + {Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop}, + // destructive, but discarded because canarying + {Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run}, + }, + disconnectedNode: { + // filtered as terminal + {Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop}, + // Gets a placement, and a disconnect update + {Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run}, + // Gets a placement, and a disconnect update + {Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run}, + }, + }, + canaryAllocs: map[*structs.Node][]*structs.Allocation{ + readyNode: { + // Ignored + {Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run}, + // Ignored + {Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run}, + }, + disconnectedNode: { + // Gets a placement, and a disconnect update + {Name: "my-job.web[1]", ClientStatus: running, DesiredStatus: run}, + }, + }, + expectedResult: &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 3, + destructive: 0, + stop: 0, + inplace: 0, + attributeUpdates: 0, + disconnectUpdates: 3, + reconnectUpdates: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + updatedJob.TaskGroups[0].Name: { + Place: 3, + Canary: 0, + Ignore: 3, + }, + }, + }, + }, + { + name: "ignore-unknown", + deploymentState: &structs.DeploymentState{ + AutoRevert: false, + AutoPromote: false, + Promoted: false, + ProgressDeadline: 5 * time.Minute, + RequireProgressBy: time.Now().Add(5 * time.Minute), + PlacedCanaries: []string{}, + DesiredCanaries: 1, + DesiredTotal: 6, + PlacedAllocs: 3, + HealthyAllocs: 2, + UnhealthyAllocs: 0, + }, + deployedAllocs: map[*structs.Node][]*structs.Allocation{ + readyNode: { + // filtered as terminal + {Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop}, + // Ignored + {Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop}, + // destructive, but discarded because canarying + {Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run}, + }, + disconnectedNode: { + // filtered as terminal + {Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop}, + // Gets a placement, and a disconnect update + {Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run}, + // Gets a placement, and a disconnect update + {Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run}, + }, + }, + canaryAllocs: map[*structs.Node][]*structs.Allocation{ + readyNode: { + // Ignored + {Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run}, + // Ignored + {Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run}, + }, + disconnectedNode: { + // Ignored + {Name: "my-job.web[1]", ClientStatus: unknown, DesiredStatus: run}, + }, + }, + expectedResult: &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 2, + destructive: 0, + stop: 0, + inplace: 0, + attributeUpdates: 0, + disconnectUpdates: 2, + reconnectUpdates: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + updatedJob.TaskGroups[0].Name: { + Place: 2, + Canary: 0, + Ignore: 4, + }, + }, + }, + }, + { + name: "4-placed-2-pending-lost", + deploymentState: &structs.DeploymentState{ + AutoRevert: false, + AutoPromote: false, + Promoted: false, + ProgressDeadline: 5 * time.Minute, + RequireProgressBy: time.Now().Add(5 * time.Minute), + PlacedCanaries: []string{}, + DesiredCanaries: 2, + DesiredTotal: 6, + PlacedAllocs: 4, + HealthyAllocs: 2, + UnhealthyAllocs: 0, + }, + deployedAllocs: map[*structs.Node][]*structs.Allocation{ + readyNode: { + // filtered as terminal + {Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop}, + // filtered as terminal + {Name: "my-job.web[2]", ClientStatus: complete, DesiredStatus: stop}, + // destructive, but discarded because canarying + {Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run}, + }, + disconnectedNode: { + // filtered as terminal + {Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop}, + // Gets a placement, and a disconnect update + {Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run}, + // Gets a placement, and a disconnect update + {Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run}, + }, + }, + canaryAllocs: map[*structs.Node][]*structs.Allocation{ + readyNode: { + // Ignored + {Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run}, + // Ignored + {Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: run}, + }, + disconnectedNode: { + // Stop/Lost because pending + {Name: "my-job.web[1]", ClientStatus: pending, DesiredStatus: run}, + // Stop/Lost because pending + {Name: "my-job.web[3]", ClientStatus: pending, DesiredStatus: run}, + }, + }, + expectedResult: &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 2, + destructive: 0, + stop: 2, + inplace: 0, + attributeUpdates: 0, + disconnectUpdates: 2, + reconnectUpdates: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + updatedJob.TaskGroups[0].Name: { + Place: 2, + Canary: 0, + Ignore: 3, + // The 2 stops in this test are transient failures, but + // the deployment can still progress. We don't include + // them in the stop count since DesiredTGUpdates is used + // to report deployment progress or final deployment state. + Stop: 0, + }, + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Set the count dynamically to the number from the original deployment. + job.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode]) + job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect + job.TaskGroups[0].Update = &structs.UpdateStrategy{ + MaxParallel: 1, + Canary: tc.deploymentState.DesiredCanaries, + MinHealthyTime: 3 * time.Second, + HealthyDeadline: 20 * time.Second, + AutoRevert: true, + AutoPromote: true, + } + + updatedJob.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode]) + updatedJob.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect + updatedJob.TaskGroups[0].Update = &structs.UpdateStrategy{ + MaxParallel: 1, + Canary: tc.deploymentState.DesiredCanaries, + MinHealthyTime: 3 * time.Second, + HealthyDeadline: 20 * time.Second, + AutoRevert: true, + AutoPromote: true, + } + + // Populate Alloc IDS, Node IDs, Job on deployed allocs + allocsConfigured := 0 + for node, allocs := range tc.deployedAllocs { + for _, alloc := range allocs { + alloc.ID = uuid.Generate() + alloc.NodeID = node.ID + alloc.NodeName = node.Name + alloc.JobID = job.ID + alloc.Job = job + alloc.TaskGroup = job.TaskGroups[0].Name + allocsConfigured++ + } + } + + require.Equal(t, tc.deploymentState.DesiredTotal, allocsConfigured, "invalid alloc configuration: expect %d got %d", tc.deploymentState.DesiredTotal, allocsConfigured) + + // Populate Alloc IDS, Node IDs, Job on canaries + canariesConfigured := 0 + handled := make(map[string]allocUpdateType) + for node, allocs := range tc.canaryAllocs { + for _, alloc := range allocs { + alloc.ID = uuid.Generate() + alloc.NodeID = node.ID + alloc.NodeName = node.Name + alloc.JobID = updatedJob.ID + alloc.Job = updatedJob + alloc.TaskGroup = updatedJob.TaskGroups[0].Name + alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + } + if alloc.ClientStatus == structs.AllocClientStatusRunning { + alloc.DeploymentStatus.Healthy = helper.BoolToPtr(true) + } + tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID) + handled[alloc.ID] = allocUpdateFnIgnore + canariesConfigured++ + } + } + + // Validate tc.canaryAllocs against tc.deploymentState + require.Equal(t, tc.deploymentState.PlacedAllocs, canariesConfigured, "invalid canary configuration: expect %d got %d", tc.deploymentState.PlacedAllocs, canariesConfigured) + + deployment := structs.NewDeployment(updatedJob, 50) + deployment.TaskGroups[updatedJob.TaskGroups[0].Name] = tc.deploymentState + + // Build a map of tainted nodes that contains the last canary + tainted := make(map[string]*structs.Node, 1) + tainted[disconnectedNode.ID] = disconnectedNode + + allocs := make([]*structs.Allocation, 0) + + allocs = append(allocs, tc.deployedAllocs[readyNode]...) + allocs = append(allocs, tc.deployedAllocs[disconnectedNode]...) + allocs = append(allocs, tc.canaryAllocs[readyNode]...) + allocs = append(allocs, tc.canaryAllocs[disconnectedNode]...) + + mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, updatedJob.ID, updatedJob, + deployment, allocs, tainted, "", 50, true) + result := reconciler.Compute() + + // Assert the correct results + assertResults(t, result, tc.expectedResult) + + // Validate that placements are either for placed canaries for the + // updated job, or for disconnected allocs for the original job + // and that they have a disconnect update. + for _, placeResult := range result.place { + found := false + require.NotNil(t, placeResult.previousAlloc) + for _, deployed := range tc.deployedAllocs[disconnectedNode] { + if deployed.ID == placeResult.previousAlloc.ID { + found = true + require.Equal(t, job.Version, placeResult.previousAlloc.Job.Version) + require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID) + _, exists := result.disconnectUpdates[placeResult.previousAlloc.ID] + require.True(t, exists) + break + } + } + for _, canary := range tc.canaryAllocs[disconnectedNode] { + if canary.ID == placeResult.previousAlloc.ID { + found = true + require.Equal(t, updatedJob.Version, placeResult.previousAlloc.Job.Version) + require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID) + _, exists := result.disconnectUpdates[placeResult.previousAlloc.ID] + require.True(t, exists) + break + } + } + require.True(t, found) + } + + // Validate that stops are for pending disconnects + for _, stopResult := range result.stop { + require.Equal(t, pending, stopResult.alloc.ClientStatus) + } + }) + } +} diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 395998ff2..4d0cfb7a8 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -225,7 +225,6 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS ignore = make(map[string]*structs.Allocation) for _, alloc := range a { - // make sure we don't apply any reconnect logic to task groups // without max_client_disconnect supportsDisconnectedClients := alloc.SupportsDisconnectedClients(serverSupportsDisconnectedClients) @@ -251,6 +250,40 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS continue } + taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID] + if taintedNode != nil { + // Group disconnecting/reconnecting + switch taintedNode.Status { + case structs.NodeStatusDisconnected: + if supportsDisconnectedClients { + // Filter running allocs on a node that is disconnected to be marked as unknown. + if alloc.ClientStatus == structs.AllocClientStatusRunning { + disconnecting[alloc.ID] = alloc + continue + } + // Filter pending allocs on a node that is disconnected to be marked as lost. + if alloc.ClientStatus == structs.AllocClientStatusPending { + lost[alloc.ID] = alloc + continue + } + } else { + lost[alloc.ID] = alloc + continue + } + case structs.NodeStatusReady: + // Filter reconnecting allocs on a node that is now connected. + if reconnected { + if expired { + lost[alloc.ID] = alloc + continue + } + reconnecting[alloc.ID] = alloc + continue + } + default: + } + } + // Terminal allocs, if not reconnected, are always untainted as they // should never be migrated. if alloc.TerminalStatus() && !reconnected { @@ -287,8 +320,7 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS continue } - taintedNode, ok := taintedNodes[alloc.NodeID] - if !ok { + if !nodeIsTainted { // Filter allocs on a node that is now re-connected to be resumed. if reconnected { if expired { @@ -304,39 +336,6 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS continue } - if taintedNode != nil { - // Group disconnecting/reconnecting - switch taintedNode.Status { - case structs.NodeStatusDisconnected: - if supportsDisconnectedClients { - // Filter running allocs on a node that is disconnected to be marked as unknown. - if alloc.ClientStatus == structs.AllocClientStatusRunning { - disconnecting[alloc.ID] = alloc - continue - } - // Filter pending allocs on a node that is disconnected to be marked as lost. - if alloc.ClientStatus == structs.AllocClientStatusPending { - lost[alloc.ID] = alloc - continue - } - } else { - lost[alloc.ID] = alloc - continue - } - case structs.NodeStatusReady: - // Filter reconnecting allocs with replacements on a node that is now connected. - if reconnected { - if expired { - lost[alloc.ID] = alloc - continue - } - reconnecting[alloc.ID] = alloc - continue - } - default: - } - } - // Allocs on GC'd (nil) or lost nodes are Lost if taintedNode == nil || taintedNode.TerminalStatus() { lost[alloc.ID] = alloc @@ -361,6 +360,12 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time // When filtering disconnected sets, the untainted set is never populated. // It has no purpose in that context. for _, alloc := range a { + // Ignore disconnecting allocs that are already unknown. This can happen + // in the case of canaries that are interrupted by a disconnect. + if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown { + continue + } + var eligibleNow, eligibleLater bool var rescheduleTime time.Time @@ -557,6 +562,18 @@ func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRes return } +// filterByClientStatus returns allocs from the set with the specified client status. +func (a allocSet) filterByClientStatus(clientStatus string) allocSet { + allocs := make(allocSet) + for _, alloc := range a { + if alloc.ClientStatus == clientStatus { + allocs[alloc.ID] = alloc + } + } + + return allocs +} + // allocNameIndex is used to select allocation names for placement or removal // given an existing set of placed allocations. type allocNameIndex struct { diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index 0b2dc0331..12d1c6b09 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -266,23 +266,13 @@ func TestAllocSet_filterByTainted(t *testing.T) { all: allocSet{ // Non-terminal allocs on disconnected nodes w/o max-disconnect are lost "lost-running": { - ID: "lost-running", - Name: "lost-running", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJobNoMaxDisconnect, - NodeID: "disconnected", - TaskGroup: "web", - }, - // Unknown allocs on disconnected nodes w/o max-disconnect are lost - "lost-unknown": { - ID: "lost-unknown", - Name: "lost-unknown", - ClientStatus: structs.AllocClientStatusUnknown, + ID: "lost-running", + Name: "lost-running", + ClientStatus: structs.AllocClientStatusRunning, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJobNoMaxDisconnect, NodeID: "disconnected", TaskGroup: "web", - AllocStates: unknownAllocState, }, }, untainted: allocSet{}, @@ -292,22 +282,13 @@ func TestAllocSet_filterByTainted(t *testing.T) { ignore: allocSet{}, lost: allocSet{ "lost-running": { - ID: "lost-running", - Name: "lost-running", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJobNoMaxDisconnect, - NodeID: "disconnected", - TaskGroup: "web", - }, - "lost-unknown": { - ID: "lost-unknown", - Name: "lost-unknown", - ClientStatus: structs.AllocClientStatusUnknown, + ID: "lost-running", + Name: "lost-running", + ClientStatus: structs.AllocClientStatusRunning, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJobNoMaxDisconnect, NodeID: "disconnected", TaskGroup: "web", - AllocStates: unknownAllocState, }, }, }, @@ -324,6 +305,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "running-replacement", Name: "web", ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -349,6 +331,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "running-replacement", Name: "web", ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -384,14 +367,15 @@ func TestAllocSet_filterByTainted(t *testing.T) { // Node.UpdateStatus has already handled syncing client state so this // should be a noop. "reconnecting-running-no-replacement": { - ID: "reconnecting-running-no-replacement", - Name: "web", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "reconnecting-running-no-replacement", + Name: "web", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, }, untainted: allocSet{}, @@ -399,14 +383,15 @@ func TestAllocSet_filterByTainted(t *testing.T) { disconnecting: allocSet{}, reconnecting: allocSet{ "reconnecting-running-no-replacement": { - ID: "reconnecting-running-no-replacement", - Name: "web", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "reconnecting-running-no-replacement", + Name: "web", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, }, ignore: allocSet{}, @@ -421,43 +406,47 @@ func TestAllocSet_filterByTainted(t *testing.T) { all: allocSet{ // Allocs on reconnected nodes that are complete are untainted "untainted-reconnect-complete": { - ID: "untainted-reconnect-complete", - Name: "untainted-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "untainted-reconnect-complete", + Name: "untainted-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, // Failed allocs on reconnected nodes are in reconnecting so that // they be marked with desired status stop at the server. "reconnecting-failed": { - ID: "reconnecting-failed", - Name: "reconnecting-failed", - ClientStatus: structs.AllocClientStatusFailed, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "reconnecting-failed", + Name: "reconnecting-failed", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, // Lost allocs on reconnected nodes don't get restarted "untainted-reconnect-lost": { - ID: "untainted-reconnect-lost", - Name: "untainted-reconnect-lost", - ClientStatus: structs.AllocClientStatusLost, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "untainted-reconnect-lost", + Name: "untainted-reconnect-lost", + ClientStatus: structs.AllocClientStatusLost, + DesiredStatus: structs.AllocDesiredStatusStop, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, // Replacement allocs that are complete are untainted "untainted-reconnect-complete-replacement": { ID: "untainted-reconnect-complete-replacement", Name: "untainted-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -469,6 +458,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "untainted-reconnect-failed-replacement", Name: "untainted-reconnect-failed", ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -480,6 +470,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "untainted-reconnect-lost-replacement", Name: "untainted-reconnect-lost", ClientStatus: structs.AllocClientStatusLost, + DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -489,29 +480,32 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, untainted: allocSet{ "untainted-reconnect-complete": { - ID: "untainted-reconnect-complete", - Name: "untainted-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "untainted-reconnect-complete", + Name: "untainted-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, "untainted-reconnect-lost": { - ID: "untainted-reconnect-lost", - Name: "untainted-reconnect-lost", - ClientStatus: structs.AllocClientStatusLost, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "untainted-reconnect-lost", + Name: "untainted-reconnect-lost", + ClientStatus: structs.AllocClientStatusLost, + DesiredStatus: structs.AllocDesiredStatusStop, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, "untainted-reconnect-complete-replacement": { ID: "untainted-reconnect-complete-replacement", Name: "untainted-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -522,6 +516,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "untainted-reconnect-failed-replacement", Name: "untainted-reconnect-failed", ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -532,6 +527,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "untainted-reconnect-lost-replacement", Name: "untainted-reconnect-lost", ClientStatus: structs.AllocClientStatusLost, + DesiredStatus: structs.AllocDesiredStatusStop, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -543,14 +539,15 @@ func TestAllocSet_filterByTainted(t *testing.T) { disconnecting: allocSet{}, reconnecting: allocSet{ "reconnecting-failed": { - ID: "reconnecting-failed", - Name: "reconnecting-failed", - ClientStatus: structs.AllocClientStatusFailed, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "reconnecting-failed", + Name: "reconnecting-failed", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, }, ignore: allocSet{}, @@ -565,12 +562,13 @@ func TestAllocSet_filterByTainted(t *testing.T) { all: allocSet{ // Non-terminal allocs on disconnected nodes are disconnecting "disconnect-running": { - ID: "disconnect-running", - Name: "disconnect-running", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - NodeID: "disconnected", - TaskGroup: "web", + ID: "disconnect-running", + Name: "disconnect-running", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", }, // Unknown allocs on disconnected nodes are ignored "ignore-unknown": { @@ -585,13 +583,37 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, // Unknown allocs on disconnected nodes are lost when expired "lost-unknown": { - ID: "lost-unknown", - Name: "lost-unknown", - ClientStatus: structs.AllocClientStatusUnknown, - Job: testJob, - NodeID: "disconnected", - TaskGroup: "web", - AllocStates: expiredAllocState, + ID: "lost-unknown", + Name: "lost-unknown", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", + AllocStates: expiredAllocState, + }, + // Pending allocs on disconnected nodes are lost + "lost-pending": { + ID: "lost-pending", + Name: "lost-pending", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", + }, + // Expired allocs on reconnected clients are lost + // Pending allocs on disconnected nodes are lost + "lost-expired": { + ID: "lost-expired", + Name: "lost-expired", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + TaskStates: reconnectTaskState, + AllocStates: expiredAllocState, }, // Failed and stopped allocs on disconnected nodes are ignored "ignore-reconnected-failed-stopped": { @@ -603,19 +625,20 @@ func TestAllocSet_filterByTainted(t *testing.T) { NodeID: "disconnected", TaskGroup: "web", TaskStates: reconnectTaskState, - AllocStates: unknownAllocState, // TODO really? + AllocStates: unknownAllocState, }, }, untainted: allocSet{}, migrate: allocSet{}, disconnecting: allocSet{ "disconnect-running": { - ID: "disconnect-running", - Name: "disconnect-running", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - NodeID: "disconnected", - TaskGroup: "web", + ID: "disconnect-running", + Name: "disconnect-running", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", }, }, reconnecting: allocSet{}, @@ -645,13 +668,73 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, lost: allocSet{ "lost-unknown": { - ID: "lost-unknown", - Name: "lost-unknown", - ClientStatus: structs.AllocClientStatusUnknown, - Job: testJob, - NodeID: "disconnected", - TaskGroup: "web", - AllocStates: expiredAllocState, + ID: "lost-unknown", + Name: "lost-unknown", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", + AllocStates: expiredAllocState, + }, + "lost-pending": { + ID: "lost-pending", + Name: "lost-pending", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "disconnected", + TaskGroup: "web", + }, + "lost-expired": { + ID: "lost-expired", + Name: "lost-expired", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + TaskStates: reconnectTaskState, + AllocStates: expiredAllocState, + }, + }, + }, + { + name: "disco-client-reconnect", + supportsDisconnectedClients: true, + now: time.Now(), + taintedNodes: nodes, + skipNilNodeTest: false, + all: allocSet{ + // Expired allocs on reconnected clients are lost + "lost-expired-reconnect": { + ID: "lost-expired-reconnect", + Name: "lost-expired-reconnect", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + TaskStates: reconnectTaskState, + AllocStates: expiredAllocState, + }, + }, + untainted: allocSet{}, + migrate: allocSet{}, + disconnecting: allocSet{}, + reconnecting: allocSet{}, + ignore: allocSet{}, + lost: allocSet{ + "lost-expired-reconnect": { + ID: "lost-expired-reconnect", + Name: "lost-expired-reconnect", + ClientStatus: structs.AllocClientStatusUnknown, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + TaskStates: reconnectTaskState, + AllocStates: expiredAllocState, }, }, }, @@ -666,6 +749,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "running-replacement", Name: "web", ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -673,14 +757,15 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, // Running and replaced allocs on reconnected nodes are reconnecting "running-original": { - ID: "running-original", - Name: "web", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "running-original", + Name: "web", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, }, untainted: allocSet{ @@ -688,6 +773,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { ID: "running-replacement", Name: "web", ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, NodeID: "normal", TaskGroup: "web", @@ -698,14 +784,15 @@ func TestAllocSet_filterByTainted(t *testing.T) { disconnecting: allocSet{}, reconnecting: allocSet{ "running-original": { - ID: "running-original", - Name: "web", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - TaskStates: reconnectTaskState, + ID: "running-original", + Name: "web", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + TaskStates: reconnectTaskState, }, }, ignore: allocSet{},