reconciler: Handle canaries when client disconnects (#12539)

* plan_apply: Allow node updates in disconnected node plans
* plan: Keep the job when persisting unknown allocs
* reconciler: stop unknown allocs when stopping all
* reconcile_util: reorder filtering to handle canaries; skip rescheduling unknown
* heartbeat: Fix bug in node heartbeating
This commit is contained in:
Derek Strickland 2022-04-21 10:05:58 -04:00 committed by GitHub
parent 2ad9f6bc5f
commit 5e309f3f33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 645 additions and 180 deletions

View File

@ -167,7 +167,6 @@ func (h *nodeHeartbeater) invalidateHeartbeat(id string) {
if canDisconnect && hasPendingReconnects {
req.Status = structs.NodeStatusDisconnected
}
var resp structs.NodeUpdateResponse
if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil {
h.logger.Error("update node status failed", "error", err)
@ -181,8 +180,8 @@ func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) {
return false, false
}
// Exit if this is the node already in a state other than ready.
if node.Status != structs.NodeStatusReady {
// Exit if the node is already down or just initializing.
if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit {
return false, false
}

View File

@ -573,7 +573,8 @@ func nodeStatusTransitionRequiresEval(newStatus, oldStatus string) bool {
initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady
terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady
disconnectedToOther := oldStatus == structs.NodeStatusDisconnected && newStatus != structs.NodeStatusDisconnected
return initToReady || terminalToReady || disconnectedToOther
otherToDisconnected := oldStatus != structs.NodeStatusDisconnected && newStatus == structs.NodeStatusDisconnected
return initToReady || terminalToReady || disconnectedToOther || otherToDisconnected
}
// UpdateDrain is used to update the drain mode of a client node

View File

@ -705,10 +705,6 @@ func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID stri
// The plan is only valid for disconnected nodes if it only contains
// updates to mark allocations as unknown.
func isValidForDisconnectedNode(plan *structs.Plan, nodeID string) bool {
if len(plan.NodeUpdate[nodeID]) != 0 || len(plan.NodePreemptions[nodeID]) != 0 {
return false
}
for _, alloc := range plan.NodeAllocation[nodeID] {
if alloc.ClientStatus != structs.AllocClientStatusUnknown {
return false

View File

@ -11305,8 +11305,6 @@ func (p *Plan) AppendPreemptedAlloc(alloc *Allocation, preemptingAllocID string)
// AppendUnknownAlloc marks an allocation as unknown.
func (p *Plan) AppendUnknownAlloc(alloc *Allocation) {
// Strip the job as it's set once on the ApplyPlanResultRequest.
alloc.Job = nil
// Strip the resources as they can be rebuilt.
alloc.Resources = nil

View File

@ -344,12 +344,13 @@ func (a *allocReconciler) handleStop(m allocMatrix) {
// filterAndStopAll stops all allocations in an allocSet. This is useful in when
// stopping an entire job or task group.
func (a *allocReconciler) filterAndStopAll(set allocSet) uint64 {
untainted, migrate, lost, disconnecting, reconnecting, _ := set.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
untainted, migrate, lost, disconnecting, reconnecting, ignore := set.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
a.markStop(untainted, "", allocNotNeeded)
a.markStop(migrate, "", allocNotNeeded)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)
a.markStop(disconnecting, "", allocNotNeeded)
a.markStop(reconnecting, "", allocNotNeeded)
a.markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", allocNotNeeded)
return uint64(len(set))
}
@ -462,7 +463,6 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
if isCanarying {
untainted = untainted.difference(canaries)
}
requiresCanaries := a.requiresCanaries(tg, dstate, destructive, canaries)
if requiresCanaries {
a.computeCanaries(tg, dstate, destructive, canaries, desiredChanges, nameIndex)
@ -619,6 +619,9 @@ func (a *allocReconciler) cancelUnneededCanaries(original allocSet, desiredChang
canaries = all.fromKeys(canaryIDs)
untainted, migrate, lost, _, _, _ := canaries.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
// We don't add these stops to desiredChanges because the deployment is
// still active. DesiredChanges is used to report deployment progress/final
// state. These transient failures aren't meaningful.
a.markStop(migrate, "", allocMigrating)
a.markStop(lost, structs.AllocClientStatusLost, allocLost)

View File

@ -5549,6 +5549,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
alloc.ClientStatus = tc.disconnectedAllocStatus
// Set the node id on all the disconnected allocs to the node under test.
alloc.NodeID = testNode.ID
alloc.NodeName = "disconnected"
alloc.AllocStates = []*structs.AllocState{{
Field: structs.AllocStateFieldClientStatus,
@ -5631,3 +5632,366 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
})
}
}
// Tests that a client disconnect while a canary is in progress generates the result.
func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
type testCase struct {
name string
nodes []string
deploymentState *structs.DeploymentState
deployedAllocs map[*structs.Node][]*structs.Allocation
canaryAllocs map[*structs.Node][]*structs.Allocation
expectedResult *resultExpectation
}
running := structs.AllocClientStatusRunning
complete := structs.AllocClientStatusComplete
unknown := structs.AllocClientStatusUnknown
pending := structs.AllocClientStatusPending
run := structs.AllocDesiredStatusRun
stop := structs.AllocDesiredStatusStop
maxClientDisconnect := 10 * time.Minute
readyNode := mock.Node()
readyNode.Name = "ready-" + readyNode.ID
readyNode.Status = structs.NodeStatusReady
disconnectedNode := mock.Node()
disconnectedNode.Name = "disconnected-" + disconnectedNode.ID
disconnectedNode.Status = structs.NodeStatusDisconnected
// Job with allocations and max_client_disconnect
job := mock.Job()
updatedJob := job.Copy()
updatedJob.Version = updatedJob.Version + 1
testCases := []testCase{
{
name: "3-placed-1-disconnect",
deploymentState: &structs.DeploymentState{
AutoRevert: false,
AutoPromote: false,
Promoted: false,
ProgressDeadline: 5 * time.Minute,
RequireProgressBy: time.Now().Add(5 * time.Minute),
PlacedCanaries: []string{},
DesiredCanaries: 1,
DesiredTotal: 6,
PlacedAllocs: 3,
HealthyAllocs: 2,
UnhealthyAllocs: 0,
},
deployedAllocs: map[*structs.Node][]*structs.Allocation{
readyNode: {
// filtered as terminal
{Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop},
// Ignored
{Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop},
// destructive, but discarded because canarying
{Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run},
},
disconnectedNode: {
// filtered as terminal
{Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop},
// Gets a placement, and a disconnect update
{Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run},
// Gets a placement, and a disconnect update
{Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run},
},
},
canaryAllocs: map[*structs.Node][]*structs.Allocation{
readyNode: {
// Ignored
{Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run},
// Ignored
{Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run},
},
disconnectedNode: {
// Gets a placement, and a disconnect update
{Name: "my-job.web[1]", ClientStatus: running, DesiredStatus: run},
},
},
expectedResult: &resultExpectation{
createDeployment: nil,
deploymentUpdates: nil,
place: 3,
destructive: 0,
stop: 0,
inplace: 0,
attributeUpdates: 0,
disconnectUpdates: 3,
reconnectUpdates: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
updatedJob.TaskGroups[0].Name: {
Place: 3,
Canary: 0,
Ignore: 3,
},
},
},
},
{
name: "ignore-unknown",
deploymentState: &structs.DeploymentState{
AutoRevert: false,
AutoPromote: false,
Promoted: false,
ProgressDeadline: 5 * time.Minute,
RequireProgressBy: time.Now().Add(5 * time.Minute),
PlacedCanaries: []string{},
DesiredCanaries: 1,
DesiredTotal: 6,
PlacedAllocs: 3,
HealthyAllocs: 2,
UnhealthyAllocs: 0,
},
deployedAllocs: map[*structs.Node][]*structs.Allocation{
readyNode: {
// filtered as terminal
{Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop},
// Ignored
{Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop},
// destructive, but discarded because canarying
{Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run},
},
disconnectedNode: {
// filtered as terminal
{Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop},
// Gets a placement, and a disconnect update
{Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run},
// Gets a placement, and a disconnect update
{Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run},
},
},
canaryAllocs: map[*structs.Node][]*structs.Allocation{
readyNode: {
// Ignored
{Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run},
// Ignored
{Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run},
},
disconnectedNode: {
// Ignored
{Name: "my-job.web[1]", ClientStatus: unknown, DesiredStatus: run},
},
},
expectedResult: &resultExpectation{
createDeployment: nil,
deploymentUpdates: nil,
place: 2,
destructive: 0,
stop: 0,
inplace: 0,
attributeUpdates: 0,
disconnectUpdates: 2,
reconnectUpdates: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
updatedJob.TaskGroups[0].Name: {
Place: 2,
Canary: 0,
Ignore: 4,
},
},
},
},
{
name: "4-placed-2-pending-lost",
deploymentState: &structs.DeploymentState{
AutoRevert: false,
AutoPromote: false,
Promoted: false,
ProgressDeadline: 5 * time.Minute,
RequireProgressBy: time.Now().Add(5 * time.Minute),
PlacedCanaries: []string{},
DesiredCanaries: 2,
DesiredTotal: 6,
PlacedAllocs: 4,
HealthyAllocs: 2,
UnhealthyAllocs: 0,
},
deployedAllocs: map[*structs.Node][]*structs.Allocation{
readyNode: {
// filtered as terminal
{Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop},
// filtered as terminal
{Name: "my-job.web[2]", ClientStatus: complete, DesiredStatus: stop},
// destructive, but discarded because canarying
{Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run},
},
disconnectedNode: {
// filtered as terminal
{Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop},
// Gets a placement, and a disconnect update
{Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run},
// Gets a placement, and a disconnect update
{Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run},
},
},
canaryAllocs: map[*structs.Node][]*structs.Allocation{
readyNode: {
// Ignored
{Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run},
// Ignored
{Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: run},
},
disconnectedNode: {
// Stop/Lost because pending
{Name: "my-job.web[1]", ClientStatus: pending, DesiredStatus: run},
// Stop/Lost because pending
{Name: "my-job.web[3]", ClientStatus: pending, DesiredStatus: run},
},
},
expectedResult: &resultExpectation{
createDeployment: nil,
deploymentUpdates: nil,
place: 2,
destructive: 0,
stop: 2,
inplace: 0,
attributeUpdates: 0,
disconnectUpdates: 2,
reconnectUpdates: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
updatedJob.TaskGroups[0].Name: {
Place: 2,
Canary: 0,
Ignore: 3,
// The 2 stops in this test are transient failures, but
// the deployment can still progress. We don't include
// them in the stop count since DesiredTGUpdates is used
// to report deployment progress or final deployment state.
Stop: 0,
},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// Set the count dynamically to the number from the original deployment.
job.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode])
job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect
job.TaskGroups[0].Update = &structs.UpdateStrategy{
MaxParallel: 1,
Canary: tc.deploymentState.DesiredCanaries,
MinHealthyTime: 3 * time.Second,
HealthyDeadline: 20 * time.Second,
AutoRevert: true,
AutoPromote: true,
}
updatedJob.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode])
updatedJob.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect
updatedJob.TaskGroups[0].Update = &structs.UpdateStrategy{
MaxParallel: 1,
Canary: tc.deploymentState.DesiredCanaries,
MinHealthyTime: 3 * time.Second,
HealthyDeadline: 20 * time.Second,
AutoRevert: true,
AutoPromote: true,
}
// Populate Alloc IDS, Node IDs, Job on deployed allocs
allocsConfigured := 0
for node, allocs := range tc.deployedAllocs {
for _, alloc := range allocs {
alloc.ID = uuid.Generate()
alloc.NodeID = node.ID
alloc.NodeName = node.Name
alloc.JobID = job.ID
alloc.Job = job
alloc.TaskGroup = job.TaskGroups[0].Name
allocsConfigured++
}
}
require.Equal(t, tc.deploymentState.DesiredTotal, allocsConfigured, "invalid alloc configuration: expect %d got %d", tc.deploymentState.DesiredTotal, allocsConfigured)
// Populate Alloc IDS, Node IDs, Job on canaries
canariesConfigured := 0
handled := make(map[string]allocUpdateType)
for node, allocs := range tc.canaryAllocs {
for _, alloc := range allocs {
alloc.ID = uuid.Generate()
alloc.NodeID = node.ID
alloc.NodeName = node.Name
alloc.JobID = updatedJob.ID
alloc.Job = updatedJob
alloc.TaskGroup = updatedJob.TaskGroups[0].Name
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
Canary: true,
}
if alloc.ClientStatus == structs.AllocClientStatusRunning {
alloc.DeploymentStatus.Healthy = helper.BoolToPtr(true)
}
tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID)
handled[alloc.ID] = allocUpdateFnIgnore
canariesConfigured++
}
}
// Validate tc.canaryAllocs against tc.deploymentState
require.Equal(t, tc.deploymentState.PlacedAllocs, canariesConfigured, "invalid canary configuration: expect %d got %d", tc.deploymentState.PlacedAllocs, canariesConfigured)
deployment := structs.NewDeployment(updatedJob, 50)
deployment.TaskGroups[updatedJob.TaskGroups[0].Name] = tc.deploymentState
// Build a map of tainted nodes that contains the last canary
tainted := make(map[string]*structs.Node, 1)
tainted[disconnectedNode.ID] = disconnectedNode
allocs := make([]*structs.Allocation, 0)
allocs = append(allocs, tc.deployedAllocs[readyNode]...)
allocs = append(allocs, tc.deployedAllocs[disconnectedNode]...)
allocs = append(allocs, tc.canaryAllocs[readyNode]...)
allocs = append(allocs, tc.canaryAllocs[disconnectedNode]...)
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, updatedJob.ID, updatedJob,
deployment, allocs, tainted, "", 50, true)
result := reconciler.Compute()
// Assert the correct results
assertResults(t, result, tc.expectedResult)
// Validate that placements are either for placed canaries for the
// updated job, or for disconnected allocs for the original job
// and that they have a disconnect update.
for _, placeResult := range result.place {
found := false
require.NotNil(t, placeResult.previousAlloc)
for _, deployed := range tc.deployedAllocs[disconnectedNode] {
if deployed.ID == placeResult.previousAlloc.ID {
found = true
require.Equal(t, job.Version, placeResult.previousAlloc.Job.Version)
require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID)
_, exists := result.disconnectUpdates[placeResult.previousAlloc.ID]
require.True(t, exists)
break
}
}
for _, canary := range tc.canaryAllocs[disconnectedNode] {
if canary.ID == placeResult.previousAlloc.ID {
found = true
require.Equal(t, updatedJob.Version, placeResult.previousAlloc.Job.Version)
require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID)
_, exists := result.disconnectUpdates[placeResult.previousAlloc.ID]
require.True(t, exists)
break
}
}
require.True(t, found)
}
// Validate that stops are for pending disconnects
for _, stopResult := range result.stop {
require.Equal(t, pending, stopResult.alloc.ClientStatus)
}
})
}
}

View File

@ -225,7 +225,6 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
ignore = make(map[string]*structs.Allocation)
for _, alloc := range a {
// make sure we don't apply any reconnect logic to task groups
// without max_client_disconnect
supportsDisconnectedClients := alloc.SupportsDisconnectedClients(serverSupportsDisconnectedClients)
@ -251,6 +250,40 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
continue
}
taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID]
if taintedNode != nil {
// Group disconnecting/reconnecting
switch taintedNode.Status {
case structs.NodeStatusDisconnected:
if supportsDisconnectedClients {
// Filter running allocs on a node that is disconnected to be marked as unknown.
if alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
// Filter pending allocs on a node that is disconnected to be marked as lost.
if alloc.ClientStatus == structs.AllocClientStatusPending {
lost[alloc.ID] = alloc
continue
}
} else {
lost[alloc.ID] = alloc
continue
}
case structs.NodeStatusReady:
// Filter reconnecting allocs on a node that is now connected.
if reconnected {
if expired {
lost[alloc.ID] = alloc
continue
}
reconnecting[alloc.ID] = alloc
continue
}
default:
}
}
// Terminal allocs, if not reconnected, are always untainted as they
// should never be migrated.
if alloc.TerminalStatus() && !reconnected {
@ -287,8 +320,7 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
continue
}
taintedNode, ok := taintedNodes[alloc.NodeID]
if !ok {
if !nodeIsTainted {
// Filter allocs on a node that is now re-connected to be resumed.
if reconnected {
if expired {
@ -304,39 +336,6 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
continue
}
if taintedNode != nil {
// Group disconnecting/reconnecting
switch taintedNode.Status {
case structs.NodeStatusDisconnected:
if supportsDisconnectedClients {
// Filter running allocs on a node that is disconnected to be marked as unknown.
if alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
// Filter pending allocs on a node that is disconnected to be marked as lost.
if alloc.ClientStatus == structs.AllocClientStatusPending {
lost[alloc.ID] = alloc
continue
}
} else {
lost[alloc.ID] = alloc
continue
}
case structs.NodeStatusReady:
// Filter reconnecting allocs with replacements on a node that is now connected.
if reconnected {
if expired {
lost[alloc.ID] = alloc
continue
}
reconnecting[alloc.ID] = alloc
continue
}
default:
}
}
// Allocs on GC'd (nil) or lost nodes are Lost
if taintedNode == nil || taintedNode.TerminalStatus() {
lost[alloc.ID] = alloc
@ -361,6 +360,12 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time
// When filtering disconnected sets, the untainted set is never populated.
// It has no purpose in that context.
for _, alloc := range a {
// Ignore disconnecting allocs that are already unknown. This can happen
// in the case of canaries that are interrupted by a disconnect.
if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown {
continue
}
var eligibleNow, eligibleLater bool
var rescheduleTime time.Time
@ -557,6 +562,18 @@ func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRes
return
}
// filterByClientStatus returns allocs from the set with the specified client status.
func (a allocSet) filterByClientStatus(clientStatus string) allocSet {
allocs := make(allocSet)
for _, alloc := range a {
if alloc.ClientStatus == clientStatus {
allocs[alloc.ID] = alloc
}
}
return allocs
}
// allocNameIndex is used to select allocation names for placement or removal
// given an existing set of placed allocations.
type allocNameIndex struct {

View File

@ -266,23 +266,13 @@ func TestAllocSet_filterByTainted(t *testing.T) {
all: allocSet{
// Non-terminal allocs on disconnected nodes w/o max-disconnect are lost
"lost-running": {
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
},
// Unknown allocs on disconnected nodes w/o max-disconnect are lost
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
untainted: allocSet{},
@ -292,22 +282,13 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ignore: allocSet{},
lost: allocSet{
"lost-running": {
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
},
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
},
@ -324,6 +305,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -349,6 +331,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -384,14 +367,15 @@ func TestAllocSet_filterByTainted(t *testing.T) {
// Node.UpdateStatus has already handled syncing client state so this
// should be a noop.
"reconnecting-running-no-replacement": {
ID: "reconnecting-running-no-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "reconnecting-running-no-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
untainted: allocSet{},
@ -399,14 +383,15 @@ func TestAllocSet_filterByTainted(t *testing.T) {
disconnecting: allocSet{},
reconnecting: allocSet{
"reconnecting-running-no-replacement": {
ID: "reconnecting-running-no-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "reconnecting-running-no-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
ignore: allocSet{},
@ -421,43 +406,47 @@ func TestAllocSet_filterByTainted(t *testing.T) {
all: allocSet{
// Allocs on reconnected nodes that are complete are untainted
"untainted-reconnect-complete": {
ID: "untainted-reconnect-complete",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "untainted-reconnect-complete",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
// Failed allocs on reconnected nodes are in reconnecting so that
// they be marked with desired status stop at the server.
"reconnecting-failed": {
ID: "reconnecting-failed",
Name: "reconnecting-failed",
ClientStatus: structs.AllocClientStatusFailed,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "reconnecting-failed",
Name: "reconnecting-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
// Lost allocs on reconnected nodes don't get restarted
"untainted-reconnect-lost": {
ID: "untainted-reconnect-lost",
Name: "untainted-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "untainted-reconnect-lost",
Name: "untainted-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
// Replacement allocs that are complete are untainted
"untainted-reconnect-complete-replacement": {
ID: "untainted-reconnect-complete-replacement",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -469,6 +458,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "untainted-reconnect-failed-replacement",
Name: "untainted-reconnect-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -480,6 +470,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "untainted-reconnect-lost-replacement",
Name: "untainted-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -489,29 +480,32 @@ func TestAllocSet_filterByTainted(t *testing.T) {
},
untainted: allocSet{
"untainted-reconnect-complete": {
ID: "untainted-reconnect-complete",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "untainted-reconnect-complete",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
"untainted-reconnect-lost": {
ID: "untainted-reconnect-lost",
Name: "untainted-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "untainted-reconnect-lost",
Name: "untainted-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
"untainted-reconnect-complete-replacement": {
ID: "untainted-reconnect-complete-replacement",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -522,6 +516,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "untainted-reconnect-failed-replacement",
Name: "untainted-reconnect-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -532,6 +527,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "untainted-reconnect-lost-replacement",
Name: "untainted-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -543,14 +539,15 @@ func TestAllocSet_filterByTainted(t *testing.T) {
disconnecting: allocSet{},
reconnecting: allocSet{
"reconnecting-failed": {
ID: "reconnecting-failed",
Name: "reconnecting-failed",
ClientStatus: structs.AllocClientStatusFailed,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "reconnecting-failed",
Name: "reconnecting-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
ignore: allocSet{},
@ -565,12 +562,13 @@ func TestAllocSet_filterByTainted(t *testing.T) {
all: allocSet{
// Non-terminal allocs on disconnected nodes are disconnecting
"disconnect-running": {
ID: "disconnect-running",
Name: "disconnect-running",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
ID: "disconnect-running",
Name: "disconnect-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
// Unknown allocs on disconnected nodes are ignored
"ignore-unknown": {
@ -585,13 +583,37 @@ func TestAllocSet_filterByTainted(t *testing.T) {
},
// Unknown allocs on disconnected nodes are lost when expired
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: expiredAllocState,
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
// Pending allocs on disconnected nodes are lost
"lost-pending": {
ID: "lost-pending",
Name: "lost-pending",
ClientStatus: structs.AllocClientStatusPending,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
// Expired allocs on reconnected clients are lost
// Pending allocs on disconnected nodes are lost
"lost-expired": {
ID: "lost-expired",
Name: "lost-expired",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: expiredAllocState,
},
// Failed and stopped allocs on disconnected nodes are ignored
"ignore-reconnected-failed-stopped": {
@ -603,19 +625,20 @@ func TestAllocSet_filterByTainted(t *testing.T) {
NodeID: "disconnected",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: unknownAllocState, // TODO really?
AllocStates: unknownAllocState,
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{
"disconnect-running": {
ID: "disconnect-running",
Name: "disconnect-running",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
ID: "disconnect-running",
Name: "disconnect-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
},
reconnecting: allocSet{},
@ -645,13 +668,73 @@ func TestAllocSet_filterByTainted(t *testing.T) {
},
lost: allocSet{
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: expiredAllocState,
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
"lost-pending": {
ID: "lost-pending",
Name: "lost-pending",
ClientStatus: structs.AllocClientStatusPending,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
"lost-expired": {
ID: "lost-expired",
Name: "lost-expired",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: expiredAllocState,
},
},
},
{
name: "disco-client-reconnect",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
// Expired allocs on reconnected clients are lost
"lost-expired-reconnect": {
ID: "lost-expired-reconnect",
Name: "lost-expired-reconnect",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: expiredAllocState,
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{
"lost-expired-reconnect": {
ID: "lost-expired-reconnect",
Name: "lost-expired-reconnect",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: expiredAllocState,
},
},
},
@ -666,6 +749,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -673,14 +757,15 @@ func TestAllocSet_filterByTainted(t *testing.T) {
},
// Running and replaced allocs on reconnected nodes are reconnecting
"running-original": {
ID: "running-original",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "running-original",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
untainted: allocSet{
@ -688,6 +773,7 @@ func TestAllocSet_filterByTainted(t *testing.T) {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
@ -698,14 +784,15 @@ func TestAllocSet_filterByTainted(t *testing.T) {
disconnecting: allocSet{},
reconnecting: allocSet{
"running-original": {
ID: "running-original",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "running-original",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
ignore: allocSet{},