package scheduler import ( "fmt" "reflect" "regexp" "strconv" "testing" "time" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/kr/pretty" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) var ( canaryUpdate = &structs.UpdateStrategy{ Canary: 2, MaxParallel: 2, HealthCheck: structs.UpdateStrategyHealthCheck_Checks, MinHealthyTime: 10 * time.Second, HealthyDeadline: 10 * time.Minute, Stagger: 31 * time.Second, } noCanaryUpdate = &structs.UpdateStrategy{ MaxParallel: 4, HealthCheck: structs.UpdateStrategyHealthCheck_Checks, MinHealthyTime: 10 * time.Second, HealthyDeadline: 10 * time.Minute, Stagger: 31 * time.Second, } ) func allocUpdateFnIgnore(*structs.Allocation, *structs.Job, *structs.TaskGroup) (bool, bool, *structs.Allocation) { return true, false, nil } func allocUpdateFnDestructive(*structs.Allocation, *structs.Job, *structs.TaskGroup) (bool, bool, *structs.Allocation) { return false, true, nil } func allocUpdateFnInplace(existing *structs.Allocation, _ *structs.Job, newTG *structs.TaskGroup) (bool, bool, *structs.Allocation) { // Create a shallow copy newAlloc := existing.CopySkipJob() newAlloc.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{}, Shared: structs.AllocatedSharedResources{ DiskMB: int64(newTG.EphemeralDisk.SizeMB), }, } // Use the new task resources but keep the network from the old for _, task := range newTG.Tasks { networks := existing.AllocatedResources.Tasks[task.Name].Copy().Networks newAlloc.AllocatedResources.Tasks[task.Name] = &structs.AllocatedTaskResources{ Cpu: structs.AllocatedCpuResources{ CpuShares: int64(task.Resources.CPU), }, Memory: structs.AllocatedMemoryResources{ MemoryMB: int64(task.Resources.MemoryMB), }, Networks: networks, } } return false, false, newAlloc } func allocUpdateFnMock(handled map[string]allocUpdateType, unhandled allocUpdateType) allocUpdateType { return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (bool, bool, *structs.Allocation) { if fn, ok := handled[existing.ID]; ok { return fn(existing, newJob, newTG) } return unhandled(existing, newJob, newTG) } } var ( // AllocationIndexRegex is a regular expression to find the allocation index. allocationIndexRegex = regexp.MustCompile(".+\\[(\\d+)\\]$") ) // allocNameToIndex returns the index of the allocation. func allocNameToIndex(name string) uint { matches := allocationIndexRegex.FindStringSubmatch(name) if len(matches) != 2 { return 0 } index, err := strconv.Atoi(matches[1]) if err != nil { return 0 } return uint(index) } func assertNamesHaveIndexes(t *testing.T, indexes []int, names []string) { t.Helper() m := make(map[uint]int) for _, i := range indexes { m[uint(i)] += 1 } for _, n := range names { index := allocNameToIndex(n) val, contained := m[index] if !contained { t.Fatalf("Unexpected index %d from name %s\nAll names: %v", index, n, names) } val-- if val < 0 { t.Fatalf("Index %d repeated too many times\nAll names: %v", index, names) } m[index] = val } for k, remainder := range m { if remainder != 0 { t.Fatalf("Index %d has %d remaining uses expected\nAll names: %v", k, remainder, names) } } } func assertNoCanariesStopped(t *testing.T, d *structs.Deployment, stop []allocStopResult) { t.Helper() canaryIndex := make(map[string]struct{}) for _, state := range d.TaskGroups { for _, c := range state.PlacedCanaries { canaryIndex[c] = struct{}{} } } for _, s := range stop { if _, ok := canaryIndex[s.alloc.ID]; ok { t.Fatalf("Stopping canary alloc %q %q", s.alloc.ID, s.alloc.Name) } } } func assertPlaceResultsHavePreviousAllocs(t *testing.T, numPrevious int, place []allocPlaceResult) { t.Helper() names := make(map[string]struct{}, numPrevious) found := 0 for _, p := range place { if _, ok := names[p.name]; ok { t.Fatalf("Name %q already placed", p.name) } names[p.name] = struct{}{} if p.previousAlloc == nil { continue } if act := p.previousAlloc.Name; p.name != act { t.Fatalf("Name mismatch on previous alloc; got %q; want %q", act, p.name) } found++ } if numPrevious != found { t.Fatalf("wanted %d; got %d placements with previous allocs", numPrevious, found) } } func assertPlacementsAreRescheduled(t *testing.T, numRescheduled int, place []allocPlaceResult) { t.Helper() names := make(map[string]struct{}, numRescheduled) found := 0 for _, p := range place { if _, ok := names[p.name]; ok { t.Fatalf("Name %q already placed", p.name) } names[p.name] = struct{}{} if p.previousAlloc == nil { continue } if p.reschedule { found++ } } if numRescheduled != found { t.Fatalf("wanted %d; got %d placements that are rescheduled", numRescheduled, found) } } func intRange(pairs ...int) []int { if len(pairs)%2 != 0 { return nil } var r []int for i := 0; i < len(pairs); i += 2 { for j := pairs[i]; j <= pairs[i+1]; j++ { r = append(r, j) } } return r } func placeResultsToNames(place []allocPlaceResult) []string { names := make([]string, 0, len(place)) for _, p := range place { names = append(names, p.name) } return names } func destructiveResultsToNames(destructive []allocDestructiveResult) []string { names := make([]string, 0, len(destructive)) for _, d := range destructive { names = append(names, d.placeName) } return names } func stopResultsToNames(stop []allocStopResult) []string { names := make([]string, 0, len(stop)) for _, s := range stop { names = append(names, s.alloc.Name) } return names } func attributeUpdatesToNames(attributeUpdates map[string]*structs.Allocation) []string { names := make([]string, 0, len(attributeUpdates)) for _, a := range attributeUpdates { names = append(names, a.Name) } return names } func allocsToNames(allocs []*structs.Allocation) []string { names := make([]string, 0, len(allocs)) for _, a := range allocs { names = append(names, a.Name) } return names } type resultExpectation struct { createDeployment *structs.Deployment deploymentUpdates []*structs.DeploymentStatusUpdate place int destructive int inplace int attributeUpdates int disconnectUpdates int reconnectUpdates int desiredTGUpdates map[string]*structs.DesiredUpdates stop int } func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) { t.Helper() assertion := assert.New(t) if exp.createDeployment != nil && r.deployment == nil { t.Errorf("Expect a created deployment got none") } else if exp.createDeployment == nil && r.deployment != nil { t.Errorf("Expect no created deployment; got %#v", r.deployment) } else if exp.createDeployment != nil && r.deployment != nil { // Clear the deployment ID r.deployment.ID, exp.createDeployment.ID = "", "" if !reflect.DeepEqual(r.deployment, exp.createDeployment) { t.Errorf("Unexpected createdDeployment; got\n %#v\nwant\n%#v\nDiff: %v", r.deployment, exp.createDeployment, pretty.Diff(r.deployment, exp.createDeployment)) } } assertion.EqualValues(exp.deploymentUpdates, r.deploymentUpdates, "Expected Deployment Updates") assertion.Len(r.place, exp.place, "Expected Placements") assertion.Len(r.destructiveUpdate, exp.destructive, "Expected Destructive") assertion.Len(r.inplaceUpdate, exp.inplace, "Expected Inplace Updates") assertion.Len(r.attributeUpdates, exp.attributeUpdates, "Expected Attribute Updates") assertion.Len(r.reconnectUpdates, exp.reconnectUpdates, "Expected Reconnect Updates") assertion.Len(r.disconnectUpdates, exp.disconnectUpdates, "Expected Disconnect Updates") assertion.Len(r.stop, exp.stop, "Expected Stops") assertion.EqualValues(exp.desiredTGUpdates, r.desiredTGUpdates, "Expected Desired TG Update Annotations") } func buildAllocations(job *structs.Job, count int, clientStatus, desiredStatus string, nodeScore float64) []*structs.Allocation { allocs := make([]*structs.Allocation, 0) for i := 0; i < count; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.ClientStatus = clientStatus alloc.DesiredStatus = desiredStatus alloc.Metrics = &structs.AllocMetric{ ScoreMetaData: []*structs.NodeScoreMeta{ { NodeID: alloc.NodeID, NormScore: nodeScore, Scores: map[string]float64{ alloc.NodeID: nodeScore, }, }, }, } allocs = append(allocs, alloc) } return allocs } func buildDisconnectedNodes(allocs []*structs.Allocation, count int) map[string]*structs.Node { tainted := make(map[string]*structs.Node, count) for i := 0; i < count; i++ { n := mock.Node() n.ID = allocs[i].NodeID n.Status = structs.NodeStatusDisconnected tainted[n.ID] = n } return tainted } func buildResumableAllocations(count int, clientStatus, desiredStatus string, nodeScore float64) (*structs.Job, []*structs.Allocation) { job := mock.Job() job.TaskGroups[0].MaxClientDisconnect = helper.TimeToPtr(5 * time.Minute) job.TaskGroups[0].Count = count return job, buildAllocations(job, count, clientStatus, desiredStatus, nodeScore) } // Tests the reconciler properly handles placements for a job that has no // existing allocations func TestReconciler_Place_NoExisting(t *testing.T) { ci.Parallel(t) job := mock.Job() reconciler := NewAllocReconciler( testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, nil, nil, "", job.Priority, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 10, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place)) } // Tests the reconciler properly handles placements for a job that has some // existing allocations func TestReconciler_Place_Existing(t *testing.T) { ci.Parallel(t) job := mock.Job() // Create 3 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 5, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, Ignore: 5, }, }, }) assertNamesHaveIndexes(t, intRange(5, 9), placeResultsToNames(r.place)) } // Tests the reconciler properly handles stopping allocations for a job that has // scaled down func TestReconciler_ScaleDown_Partial(t *testing.T) { ci.Parallel(t) // Has desired 10 job := mock.Job() // Create 20 existing allocations var allocs []*structs.Allocation for i := 0; i < 20; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, Stop: 10, }, }, }) assertNamesHaveIndexes(t, intRange(10, 19), stopResultsToNames(r.stop)) } // Tests the reconciler properly handles stopping allocations for a job that has // scaled down to zero desired func TestReconciler_ScaleDown_Zero(t *testing.T) { ci.Parallel(t) // Set desired 0 job := mock.Job() job.TaskGroups[0].Count = 0 // Create 20 existing allocations var allocs []*structs.Allocation for i := 0; i < 20; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 20, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 20, }, }, }) assertNamesHaveIndexes(t, intRange(0, 19), stopResultsToNames(r.stop)) } // Tests the reconciler properly handles stopping allocations for a job that has // scaled down to zero desired where allocs have duplicate names func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { ci.Parallel(t) // Set desired 0 job := mock.Job() job.TaskGroups[0].Count = 0 // Create 20 existing allocations var allocs []*structs.Allocation var expectedStopped []int for i := 0; i < 20; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2)) allocs = append(allocs, alloc) expectedStopped = append(expectedStopped, i%2) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 20, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 20, }, }, }) assertNamesHaveIndexes(t, expectedStopped, stopResultsToNames(r.stop)) } // Tests the reconciler properly handles inplace upgrading allocations func TestReconciler_Inplace(t *testing.T) { ci.Parallel(t) job := mock.Job() // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 10, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { InPlaceUpdate: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), allocsToNames(r.inplaceUpdate)) } // Tests the reconciler properly handles inplace upgrading allocations while // scaling up func TestReconciler_Inplace_ScaleUp(t *testing.T) { ci.Parallel(t) // Set desired 15 job := mock.Job() job.TaskGroups[0].Count = 15 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 5, inplace: 10, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, InPlaceUpdate: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), allocsToNames(r.inplaceUpdate)) assertNamesHaveIndexes(t, intRange(10, 14), placeResultsToNames(r.place)) } // Tests the reconciler properly handles inplace upgrading allocations while // scaling down func TestReconciler_Inplace_ScaleDown(t *testing.T) { ci.Parallel(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 5, stop: 5, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 5, InPlaceUpdate: 5, }, }, }) assertNamesHaveIndexes(t, intRange(0, 4), allocsToNames(r.inplaceUpdate)) assertNamesHaveIndexes(t, intRange(5, 9), stopResultsToNames(r.stop)) } // TestReconciler_Inplace_Rollback tests that a rollback to a previous version // generates the expected placements for any already-running allocations of // that version. func TestReconciler_Inplace_Rollback(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Count = 4 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ DelayFunction: "exponential", Interval: time.Second * 30, Delay: time.Hour * 1, Attempts: 3, Unlimited: true, } // Create 3 existing allocations var allocs []*structs.Allocation for i := 0; i < 3; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // allocs[0] is an allocation from version 0 allocs[0].ClientStatus = structs.AllocClientStatusRunning // allocs[1] and allocs[2] are failed allocations for version 1 with // different rescheduling states allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].TaskStates = map[string]*structs.TaskState{ "web": {FinishedAt: time.Now().Add(-10 * time.Minute)}} allocs[2].ClientStatus = structs.AllocClientStatusFailed // job is rolled back, we expect allocs[0] to be updated in-place allocUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{ allocs[0].ID: allocUpdateFnInplace, }, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFn, false, job.ID, job, nil, allocs, nil, uuid.Generate(), 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, inplace: 1, stop: 1, destructive: 1, attributeUpdates: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Stop: 1, InPlaceUpdate: 1, DestructiveUpdate: 1, }, }, }) assert.Len(t, r.desiredFollowupEvals, 1, "expected 1 follow-up eval") assertNamesHaveIndexes(t, intRange(0, 0), allocsToNames(r.inplaceUpdate)) assertNamesHaveIndexes(t, intRange(2, 2), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(2, 3), placeResultsToNames(r.place)) } // Tests the reconciler properly handles destructive upgrading allocations func TestReconciler_Destructive(t *testing.T) { ci.Parallel(t) job := mock.Job() // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, destructive: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { DestructiveUpdate: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), destructiveResultsToNames(r.destructiveUpdate)) } // Tests the reconciler properly handles destructive upgrading allocations when max_parallel=0 func TestReconciler_DestructiveMaxParallel(t *testing.T) { ci.Parallel(t) job := mock.MaxParallelJob() // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, destructive: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { DestructiveUpdate: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), destructiveResultsToNames(r.destructiveUpdate)) } // Tests the reconciler properly handles destructive upgrading allocations while // scaling up func TestReconciler_Destructive_ScaleUp(t *testing.T) { ci.Parallel(t) // Set desired 15 job := mock.Job() job.TaskGroups[0].Count = 15 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 5, destructive: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, DestructiveUpdate: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), destructiveResultsToNames(r.destructiveUpdate)) assertNamesHaveIndexes(t, intRange(10, 14), placeResultsToNames(r.place)) } // Tests the reconciler properly handles destructive upgrading allocations while // scaling down func TestReconciler_Destructive_ScaleDown(t *testing.T) { ci.Parallel(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, destructive: 5, stop: 5, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 5, DestructiveUpdate: 5, }, }, }) assertNamesHaveIndexes(t, intRange(5, 9), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 4), destructiveResultsToNames(r.destructiveUpdate)) } // Tests the reconciler properly handles lost nodes with allocations func TestReconciler_LostNode(t *testing.T) { ci.Parallel(t) job := mock.Job() // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 2) for i := 0; i < 2; i++ { n := mock.Node() n.ID = allocs[i].NodeID n.Status = structs.NodeStatusDown tainted[n.ID] = n } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Stop: 2, Ignore: 8, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) } // Tests the reconciler properly handles lost nodes with allocations while // scaling up func TestReconciler_LostNode_ScaleUp(t *testing.T) { ci.Parallel(t) // Set desired 15 job := mock.Job() job.TaskGroups[0].Count = 15 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 2) for i := 0; i < 2; i++ { n := mock.Node() n.ID = allocs[i].NodeID n.Status = structs.NodeStatusDown tainted[n.ID] = n } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 7, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 7, Stop: 2, Ignore: 8, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place)) } // Tests the reconciler properly handles lost nodes with allocations while // scaling down func TestReconciler_LostNode_ScaleDown(t *testing.T) { ci.Parallel(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 2) for i := 0; i < 2; i++ { n := mock.Node() n.ID = allocs[i].NodeID n.Status = structs.NodeStatusDown tainted[n.ID] = n } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 5, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 5, Ignore: 5, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1, 7, 9), stopResultsToNames(r.stop)) } // Tests the reconciler properly handles draining nodes with allocations func TestReconciler_DrainNode(t *testing.T) { ci.Parallel(t) job := mock.Job() // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 2) for i := 0; i < 2; i++ { n := mock.DrainNode() n.ID = allocs[i].NodeID allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) tainted[n.ID] = n } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Migrate: 2, Ignore: 8, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 2, r.place) // These should not have the reschedule field set assertPlacementsAreRescheduled(t, 0, r.place) } // Tests the reconciler properly handles draining nodes with allocations while // scaling up func TestReconciler_DrainNode_ScaleUp(t *testing.T) { ci.Parallel(t) // Set desired 15 job := mock.Job() job.TaskGroups[0].Count = 15 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 2) for i := 0; i < 2; i++ { n := mock.DrainNode() n.ID = allocs[i].NodeID allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) tainted[n.ID] = n } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 7, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, Migrate: 2, Ignore: 8, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 2, r.place) // These should not have the reschedule field set assertPlacementsAreRescheduled(t, 0, r.place) } // Tests the reconciler properly handles draining nodes with allocations while // scaling down func TestReconciler_DrainNode_ScaleDown(t *testing.T) { ci.Parallel(t) // Set desired 8 job := mock.Job() job.TaskGroups[0].Count = 8 // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 3) for i := 0; i < 3; i++ { n := mock.DrainNode() n.ID = allocs[i].NodeID allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) tainted[n.ID] = n } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 3, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Migrate: 1, Stop: 2, Ignore: 7, }, }, }) assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) // These should not have the reschedule field set assertPlacementsAreRescheduled(t, 0, r.place) } // Tests the reconciler properly handles a task group being removed func TestReconciler_RemovedTG(t *testing.T) { ci.Parallel(t) job := mock.Job() // Create 10 allocations for a tg that no longer exists var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } oldName := job.TaskGroups[0].Name newName := "different" job.TaskGroups[0].Name = newName reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 10, inplace: 0, stop: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ oldName: { Stop: 10, }, newName: { Place: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place)) } // Tests the reconciler properly handles a job in stopped states func TestReconciler_JobStopped(t *testing.T) { ci.Parallel(t) job := mock.Job() job.Stop = true cases := []struct { name string job *structs.Job jobID, taskGroup string }{ { name: "stopped job", job: job, jobID: job.ID, taskGroup: job.TaskGroups[0].Name, }, { name: "nil job", job: nil, jobID: "foo", taskGroup: "bar", }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create 10 allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = c.job alloc.JobID = c.jobID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(c.jobID, c.taskGroup, uint(i)) alloc.TaskGroup = c.taskGroup allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ c.taskGroup: { Stop: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), stopResultsToNames(r.stop)) }) } } // Tests the reconciler doesn't update allocs in terminal state // when job is stopped or nil func TestReconciler_JobStopped_TerminalAllocs(t *testing.T) { ci.Parallel(t) job := mock.Job() job.Stop = true cases := []struct { name string job *structs.Job jobID, taskGroup string }{ { name: "stopped job", job: job, jobID: job.ID, taskGroup: job.TaskGroups[0].Name, }, { name: "nil job", job: nil, jobID: "foo", taskGroup: "bar", }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create 10 terminal allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = c.job alloc.JobID = c.jobID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(c.jobID, c.taskGroup, uint(i)) alloc.TaskGroup = c.taskGroup if i%2 == 0 { alloc.DesiredStatus = structs.AllocDesiredStatusStop } else { alloc.ClientStatus = structs.AllocClientStatusFailed } allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() require.Len(t, r.stop, 0) // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ c.taskGroup: {}, }, }) }) } } // Tests the reconciler properly handles jobs with multiple task groups func TestReconciler_MultiTG(t *testing.T) { ci.Parallel(t) job := mock.Job() tg2 := job.TaskGroups[0].Copy() tg2.Name = "foo" job.TaskGroups = append(job.TaskGroups, tg2) // Create 2 existing allocations for the first tg var allocs []*structs.Allocation for i := 0; i < 2; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 18, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 8, Ignore: 2, }, tg2.Name: { Place: 10, }, }, }) assertNamesHaveIndexes(t, intRange(2, 9, 0, 9), placeResultsToNames(r.place)) } // Tests the reconciler properly handles jobs with multiple task groups with // only one having an update stanza and a deployment already being created func TestReconciler_MultiTG_SingleUpdateStanza(t *testing.T) { ci.Parallel(t) job := mock.Job() tg2 := job.TaskGroups[0].Copy() tg2.Name = "foo" job.TaskGroups = append(job.TaskGroups, tg2) job.TaskGroups[0].Update = noCanaryUpdate // Create all the allocs var allocs []*structs.Allocation for i := 0; i < 2; i++ { for j := 0; j < 10; j++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[i].Name, uint(j)) alloc.TaskGroup = job.TaskGroups[i].Name allocs = append(allocs, alloc) } } d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 10, } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, tg2.Name: { Ignore: 10, }, }, }) } // Tests delayed rescheduling of failed batch allocations func TestReconciler_RescheduleLater_Batch(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 4 job := mock.Job() job.TaskGroups[0].Count = 4 now := time.Now() // Set up reschedule policy delayDur := 15 * time.Second job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour, Delay: delayDur, DelayFunction: "constant"} tgName := job.TaskGroups[0].Name // Create 6 existing allocations - 2 running, 1 complete and 3 failed var allocs []*structs.Allocation for i := 0; i < 6; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark 3 as failed with restart tracking info allocs[0].ClientStatus = structs.AllocClientStatusFailed allocs[0].NextAllocation = allocs[1].ID allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[0].ID, PrevNodeID: uuid.Generate(), }, }} allocs[1].NextAllocation = allocs[2].ID allocs[2].ClientStatus = structs.AllocClientStatusFailed allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now}} allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[0].ID, PrevNodeID: uuid.Generate(), }, {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[1].ID, PrevNodeID: uuid.Generate(), }, }} // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate(), 50, true) r := reconciler.Compute() // Two reschedule attempts were already made, one more can be made at a future time // Verify that the follow up eval has the expected waitUntil time evals := r.desiredFollowupEvals[tgName] require.NotNil(evals) require.Equal(1, len(evals)) require.Equal(now.Add(delayDur), evals[0].WaitUntil) // Alloc 5 should not be replaced because it is terminal assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, attributeUpdates: 1, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 0, InPlaceUpdate: 0, Ignore: 4, Stop: 0, }, }, }) assertNamesHaveIndexes(t, intRange(2, 2), attributeUpdatesToNames(r.attributeUpdates)) // Verify that the followup evalID field is set correctly var annotated *structs.Allocation for _, a := range r.attributeUpdates { annotated = a } require.Equal(evals[0].ID, annotated.FollowupEvalID) } // Tests delayed rescheduling of failed batch allocations and batching of allocs // with fail times that are close together func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 4 job := mock.Job() job.TaskGroups[0].Count = 10 now := time.Now() // Set up reschedule policy delayDur := 15 * time.Second job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour, Delay: delayDur, DelayFunction: "constant"} tgName := job.TaskGroups[0].Name // Create 10 existing allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark 5 as failed with fail times very close together for i := 0; i < 5; i++ { allocs[i].ClientStatus = structs.AllocClientStatusFailed allocs[i].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(time.Duration(50*i) * time.Millisecond)}} } // Mark two more as failed several seconds later for i := 5; i < 7; i++ { allocs[i].ClientStatus = structs.AllocClientStatusFailed allocs[i].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(10 * time.Second)}} } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate(), 50, true) r := reconciler.Compute() // Verify that two follow up evals were created evals := r.desiredFollowupEvals[tgName] require.NotNil(evals) require.Equal(2, len(evals)) // Verify expected WaitUntil values for both batched evals require.Equal(now.Add(delayDur), evals[0].WaitUntil) secondBatchDuration := delayDur + 10*time.Second require.Equal(now.Add(secondBatchDuration), evals[1].WaitUntil) // Alloc 5 should not be replaced because it is terminal assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, attributeUpdates: 7, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 0, InPlaceUpdate: 0, Ignore: 10, Stop: 0, }, }, }) assertNamesHaveIndexes(t, intRange(0, 6), attributeUpdatesToNames(r.attributeUpdates)) // Verify that the followup evalID field is set correctly for _, alloc := range r.attributeUpdates { if allocNameToIndex(alloc.Name) < 5 { require.Equal(evals[0].ID, alloc.FollowupEvalID) } else if allocNameToIndex(alloc.Name) < 7 { require.Equal(evals[1].ID, alloc.FollowupEvalID) } else { t.Fatalf("Unexpected alloc name in Inplace results %v", alloc.Name) } } } // Tests rescheduling failed batch allocations func TestReconciler_RescheduleNow_Batch(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 4 job := mock.Job() job.TaskGroups[0].Count = 4 now := time.Now() // Set up reschedule policy job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "constant"} tgName := job.TaskGroups[0].Name // Create 6 existing allocations - 2 running, 1 complete and 3 failed var allocs []*structs.Allocation for i := 0; i < 6; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark 3 as failed with restart tracking info allocs[0].ClientStatus = structs.AllocClientStatusFailed allocs[0].NextAllocation = allocs[1].ID allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[0].ID, PrevNodeID: uuid.Generate(), }, }} allocs[1].NextAllocation = allocs[2].ID allocs[2].ClientStatus = structs.AllocClientStatusFailed allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-5 * time.Second)}} allocs[2].FollowupEvalID = uuid.Generate() allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[0].ID, PrevNodeID: uuid.Generate(), }, {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[1].ID, PrevNodeID: uuid.Generate(), }, }} // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "", 50, true) reconciler.now = now r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Two reschedule attempts were made, one more can be made now // Alloc 5 should not be replaced because it is terminal assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, stop: 1, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Stop: 1, Ignore: 3, }, }, }) assertNamesHaveIndexes(t, intRange(2, 2), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } // Tests rescheduling failed service allocations with desired state stop func TestReconciler_RescheduleLater_Service(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy delayDur := 15 * time.Second job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour, Delay: delayDur, MaxDelay: 1 * time.Hour} // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark two as failed allocs[0].ClientStatus = structs.AllocClientStatusFailed // Mark one of them as already rescheduled once allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now}} allocs[1].ClientStatus = structs.AllocClientStatusFailed // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, uuid.Generate(), 50, true) r := reconciler.Compute() // Should place a new placement and create a follow up eval for the delayed reschedule // Verify that the follow up eval has the expected waitUntil time evals := r.desiredFollowupEvals[tgName] require.NotNil(evals) require.Equal(1, len(evals)) require.Equal(now.Add(delayDur), evals[0].WaitUntil) assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, attributeUpdates: 1, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, InPlaceUpdate: 0, Ignore: 4, Stop: 0, }, }, }) assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place)) assertNamesHaveIndexes(t, intRange(1, 1), attributeUpdatesToNames(r.attributeUpdates)) // Verify that the followup evalID field is set correctly var annotated *structs.Allocation for _, a := range r.attributeUpdates { annotated = a } require.Equal(evals[0].ID, annotated.FollowupEvalID) } // Tests service allocations with client status complete func TestReconciler_Service_ClientStatusComplete(t *testing.T) { ci.Parallel(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 // Set up reschedule policy delayDur := 15 * time.Second job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: delayDur, MaxDelay: 1 * time.Hour, } // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning alloc.DesiredStatus = structs.AllocDesiredStatusRun } // Mark one as client status complete allocs[4].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Should place a new placement for the alloc that was marked complete assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, InPlaceUpdate: 0, Ignore: 4, }, }, }) assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place)) } // Tests service job placement with desired stop and client status complete func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) { ci.Parallel(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 // Set up reschedule policy delayDur := 15 * time.Second job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: delayDur, MaxDelay: 1 * time.Hour, } // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning alloc.DesiredStatus = structs.AllocDesiredStatusRun } // Mark one as failed but with desired status stop // Should not trigger rescheduling logic but should trigger a placement allocs[4].ClientStatus = structs.AllocClientStatusFailed allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Should place a new placement for the alloc that was marked stopped assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, InPlaceUpdate: 0, Ignore: 4, }, }, }) assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place)) // Should not have any follow up evals created require := require.New(t) require.Equal(0, len(r.desiredFollowupEvals)) } // Tests rescheduling failed service allocations with desired state stop func TestReconciler_RescheduleNow_Service(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = noCanaryUpdate // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark two as failed allocs[0].ClientStatus = structs.AllocClientStatusFailed // Mark one of them as already rescheduled once allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc and one replacement for terminal alloc were placed assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, inplace: 0, stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Ignore: 3, Stop: 1, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(1, 1, 4, 4), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } // Tests rescheduling failed service allocations when there's clock drift (upto a second) func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = noCanaryUpdate // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark one as failed allocs[0].ClientStatus = structs.AllocClientStatusFailed // Mark one of them as already rescheduled once allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} // Set fail time to 4 seconds ago which falls within the reschedule window allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-4 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) reconciler.now = now r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc was placed assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Stop: 1, Ignore: 4, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } // Tests rescheduling failed service allocations when the eval ID matches and there's a large clock drift func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = noCanaryUpdate // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark one as failed allocs[0].ClientStatus = structs.AllocClientStatusFailed // Mark one of them as already rescheduled once allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} // Set fail time to 5 seconds ago and eval ID evalID := uuid.Generate() allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-5 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].FollowupEvalID = evalID reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, evalID, 50, true) reconciler.now = now.Add(-30 * time.Second) r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc was placed assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, stop: 1, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Stop: 1, Ignore: 4, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } // Tests rescheduling failed service allocations when there are canaries func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = canaryUpdate job2 := job.Copy() job2.Version++ d := structs.NewDeployment(job2, 50) d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion s := &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 5, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark three as failed allocs[0].ClientStatus = structs.AllocClientStatusFailed // Mark one of them as already rescheduled once allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed // Mark one as desired state stop allocs[4].ClientStatus = structs.AllocClientStatusFailed // Create 2 canary allocations for i := 0; i < 2; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.ClientStatus = structs.AllocClientStatusRunning alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Canary: true, Healthy: helper.BoolToPtr(false), } s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc and one replacement for terminal alloc were placed assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, stop: 2, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Stop: 2, Ignore: 5, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(1, 1, 4, 4), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 2, r.place) assertPlacementsAreRescheduled(t, 2, r.place) } // Tests rescheduling failed canary service allocations func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Delay: 5 * time.Second, DelayFunction: "constant", MaxDelay: 1 * time.Hour, Unlimited: true, } job.TaskGroups[0].Update = canaryUpdate job2 := job.Copy() job2.Version++ d := structs.NewDeployment(job2, 50) d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion s := &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 5, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Create 2 healthy canary allocations for i := 0; i < 2; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.ClientStatus = structs.AllocClientStatusRunning alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Canary: true, Healthy: helper.BoolToPtr(false), } s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID) allocs = append(allocs, alloc) } // Mark the canaries as failed allocs[5].ClientStatus = structs.AllocClientStatusFailed allocs[5].DesiredTransition.Reschedule = helper.BoolToPtr(true) // Mark one of them as already rescheduled once allocs[5].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: now.Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[6].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs[6].ClientStatus = structs.AllocClientStatusFailed allocs[6].DesiredTransition.Reschedule = helper.BoolToPtr(true) // Create 4 unhealthy canary allocations that have already been replaced for i := 0; i < 4; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2)) alloc.ClientStatus = structs.AllocClientStatusFailed alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Canary: true, Healthy: helper.BoolToPtr(false), } s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "", 50, true) reconciler.now = now r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc and one replacement for terminal alloc were placed assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, stop: 2, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Stop: 2, Ignore: 9, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 2, r.place) assertPlacementsAreRescheduled(t, 2, r.place) } // Tests rescheduling failed canary service allocations when one has reached its // reschedule limit func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = canaryUpdate job2 := job.Copy() job2.Version++ d := structs.NewDeployment(job2, 50) d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion s := &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 5, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Create 2 healthy canary allocations for i := 0; i < 2; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.ClientStatus = structs.AllocClientStatusRunning alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Canary: true, Healthy: helper.BoolToPtr(false), } s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID) allocs = append(allocs, alloc) } // Mark the canaries as failed allocs[5].ClientStatus = structs.AllocClientStatusFailed allocs[5].DesiredTransition.Reschedule = helper.BoolToPtr(true) // Mark one of them as already rescheduled once allocs[5].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: now.Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[6].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs[6].ClientStatus = structs.AllocClientStatusFailed allocs[6].DesiredTransition.Reschedule = helper.BoolToPtr(true) // Create 4 unhealthy canary allocations that have already been replaced for i := 0; i < 4; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2)) alloc.ClientStatus = structs.AllocClientStatusFailed alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Canary: true, Healthy: helper.BoolToPtr(false), } s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID) allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "", 50, true) reconciler.now = now r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc and one replacement for terminal alloc were placed assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, stop: 1, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Stop: 1, Ignore: 10, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } // Tests failed service allocations that were already rescheduled won't be rescheduled again func TestReconciler_DontReschedule_PreviouslyRescheduled(t *testing.T) { ci.Parallel(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 // Set up reschedule policy job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 5, Interval: 24 * time.Hour} // Create 7 existing allocations var allocs []*structs.Allocation for i := 0; i < 7; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark two as failed and rescheduled allocs[0].ClientStatus = structs.AllocClientStatusFailed allocs[0].ID = allocs[1].ID allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[1].NextAllocation = allocs[2].ID // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Should place 1 - one is a new placement to make up the desired count of 5 // failing allocs are not rescheduled assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Ignore: 4, }, }, }) // name index 0 is used for the replacement because its assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place)) } // Tests the reconciler cancels an old deployment when the job is being stopped func TestReconciler_CancelDeployment_JobStop(t *testing.T) { ci.Parallel(t) job := mock.Job() job.Stop = true running := structs.NewDeployment(job, 50) failed := structs.NewDeployment(job, 50) failed.Status = structs.DeploymentStatusFailed cases := []struct { name string job *structs.Job jobID, taskGroup string deployment *structs.Deployment cancel bool }{ { name: "stopped job, running deployment", job: job, jobID: job.ID, taskGroup: job.TaskGroups[0].Name, deployment: running, cancel: true, }, { name: "nil job, running deployment", job: nil, jobID: "foo", taskGroup: "bar", deployment: running, cancel: true, }, { name: "stopped job, failed deployment", job: job, jobID: job.ID, taskGroup: job.TaskGroups[0].Name, deployment: failed, cancel: false, }, { name: "nil job, failed deployment", job: nil, jobID: "foo", taskGroup: "bar", deployment: failed, cancel: false, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create 10 allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = c.job alloc.JobID = c.jobID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(c.jobID, c.taskGroup, uint(i)) alloc.TaskGroup = c.taskGroup allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, c.deployment, allocs, nil, "", 50, true) r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate if c.cancel { updates = []*structs.DeploymentStatusUpdate{ { DeploymentID: c.deployment.ID, Status: structs.DeploymentStatusCancelled, StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, }, } } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: updates, place: 0, inplace: 0, stop: 10, desiredTGUpdates: map[string]*structs.DesiredUpdates{ c.taskGroup: { Stop: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), stopResultsToNames(r.stop)) }) } } // Tests the reconciler cancels an old deployment when the job is updated func TestReconciler_CancelDeployment_JobUpdate(t *testing.T) { ci.Parallel(t) // Create a base job job := mock.Job() // Create two deployments running := structs.NewDeployment(job, 50) failed := structs.NewDeployment(job, 50) failed.Status = structs.DeploymentStatusFailed // Make the job newer than the deployment job.Version += 10 cases := []struct { name string deployment *structs.Deployment cancel bool }{ { name: "running deployment", deployment: running, cancel: true, }, { name: "failed deployment", deployment: failed, cancel: false, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create 10 allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, c.deployment, allocs, nil, "", 50, true) r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate if c.cancel { updates = []*structs.DeploymentStatusUpdate{ { DeploymentID: c.deployment.ID, Status: structs.DeploymentStatusCancelled, StatusDescription: structs.DeploymentStatusDescriptionNewerJob, }, } } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: updates, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, }, }) }) } } // Tests the reconciler creates a deployment and does a rolling upgrade with // destructive changes func TestReconciler_CreateDeployment_RollingUpgrade_Destructive(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 10, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: d, deploymentUpdates: nil, destructive: 4, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { DestructiveUpdate: 4, Ignore: 6, }, }, }) assertNamesHaveIndexes(t, intRange(0, 3), destructiveResultsToNames(r.destructiveUpdate)) } // Tests the reconciler creates a deployment for inplace updates func TestReconciler_CreateDeployment_RollingUpgrade_Inplace(t *testing.T) { ci.Parallel(t) jobOld := mock.Job() job := jobOld.Copy() job.Version++ job.TaskGroups[0].Update = noCanaryUpdate // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = jobOld alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 10, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: d, deploymentUpdates: nil, place: 0, inplace: 10, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { InPlaceUpdate: 10, }, }, }) } // Tests the reconciler creates a deployment when the job has a newer create index func TestReconciler_CreateDeployment_NewerCreateIndex(t *testing.T) { ci.Parallel(t) jobOld := mock.Job() job := jobOld.Copy() job.TaskGroups[0].Update = noCanaryUpdate job.CreateIndex += 100 // Create 5 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = jobOld alloc.JobID = jobOld.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 5, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: d, deploymentUpdates: nil, place: 5, destructive: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { InPlaceUpdate: 0, Ignore: 5, Place: 5, DestructiveUpdate: 0, }, }, }) } // Tests the reconciler doesn't creates a deployment if there are no changes func TestReconciler_DontCreateDeployment_NoChanges(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate // Create 10 allocations from the job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { DestructiveUpdate: 0, Ignore: 10, }, }, }) } // Tests the reconciler doesn't place any more canaries when the deployment is // paused or failed func TestReconciler_PausedOrFailedDeployment_NoMoreCanaries(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate cases := []struct { name string deploymentStatus string stop uint64 }{ { name: "paused deployment", deploymentStatus: structs.DeploymentStatusPaused, stop: 0, }, { name: "failed deployment", deploymentStatus: structs.DeploymentStatusFailed, stop: 1, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create a deployment that is paused/failed and has placed some canaries d := structs.NewDeployment(job, 50) d.Status = c.deploymentStatus d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: false, DesiredCanaries: 2, DesiredTotal: 10, PlacedAllocs: 1, } // Create 10 allocations for the original job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, 0) canary.TaskGroup = job.TaskGroups[0].Name canary.DeploymentID = d.ID allocs = append(allocs, canary) d.TaskGroups[canary.TaskGroup].PlacedCanaries = []string{canary.ID} mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{canary.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: int(c.stop), desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 11 - c.stop, Stop: c.stop, }, }, }) }) } } // Tests the reconciler doesn't place any more allocs when the deployment is // paused or failed func TestReconciler_PausedOrFailedDeployment_NoMorePlacements(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate job.TaskGroups[0].Count = 15 cases := []struct { name string deploymentStatus string }{ { name: "paused deployment", deploymentStatus: structs.DeploymentStatusPaused, }, { name: "failed deployment", deploymentStatus: structs.DeploymentStatusFailed, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create a deployment that is paused and has placed some canaries d := structs.NewDeployment(job, 50) d.Status = c.deploymentStatus d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: false, DesiredTotal: 15, PlacedAllocs: 10, } // Create 10 allocations for the new job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, }, }) }) } } // Tests the reconciler doesn't do any more destructive updates when the // deployment is paused or failed func TestReconciler_PausedOrFailedDeployment_NoMoreDestructiveUpdates(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate cases := []struct { name string deploymentStatus string }{ { name: "paused deployment", deploymentStatus: structs.DeploymentStatusPaused, }, { name: "failed deployment", deploymentStatus: structs.DeploymentStatusFailed, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { // Create a deployment that is paused and has placed some canaries d := structs.NewDeployment(job, 50) d.Status = c.deploymentStatus d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, PlacedAllocs: 1, } // Create 9 allocations for the original job var allocs []*structs.Allocation for i := 1; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create one for the new job newAlloc := mock.Alloc() newAlloc.Job = job newAlloc.JobID = job.ID newAlloc.NodeID = uuid.Generate() newAlloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, 0) newAlloc.TaskGroup = job.TaskGroups[0].Name newAlloc.DeploymentID = d.ID allocs = append(allocs, newAlloc) mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{newAlloc.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, }, }) }) } } // Tests the reconciler handles migrating a canary correctly on a draining node func TestReconciler_DrainNode_Canary(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate // Create a deployment that is paused and has placed some canaries d := structs.NewDeployment(job, 50) s := &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create two canaries for the new job handled := make(map[string]allocUpdateType) for i := 0; i < 2; i++ { // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) canary.TaskGroup = job.TaskGroups[0].Name canary.DeploymentID = d.ID s.PlacedCanaries = append(s.PlacedCanaries, canary.ID) allocs = append(allocs, canary) handled[canary.ID] = allocUpdateFnIgnore } // Build a map of tainted nodes that contains the last canary tainted := make(map[string]*structs.Node, 1) n := mock.DrainNode() n.ID = allocs[11].NodeID allocs[11].DesiredTransition.Migrate = helper.BoolToPtr(true) tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 1, Ignore: 11, }, }, }) assertNamesHaveIndexes(t, intRange(1, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place)) } // Tests the reconciler handles migrating a canary correctly on a lost node func TestReconciler_LostNode_Canary(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate // Create a deployment that is paused and has placed some canaries d := structs.NewDeployment(job, 50) s := &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create two canaries for the new job handled := make(map[string]allocUpdateType) for i := 0; i < 2; i++ { // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) canary.TaskGroup = job.TaskGroups[0].Name s.PlacedCanaries = append(s.PlacedCanaries, canary.ID) canary.DeploymentID = d.ID allocs = append(allocs, canary) handled[canary.ID] = allocUpdateFnIgnore } // Build a map of tainted nodes that contains the last canary tainted := make(map[string]*structs.Node, 1) n := mock.Node() n.ID = allocs[11].NodeID n.Status = structs.NodeStatusDown tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 1, Ignore: 11, }, }, }) assertNamesHaveIndexes(t, intRange(1, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place)) } // Tests the reconciler handles stopping canaries from older deployments func TestReconciler_StopOldCanaries(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate // Create an old deployment that has placed some canaries d := structs.NewDeployment(job, 50) s := &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s // Update the job job.Version += 10 // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create canaries for i := 0; i < 2; i++ { // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) canary.TaskGroup = job.TaskGroups[0].Name s.PlacedCanaries = append(s.PlacedCanaries, canary.ID) canary.DeploymentID = d.ID allocs = append(allocs, canary) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() newD := structs.NewDeployment(job, 50) newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 10, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: newD, deploymentUpdates: []*structs.DeploymentStatusUpdate{ { DeploymentID: d.ID, Status: structs.DeploymentStatusCancelled, StatusDescription: structs.DeploymentStatusDescriptionNewerJob, }, }, place: 2, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 2, Stop: 2, Ignore: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) } // Tests the reconciler creates new canaries when the job changes func TestReconciler_NewCanaries(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() newD := structs.NewDeployment(job, 50) newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 10, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: newD, deploymentUpdates: nil, place: 2, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 2, Ignore: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) } // Tests the reconciler creates new canaries when the job changes and the // canary count is greater than the task group count func TestReconciler_NewCanaries_CountGreater(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Count = 3 job.TaskGroups[0].Update = canaryUpdate.Copy() job.TaskGroups[0].Update.Canary = 7 // Create 3 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 3; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() newD := structs.NewDeployment(job, 50) newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion state := &structs.DeploymentState{ DesiredCanaries: 7, DesiredTotal: 3, } newD.TaskGroups[job.TaskGroups[0].Name] = state // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: newD, deploymentUpdates: nil, place: 7, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 7, Ignore: 3, }, }, }) assertNamesHaveIndexes(t, intRange(0, 2, 3, 6), placeResultsToNames(r.place)) } // Tests the reconciler creates new canaries when the job changes for multiple // task groups func TestReconciler_NewCanaries_MultiTG(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate job.TaskGroups = append(job.TaskGroups, job.TaskGroups[0].Copy()) job.TaskGroups[0].Name = "tg2" // Create 10 allocations from the old job for each tg var allocs []*structs.Allocation for j := 0; j < 2; j++ { for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[j].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[j].Name allocs = append(allocs, alloc) } } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() newD := structs.NewDeployment(job, 50) newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion state := &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 10, } newD.TaskGroups[job.TaskGroups[0].Name] = state newD.TaskGroups[job.TaskGroups[1].Name] = state.Copy() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: newD, deploymentUpdates: nil, place: 4, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 2, Ignore: 10, }, job.TaskGroups[1].Name: { Canary: 2, Ignore: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1, 0, 1), placeResultsToNames(r.place)) } // Tests the reconciler creates new canaries when the job changes and scales up func TestReconciler_NewCanaries_ScaleUp(t *testing.T) { ci.Parallel(t) // Scale the job up to 15 job := mock.Job() job.TaskGroups[0].Update = canaryUpdate job.TaskGroups[0].Count = 15 // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() newD := structs.NewDeployment(job, 50) newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 15, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: newD, deploymentUpdates: nil, place: 2, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 2, Ignore: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) } // Tests the reconciler creates new canaries when the job changes and scales // down func TestReconciler_NewCanaries_ScaleDown(t *testing.T) { ci.Parallel(t) // Scale the job down to 5 job := mock.Job() job.TaskGroups[0].Update = canaryUpdate job.TaskGroups[0].Count = 5 // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() newD := structs.NewDeployment(job, 50) newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredCanaries: 2, DesiredTotal: 5, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: newD, deploymentUpdates: nil, place: 2, inplace: 0, stop: 5, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 2, Stop: 5, Ignore: 5, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) assertNamesHaveIndexes(t, intRange(5, 9), stopResultsToNames(r.stop)) } // Tests the reconciler handles filling the names of partially placed canaries func TestReconciler_NewCanaries_FillNames(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = &structs.UpdateStrategy{ Canary: 4, MaxParallel: 2, HealthCheck: structs.UpdateStrategyHealthCheck_Checks, MinHealthyTime: 10 * time.Second, HealthyDeadline: 10 * time.Minute, } // Create an existing deployment that has placed some canaries d := structs.NewDeployment(job, 50) s := &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, DesiredCanaries: 4, PlacedAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create canaries but pick names at the ends for i := 0; i < 4; i += 3 { // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) canary.TaskGroup = job.TaskGroups[0].Name s.PlacedCanaries = append(s.PlacedCanaries, canary.ID) canary.DeploymentID = d.ID allocs = append(allocs, canary) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Canary: 2, Ignore: 12, }, }, }) assertNamesHaveIndexes(t, intRange(1, 2), placeResultsToNames(r.place)) } // Tests the reconciler handles canary promotion by unblocking max_parallel func TestReconciler_PromoteCanaries_Unblock(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate // Create an existing deployment that has placed some canaries and mark them // promoted d := structs.NewDeployment(job, 50) s := &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create the canaries handled := make(map[string]allocUpdateType) for i := 0; i < 2; i++ { // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) canary.TaskGroup = job.TaskGroups[0].Name s.PlacedCanaries = append(s.PlacedCanaries, canary.ID) canary.DeploymentID = d.ID canary.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, canary) handled[canary.ID] = allocUpdateFnIgnore } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, destructive: 2, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 2, DestructiveUpdate: 2, Ignore: 8, }, }, }) assertNoCanariesStopped(t, d, r.stop) assertNamesHaveIndexes(t, intRange(2, 3), destructiveResultsToNames(r.destructiveUpdate)) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) } // Tests the reconciler handles canary promotion when the canary count equals // the total correctly func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate job.TaskGroups[0].Count = 2 // Create an existing deployment that has placed some canaries and mark them // promoted d := structs.NewDeployment(job, 50) s := &structs.DeploymentState{ Promoted: true, DesiredTotal: 2, DesiredCanaries: 2, PlacedAllocs: 2, HealthyAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s // Create 2 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 2; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create the canaries handled := make(map[string]allocUpdateType) for i := 0; i < 2; i++ { // Create one canary canary := mock.Alloc() canary.Job = job canary.JobID = job.ID canary.NodeID = uuid.Generate() canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) canary.TaskGroup = job.TaskGroups[0].Name s.PlacedCanaries = append(s.PlacedCanaries, canary.ID) canary.DeploymentID = d.ID canary.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, canary) handled[canary.ID] = allocUpdateFnIgnore } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { DeploymentID: d.ID, Status: structs.DeploymentStatusSuccessful, StatusDescription: structs.DeploymentStatusDescriptionSuccessful, }, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: updates, place: 0, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 2, Ignore: 2, }, }, }) assertNoCanariesStopped(t, d, r.stop) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) } // Tests the reconciler checks the health of placed allocs to determine the // limit func TestReconciler_DeploymentLimit_HealthAccounting(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate cases := []struct { healthy int }{ { healthy: 0, }, { healthy: 1, }, { healthy: 2, }, { healthy: 3, }, { healthy: 4, }, } for _, c := range cases { t.Run(fmt.Sprintf("%d healthy", c.healthy), func(t *testing.T) { // Create an existing deployment that has placed some canaries and mark them // promoted d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, PlacedAllocs: 4, } // Create 6 allocations from the old job var allocs []*structs.Allocation for i := 4; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create the new allocs handled := make(map[string]allocUpdateType) for i := 0; i < 4; i++ { new := mock.Alloc() new.Job = job new.JobID = job.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) new.TaskGroup = job.TaskGroups[0].Name new.DeploymentID = d.ID if i < c.healthy { new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } } allocs = append(allocs, new) handled[new.ID] = allocUpdateFnIgnore } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, destructive: c.healthy, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { DestructiveUpdate: uint64(c.healthy), Ignore: uint64(10 - c.healthy), }, }, }) if c.healthy != 0 { assertNamesHaveIndexes(t, intRange(4, 3+c.healthy), destructiveResultsToNames(r.destructiveUpdate)) } }) } } // Tests the reconciler handles an alloc on a tainted node during a rolling // update func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate // Create an existing deployment that has some placed allocs d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, PlacedAllocs: 7, } // Create 2 allocations from the old job var allocs []*structs.Allocation for i := 8; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create the healthy replacements handled := make(map[string]allocUpdateType) for i := 0; i < 8; i++ { new := mock.Alloc() new.Job = job new.JobID = job.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) new.TaskGroup = job.TaskGroups[0].Name new.DeploymentID = d.ID new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, new) handled[new.ID] = allocUpdateFnIgnore } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 3) for i := 0; i < 3; i++ { n := mock.Node() n.ID = allocs[2+i].NodeID if i == 0 { n.Status = structs.NodeStatusDown } else { n.DrainStrategy = mock.DrainNode().DrainStrategy allocs[2+i].DesiredTransition.Migrate = helper.BoolToPtr(true) } tainted[n.ID] = n } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 3, destructive: 2, stop: 3, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, // Place the lost Stop: 1, // Stop the lost Migrate: 2, // Migrate the tainted DestructiveUpdate: 2, Ignore: 5, }, }, }) assertNamesHaveIndexes(t, intRange(8, 9), destructiveResultsToNames(r.destructiveUpdate)) assertNamesHaveIndexes(t, intRange(0, 2), placeResultsToNames(r.place)) assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop)) } // Tests the reconciler handles a failed deployment with allocs on tainted // nodes func TestReconciler_FailedDeployment_TaintedNodes(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate // Create an existing failed deployment that has some placed allocs d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusFailed d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, PlacedAllocs: 4, } // Create 6 allocations from the old job var allocs []*structs.Allocation for i := 4; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create the healthy replacements handled := make(map[string]allocUpdateType) for i := 0; i < 4; i++ { new := mock.Alloc() new.Job = job new.JobID = job.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) new.TaskGroup = job.TaskGroups[0].Name new.DeploymentID = d.ID new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, new) handled[new.ID] = allocUpdateFnIgnore } // Build a map of tainted nodes tainted := make(map[string]*structs.Node, 2) for i := 0; i < 2; i++ { n := mock.Node() n.ID = allocs[6+i].NodeID if i == 0 { n.Status = structs.NodeStatusDown } else { n.DrainStrategy = mock.DrainNode().DrainStrategy allocs[6+i].DesiredTransition.Migrate = helper.BoolToPtr(true) } tainted[n.ID] = n } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Migrate: 1, Stop: 1, Ignore: 8, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) } // Tests the reconciler handles a run after a deployment is complete // successfully. func TestReconciler_CompleteDeployment(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = canaryUpdate d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusSuccessful d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 10, HealthyAllocs: 10, } // Create allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, }, }) } // Tests that the reconciler marks a deployment as complete once there is // nothing left to place even if there are failed allocations that are part of // the deployment. func TestReconciler_MarkDeploymentComplete_FailedAllocations(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 10, PlacedAllocs: 20, HealthyAllocs: 10, } // Create 10 healthy allocs and 10 allocs that are failed var allocs []*structs.Allocation for i := 0; i < 20; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%10)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{} if i < 10 { alloc.ClientStatus = structs.AllocClientStatusRunning alloc.DeploymentStatus.Healthy = helper.BoolToPtr(true) } else { alloc.DesiredStatus = structs.AllocDesiredStatusStop alloc.ClientStatus = structs.AllocClientStatusFailed alloc.DeploymentStatus.Healthy = helper.BoolToPtr(false) } allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { DeploymentID: d.ID, Status: structs.DeploymentStatusSuccessful, StatusDescription: structs.DeploymentStatusDescriptionSuccessful, }, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: updates, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, }, }) } // Test that a failed deployment cancels non-promoted canaries func TestReconciler_FailedDeployment_CancelCanaries(t *testing.T) { ci.Parallel(t) // Create a job with two task groups job := mock.Job() job.TaskGroups[0].Update = canaryUpdate job.TaskGroups = append(job.TaskGroups, job.TaskGroups[0].Copy()) job.TaskGroups[1].Name = "two" // Create an existing failed deployment that has promoted one task group d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusFailed s0 := &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 4, } s1 := &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, DesiredCanaries: 2, PlacedAllocs: 2, } d.TaskGroups[job.TaskGroups[0].Name] = s0 d.TaskGroups[job.TaskGroups[1].Name] = s1 // Create 6 allocations from the old job var allocs []*structs.Allocation handled := make(map[string]allocUpdateType) for _, group := range []int{0, 1} { replacements := 4 state := s0 if group == 1 { replacements = 2 state = s1 } // Create the healthy replacements for i := 0; i < replacements; i++ { new := mock.Alloc() new.Job = job new.JobID = job.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(job.ID, job.TaskGroups[group].Name, uint(i)) new.TaskGroup = job.TaskGroups[group].Name new.DeploymentID = d.ID new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, new) handled[new.ID] = allocUpdateFnIgnore // Add the alloc to the canary list if i < 2 { state.PlacedCanaries = append(state.PlacedCanaries, new.ID) } } for i := replacements; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[group].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[group].Name allocs = append(allocs, alloc) } } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, inplace: 0, stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, job.TaskGroups[1].Name: { Stop: 2, Ignore: 8, }, }, }) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) } // Test that a failed deployment and updated job works func TestReconciler_FailedDeployment_NewJob(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate // Create an existing failed deployment that has some placed allocs d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusFailed d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, PlacedAllocs: 4, } // Create 6 allocations from the old job var allocs []*structs.Allocation for i := 4; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create the healthy replacements for i := 0; i < 4; i++ { new := mock.Alloc() new.Job = job new.JobID = job.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) new.TaskGroup = job.TaskGroups[0].Name new.DeploymentID = d.ID new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, new) } // Up the job version jobNew := job.Copy() jobNew.Version += 100 reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, jobNew, d, allocs, nil, "", 50, true) r := reconciler.Compute() dnew := structs.NewDeployment(jobNew, 50) dnew.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 10, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: dnew, deploymentUpdates: nil, destructive: 4, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { DestructiveUpdate: 4, Ignore: 6, }, }, }) assertNamesHaveIndexes(t, intRange(0, 3), destructiveResultsToNames(r.destructiveUpdate)) } // Tests the reconciler marks a deployment as complete func TestReconciler_MarkDeploymentComplete(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 10, PlacedAllocs: 10, HealthyAllocs: 10, } // Create allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { DeploymentID: d.ID, Status: structs.DeploymentStatusSuccessful, StatusDescription: structs.DeploymentStatusDescriptionSuccessful, }, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: updates, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 10, }, }, }) } // Tests the reconciler handles changing a job such that a deployment is created // while doing a scale up but as the second eval. func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) { ci.Parallel(t) // Scale the job up to 15 job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate job.TaskGroups[0].Count = 30 // Create a deployment that is paused and has placed some canaries d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: false, DesiredTotal: 30, PlacedAllocs: 20, } // Create 10 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } // Create 20 from new job handled := make(map[string]allocUpdateType) for i := 10; i < 30; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.DeploymentID = d.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) handled[alloc.ID] = allocUpdateFnIgnore } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { // All should be ignored because nothing has been marked as // healthy. Ignore: 30, }, }, }) } // Tests the reconciler doesn't stop allocations when doing a rolling upgrade // where the count of the old job allocs is < desired count. func TestReconciler_RollingUpgrade_MissingAllocs(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate // Create 7 allocations from the old job var allocs []*structs.Allocation for i := 0; i < 7; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() d := structs.NewDeployment(job, 50) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ DesiredTotal: 10, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: d, deploymentUpdates: nil, place: 3, destructive: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 3, DestructiveUpdate: 1, Ignore: 6, }, }, }) assertNamesHaveIndexes(t, intRange(7, 9), placeResultsToNames(r.place)) assertNamesHaveIndexes(t, intRange(0, 0), destructiveResultsToNames(r.destructiveUpdate)) } // Tests that the reconciler handles rerunning a batch job in the case that the // allocations are from an older instance of the job. func TestReconciler_Batch_Rerun(t *testing.T) { ci.Parallel(t) job := mock.Job() job.Type = structs.JobTypeBatch job.TaskGroups[0].Update = nil // Create 10 allocations from the old job and have them be complete var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.ClientStatus = structs.AllocClientStatusComplete alloc.DesiredStatus = structs.AllocDesiredStatusStop allocs = append(allocs, alloc) } // Create a copy of the job that is "new" job2 := job.Copy() job2.CreateIndex++ reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job2.ID, job2, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 10, destructive: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 10, DestructiveUpdate: 0, Ignore: 10, }, }, }) assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place)) } // Test that a failed deployment will not result in rescheduling failed allocations func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate tgName := job.TaskGroups[0].Name now := time.Now() // Create an existing failed deployment that has some placed allocs d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusFailed d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: true, DesiredTotal: 5, PlacedAllocs: 4, } // Create 4 allocations and mark two as failed var allocs []*structs.Allocation for i := 0; i < 4; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.DeploymentID = d.ID allocs = append(allocs, alloc) } //create some allocations that are reschedulable now allocs[2].ClientStatus = structs.AllocClientStatusFailed allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs[3].ClientStatus = structs.AllocClientStatusFailed allocs[3].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert that no rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 0, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Ignore: 2, }, }, }) } // Test that a running deployment with failed allocs will not result in // rescheduling failed allocations unless they are marked as reschedulable. func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate tgName := job.TaskGroups[0].Name now := time.Now() // Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusRunning d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, PlacedAllocs: 10, } // Create 10 allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.DeploymentID = d.ID alloc.ClientStatus = structs.AllocClientStatusFailed alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs = append(allocs, alloc) } // Mark half of them as reschedulable for i := 0; i < 5; i++ { allocs[i].DesiredTransition.Reschedule = helper.BoolToPtr(true) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert that no rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 5, stop: 5, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, Stop: 5, Ignore: 5, }, }, }) } // Test that a failed deployment cancels non-promoted canaries func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) { ci.Parallel(t) // Create a job job := mock.Job() job.TaskGroups[0].Count = 3 job.TaskGroups[0].Update = &structs.UpdateStrategy{ Canary: 3, MaxParallel: 2, HealthCheck: structs.UpdateStrategyHealthCheck_Checks, MinHealthyTime: 10 * time.Second, HealthyDeadline: 10 * time.Minute, Stagger: 31 * time.Second, } // Create v1 of the job jobv1 := job.Copy() jobv1.Version = 1 jobv1.TaskGroups[0].Meta = map[string]string{"version": "1"} // Create v2 of the job jobv2 := job.Copy() jobv2.Version = 2 jobv2.TaskGroups[0].Meta = map[string]string{"version": "2"} d := structs.NewDeployment(jobv2, 50) state := &structs.DeploymentState{ Promoted: true, DesiredTotal: 3, PlacedAllocs: 3, HealthyAllocs: 3, } d.TaskGroups[job.TaskGroups[0].Name] = state // Create the original var allocs []*structs.Allocation for i := 0; i < 3; i++ { new := mock.Alloc() new.Job = jobv2 new.JobID = job.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) new.TaskGroup = job.TaskGroups[0].Name new.DeploymentID = d.ID new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), } new.ClientStatus = structs.AllocClientStatusRunning allocs = append(allocs, new) } for i := 0; i < 3; i++ { new := mock.Alloc() new.Job = jobv1 new.JobID = jobv1.ID new.NodeID = uuid.Generate() new.Name = structs.AllocName(jobv1.ID, jobv1.TaskGroups[0].Name, uint(i)) new.TaskGroup = job.TaskGroups[0].Name new.DeploymentID = uuid.Generate() new.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(false), } new.DesiredStatus = structs.AllocDesiredStatusStop new.ClientStatus = structs.AllocClientStatusFailed allocs = append(allocs, new) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, jobv2, d, allocs, nil, "", 50, true) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { DeploymentID: d.ID, Status: structs.DeploymentStatusSuccessful, StatusDescription: structs.DeploymentStatusDescriptionSuccessful, }, } // Assert the correct results assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: updates, place: 0, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Stop: 0, InPlaceUpdate: 0, Ignore: 3, }, }, }) } // Test that a successful deployment with failed allocs will result in // rescheduling failed allocations func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T) { ci.Parallel(t) job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate tgName := job.TaskGroups[0].Name now := time.Now() // Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet d := structs.NewDeployment(job, 50) d.Status = structs.DeploymentStatusSuccessful d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ Promoted: false, DesiredTotal: 10, PlacedAllocs: 10, } // Create 10 allocations var allocs []*structs.Allocation for i := 0; i < 10; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.TaskGroup = job.TaskGroups[0].Name alloc.DeploymentID = d.ID alloc.ClientStatus = structs.AllocClientStatusFailed alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs = append(allocs, alloc) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "", 50, true) r := reconciler.Compute() // Assert that rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 10, stop: 10, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 10, Stop: 10, Ignore: 0, }, }, }) assertPlaceResultsHavePreviousAllocs(t, 10, r.place) } // Tests force rescheduling a failed alloc that is past its reschedule limit func TestReconciler_ForceReschedule_Service(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 1, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = noCanaryUpdate // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark one as failed and past its reschedule limit so not eligible to reschedule allocs[0].ClientStatus = structs.AllocClientStatusFailed allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} // Mark DesiredTransition ForceReschedule allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: helper.BoolToPtr(true)} reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // Verify that one rescheduled alloc was created because of the forced reschedule assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, stop: 1, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Stop: 1, Ignore: 4, }, }, }) // Rescheduled allocs should have previous allocs assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } // Tests behavior of service failure with rescheduling policy preventing rescheduling: // new allocs should be placed to satisfy the job count, and current allocations are // left unmodified func TestReconciler_RescheduleNot_Service(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 5 job := mock.Job() job.TaskGroups[0].Count = 5 tgName := job.TaskGroups[0].Name now := time.Now() // Set up reschedule policy and update stanza job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 0, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "", MaxDelay: 1 * time.Hour, Unlimited: false, } job.TaskGroups[0].Update = noCanaryUpdate // Create 5 existing allocations var allocs []*structs.Allocation for i := 0; i < 5; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark two as failed allocs[0].ClientStatus = structs.AllocClientStatusFailed // Mark one of them as already rescheduled once allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: uuid.Generate(), PrevNodeID: uuid.Generate(), }, }} allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "", 50, true) r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // no rescheduling, ignore all 4 allocs // but place one to substitute allocs[4] that was stopped explicitly assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 1, inplace: 0, stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, Ignore: 4, Stop: 0, }, }, }) // none of the placement should have preallocs or rescheduled assertPlaceResultsHavePreviousAllocs(t, 0, r.place) assertPlacementsAreRescheduled(t, 0, r.place) } // Tests behavior of batch failure with rescheduling policy preventing rescheduling: // current allocations are left unmodified and no follow up func TestReconciler_RescheduleNot_Batch(t *testing.T) { ci.Parallel(t) require := require.New(t) // Set desired 4 job := mock.Job() job.TaskGroups[0].Count = 4 now := time.Now() // Set up reschedule policy job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: 0, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "constant", } tgName := job.TaskGroups[0].Name // Create 6 existing allocations - 2 running, 1 complete and 3 failed var allocs []*structs.Allocation for i := 0; i < 6; i++ { alloc := mock.Alloc() alloc.Job = job alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) allocs = append(allocs, alloc) alloc.ClientStatus = structs.AllocClientStatusRunning } // Mark 3 as failed with restart tracking info allocs[0].ClientStatus = structs.AllocClientStatusFailed allocs[0].NextAllocation = allocs[1].ID allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[0].ID, PrevNodeID: uuid.Generate(), }, }} allocs[1].NextAllocation = allocs[2].ID allocs[2].ClientStatus = structs.AllocClientStatusFailed allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-5 * time.Second)}} allocs[2].FollowupEvalID = uuid.Generate() allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ {RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[0].ID, PrevNodeID: uuid.Generate(), }, {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), PrevAllocID: allocs[1].ID, PrevNodeID: uuid.Generate(), }, }} // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "", 50, true) reconciler.now = now r := reconciler.Compute() // Verify that no follow up evals were created evals := r.desiredFollowupEvals[tgName] require.Nil(evals) // No reschedule attempts were made and all allocs are untouched assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 0, stop: 0, inplace: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 0, Stop: 0, Ignore: 4, }, }, }) } // Tests that when a node disconnects running allocations are queued to transition to unknown. func TestReconciler_Node_Disconnect_Updates_Alloc_To_Unknown(t *testing.T) { job, allocs := buildResumableAllocations(3, structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun, 2) // Build a map of disconnected nodes nodes := buildDisconnectedNodes(allocs, 2) reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nodes, "", 50, true) reconciler.now = time.Now().UTC() results := reconciler.Compute() // Verify that 1 follow up eval was created with the values we expect. evals := results.desiredFollowupEvals[job.TaskGroups[0].Name] require.Len(t, evals, 1) expectedTime := reconciler.now.Add(5 * time.Minute) eval := evals[0] require.NotNil(t, eval.WaitUntil) require.Equal(t, expectedTime, eval.WaitUntil) // Validate that the queued disconnectUpdates have the right client status, // and that they have a valid FollowUpdEvalID. for _, disconnectUpdate := range results.disconnectUpdates { require.Equal(t, structs.AllocClientStatusUnknown, disconnectUpdate.ClientStatus) require.NotEmpty(t, disconnectUpdate.FollowupEvalID) require.Equal(t, eval.ID, disconnectUpdate.FollowupEvalID) } // 2 to place, 2 to update, 1 to ignore assertResults(t, results, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, stop: 0, inplace: 0, disconnectUpdates: 2, // 2 to place and 1 to ignore desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Stop: 0, Ignore: 1, InPlaceUpdate: 0, }, }, }) } // Tests that when a node disconnects/reconnects allocations for that node are // reconciled according to the business rules. func TestReconciler_Disconnected_Client(t *testing.T) { type testCase struct { name string allocCount int disconnectedAllocCount int jobVersionIncrement uint64 nodeScoreIncrement float64 disconnectedAllocStatus string serverDesiredStatus string isBatch bool nodeStatusDisconnected bool replace bool failReplacement bool shouldStopOnDisconnectedNode bool maxDisconnect *time.Duration expected *resultExpectation } testCases := []testCase{ { name: "reconnect-original-no-replacement", allocCount: 2, replace: false, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: false, expected: &resultExpectation{ reconnectUpdates: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 2, }, }, }, }, { name: "resume-original-and-stop-replacement", allocCount: 3, replace: true, disconnectedAllocCount: 1, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: false, expected: &resultExpectation{ stop: 1, reconnectUpdates: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 1, Ignore: 3, }, }, }, }, { name: "stop-original-with-lower-node-score", allocCount: 4, replace: true, disconnectedAllocCount: 1, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, nodeScoreIncrement: 1, expected: &resultExpectation{ stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 1, Ignore: 4, }, }, }, }, { name: "stop-original-failed-on-reconnect", allocCount: 4, replace: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusFailed, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 2, Ignore: 4, }, }, }, }, { name: "reschedule-original-failed-if-not-replaced", allocCount: 4, replace: false, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusFailed, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ stop: 2, place: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 2, Place: 2, Stop: 2, }, }, }, }, { name: "ignore-reconnect-completed", allocCount: 2, replace: false, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusComplete, serverDesiredStatus: structs.AllocDesiredStatusRun, isBatch: true, expected: &resultExpectation{ desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 2, }, }, }, }, { name: "stop-original-alloc-with-old-job-version", allocCount: 5, replace: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, jobVersionIncrement: 1, expected: &resultExpectation{ stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 5, Stop: 2, }, }, }, }, { name: "stop-original-alloc-with-old-job-version-reconnect-eval", allocCount: 5, replace: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, jobVersionIncrement: 1, expected: &resultExpectation{ stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 2, Ignore: 5, }, }, }, }, { name: "stop-original-alloc-with-old-job-version-and-failed-replacements", allocCount: 5, replace: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, failReplacement: true, shouldStopOnDisconnectedNode: true, jobVersionIncrement: 1, expected: &resultExpectation{ stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 2, Ignore: 5, }, }, }, }, { name: "stop-original-pending-alloc-for-disconnected-node", allocCount: 2, replace: true, disconnectedAllocCount: 1, disconnectedAllocStatus: structs.AllocClientStatusPending, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, nodeStatusDisconnected: true, expected: &resultExpectation{ stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 1, Ignore: 2, }, }, }, }, { name: "stop-expired-allocs", allocCount: 5, replace: true, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusUnknown, serverDesiredStatus: structs.AllocDesiredStatusRun, shouldStopOnDisconnectedNode: true, nodeStatusDisconnected: true, maxDisconnect: helper.TimeToPtr(2 * time.Second), expected: &resultExpectation{ stop: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Stop: 2, Ignore: 5, }, }, }, }, { name: "replace-allocs-on-disconnected-node", allocCount: 5, replace: false, disconnectedAllocCount: 2, disconnectedAllocStatus: structs.AllocClientStatusRunning, serverDesiredStatus: structs.AllocDesiredStatusRun, nodeStatusDisconnected: true, expected: &resultExpectation{ place: 2, disconnectUpdates: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Place: 2, Ignore: 3, }, }, }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { require.NotEqual(t, 0, tc.allocCount, "invalid test case: alloc count must be greater than zero") testNode := mock.Node() if tc.nodeStatusDisconnected == true { testNode.Status = structs.NodeStatusDisconnected } // Create resumable allocs job, allocs := buildResumableAllocations(tc.allocCount, structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun, 2) if tc.isBatch { job.Type = structs.JobTypeBatch } // Set alloc state disconnectedAllocCount := tc.disconnectedAllocCount for _, alloc := range allocs { alloc.DesiredStatus = tc.serverDesiredStatus if tc.maxDisconnect != nil { alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect } if disconnectedAllocCount > 0 { alloc.ClientStatus = tc.disconnectedAllocStatus // Set the node id on all the disconnected allocs to the node under test. alloc.NodeID = testNode.ID alloc.NodeName = "disconnected" alloc.AllocStates = []*structs.AllocState{{ Field: structs.AllocStateFieldClientStatus, Value: structs.AllocClientStatusUnknown, Time: time.Now(), }} event := structs.NewTaskEvent(structs.TaskClientReconnected) event.Time = time.Now().UnixNano() alloc.TaskStates = map[string]*structs.TaskState{ alloc.Job.TaskGroups[0].Tasks[0].Name: { Events: []*structs.TaskEvent{event}, }, } disconnectedAllocCount-- } } // Place the allocs on another node. if tc.replace { replacements := make([]*structs.Allocation, 0) for _, alloc := range allocs { if alloc.NodeID != testNode.ID { continue } replacement := alloc.Copy() replacement.ID = uuid.Generate() replacement.NodeID = uuid.Generate() replacement.ClientStatus = structs.AllocClientStatusRunning replacement.PreviousAllocation = alloc.ID replacement.AllocStates = nil replacement.TaskStates = nil alloc.NextAllocation = replacement.ID if tc.jobVersionIncrement != 0 { replacement.Job.Version = replacement.Job.Version + tc.jobVersionIncrement } if tc.nodeScoreIncrement != 0 { replacement.Metrics.ScoreMetaData[0].NormScore = replacement.Metrics.ScoreMetaData[0].NormScore + tc.nodeScoreIncrement } replacements = append(replacements, replacement) // If we want to test intermediate replacement failures simulate that. if tc.failReplacement { replacement.ClientStatus = structs.AllocClientStatusFailed nextReplacement := replacement.Copy() nextReplacement.ID = uuid.Generate() nextReplacement.ClientStatus = structs.AllocClientStatusRunning nextReplacement.PreviousAllocation = replacement.ID replacement.NextAllocation = nextReplacement.ID replacements = append(replacements, nextReplacement) } } allocs = append(allocs, replacements...) } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, tc.isBatch, job.ID, job, nil, allocs, map[string]*structs.Node{testNode.ID: testNode}, "", 50, true) reconciler.now = time.Now() if tc.maxDisconnect != nil { reconciler.now = time.Now().Add(*tc.maxDisconnect * 20) } results := reconciler.Compute() assertResults(t, results, tc.expected) for _, stopResult := range results.stop { if tc.shouldStopOnDisconnectedNode { require.Equal(t, testNode.ID, stopResult.alloc.NodeID) } else { require.NotEqual(t, testNode.ID, stopResult.alloc.NodeID) } require.Equal(t, job.Version, stopResult.alloc.Job.Version) } }) } } // Tests that a client disconnect while a canary is in progress generates the result. func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { type testCase struct { name string nodes []string deploymentState *structs.DeploymentState deployedAllocs map[*structs.Node][]*structs.Allocation canaryAllocs map[*structs.Node][]*structs.Allocation expectedResult *resultExpectation } running := structs.AllocClientStatusRunning complete := structs.AllocClientStatusComplete unknown := structs.AllocClientStatusUnknown pending := structs.AllocClientStatusPending run := structs.AllocDesiredStatusRun stop := structs.AllocDesiredStatusStop maxClientDisconnect := 10 * time.Minute readyNode := mock.Node() readyNode.Name = "ready-" + readyNode.ID readyNode.Status = structs.NodeStatusReady disconnectedNode := mock.Node() disconnectedNode.Name = "disconnected-" + disconnectedNode.ID disconnectedNode.Status = structs.NodeStatusDisconnected // Job with allocations and max_client_disconnect job := mock.Job() updatedJob := job.Copy() updatedJob.Version = updatedJob.Version + 1 testCases := []testCase{ { name: "3-placed-1-disconnect", deploymentState: &structs.DeploymentState{ AutoRevert: false, AutoPromote: false, Promoted: false, ProgressDeadline: 5 * time.Minute, RequireProgressBy: time.Now().Add(5 * time.Minute), PlacedCanaries: []string{}, DesiredCanaries: 1, DesiredTotal: 6, PlacedAllocs: 3, HealthyAllocs: 2, UnhealthyAllocs: 0, }, deployedAllocs: map[*structs.Node][]*structs.Allocation{ readyNode: { // filtered as terminal {Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop}, // Ignored {Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop}, // destructive, but discarded because canarying {Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run}, }, disconnectedNode: { // filtered as terminal {Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop}, // Gets a placement, and a disconnect update {Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run}, // Gets a placement, and a disconnect update {Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run}, }, }, canaryAllocs: map[*structs.Node][]*structs.Allocation{ readyNode: { // Ignored {Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run}, // Ignored {Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run}, }, disconnectedNode: { // Gets a placement, and a disconnect update {Name: "my-job.web[1]", ClientStatus: running, DesiredStatus: run}, }, }, expectedResult: &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 3, destructive: 0, stop: 0, inplace: 0, attributeUpdates: 0, disconnectUpdates: 3, reconnectUpdates: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ updatedJob.TaskGroups[0].Name: { Place: 3, Canary: 0, Ignore: 3, }, }, }, }, { name: "ignore-unknown", deploymentState: &structs.DeploymentState{ AutoRevert: false, AutoPromote: false, Promoted: false, ProgressDeadline: 5 * time.Minute, RequireProgressBy: time.Now().Add(5 * time.Minute), PlacedCanaries: []string{}, DesiredCanaries: 1, DesiredTotal: 6, PlacedAllocs: 3, HealthyAllocs: 2, UnhealthyAllocs: 0, }, deployedAllocs: map[*structs.Node][]*structs.Allocation{ readyNode: { // filtered as terminal {Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop}, // Ignored {Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop}, // destructive, but discarded because canarying {Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run}, }, disconnectedNode: { // filtered as terminal {Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop}, // Gets a placement, and a disconnect update {Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run}, // Gets a placement, and a disconnect update {Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run}, }, }, canaryAllocs: map[*structs.Node][]*structs.Allocation{ readyNode: { // Ignored {Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run}, // Ignored {Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run}, }, disconnectedNode: { // Ignored {Name: "my-job.web[1]", ClientStatus: unknown, DesiredStatus: run}, }, }, expectedResult: &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, destructive: 0, stop: 0, inplace: 0, attributeUpdates: 0, disconnectUpdates: 2, reconnectUpdates: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ updatedJob.TaskGroups[0].Name: { Place: 2, Canary: 0, Ignore: 4, }, }, }, }, { name: "4-placed-2-pending-lost", deploymentState: &structs.DeploymentState{ AutoRevert: false, AutoPromote: false, Promoted: false, ProgressDeadline: 5 * time.Minute, RequireProgressBy: time.Now().Add(5 * time.Minute), PlacedCanaries: []string{}, DesiredCanaries: 2, DesiredTotal: 6, PlacedAllocs: 4, HealthyAllocs: 2, UnhealthyAllocs: 0, }, deployedAllocs: map[*structs.Node][]*structs.Allocation{ readyNode: { // filtered as terminal {Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop}, // filtered as terminal {Name: "my-job.web[2]", ClientStatus: complete, DesiredStatus: stop}, // destructive, but discarded because canarying {Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run}, }, disconnectedNode: { // filtered as terminal {Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop}, // Gets a placement, and a disconnect update {Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run}, // Gets a placement, and a disconnect update {Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run}, }, }, canaryAllocs: map[*structs.Node][]*structs.Allocation{ readyNode: { // Ignored {Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run}, // Ignored {Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: run}, }, disconnectedNode: { // Stop/Lost because pending {Name: "my-job.web[1]", ClientStatus: pending, DesiredStatus: run}, // Stop/Lost because pending {Name: "my-job.web[3]", ClientStatus: pending, DesiredStatus: run}, }, }, expectedResult: &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, place: 2, destructive: 0, stop: 2, inplace: 0, attributeUpdates: 0, disconnectUpdates: 2, reconnectUpdates: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ updatedJob.TaskGroups[0].Name: { Place: 2, Canary: 0, Ignore: 3, // The 2 stops in this test are transient failures, but // the deployment can still progress. We don't include // them in the stop count since DesiredTGUpdates is used // to report deployment progress or final deployment state. Stop: 0, }, }, }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { // Set the count dynamically to the number from the original deployment. job.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode]) job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect job.TaskGroups[0].Update = &structs.UpdateStrategy{ MaxParallel: 1, Canary: tc.deploymentState.DesiredCanaries, MinHealthyTime: 3 * time.Second, HealthyDeadline: 20 * time.Second, AutoRevert: true, AutoPromote: true, } updatedJob.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode]) updatedJob.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect updatedJob.TaskGroups[0].Update = &structs.UpdateStrategy{ MaxParallel: 1, Canary: tc.deploymentState.DesiredCanaries, MinHealthyTime: 3 * time.Second, HealthyDeadline: 20 * time.Second, AutoRevert: true, AutoPromote: true, } // Populate Alloc IDS, Node IDs, Job on deployed allocs allocsConfigured := 0 for node, allocs := range tc.deployedAllocs { for _, alloc := range allocs { alloc.ID = uuid.Generate() alloc.NodeID = node.ID alloc.NodeName = node.Name alloc.JobID = job.ID alloc.Job = job alloc.TaskGroup = job.TaskGroups[0].Name allocsConfigured++ } } require.Equal(t, tc.deploymentState.DesiredTotal, allocsConfigured, "invalid alloc configuration: expect %d got %d", tc.deploymentState.DesiredTotal, allocsConfigured) // Populate Alloc IDS, Node IDs, Job on canaries canariesConfigured := 0 handled := make(map[string]allocUpdateType) for node, allocs := range tc.canaryAllocs { for _, alloc := range allocs { alloc.ID = uuid.Generate() alloc.NodeID = node.ID alloc.NodeName = node.Name alloc.JobID = updatedJob.ID alloc.Job = updatedJob alloc.TaskGroup = updatedJob.TaskGroups[0].Name alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ Canary: true, } if alloc.ClientStatus == structs.AllocClientStatusRunning { alloc.DeploymentStatus.Healthy = helper.BoolToPtr(true) } tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID) handled[alloc.ID] = allocUpdateFnIgnore canariesConfigured++ } } // Validate tc.canaryAllocs against tc.deploymentState require.Equal(t, tc.deploymentState.PlacedAllocs, canariesConfigured, "invalid canary configuration: expect %d got %d", tc.deploymentState.PlacedAllocs, canariesConfigured) deployment := structs.NewDeployment(updatedJob, 50) deployment.TaskGroups[updatedJob.TaskGroups[0].Name] = tc.deploymentState // Build a map of tainted nodes that contains the last canary tainted := make(map[string]*structs.Node, 1) tainted[disconnectedNode.ID] = disconnectedNode allocs := make([]*structs.Allocation, 0) allocs = append(allocs, tc.deployedAllocs[readyNode]...) allocs = append(allocs, tc.deployedAllocs[disconnectedNode]...) allocs = append(allocs, tc.canaryAllocs[readyNode]...) allocs = append(allocs, tc.canaryAllocs[disconnectedNode]...) mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, updatedJob.ID, updatedJob, deployment, allocs, tainted, "", 50, true) result := reconciler.Compute() // Assert the correct results assertResults(t, result, tc.expectedResult) // Validate that placements are either for placed canaries for the // updated job, or for disconnected allocs for the original job // and that they have a disconnect update. for _, placeResult := range result.place { found := false require.NotNil(t, placeResult.previousAlloc) for _, deployed := range tc.deployedAllocs[disconnectedNode] { if deployed.ID == placeResult.previousAlloc.ID { found = true require.Equal(t, job.Version, placeResult.previousAlloc.Job.Version) require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID) _, exists := result.disconnectUpdates[placeResult.previousAlloc.ID] require.True(t, exists) break } } for _, canary := range tc.canaryAllocs[disconnectedNode] { if canary.ID == placeResult.previousAlloc.ID { found = true require.Equal(t, updatedJob.Version, placeResult.previousAlloc.Job.Version) require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID) _, exists := result.disconnectUpdates[placeResult.previousAlloc.ID] require.True(t, exists) break } } require.True(t, found) } // Validate that stops are for pending disconnects for _, stopResult := range result.stop { require.Equal(t, pending, stopResult.alloc.ClientStatus) } }) } }