2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
package scheduler
|
|
|
|
|
|
|
|
import (
|
2017-06-02 23:11:29 +00:00
|
|
|
"fmt"
|
2017-06-01 22:16:24 +00:00
|
|
|
"reflect"
|
|
|
|
"regexp"
|
|
|
|
"strconv"
|
|
|
|
"testing"
|
2017-06-02 23:11:29 +00:00
|
|
|
"time"
|
2017-06-01 22:16:24 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
"github.com/hashicorp/go-set"
|
2022-03-15 12:42:43 +00:00
|
|
|
"github.com/hashicorp/nomad/ci"
|
2022-08-17 16:26:34 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
2018-06-13 22:33:25 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/testlog"
|
2017-09-29 16:58:48 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2017-06-01 22:16:24 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
"github.com/kr/pretty"
|
2018-03-29 14:28:52 +00:00
|
|
|
"github.com/stretchr/testify/assert"
|
2018-03-02 00:23:44 +00:00
|
|
|
"github.com/stretchr/testify/require"
|
2017-06-01 22:16:24 +00:00
|
|
|
)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
var (
|
|
|
|
canaryUpdate = &structs.UpdateStrategy{
|
|
|
|
Canary: 2,
|
|
|
|
MaxParallel: 2,
|
|
|
|
HealthCheck: structs.UpdateStrategyHealthCheck_Checks,
|
|
|
|
MinHealthyTime: 10 * time.Second,
|
|
|
|
HealthyDeadline: 10 * time.Minute,
|
2017-07-07 18:42:51 +00:00
|
|
|
Stagger: 31 * time.Second,
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
noCanaryUpdate = &structs.UpdateStrategy{
|
|
|
|
MaxParallel: 4,
|
|
|
|
HealthCheck: structs.UpdateStrategyHealthCheck_Checks,
|
|
|
|
MinHealthyTime: 10 * time.Second,
|
|
|
|
HealthyDeadline: 10 * time.Minute,
|
2017-07-07 18:42:51 +00:00
|
|
|
Stagger: 31 * time.Second,
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
func allocUpdateFnIgnore(*structs.Allocation, *structs.Job, *structs.TaskGroup) (bool, bool, *structs.Allocation) {
|
|
|
|
return true, false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func allocUpdateFnDestructive(*structs.Allocation, *structs.Job, *structs.TaskGroup) (bool, bool, *structs.Allocation) {
|
|
|
|
return false, true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func allocUpdateFnInplace(existing *structs.Allocation, _ *structs.Job, newTG *structs.TaskGroup) (bool, bool, *structs.Allocation) {
|
|
|
|
// Create a shallow copy
|
2018-03-23 23:55:21 +00:00
|
|
|
newAlloc := existing.CopySkipJob()
|
2018-10-03 16:47:18 +00:00
|
|
|
newAlloc.AllocatedResources = &structs.AllocatedResources{
|
|
|
|
Tasks: map[string]*structs.AllocatedTaskResources{},
|
|
|
|
Shared: structs.AllocatedSharedResources{
|
2018-10-16 22:34:32 +00:00
|
|
|
DiskMB: int64(newTG.EphemeralDisk.SizeMB),
|
2018-10-03 16:47:18 +00:00
|
|
|
},
|
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
|
|
|
|
// Use the new task resources but keep the network from the old
|
|
|
|
for _, task := range newTG.Tasks {
|
2018-10-03 16:47:18 +00:00
|
|
|
networks := existing.AllocatedResources.Tasks[task.Name].Copy().Networks
|
|
|
|
newAlloc.AllocatedResources.Tasks[task.Name] = &structs.AllocatedTaskResources{
|
|
|
|
Cpu: structs.AllocatedCpuResources{
|
2018-10-16 22:34:32 +00:00
|
|
|
CpuShares: int64(task.Resources.CPU),
|
2018-10-03 16:47:18 +00:00
|
|
|
},
|
|
|
|
Memory: structs.AllocatedMemoryResources{
|
2018-10-16 22:34:32 +00:00
|
|
|
MemoryMB: int64(task.Resources.MemoryMB),
|
2018-10-03 16:47:18 +00:00
|
|
|
},
|
|
|
|
Networks: networks,
|
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return false, false, newAlloc
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
func allocUpdateFnMock(handled map[string]allocUpdateType, unhandled allocUpdateType) allocUpdateType {
|
|
|
|
return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (bool, bool, *structs.Allocation) {
|
|
|
|
if fn, ok := handled[existing.ID]; ok {
|
|
|
|
return fn(existing, newJob, newTG)
|
|
|
|
}
|
|
|
|
|
|
|
|
return unhandled(existing, newJob, newTG)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
var (
|
|
|
|
// AllocationIndexRegex is a regular expression to find the allocation index.
|
|
|
|
allocationIndexRegex = regexp.MustCompile(".+\\[(\\d+)\\]$")
|
|
|
|
)
|
|
|
|
|
|
|
|
// allocNameToIndex returns the index of the allocation.
|
|
|
|
func allocNameToIndex(name string) uint {
|
|
|
|
matches := allocationIndexRegex.FindStringSubmatch(name)
|
|
|
|
if len(matches) != 2 {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
index, err := strconv.Atoi(matches[1])
|
|
|
|
if err != nil {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
return uint(index)
|
|
|
|
}
|
|
|
|
|
|
|
|
func assertNamesHaveIndexes(t *testing.T, indexes []int, names []string) {
|
2017-09-14 21:00:33 +00:00
|
|
|
t.Helper()
|
2017-06-01 22:16:24 +00:00
|
|
|
m := make(map[uint]int)
|
|
|
|
for _, i := range indexes {
|
|
|
|
m[uint(i)] += 1
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, n := range names {
|
|
|
|
index := allocNameToIndex(n)
|
|
|
|
val, contained := m[index]
|
|
|
|
if !contained {
|
|
|
|
t.Fatalf("Unexpected index %d from name %s\nAll names: %v", index, n, names)
|
|
|
|
}
|
|
|
|
|
|
|
|
val--
|
|
|
|
if val < 0 {
|
|
|
|
t.Fatalf("Index %d repeated too many times\nAll names: %v", index, names)
|
|
|
|
}
|
|
|
|
m[index] = val
|
|
|
|
}
|
|
|
|
|
|
|
|
for k, remainder := range m {
|
|
|
|
if remainder != 0 {
|
|
|
|
t.Fatalf("Index %d has %d remaining uses expected\nAll names: %v", k, remainder, names)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-06 16:55:39 +00:00
|
|
|
func assertNoCanariesStopped(t *testing.T, d *structs.Deployment, stop []allocStopResult) {
|
2017-09-14 21:00:33 +00:00
|
|
|
t.Helper()
|
2017-07-06 16:55:39 +00:00
|
|
|
canaryIndex := make(map[string]struct{})
|
|
|
|
for _, state := range d.TaskGroups {
|
|
|
|
for _, c := range state.PlacedCanaries {
|
|
|
|
canaryIndex[c] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
for _, s := range stop {
|
2017-07-06 16:55:39 +00:00
|
|
|
if _, ok := canaryIndex[s.alloc.ID]; ok {
|
2017-06-02 23:11:29 +00:00
|
|
|
t.Fatalf("Stopping canary alloc %q %q", s.alloc.ID, s.alloc.Name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func assertPlaceResultsHavePreviousAllocs(t *testing.T, numPrevious int, place []allocPlaceResult) {
|
2017-09-14 21:00:33 +00:00
|
|
|
t.Helper()
|
2017-06-02 23:11:29 +00:00
|
|
|
names := make(map[string]struct{}, numPrevious)
|
|
|
|
|
|
|
|
found := 0
|
|
|
|
for _, p := range place {
|
|
|
|
if _, ok := names[p.name]; ok {
|
|
|
|
t.Fatalf("Name %q already placed", p.name)
|
|
|
|
}
|
|
|
|
names[p.name] = struct{}{}
|
|
|
|
|
|
|
|
if p.previousAlloc == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if act := p.previousAlloc.Name; p.name != act {
|
|
|
|
t.Fatalf("Name mismatch on previous alloc; got %q; want %q", act, p.name)
|
|
|
|
}
|
|
|
|
found++
|
|
|
|
}
|
|
|
|
if numPrevious != found {
|
|
|
|
t.Fatalf("wanted %d; got %d placements with previous allocs", numPrevious, found)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-01-19 19:21:50 +00:00
|
|
|
func assertPlacementsAreRescheduled(t *testing.T, numRescheduled int, place []allocPlaceResult) {
|
|
|
|
t.Helper()
|
|
|
|
names := make(map[string]struct{}, numRescheduled)
|
|
|
|
|
|
|
|
found := 0
|
|
|
|
for _, p := range place {
|
|
|
|
if _, ok := names[p.name]; ok {
|
|
|
|
t.Fatalf("Name %q already placed", p.name)
|
|
|
|
}
|
|
|
|
names[p.name] = struct{}{}
|
|
|
|
|
|
|
|
if p.previousAlloc == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if p.reschedule {
|
|
|
|
found++
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
if numRescheduled != found {
|
|
|
|
t.Fatalf("wanted %d; got %d placements that are rescheduled", numRescheduled, found)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
func intRange(pairs ...int) []int {
|
|
|
|
if len(pairs)%2 != 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var r []int
|
|
|
|
for i := 0; i < len(pairs); i += 2 {
|
|
|
|
for j := pairs[i]; j <= pairs[i+1]; j++ {
|
|
|
|
r = append(r, j)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func placeResultsToNames(place []allocPlaceResult) []string {
|
|
|
|
names := make([]string, 0, len(place))
|
|
|
|
for _, p := range place {
|
|
|
|
names = append(names, p.name)
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
func destructiveResultsToNames(destructive []allocDestructiveResult) []string {
|
|
|
|
names := make([]string, 0, len(destructive))
|
|
|
|
for _, d := range destructive {
|
|
|
|
names = append(names, d.placeName)
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
func stopResultsToNames(stop []allocStopResult) []string {
|
|
|
|
names := make([]string, 0, len(stop))
|
|
|
|
for _, s := range stop {
|
|
|
|
names = append(names, s.alloc.Name)
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
2018-03-26 18:06:21 +00:00
|
|
|
func attributeUpdatesToNames(attributeUpdates map[string]*structs.Allocation) []string {
|
|
|
|
names := make([]string, 0, len(attributeUpdates))
|
|
|
|
for _, a := range attributeUpdates {
|
2018-03-23 23:55:21 +00:00
|
|
|
names = append(names, a.Name)
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
func allocsToNames(allocs []*structs.Allocation) []string {
|
|
|
|
names := make([]string, 0, len(allocs))
|
|
|
|
for _, a := range allocs {
|
|
|
|
names = append(names, a.Name)
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
|
|
|
type resultExpectation struct {
|
|
|
|
createDeployment *structs.Deployment
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates []*structs.DeploymentStatusUpdate
|
2017-06-01 22:16:24 +00:00
|
|
|
place int
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive int
|
2017-06-01 22:16:24 +00:00
|
|
|
inplace int
|
2018-03-26 18:06:21 +00:00
|
|
|
attributeUpdates int
|
2022-02-16 18:50:20 +00:00
|
|
|
disconnectUpdates int
|
|
|
|
reconnectUpdates int
|
2017-06-01 22:16:24 +00:00
|
|
|
desiredTGUpdates map[string]*structs.DesiredUpdates
|
2022-03-31 15:32:18 +00:00
|
|
|
stop int
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) {
|
2017-09-14 21:00:33 +00:00
|
|
|
t.Helper()
|
2021-11-23 08:23:31 +00:00
|
|
|
assertion := assert.New(t)
|
2018-03-29 14:28:52 +00:00
|
|
|
|
2017-07-04 20:31:01 +00:00
|
|
|
if exp.createDeployment != nil && r.deployment == nil {
|
2018-04-20 00:08:24 +00:00
|
|
|
t.Errorf("Expect a created deployment got none")
|
2017-07-04 20:31:01 +00:00
|
|
|
} else if exp.createDeployment == nil && r.deployment != nil {
|
2018-04-20 00:08:24 +00:00
|
|
|
t.Errorf("Expect no created deployment; got %#v", r.deployment)
|
2017-07-04 20:31:01 +00:00
|
|
|
} else if exp.createDeployment != nil && r.deployment != nil {
|
2017-06-02 23:11:29 +00:00
|
|
|
// Clear the deployment ID
|
2017-07-04 20:31:01 +00:00
|
|
|
r.deployment.ID, exp.createDeployment.ID = "", ""
|
|
|
|
if !reflect.DeepEqual(r.deployment, exp.createDeployment) {
|
2018-04-20 00:08:24 +00:00
|
|
|
t.Errorf("Unexpected createdDeployment; got\n %#v\nwant\n%#v\nDiff: %v",
|
2017-07-04 20:31:01 +00:00
|
|
|
r.deployment, exp.createDeployment, pretty.Diff(r.deployment, exp.createDeployment))
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
assertion.EqualValues(exp.deploymentUpdates, r.deploymentUpdates, "Expected Deployment Updates")
|
|
|
|
assertion.Len(r.place, exp.place, "Expected Placements")
|
|
|
|
assertion.Len(r.destructiveUpdate, exp.destructive, "Expected Destructive")
|
|
|
|
assertion.Len(r.inplaceUpdate, exp.inplace, "Expected Inplace Updates")
|
|
|
|
assertion.Len(r.attributeUpdates, exp.attributeUpdates, "Expected Attribute Updates")
|
2022-02-16 18:50:20 +00:00
|
|
|
assertion.Len(r.reconnectUpdates, exp.reconnectUpdates, "Expected Reconnect Updates")
|
|
|
|
assertion.Len(r.disconnectUpdates, exp.disconnectUpdates, "Expected Disconnect Updates")
|
2021-11-23 08:23:31 +00:00
|
|
|
assertion.Len(r.stop, exp.stop, "Expected Stops")
|
|
|
|
assertion.EqualValues(exp.desiredTGUpdates, r.desiredTGUpdates, "Expected Desired TG Update Annotations")
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
func buildAllocations(job *structs.Job, count int, clientStatus, desiredStatus string, nodeScore float64) []*structs.Allocation {
|
|
|
|
allocs := make([]*structs.Allocation, 0)
|
|
|
|
|
|
|
|
for i := 0; i < count; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.ClientStatus = clientStatus
|
|
|
|
alloc.DesiredStatus = desiredStatus
|
|
|
|
|
|
|
|
alloc.Metrics = &structs.AllocMetric{
|
|
|
|
ScoreMetaData: []*structs.NodeScoreMeta{
|
|
|
|
{
|
|
|
|
NodeID: alloc.NodeID,
|
|
|
|
NormScore: nodeScore,
|
|
|
|
Scores: map[string]float64{
|
|
|
|
alloc.NodeID: nodeScore,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
return allocs
|
|
|
|
}
|
|
|
|
|
|
|
|
func buildDisconnectedNodes(allocs []*structs.Allocation, count int) map[string]*structs.Node {
|
|
|
|
tainted := make(map[string]*structs.Node, count)
|
|
|
|
for i := 0; i < count; i++ {
|
|
|
|
n := mock.Node()
|
|
|
|
n.ID = allocs[i].NodeID
|
|
|
|
n.Status = structs.NodeStatusDisconnected
|
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
return tainted
|
|
|
|
}
|
|
|
|
|
|
|
|
func buildResumableAllocations(count int, clientStatus, desiredStatus string, nodeScore float64) (*structs.Job, []*structs.Allocation) {
|
|
|
|
job := mock.Job()
|
2022-08-17 16:26:34 +00:00
|
|
|
job.TaskGroups[0].MaxClientDisconnect = pointer.Of(5 * time.Minute)
|
2022-02-16 18:50:20 +00:00
|
|
|
job.TaskGroups[0].Count = count
|
|
|
|
|
|
|
|
return job, buildAllocations(job, count, clientStatus, desiredStatus, nodeScore)
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Tests the reconciler properly handles placements for a job that has no
|
|
|
|
// existing allocations
|
|
|
|
func TestReconciler_Place_NoExisting(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(
|
|
|
|
testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, nil, nil, "", job.Priority, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 10,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles placements for a job that has some
|
|
|
|
// existing allocations
|
|
|
|
func TestReconciler_Place_Existing(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 3 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 5,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 5,
|
|
|
|
Ignore: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(5, 9), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles stopping allocations for a job that has
|
|
|
|
// scaled down
|
|
|
|
func TestReconciler_ScaleDown_Partial(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Has desired 10
|
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 20 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 20; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 10,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
Stop: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(10, 19), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles stopping allocations for a job that has
|
|
|
|
// scaled down to zero desired
|
|
|
|
func TestReconciler_ScaleDown_Zero(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 0
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 0
|
|
|
|
|
|
|
|
// Create 20 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 20; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 20,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 20,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 19), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
2017-09-14 21:00:33 +00:00
|
|
|
// Tests the reconciler properly handles stopping allocations for a job that has
|
|
|
|
// scaled down to zero desired where allocs have duplicate names
|
|
|
|
func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-09-14 21:00:33 +00:00
|
|
|
// Set desired 0
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 0
|
|
|
|
|
|
|
|
// Create 20 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
var expectedStopped []int
|
|
|
|
for i := 0; i < 20; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-09-14 21:00:33 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
expectedStopped = append(expectedStopped, i%2)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-09-14 21:00:33 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 20,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 20,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, expectedStopped, stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Tests the reconciler properly handles inplace upgrading allocations
|
|
|
|
func TestReconciler_Inplace(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 10,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
InPlaceUpdate: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), allocsToNames(r.inplaceUpdate))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles inplace upgrading allocations while
|
|
|
|
// scaling up
|
|
|
|
func TestReconciler_Inplace_ScaleUp(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 15
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 15
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 5,
|
|
|
|
inplace: 10,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 5,
|
|
|
|
InPlaceUpdate: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), allocsToNames(r.inplaceUpdate))
|
|
|
|
assertNamesHaveIndexes(t, intRange(10, 14), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles inplace upgrading allocations while
|
|
|
|
// scaling down
|
|
|
|
func TestReconciler_Inplace_ScaleDown(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 5,
|
|
|
|
stop: 5,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 5,
|
|
|
|
InPlaceUpdate: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 4), allocsToNames(r.inplaceUpdate))
|
|
|
|
assertNamesHaveIndexes(t, intRange(5, 9), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
2021-06-02 14:11:34 +00:00
|
|
|
// TestReconciler_Inplace_Rollback tests that a rollback to a previous version
|
|
|
|
// generates the expected placements for any already-running allocations of
|
|
|
|
// that version.
|
|
|
|
func TestReconciler_Inplace_Rollback(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2021-06-02 14:11:34 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 4
|
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
DelayFunction: "exponential",
|
|
|
|
Interval: time.Second * 30,
|
|
|
|
Delay: time.Hour * 1,
|
|
|
|
Attempts: 3,
|
|
|
|
Unlimited: true,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 3 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 3; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
// allocs[0] is an allocation from version 0
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
|
|
|
|
// allocs[1] and allocs[2] are failed allocations for version 1 with
|
|
|
|
// different rescheduling states
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{
|
2021-10-01 13:59:55 +00:00
|
|
|
"web": {FinishedAt: time.Now().Add(-10 * time.Minute)}}
|
2021-06-02 14:11:34 +00:00
|
|
|
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// job is rolled back, we expect allocs[0] to be updated in-place
|
|
|
|
allocUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{
|
|
|
|
allocs[0].ID: allocUpdateFnInplace,
|
|
|
|
}, allocUpdateFnDestructive)
|
|
|
|
|
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFn,
|
2022-03-07 18:40:57 +00:00
|
|
|
false, job.ID, job, nil, allocs, nil, uuid.Generate(), 50, true)
|
2021-06-02 14:11:34 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
inplace: 1,
|
|
|
|
stop: 1,
|
|
|
|
destructive: 1,
|
|
|
|
attributeUpdates: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
|
|
|
Stop: 1,
|
|
|
|
InPlaceUpdate: 1,
|
|
|
|
DestructiveUpdate: 1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assert.Len(t, r.desiredFollowupEvals, 1, "expected 1 follow-up eval")
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 0), allocsToNames(r.inplaceUpdate))
|
|
|
|
assertNamesHaveIndexes(t, intRange(2, 2), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(2, 3), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Tests the reconciler properly handles destructive upgrading allocations
|
|
|
|
func TestReconciler_Destructive(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
2019-09-02 17:30:09 +00:00
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
destructive: 10,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
DestructiveUpdate: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), destructiveResultsToNames(r.destructiveUpdate))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles destructive upgrading allocations when max_parallel=0
|
|
|
|
func TestReconciler_DestructiveMaxParallel(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2019-09-02 17:30:09 +00:00
|
|
|
job := mock.MaxParallelJob()
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2019-09-02 17:30:09 +00:00
|
|
|
r := reconciler.Compute()
|
2017-06-01 22:16:24 +00:00
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive: 10,
|
2017-06-01 22:16:24 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
DestructiveUpdate: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), destructiveResultsToNames(r.destructiveUpdate))
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles destructive upgrading allocations while
|
|
|
|
// scaling up
|
|
|
|
func TestReconciler_Destructive_ScaleUp(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 15
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 15
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
place: 5,
|
|
|
|
destructive: 10,
|
2017-06-01 22:16:24 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 5,
|
|
|
|
DestructiveUpdate: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), destructiveResultsToNames(r.destructiveUpdate))
|
|
|
|
assertNamesHaveIndexes(t, intRange(10, 14), placeResultsToNames(r.place))
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles destructive upgrading allocations while
|
|
|
|
// scaling down
|
|
|
|
func TestReconciler_Destructive_ScaleDown(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive: 5,
|
|
|
|
stop: 5,
|
2017-06-01 22:16:24 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 5,
|
|
|
|
DestructiveUpdate: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(5, 9), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 4), destructiveResultsToNames(r.destructiveUpdate))
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles lost nodes with allocations
|
|
|
|
func TestReconciler_LostNode(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 2)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
n := mock.Node()
|
|
|
|
n.ID = allocs[i].NodeID
|
|
|
|
n.Status = structs.NodeStatusDown
|
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, tainted, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
2017-06-02 23:11:29 +00:00
|
|
|
Stop: 2,
|
2017-06-01 22:16:24 +00:00
|
|
|
Ignore: 8,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles lost nodes with allocations while
|
|
|
|
// scaling up
|
|
|
|
func TestReconciler_LostNode_ScaleUp(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 15
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 15
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 2)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
n := mock.Node()
|
|
|
|
n.ID = allocs[i].NodeID
|
|
|
|
n.Status = structs.NodeStatusDown
|
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, tainted, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 7,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 7,
|
2017-06-02 23:11:29 +00:00
|
|
|
Stop: 2,
|
2017-06-01 22:16:24 +00:00
|
|
|
Ignore: 8,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles lost nodes with allocations while
|
|
|
|
// scaling down
|
|
|
|
func TestReconciler_LostNode_ScaleDown(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 2)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
n := mock.Node()
|
|
|
|
n.ID = allocs[i].NodeID
|
|
|
|
n.Status = structs.NodeStatusDown
|
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, tainted, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 5,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 5,
|
|
|
|
Ignore: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1, 7, 9), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles draining nodes with allocations
|
|
|
|
func TestReconciler_DrainNode(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 2)
|
|
|
|
for i := 0; i < 2; i++ {
|
2021-02-11 15:40:59 +00:00
|
|
|
n := mock.DrainNode()
|
2017-06-01 22:16:24 +00:00
|
|
|
n.ID = allocs[i].NodeID
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[i].DesiredTransition.Migrate = pointer.Of(true)
|
2017-06-01 22:16:24 +00:00
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, tainted, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Migrate: 2,
|
|
|
|
Ignore: 8,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
2017-06-02 23:11:29 +00:00
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
|
2018-01-19 19:21:50 +00:00
|
|
|
// These should not have the reschedule field set
|
|
|
|
assertPlacementsAreRescheduled(t, 0, r.place)
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles draining nodes with allocations while
|
|
|
|
// scaling up
|
|
|
|
func TestReconciler_DrainNode_ScaleUp(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 15
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 15
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 2)
|
|
|
|
for i := 0; i < 2; i++ {
|
2021-02-11 15:40:59 +00:00
|
|
|
n := mock.DrainNode()
|
2017-06-01 22:16:24 +00:00
|
|
|
n.ID = allocs[i].NodeID
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[i].DesiredTransition.Migrate = pointer.Of(true)
|
2017-06-01 22:16:24 +00:00
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, tainted, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 7,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 5,
|
|
|
|
Migrate: 2,
|
|
|
|
Ignore: 8,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place))
|
2017-06-02 23:11:29 +00:00
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
|
2018-01-19 19:21:50 +00:00
|
|
|
// These should not have the reschedule field set
|
|
|
|
assertPlacementsAreRescheduled(t, 0, r.place)
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles draining nodes with allocations while
|
|
|
|
// scaling down
|
|
|
|
func TestReconciler_DrainNode_ScaleDown(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Set desired 8
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 8
|
|
|
|
|
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 3)
|
|
|
|
for i := 0; i < 3; i++ {
|
2021-02-11 15:40:59 +00:00
|
|
|
n := mock.DrainNode()
|
2017-06-01 22:16:24 +00:00
|
|
|
n.ID = allocs[i].NodeID
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[i].DesiredTransition.Migrate = pointer.Of(true)
|
2017-06-01 22:16:24 +00:00
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, tainted, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 3,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Migrate: 1,
|
|
|
|
Stop: 2,
|
|
|
|
Ignore: 7,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
|
2017-06-02 23:11:29 +00:00
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
2018-01-19 19:21:50 +00:00
|
|
|
// These should not have the reschedule field set
|
|
|
|
assertPlacementsAreRescheduled(t, 0, r.place)
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles a task group being removed
|
|
|
|
func TestReconciler_RemovedTG(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create 10 allocations for a tg that no longer exists
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
oldName := job.TaskGroups[0].Name
|
|
|
|
newName := "different"
|
|
|
|
job.TaskGroups[0].Name = newName
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 10,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 10,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
oldName: {
|
|
|
|
Stop: 10,
|
|
|
|
},
|
|
|
|
newName: {
|
|
|
|
Place: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler properly handles a job in stopped states
|
|
|
|
func TestReconciler_JobStopped(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.Stop = true
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
job *structs.Job
|
|
|
|
jobID, taskGroup string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "stopped job",
|
|
|
|
job: job,
|
|
|
|
jobID: job.ID,
|
|
|
|
taskGroup: job.TaskGroups[0].Name,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "nil job",
|
|
|
|
job: nil,
|
|
|
|
jobID: "foo",
|
|
|
|
taskGroup: "bar",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
2017-06-02 23:11:29 +00:00
|
|
|
// Create 10 allocations
|
2017-06-01 22:16:24 +00:00
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = c.job
|
|
|
|
alloc.JobID = c.jobID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(c.jobID, c.taskGroup, uint(i))
|
|
|
|
alloc.TaskGroup = c.taskGroup
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 10,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
c.taskGroup: {
|
|
|
|
Stop: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), stopResultsToNames(r.stop))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-09 02:03:49 +00:00
|
|
|
// Tests the reconciler doesn't update allocs in terminal state
|
|
|
|
// when job is stopped or nil
|
|
|
|
func TestReconciler_JobStopped_TerminalAllocs(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-10-09 02:03:49 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.Stop = true
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
job *structs.Job
|
|
|
|
jobID, taskGroup string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "stopped job",
|
|
|
|
job: job,
|
|
|
|
jobID: job.ID,
|
|
|
|
taskGroup: job.TaskGroups[0].Name,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "nil job",
|
|
|
|
job: nil,
|
|
|
|
jobID: "foo",
|
|
|
|
taskGroup: "bar",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
// Create 10 terminal allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = c.job
|
|
|
|
alloc.JobID = c.jobID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(c.jobID, c.taskGroup, uint(i))
|
|
|
|
alloc.TaskGroup = c.taskGroup
|
|
|
|
if i%2 == 0 {
|
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
} else {
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
}
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-10-09 02:03:49 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
require.Len(t, r.stop, 0)
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
c.taskGroup: {},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
// Tests the reconciler properly handles jobs with multiple task groups
|
|
|
|
func TestReconciler_MultiTG(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-01 22:16:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
tg2 := job.TaskGroups[0].Copy()
|
|
|
|
tg2.Name = "foo"
|
|
|
|
job.TaskGroups = append(job.TaskGroups, tg2)
|
|
|
|
|
|
|
|
// Create 2 existing allocations for the first tg
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-01 22:16:24 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-01 22:16:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-06-02 23:11:29 +00:00
|
|
|
deploymentUpdates: nil,
|
2017-06-01 22:16:24 +00:00
|
|
|
place: 18,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 8,
|
|
|
|
Ignore: 2,
|
|
|
|
},
|
|
|
|
tg2.Name: {
|
|
|
|
Place: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(2, 9, 0, 9), placeResultsToNames(r.place))
|
|
|
|
}
|
2017-06-02 23:11:29 +00:00
|
|
|
|
2018-04-25 23:43:44 +00:00
|
|
|
// Tests the reconciler properly handles jobs with multiple task groups with
|
2023-01-30 14:48:43 +00:00
|
|
|
// only one having an update block and a deployment already being created
|
|
|
|
func TestReconciler_MultiTG_SingleUpdateBlock(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-25 23:43:44 +00:00
|
|
|
job := mock.Job()
|
|
|
|
tg2 := job.TaskGroups[0].Copy()
|
|
|
|
tg2.Name = "foo"
|
|
|
|
job.TaskGroups = append(job.TaskGroups, tg2)
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create all the allocs
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
for j := 0; j < 10; j++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[i].Name, uint(j))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[i].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2018-04-25 23:43:44 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-04-25 23:43:44 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
tg2.Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Tests delayed rescheduling of failed batch allocations
|
|
|
|
func TestReconciler_RescheduleLater_Batch(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
require := require.New(t)
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-01-19 23:13:11 +00:00
|
|
|
// Set desired 4
|
2018-01-19 17:58:59 +00:00
|
|
|
job := mock.Job()
|
2018-01-19 23:13:11 +00:00
|
|
|
job.TaskGroups[0].Count = 4
|
2018-03-02 00:23:44 +00:00
|
|
|
now := time.Now()
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Set up reschedule policy
|
|
|
|
delayDur := 15 * time.Second
|
2018-03-26 19:45:09 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour, Delay: delayDur, DelayFunction: "constant"}
|
2018-03-02 00:23:44 +00:00
|
|
|
tgName := job.TaskGroups[0].Name
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Create 6 existing allocations - 2 running, 1 complete and 3 failed
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 6; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Mark 3 as failed with restart tracking info
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[0].NextAllocation = allocs[1].ID
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[0].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
allocs[1].NextAllocation = allocs[2].ID
|
|
|
|
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now}}
|
|
|
|
allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[0].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[1].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Mark one as complete
|
|
|
|
allocs[5].ClientStatus = structs.AllocClientStatusComplete
|
2018-01-19 17:58:59 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, uuid.Generate(), 50, true)
|
2018-03-02 00:23:44 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Two reschedule attempts were already made, one more can be made at a future time
|
|
|
|
// Verify that the follow up eval has the expected waitUntil time
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.NotNil(evals)
|
|
|
|
require.Equal(1, len(evals))
|
|
|
|
require.Equal(now.Add(delayDur), evals[0].WaitUntil)
|
|
|
|
|
|
|
|
// Alloc 5 should not be replaced because it is terminal
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
2018-03-23 23:55:21 +00:00
|
|
|
inplace: 0,
|
2018-03-26 18:06:21 +00:00
|
|
|
attributeUpdates: 1,
|
2019-06-13 13:37:18 +00:00
|
|
|
stop: 0,
|
2018-03-02 00:23:44 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 0,
|
2018-03-23 21:36:05 +00:00
|
|
|
InPlaceUpdate: 0,
|
|
|
|
Ignore: 4,
|
2019-06-13 13:37:18 +00:00
|
|
|
Stop: 0,
|
2018-03-02 00:23:44 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
2018-03-26 18:06:21 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(2, 2), attributeUpdatesToNames(r.attributeUpdates))
|
2018-03-23 23:55:21 +00:00
|
|
|
|
|
|
|
// Verify that the followup evalID field is set correctly
|
|
|
|
var annotated *structs.Allocation
|
2018-03-26 18:06:21 +00:00
|
|
|
for _, a := range r.attributeUpdates {
|
2018-03-23 23:55:21 +00:00
|
|
|
annotated = a
|
|
|
|
}
|
|
|
|
require.Equal(evals[0].ID, annotated.FollowupEvalID)
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests delayed rescheduling of failed batch allocations and batching of allocs
|
|
|
|
// with fail times that are close together
|
|
|
|
func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
require := require.New(t)
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Set desired 4
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 10
|
|
|
|
now := time.Now()
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
// Set up reschedule policy
|
2018-03-02 00:23:44 +00:00
|
|
|
delayDur := 15 * time.Second
|
2018-03-26 19:45:09 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour, Delay: delayDur, DelayFunction: "constant"}
|
2018-03-02 00:23:44 +00:00
|
|
|
tgName := job.TaskGroups[0].Name
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Create 10 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Mark 5 as failed with fail times very close together
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
allocs[i].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[i].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(time.Duration(50*i) * time.Millisecond)}}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark two more as failed several seconds later
|
|
|
|
for i := 5; i < 7; i++ {
|
|
|
|
allocs[i].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[i].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(10 * time.Second)}}
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, uuid.Generate(), 50, true)
|
2018-03-02 00:23:44 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that two follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.NotNil(evals)
|
|
|
|
require.Equal(2, len(evals))
|
|
|
|
|
|
|
|
// Verify expected WaitUntil values for both batched evals
|
|
|
|
require.Equal(now.Add(delayDur), evals[0].WaitUntil)
|
|
|
|
secondBatchDuration := delayDur + 10*time.Second
|
|
|
|
require.Equal(now.Add(secondBatchDuration), evals[1].WaitUntil)
|
2018-01-19 17:58:59 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Alloc 5 should not be replaced because it is terminal
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
2018-03-23 23:55:21 +00:00
|
|
|
inplace: 0,
|
2018-03-26 18:06:21 +00:00
|
|
|
attributeUpdates: 7,
|
2019-06-13 13:37:18 +00:00
|
|
|
stop: 0,
|
2018-03-02 00:23:44 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 0,
|
2018-03-23 21:36:05 +00:00
|
|
|
InPlaceUpdate: 0,
|
|
|
|
Ignore: 10,
|
2019-06-13 13:37:18 +00:00
|
|
|
Stop: 0,
|
2018-03-02 00:23:44 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
2018-03-26 18:06:21 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 6), attributeUpdatesToNames(r.attributeUpdates))
|
2018-03-23 23:55:21 +00:00
|
|
|
|
|
|
|
// Verify that the followup evalID field is set correctly
|
2018-03-26 18:06:21 +00:00
|
|
|
for _, alloc := range r.attributeUpdates {
|
2018-03-02 00:23:44 +00:00
|
|
|
if allocNameToIndex(alloc.Name) < 5 {
|
|
|
|
require.Equal(evals[0].ID, alloc.FollowupEvalID)
|
|
|
|
} else if allocNameToIndex(alloc.Name) < 7 {
|
|
|
|
require.Equal(evals[1].ID, alloc.FollowupEvalID)
|
|
|
|
} else {
|
|
|
|
t.Fatalf("Unexpected alloc name in Inplace results %v", alloc.Name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests rescheduling failed batch allocations
|
|
|
|
func TestReconciler_RescheduleNow_Batch(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
require := require.New(t)
|
|
|
|
// Set desired 4
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 4
|
|
|
|
now := time.Now()
|
|
|
|
// Set up reschedule policy
|
2018-03-26 19:45:09 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour, Delay: 5 * time.Second, DelayFunction: "constant"}
|
2018-03-02 00:23:44 +00:00
|
|
|
tgName := job.TaskGroups[0].Name
|
2018-01-19 23:13:11 +00:00
|
|
|
// Create 6 existing allocations - 2 running, 1 complete and 3 failed
|
2018-01-19 17:58:59 +00:00
|
|
|
var allocs []*structs.Allocation
|
2018-01-19 23:13:11 +00:00
|
|
|
for i := 0; i < 6; i++ {
|
2018-01-19 17:58:59 +00:00
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
2018-01-19 23:13:11 +00:00
|
|
|
// Mark 3 as failed with restart tracking info
|
2018-01-19 17:58:59 +00:00
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
2018-02-19 12:53:14 +00:00
|
|
|
allocs[0].NextAllocation = allocs[1].ID
|
2018-01-19 23:13:11 +00:00
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[0].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
2018-02-19 12:53:14 +00:00
|
|
|
allocs[1].NextAllocation = allocs[2].ID
|
2018-01-19 23:13:11 +00:00
|
|
|
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
2018-03-02 00:23:44 +00:00
|
|
|
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
2018-04-03 20:49:18 +00:00
|
|
|
FinishedAt: now.Add(-5 * time.Second)}}
|
|
|
|
allocs[2].FollowupEvalID = uuid.Generate()
|
2018-01-19 23:13:11 +00:00
|
|
|
allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[0].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[1].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
2018-01-19 17:58:59 +00:00
|
|
|
// Mark one as complete
|
2018-01-19 23:13:11 +00:00
|
|
|
allocs[5].ClientStatus = structs.AllocClientStatusComplete
|
2018-01-19 17:58:59 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-04-03 20:49:18 +00:00
|
|
|
reconciler.now = now
|
2018-01-19 17:58:59 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Two reschedule attempts were made, one more can be made now
|
2018-01-19 23:13:11 +00:00
|
|
|
// Alloc 5 should not be replaced because it is terminal
|
2018-01-19 17:58:59 +00:00
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 1,
|
2018-01-19 17:58:59 +00:00
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 1,
|
2018-01-19 23:13:11 +00:00
|
|
|
Ignore: 3,
|
2018-01-19 17:58:59 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2018-01-19 23:13:11 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(2, 2), placeResultsToNames(r.place))
|
2018-01-19 17:58:59 +00:00
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
2018-01-19 19:21:50 +00:00
|
|
|
assertPlacementsAreRescheduled(t, 1, r.place)
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests rescheduling failed service allocations with desired state stop
|
2018-03-02 00:23:44 +00:00
|
|
|
func TestReconciler_RescheduleLater_Service(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
require := require.New(t)
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
2018-03-02 00:23:44 +00:00
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
// Set up reschedule policy
|
2018-03-02 00:23:44 +00:00
|
|
|
delayDur := 15 * time.Second
|
2018-03-13 15:06:26 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour, Delay: delayDur, MaxDelay: 1 * time.Hour}
|
2018-01-19 17:58:59 +00:00
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
// Mark two as failed
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
2018-03-23 23:55:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now}}
|
2018-01-19 17:58:59 +00:00
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Mark one as desired state stop
|
|
|
|
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, uuid.Generate(), 50, true)
|
2018-03-02 00:23:44 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Should place a new placement and create a follow up eval for the delayed reschedule
|
|
|
|
// Verify that the follow up eval has the expected waitUntil time
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.NotNil(evals)
|
|
|
|
require.Equal(1, len(evals))
|
|
|
|
require.Equal(now.Add(delayDur), evals[0].WaitUntil)
|
|
|
|
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
2018-03-23 23:55:21 +00:00
|
|
|
inplace: 0,
|
2018-03-26 18:06:21 +00:00
|
|
|
attributeUpdates: 1,
|
2019-06-13 13:37:18 +00:00
|
|
|
stop: 0,
|
2018-03-02 00:23:44 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
2018-03-23 21:36:05 +00:00
|
|
|
InPlaceUpdate: 0,
|
|
|
|
Ignore: 4,
|
2019-06-13 13:37:18 +00:00
|
|
|
Stop: 0,
|
2018-03-02 00:23:44 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place))
|
2018-03-26 18:06:21 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), attributeUpdatesToNames(r.attributeUpdates))
|
2018-03-23 23:55:21 +00:00
|
|
|
|
|
|
|
// Verify that the followup evalID field is set correctly
|
|
|
|
var annotated *structs.Allocation
|
2018-03-26 18:06:21 +00:00
|
|
|
for _, a := range r.attributeUpdates {
|
2018-03-23 23:55:21 +00:00
|
|
|
annotated = a
|
|
|
|
}
|
|
|
|
require.Equal(evals[0].ID, annotated.FollowupEvalID)
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
2018-03-23 23:41:00 +00:00
|
|
|
// Tests service allocations with client status complete
|
|
|
|
func TestReconciler_Service_ClientStatusComplete(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-23 23:41:00 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Set up reschedule policy
|
|
|
|
delayDur := 15 * time.Second
|
2018-03-26 18:13:21 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: delayDur,
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
}
|
2018-03-23 23:41:00 +00:00
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark one as client status complete
|
|
|
|
allocs[4].ClientStatus = structs.AllocClientStatusComplete
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-03-23 23:41:00 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Should place a new placement for the alloc that was marked complete
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
|
|
|
InPlaceUpdate: 0,
|
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2018-03-29 14:28:52 +00:00
|
|
|
// Tests service job placement with desired stop and client status complete
|
|
|
|
func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-29 14:28:52 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Set up reschedule policy
|
|
|
|
delayDur := 15 * time.Second
|
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: delayDur,
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark one as failed but with desired status stop
|
|
|
|
// Should not trigger rescheduling logic but should trigger a placement
|
|
|
|
allocs[4].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-03-29 14:28:52 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Should place a new placement for the alloc that was marked stopped
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
|
|
|
InPlaceUpdate: 0,
|
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place))
|
|
|
|
|
|
|
|
// Should not have any follow up evals created
|
|
|
|
require := require.New(t)
|
|
|
|
require.Equal(0, len(r.desiredFollowupEvals))
|
|
|
|
}
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Tests rescheduling failed service allocations with desired state stop
|
|
|
|
func TestReconciler_RescheduleNow_Service(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
require := require.New(t)
|
2018-03-26 18:06:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
2018-03-26 18:06:21 +00:00
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-03-26 18:06:21 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
2018-03-02 00:23:44 +00:00
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
2018-03-26 18:06:21 +00:00
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Mark two as failed
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
2018-03-26 18:06:21 +00:00
|
|
|
|
2018-01-19 21:20:00 +00:00
|
|
|
// Mark one of them as already rescheduled once
|
2018-03-02 00:23:44 +00:00
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
2018-01-19 21:20:00 +00:00
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
2018-03-02 00:23:44 +00:00
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
2018-01-19 21:20:00 +00:00
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
// Mark one as desired state stop
|
|
|
|
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-01-19 17:58:59 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc and one replacement for terminal alloc were placed
|
2018-01-19 17:58:59 +00:00
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
2018-01-19 21:20:00 +00:00
|
|
|
place: 2,
|
2018-01-19 17:58:59 +00:00
|
|
|
inplace: 0,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 1,
|
2018-01-19 17:58:59 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
2018-01-19 21:20:00 +00:00
|
|
|
Place: 2,
|
|
|
|
Ignore: 3,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 1,
|
2018-01-19 17:58:59 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// Rescheduled allocs should have previous allocs
|
2018-03-26 18:06:21 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1, 4, 4), placeResultsToNames(r.place))
|
2018-01-19 21:20:00 +00:00
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 1, r.place)
|
2018-01-19 17:58:59 +00:00
|
|
|
}
|
|
|
|
|
2018-04-03 20:49:18 +00:00
|
|
|
// Tests rescheduling failed service allocations when there's clock drift (upto a second)
|
|
|
|
func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-03 20:49:18 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-04-03 20:49:18 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark one as failed
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
// Set fail time to 4 seconds ago which falls within the reschedule window
|
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-4 * time.Second)}}
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-04-03 20:49:18 +00:00
|
|
|
reconciler.now = now
|
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc was placed
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 1,
|
2018-04-03 20:49:18 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 1,
|
2018-04-03 20:49:18 +00:00
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// Rescheduled allocs should have previous allocs
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place))
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 1, r.place)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests rescheduling failed service allocations when the eval ID matches and there's a large clock drift
|
|
|
|
func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-03 20:49:18 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-04-03 20:49:18 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark one as failed
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
// Set fail time to 5 seconds ago and eval ID
|
|
|
|
evalID := uuid.Generate()
|
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-5 * time.Second)}}
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[1].FollowupEvalID = evalID
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, evalID, 50, true)
|
2018-04-03 20:49:18 +00:00
|
|
|
reconciler.now = now.Add(-30 * time.Second)
|
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc was placed
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 1,
|
2018-04-03 20:49:18 +00:00
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 1,
|
2018-04-03 20:49:18 +00:00
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// Rescheduled allocs should have previous allocs
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place))
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 1, r.place)
|
|
|
|
}
|
|
|
|
|
2018-04-20 01:04:32 +00:00
|
|
|
// Tests rescheduling failed service allocations when there are canaries
|
|
|
|
func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-20 01:04:32 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-04-20 01:04:32 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
job2 := job.Copy()
|
|
|
|
job2.Version++
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job2, 50)
|
2018-04-20 01:04:32 +00:00
|
|
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
|
|
s := &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 5,
|
|
|
|
}
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark three as failed
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Mark one as desired state stop
|
|
|
|
allocs[4].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Create 2 canary allocations
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
|
|
Canary: true,
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(false),
|
2018-04-20 01:04:32 +00:00
|
|
|
}
|
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-04-20 01:04:32 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc and one replacement for terminal alloc were placed
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 2,
|
2018-04-20 01:04:32 +00:00
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 2,
|
2018-04-20 01:04:32 +00:00
|
|
|
Ignore: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// Rescheduled allocs should have previous allocs
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1, 4, 4), placeResultsToNames(r.place))
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 2, r.place)
|
|
|
|
}
|
|
|
|
|
2018-04-20 21:59:08 +00:00
|
|
|
// Tests rescheduling failed canary service allocations
|
|
|
|
func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-20 21:59:08 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-04-23 23:35:25 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "constant",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: true,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
job2 := job.Copy()
|
|
|
|
job2.Version++
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job2, 50)
|
2018-04-23 23:35:25 +00:00
|
|
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
|
|
s := &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 5,
|
|
|
|
}
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 2 healthy canary allocations
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
|
|
Canary: true,
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(false),
|
2018-04-23 23:35:25 +00:00
|
|
|
}
|
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark the canaries as failed
|
|
|
|
allocs[5].ClientStatus = structs.AllocClientStatusFailed
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[5].DesiredTransition.Reschedule = pointer.Of(true)
|
2018-04-23 23:35:25 +00:00
|
|
|
|
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[5].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: now.Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
|
|
|
|
allocs[6].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
allocs[6].ClientStatus = structs.AllocClientStatusFailed
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[6].DesiredTransition.Reschedule = pointer.Of(true)
|
2018-04-23 23:35:25 +00:00
|
|
|
|
|
|
|
// Create 4 unhealthy canary allocations that have already been replaced
|
|
|
|
for i := 0; i < 4; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2))
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
|
|
Canary: true,
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(false),
|
2018-04-23 23:35:25 +00:00
|
|
|
}
|
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-04-23 23:35:25 +00:00
|
|
|
reconciler.now = now
|
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc and one replacement for terminal alloc were placed
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 2,
|
2018-04-23 23:35:25 +00:00
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 2,
|
2018-04-23 23:35:25 +00:00
|
|
|
Ignore: 9,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// Rescheduled allocs should have previous allocs
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 2, r.place)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests rescheduling failed canary service allocations when one has reached its
|
|
|
|
// reschedule limit
|
|
|
|
func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-23 23:35:25 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-04-20 21:59:08 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
job2 := job.Copy()
|
|
|
|
job2.Version++
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job2, 50)
|
2018-04-20 21:59:08 +00:00
|
|
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
|
|
s := &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 5,
|
|
|
|
}
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
2018-04-23 23:35:25 +00:00
|
|
|
// Create 2 healthy canary allocations
|
2018-04-20 21:59:08 +00:00
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
|
|
Canary: true,
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(false),
|
2018-04-20 21:59:08 +00:00
|
|
|
}
|
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark the canaries as failed
|
|
|
|
allocs[5].ClientStatus = structs.AllocClientStatusFailed
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[5].DesiredTransition.Reschedule = pointer.Of(true)
|
2018-04-20 21:59:08 +00:00
|
|
|
|
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[5].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
2018-04-23 23:35:25 +00:00
|
|
|
{RescheduleTime: now.Add(-1 * time.Hour).UTC().UnixNano(),
|
2018-04-20 21:59:08 +00:00
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
2018-04-23 23:35:25 +00:00
|
|
|
|
2018-04-20 21:59:08 +00:00
|
|
|
allocs[6].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
allocs[6].ClientStatus = structs.AllocClientStatusFailed
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[6].DesiredTransition.Reschedule = pointer.Of(true)
|
2018-04-23 23:35:25 +00:00
|
|
|
|
|
|
|
// Create 4 unhealthy canary allocations that have already been replaced
|
|
|
|
for i := 0; i < 4; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2))
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
|
|
Canary: true,
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(false),
|
2018-04-23 23:35:25 +00:00
|
|
|
}
|
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
2018-04-20 21:59:08 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-04-23 23:35:25 +00:00
|
|
|
reconciler.now = now
|
2018-04-20 21:59:08 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc and one replacement for terminal alloc were placed
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
2018-04-23 23:35:25 +00:00
|
|
|
place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 1,
|
2018-04-20 21:59:08 +00:00
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
2018-04-23 23:35:25 +00:00
|
|
|
Place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 1,
|
2018-04-23 23:35:25 +00:00
|
|
|
Ignore: 10,
|
2018-04-20 21:59:08 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// Rescheduled allocs should have previous allocs
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place))
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 1, r.place)
|
|
|
|
}
|
|
|
|
|
2018-02-19 12:53:14 +00:00
|
|
|
// Tests failed service allocations that were already rescheduled won't be rescheduled again
|
|
|
|
func TestReconciler_DontReschedule_PreviouslyRescheduled(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-02-19 12:53:14 +00:00
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Set up reschedule policy
|
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 5, Interval: 24 * time.Hour}
|
|
|
|
|
|
|
|
// Create 7 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 7; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
// Mark two as failed and rescheduled
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[0].ID = allocs[1].ID
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
allocs[1].NextAllocation = allocs[2].ID
|
|
|
|
|
|
|
|
// Mark one as desired state stop
|
|
|
|
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-02-19 12:53:14 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Should place 1 - one is a new placement to make up the desired count of 5
|
|
|
|
// failing allocs are not rescheduled
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// name index 0 is used for the replacement because its
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Tests the reconciler cancels an old deployment when the job is being stopped
|
|
|
|
func TestReconciler_CancelDeployment_JobStop(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.Stop = true
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
running := structs.NewDeployment(job, 50)
|
|
|
|
failed := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
failed.Status = structs.DeploymentStatusFailed
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
job *structs.Job
|
|
|
|
jobID, taskGroup string
|
|
|
|
deployment *structs.Deployment
|
|
|
|
cancel bool
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "stopped job, running deployment",
|
|
|
|
job: job,
|
|
|
|
jobID: job.ID,
|
|
|
|
taskGroup: job.TaskGroups[0].Name,
|
|
|
|
deployment: running,
|
|
|
|
cancel: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "nil job, running deployment",
|
|
|
|
job: nil,
|
|
|
|
jobID: "foo",
|
|
|
|
taskGroup: "bar",
|
|
|
|
deployment: running,
|
|
|
|
cancel: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "stopped job, failed deployment",
|
|
|
|
job: job,
|
|
|
|
jobID: job.ID,
|
|
|
|
taskGroup: job.TaskGroups[0].Name,
|
|
|
|
deployment: failed,
|
|
|
|
cancel: false,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "nil job, failed deployment",
|
|
|
|
job: nil,
|
|
|
|
jobID: "foo",
|
|
|
|
taskGroup: "bar",
|
|
|
|
deployment: failed,
|
|
|
|
cancel: false,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
// Create 10 allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = c.job
|
|
|
|
alloc.JobID = c.jobID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(c.jobID, c.taskGroup, uint(i))
|
|
|
|
alloc.TaskGroup = c.taskGroup
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job,
|
2022-03-07 18:40:57 +00:00
|
|
|
c.deployment, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
var updates []*structs.DeploymentStatusUpdate
|
|
|
|
if c.cancel {
|
|
|
|
updates = []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: c.deployment.ID,
|
|
|
|
Status: structs.DeploymentStatusCancelled,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: updates,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 10,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
c.taskGroup: {
|
|
|
|
Stop: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), stopResultsToNames(r.stop))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler cancels an old deployment when the job is updated
|
|
|
|
func TestReconciler_CancelDeployment_JobUpdate(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Create a base job
|
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
// Create two deployments
|
2021-11-23 08:23:31 +00:00
|
|
|
running := structs.NewDeployment(job, 50)
|
|
|
|
failed := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
failed.Status = structs.DeploymentStatusFailed
|
|
|
|
|
|
|
|
// Make the job newer than the deployment
|
2017-07-07 02:55:58 +00:00
|
|
|
job.Version += 10
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
deployment *structs.Deployment
|
|
|
|
cancel bool
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "running deployment",
|
|
|
|
deployment: running,
|
|
|
|
cancel: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "failed deployment",
|
|
|
|
deployment: failed,
|
|
|
|
cancel: false,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
// Create 10 allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
c.deployment, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
var updates []*structs.DeploymentStatusUpdate
|
|
|
|
if c.cancel {
|
|
|
|
updates = []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: c.deployment.ID,
|
|
|
|
Status: structs.DeploymentStatusCancelled,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: updates,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// Tests the reconciler creates a deployment and does a rolling upgrade with
|
|
|
|
// destructive changes
|
|
|
|
func TestReconciler_CreateDeployment_RollingUpgrade_Destructive(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: d,
|
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive: 4,
|
2017-06-02 23:11:29 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
DestructiveUpdate: 4,
|
|
|
|
Ignore: 6,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 3), destructiveResultsToNames(r.destructiveUpdate))
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// Tests the reconciler creates a deployment for inplace updates
|
|
|
|
func TestReconciler_CreateDeployment_RollingUpgrade_Inplace(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-23 23:55:21 +00:00
|
|
|
jobOld := mock.Job()
|
|
|
|
job := jobOld.Copy()
|
|
|
|
job.Version++
|
2017-06-06 21:08:46 +00:00
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
2018-03-23 23:55:21 +00:00
|
|
|
alloc.Job = jobOld
|
2017-06-06 21:08:46 +00:00
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-06 21:08:46 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-06 21:08:46 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-06 21:08:46 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: d,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 10,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
InPlaceUpdate: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2018-06-05 18:58:53 +00:00
|
|
|
// Tests the reconciler creates a deployment when the job has a newer create index
|
|
|
|
func TestReconciler_CreateDeployment_NewerCreateIndex(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-06-05 18:58:53 +00:00
|
|
|
jobOld := mock.Job()
|
|
|
|
job := jobOld.Copy()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
2018-06-05 22:29:59 +00:00
|
|
|
job.CreateIndex += 100
|
2018-06-05 18:58:53 +00:00
|
|
|
|
|
|
|
// Create 5 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = jobOld
|
|
|
|
alloc.JobID = jobOld.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-06-05 18:58:53 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2018-06-05 18:58:53 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 5,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: d,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 5,
|
|
|
|
destructive: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
InPlaceUpdate: 0,
|
|
|
|
Ignore: 5,
|
|
|
|
Place: 5,
|
|
|
|
DestructiveUpdate: 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// Tests the reconciler doesn't creates a deployment if there are no changes
|
|
|
|
func TestReconciler_DontCreateDeployment_NoChanges(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 10 allocations from the job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-06 21:08:46 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-06 21:08:46 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
DestructiveUpdate: 0,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Tests the reconciler doesn't place any more canaries when the deployment is
|
|
|
|
// paused or failed
|
|
|
|
func TestReconciler_PausedOrFailedDeployment_NoMoreCanaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
deploymentStatus string
|
2017-07-05 19:50:40 +00:00
|
|
|
stop uint64
|
2017-06-02 23:11:29 +00:00
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "paused deployment",
|
|
|
|
deploymentStatus: structs.DeploymentStatusPaused,
|
2017-07-05 19:50:40 +00:00
|
|
|
stop: 0,
|
2017-06-02 23:11:29 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "failed deployment",
|
|
|
|
deploymentStatus: structs.DeploymentStatusFailed,
|
2017-07-05 19:50:40 +00:00
|
|
|
stop: 1,
|
2017-06-02 23:11:29 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
2017-07-05 19:50:40 +00:00
|
|
|
// Create a deployment that is paused/failed and has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.Status = c.deploymentStatus
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 1,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 10 allocations for the original job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, 0)
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
canary.DeploymentID = d.ID
|
|
|
|
allocs = append(allocs, canary)
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[canary.TaskGroup].PlacedCanaries = []string{canary.ID}
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{canary.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
2017-07-05 19:50:40 +00:00
|
|
|
stop: int(c.stop),
|
2017-06-02 23:11:29 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
2017-07-05 19:50:40 +00:00
|
|
|
Ignore: 11 - c.stop,
|
|
|
|
Stop: c.stop,
|
2017-06-02 23:11:29 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler doesn't place any more allocs when the deployment is
|
|
|
|
// paused or failed
|
|
|
|
func TestReconciler_PausedOrFailedDeployment_NoMorePlacements(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
job.TaskGroups[0].Count = 15
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
deploymentStatus string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "paused deployment",
|
|
|
|
deploymentStatus: structs.DeploymentStatusPaused,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "failed deployment",
|
|
|
|
deploymentStatus: structs.DeploymentStatusFailed,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
// Create a deployment that is paused and has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.Status = c.deploymentStatus
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 15,
|
|
|
|
PlacedAllocs: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 10 allocations for the new job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler doesn't do any more destructive updates when the
|
|
|
|
// deployment is paused or failed
|
|
|
|
func TestReconciler_PausedOrFailedDeployment_NoMoreDestructiveUpdates(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
deploymentStatus string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "paused deployment",
|
|
|
|
deploymentStatus: structs.DeploymentStatusPaused,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "failed deployment",
|
|
|
|
deploymentStatus: structs.DeploymentStatusFailed,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
// Create a deployment that is paused and has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.Status = c.deploymentStatus
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 1,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 9 allocations for the original job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 1; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create one for the new job
|
|
|
|
newAlloc := mock.Alloc()
|
|
|
|
newAlloc.Job = job
|
|
|
|
newAlloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
newAlloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
newAlloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, 0)
|
|
|
|
newAlloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
newAlloc.DeploymentID = d.ID
|
|
|
|
allocs = append(allocs, newAlloc)
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{newAlloc.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles migrating a canary correctly on a draining node
|
|
|
|
func TestReconciler_DrainNode_Canary(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
// Create a deployment that is paused and has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
s := &structs.DeploymentState{
|
2017-06-02 23:11:29 +00:00
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 2,
|
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create two canaries for the new job
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
canary.DeploymentID = d.ID
|
2017-07-05 19:50:40 +00:00
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, canary.ID)
|
2017-06-02 23:11:29 +00:00
|
|
|
allocs = append(allocs, canary)
|
|
|
|
handled[canary.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes that contains the last canary
|
|
|
|
tainted := make(map[string]*structs.Node, 1)
|
2021-02-11 15:40:59 +00:00
|
|
|
n := mock.DrainNode()
|
2017-06-02 23:11:29 +00:00
|
|
|
n.ID = allocs[11].NodeID
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[11].DesiredTransition.Migrate = pointer.Of(true)
|
2017-06-02 23:11:29 +00:00
|
|
|
tainted[n.ID] = n
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, tainted, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 1,
|
|
|
|
Ignore: 11,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles migrating a canary correctly on a lost node
|
|
|
|
func TestReconciler_LostNode_Canary(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
// Create a deployment that is paused and has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
s := &structs.DeploymentState{
|
2017-06-02 23:11:29 +00:00
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 2,
|
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create two canaries for the new job
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
2017-07-05 19:50:40 +00:00
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, canary.ID)
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.DeploymentID = d.ID
|
|
|
|
allocs = append(allocs, canary)
|
|
|
|
handled[canary.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes that contains the last canary
|
|
|
|
tainted := make(map[string]*structs.Node, 1)
|
|
|
|
n := mock.Node()
|
|
|
|
n.ID = allocs[11].NodeID
|
|
|
|
n.Status = structs.NodeStatusDown
|
|
|
|
tainted[n.ID] = n
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, tainted, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 1,
|
|
|
|
Ignore: 11,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles stopping canaries from older deployments
|
|
|
|
func TestReconciler_StopOldCanaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
// Create an old deployment that has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
s := &structs.DeploymentState{
|
2017-06-02 23:11:29 +00:00
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 2,
|
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Update the job
|
2017-07-07 02:55:58 +00:00
|
|
|
job.Version += 10
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create canaries
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
2017-07-05 19:50:40 +00:00
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, canary.ID)
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.DeploymentID = d.ID
|
|
|
|
allocs = append(allocs, canary)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d,
|
2022-03-07 18:40:57 +00:00
|
|
|
allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
newD := structs.NewDeployment(job, 50)
|
2017-07-07 06:30:46 +00:00
|
|
|
newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
2017-06-02 23:11:29 +00:00
|
|
|
newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: newD,
|
|
|
|
deploymentUpdates: []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: d.ID,
|
|
|
|
Status: structs.DeploymentStatusCancelled,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Stop: 2,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler creates new canaries when the job changes
|
|
|
|
func TestReconciler_NewCanaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
newD := structs.NewDeployment(job, 50)
|
2017-07-07 06:30:46 +00:00
|
|
|
newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
2017-06-02 23:11:29 +00:00
|
|
|
newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: newD,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
2018-04-20 00:08:24 +00:00
|
|
|
// Tests the reconciler creates new canaries when the job changes and the
|
|
|
|
// canary count is greater than the task group count
|
|
|
|
func TestReconciler_NewCanaries_CountGreater(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-20 00:08:24 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 3
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate.Copy()
|
|
|
|
job.TaskGroups[0].Update.Canary = 7
|
|
|
|
|
|
|
|
// Create 3 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 3; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-04-20 00:08:24 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
newD := structs.NewDeployment(job, 50)
|
2018-04-20 00:08:24 +00:00
|
|
|
newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
|
|
state := &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 7,
|
|
|
|
DesiredTotal: 3,
|
|
|
|
}
|
|
|
|
newD.TaskGroups[job.TaskGroups[0].Name] = state
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: newD,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 7,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 7,
|
|
|
|
Ignore: 3,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 2, 3, 6), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
2017-07-25 18:27:47 +00:00
|
|
|
// Tests the reconciler creates new canaries when the job changes for multiple
|
|
|
|
// task groups
|
|
|
|
func TestReconciler_NewCanaries_MultiTG(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-07-25 18:27:47 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
job.TaskGroups = append(job.TaskGroups, job.TaskGroups[0].Copy())
|
|
|
|
job.TaskGroups[0].Name = "tg2"
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job for each tg
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for j := 0; j < 2; j++ {
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-07-25 18:27:47 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[j].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[j].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-07-25 18:27:47 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
newD := structs.NewDeployment(job, 50)
|
2017-07-25 18:27:47 +00:00
|
|
|
newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
|
|
state := &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
newD.TaskGroups[job.TaskGroups[0].Name] = state
|
|
|
|
newD.TaskGroups[job.TaskGroups[1].Name] = state.Copy()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: newD,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 4,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
job.TaskGroups[1].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1, 0, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Tests the reconciler creates new canaries when the job changes and scales up
|
|
|
|
func TestReconciler_NewCanaries_ScaleUp(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Scale the job up to 15
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
job.TaskGroups[0].Count = 15
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
newD := structs.NewDeployment(job, 50)
|
2017-07-07 06:30:46 +00:00
|
|
|
newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
2017-06-02 23:11:29 +00:00
|
|
|
newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 15,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: newD,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler creates new canaries when the job changes and scales
|
|
|
|
// down
|
|
|
|
func TestReconciler_NewCanaries_ScaleDown(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Scale the job down to 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
newD := structs.NewDeployment(job, 50)
|
2017-07-07 06:30:46 +00:00
|
|
|
newD.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
2017-06-02 23:11:29 +00:00
|
|
|
newD.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 5,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: newD,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 5,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Stop: 5,
|
|
|
|
Ignore: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
|
|
|
assertNamesHaveIndexes(t, intRange(5, 9), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles filling the names of partially placed canaries
|
|
|
|
func TestReconciler_NewCanaries_FillNames(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = &structs.UpdateStrategy{
|
|
|
|
Canary: 4,
|
|
|
|
MaxParallel: 2,
|
|
|
|
HealthCheck: structs.UpdateStrategyHealthCheck_Checks,
|
|
|
|
MinHealthyTime: 10 * time.Second,
|
|
|
|
HealthyDeadline: 10 * time.Minute,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create an existing deployment that has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
s := &structs.DeploymentState{
|
2017-06-02 23:11:29 +00:00
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 4,
|
|
|
|
PlacedAllocs: 2,
|
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create canaries but pick names at the ends
|
|
|
|
for i := 0; i < 4; i += 3 {
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
2017-07-05 19:50:40 +00:00
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, canary.ID)
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.DeploymentID = d.ID
|
|
|
|
allocs = append(allocs, canary)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Canary: 2,
|
|
|
|
Ignore: 12,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(1, 2), placeResultsToNames(r.place))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles canary promotion by unblocking max_parallel
|
|
|
|
func TestReconciler_PromoteCanaries_Unblock(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
|
|
|
// Create an existing deployment that has placed some canaries and mark them
|
|
|
|
// promoted
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
s := &structs.DeploymentState{
|
2017-06-02 23:11:29 +00:00
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 2,
|
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the canaries
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
2017-07-05 19:50:40 +00:00
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, canary.ID)
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.DeploymentID = d.ID
|
|
|
|
canary.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, canary)
|
|
|
|
handled[canary.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive: 2,
|
|
|
|
stop: 2,
|
2017-06-02 23:11:29 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 2,
|
|
|
|
DestructiveUpdate: 2,
|
|
|
|
Ignore: 8,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-06 16:55:39 +00:00
|
|
|
assertNoCanariesStopped(t, d, r.stop)
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(2, 3), destructiveResultsToNames(r.destructiveUpdate))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles canary promotion when the canary count equals
|
|
|
|
// the total correctly
|
|
|
|
func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
job.TaskGroups[0].Count = 2
|
|
|
|
|
|
|
|
// Create an existing deployment that has placed some canaries and mark them
|
|
|
|
// promoted
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
s := &structs.DeploymentState{
|
2017-06-02 23:11:29 +00:00
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 2,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 2,
|
2018-04-08 23:09:14 +00:00
|
|
|
HealthyAllocs: 2,
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s
|
2017-06-02 23:11:29 +00:00
|
|
|
|
|
|
|
// Create 2 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the canaries
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
// Create one canary
|
|
|
|
canary := mock.Alloc()
|
|
|
|
canary.Job = job
|
|
|
|
canary.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
canary.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
canary.TaskGroup = job.TaskGroups[0].Name
|
2017-07-05 19:50:40 +00:00
|
|
|
s.PlacedCanaries = append(s.PlacedCanaries, canary.ID)
|
2017-06-02 23:11:29 +00:00
|
|
|
canary.DeploymentID = d.ID
|
|
|
|
canary.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, canary)
|
|
|
|
handled[canary.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2017-07-06 15:39:16 +00:00
|
|
|
updates := []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: d.ID,
|
|
|
|
Status: structs.DeploymentStatusSuccessful,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
2017-07-06 15:39:16 +00:00
|
|
|
deploymentUpdates: updates,
|
2017-06-02 23:11:29 +00:00
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 2,
|
|
|
|
Ignore: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-06 16:55:39 +00:00
|
|
|
assertNoCanariesStopped(t, d, r.stop)
|
2017-06-02 23:11:29 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler checks the health of placed allocs to determine the
|
|
|
|
// limit
|
|
|
|
func TestReconciler_DeploymentLimit_HealthAccounting(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
healthy int
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
healthy: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
healthy: 1,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
healthy: 2,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
healthy: 3,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
healthy: 4,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(fmt.Sprintf("%d healthy", c.healthy), func(t *testing.T) {
|
|
|
|
// Create an existing deployment that has placed some canaries and mark them
|
|
|
|
// promoted
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 4,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 6 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 4; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the new allocs
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 0; i < 4; i++ {
|
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = job
|
|
|
|
new.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
new.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
new.DeploymentID = d.ID
|
|
|
|
if i < c.healthy {
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
handled[new.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive: c.healthy,
|
2017-06-02 23:11:29 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
DestructiveUpdate: uint64(c.healthy),
|
|
|
|
Ignore: uint64(10 - c.healthy),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
if c.healthy != 0 {
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(4, 3+c.healthy), destructiveResultsToNames(r.destructiveUpdate))
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles an alloc on a tainted node during a rolling
|
|
|
|
// update
|
|
|
|
func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create an existing deployment that has some placed allocs
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
2017-07-15 23:31:33 +00:00
|
|
|
PlacedAllocs: 7,
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
|
2017-08-21 19:41:19 +00:00
|
|
|
// Create 2 allocations from the old job
|
2017-06-02 23:11:29 +00:00
|
|
|
var allocs []*structs.Allocation
|
2017-08-21 19:41:19 +00:00
|
|
|
for i := 8; i < 10; i++ {
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the healthy replacements
|
|
|
|
handled := make(map[string]allocUpdateType)
|
2017-08-21 19:41:19 +00:00
|
|
|
for i := 0; i < 8; i++ {
|
2017-06-02 23:11:29 +00:00
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = job
|
|
|
|
new.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
new.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
new.DeploymentID = d.ID
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
handled[new.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
2017-07-07 18:42:51 +00:00
|
|
|
tainted := make(map[string]*structs.Node, 3)
|
|
|
|
for i := 0; i < 3; i++ {
|
2017-06-02 23:11:29 +00:00
|
|
|
n := mock.Node()
|
2017-08-21 19:41:19 +00:00
|
|
|
n.ID = allocs[2+i].NodeID
|
2017-06-02 23:11:29 +00:00
|
|
|
if i == 0 {
|
|
|
|
n.Status = structs.NodeStatusDown
|
|
|
|
} else {
|
2021-02-11 15:40:59 +00:00
|
|
|
n.DrainStrategy = mock.DrainNode().DrainStrategy
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[2+i].DesiredTransition.Migrate = pointer.Of(true)
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, tainted, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
2018-03-12 20:44:33 +00:00
|
|
|
place: 3,
|
2017-08-21 19:41:19 +00:00
|
|
|
destructive: 2,
|
2018-03-12 20:44:33 +00:00
|
|
|
stop: 3,
|
2017-06-02 23:11:29 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1, // Place the lost
|
|
|
|
Stop: 1, // Stop the lost
|
2018-03-12 20:44:33 +00:00
|
|
|
Migrate: 2, // Migrate the tainted
|
2017-08-21 19:41:19 +00:00
|
|
|
DestructiveUpdate: 2,
|
2018-03-12 20:44:33 +00:00
|
|
|
Ignore: 5,
|
2017-06-02 23:11:29 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-08-21 19:41:19 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(8, 9), destructiveResultsToNames(r.destructiveUpdate))
|
2018-03-12 20:44:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 2), placeResultsToNames(r.place))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop))
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
|
2018-09-10 22:28:45 +00:00
|
|
|
// Tests the reconciler handles a failed deployment with allocs on tainted
|
|
|
|
// nodes
|
|
|
|
func TestReconciler_FailedDeployment_TaintedNodes(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create an existing failed deployment that has some placed allocs
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-06-02 23:11:29 +00:00
|
|
|
d.Status = structs.DeploymentStatusFailed
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 4,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 6 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 4; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the healthy replacements
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 0; i < 4; i++ {
|
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = job
|
|
|
|
new.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
new.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
new.DeploymentID = d.ID
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
handled[new.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a map of tainted nodes
|
|
|
|
tainted := make(map[string]*structs.Node, 2)
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
n := mock.Node()
|
|
|
|
n.ID = allocs[6+i].NodeID
|
|
|
|
if i == 0 {
|
|
|
|
n.Status = structs.NodeStatusDown
|
|
|
|
} else {
|
2021-02-11 15:40:59 +00:00
|
|
|
n.DrainStrategy = mock.DrainNode().DrainStrategy
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[6+i].DesiredTransition.Migrate = pointer.Of(true)
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
tainted[n.ID] = n
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, tainted, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
2018-09-10 22:28:45 +00:00
|
|
|
place: 2,
|
2017-06-02 23:11:29 +00:00
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
2018-09-10 22:28:45 +00:00
|
|
|
Place: 1,
|
|
|
|
Migrate: 1,
|
|
|
|
Stop: 1,
|
|
|
|
Ignore: 8,
|
2017-06-02 23:11:29 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2018-09-10 22:28:45 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
|
2017-06-02 23:11:29 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests the reconciler handles a run after a deployment is complete
|
|
|
|
// successfully.
|
|
|
|
func TestReconciler_CompleteDeployment(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
d.Status = structs.DeploymentStatusSuccessful
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 10,
|
|
|
|
HealthyAllocs: 10,
|
|
|
|
}
|
|
|
|
|
2017-06-02 23:11:29 +00:00
|
|
|
// Create allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-06-02 23:11:29 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
2017-07-05 19:50:40 +00:00
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-06-02 23:11:29 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-06-02 23:11:29 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2017-07-05 19:50:40 +00:00
|
|
|
|
2018-04-10 20:27:48 +00:00
|
|
|
// Tests that the reconciler marks a deployment as complete once there is
|
|
|
|
// nothing left to place even if there are failed allocations that are part of
|
|
|
|
// the deployment.
|
|
|
|
func TestReconciler_MarkDeploymentComplete_FailedAllocations(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-10 20:27:48 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2018-04-10 20:27:48 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 20,
|
|
|
|
HealthyAllocs: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 10 healthy allocs and 10 allocs that are failed
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 20; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%10))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{}
|
|
|
|
if i < 10 {
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
2022-08-17 16:26:34 +00:00
|
|
|
alloc.DeploymentStatus.Healthy = pointer.Of(true)
|
2018-04-10 20:27:48 +00:00
|
|
|
} else {
|
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusFailed
|
2022-08-17 16:26:34 +00:00
|
|
|
alloc.DeploymentStatus.Healthy = pointer.Of(false)
|
2018-04-10 20:27:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID,
|
2022-03-07 18:40:57 +00:00
|
|
|
job, d, allocs, nil, "", 50, true)
|
2018-04-10 20:27:48 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
updates := []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: d.ID,
|
|
|
|
Status: structs.DeploymentStatusSuccessful,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: updates,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
// Test that a failed deployment cancels non-promoted canaries
|
|
|
|
func TestReconciler_FailedDeployment_CancelCanaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
// Create a job with two task groups
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = canaryUpdate
|
|
|
|
job.TaskGroups = append(job.TaskGroups, job.TaskGroups[0].Copy())
|
|
|
|
job.TaskGroups[1].Name = "two"
|
|
|
|
|
|
|
|
// Create an existing failed deployment that has promoted one task group
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-05 19:50:40 +00:00
|
|
|
d.Status = structs.DeploymentStatusFailed
|
|
|
|
s0 := &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 4,
|
|
|
|
}
|
|
|
|
s1 := &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
PlacedAllocs: 2,
|
|
|
|
}
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = s0
|
|
|
|
d.TaskGroups[job.TaskGroups[1].Name] = s1
|
|
|
|
|
|
|
|
// Create 6 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for _, group := range []int{0, 1} {
|
|
|
|
replacements := 4
|
|
|
|
state := s0
|
|
|
|
if group == 1 {
|
|
|
|
replacements = 2
|
|
|
|
state = s1
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the healthy replacements
|
|
|
|
for i := 0; i < replacements; i++ {
|
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = job
|
|
|
|
new.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
new.NodeID = uuid.Generate()
|
2017-07-05 19:50:40 +00:00
|
|
|
new.Name = structs.AllocName(job.ID, job.TaskGroups[group].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[group].Name
|
|
|
|
new.DeploymentID = d.ID
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-07-05 19:50:40 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
handled[new.ID] = allocUpdateFnIgnore
|
|
|
|
|
|
|
|
// Add the alloc to the canary list
|
|
|
|
if i < 2 {
|
|
|
|
state.PlacedCanaries = append(state.PlacedCanaries, new.ID)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for i := replacements; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-07-05 19:50:40 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[group].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[group].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-07-05 19:50:40 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
job.TaskGroups[1].Name: {
|
|
|
|
Stop: 2,
|
|
|
|
Ignore: 8,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
|
|
|
|
}
|
2017-07-05 19:55:51 +00:00
|
|
|
|
2017-07-06 02:46:57 +00:00
|
|
|
// Test that a failed deployment and updated job works
|
|
|
|
func TestReconciler_FailedDeployment_NewJob(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-07-06 02:46:57 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create an existing failed deployment that has some placed allocs
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-06 02:46:57 +00:00
|
|
|
d.Status = structs.DeploymentStatusFailed
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 4,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 6 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 4; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-07-06 02:46:57 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the healthy replacements
|
|
|
|
for i := 0; i < 4; i++ {
|
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = job
|
|
|
|
new.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
new.NodeID = uuid.Generate()
|
2017-07-06 02:46:57 +00:00
|
|
|
new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
new.DeploymentID = d.ID
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-07-06 02:46:57 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Up the job version
|
|
|
|
jobNew := job.Copy()
|
2017-07-07 02:55:58 +00:00
|
|
|
jobNew.Version += 100
|
2017-07-06 02:46:57 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, jobNew,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-07-06 02:46:57 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
dnew := structs.NewDeployment(jobNew, 50)
|
2017-07-06 02:46:57 +00:00
|
|
|
dnew.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: dnew,
|
|
|
|
deploymentUpdates: nil,
|
2017-07-15 23:31:33 +00:00
|
|
|
destructive: 4,
|
2017-07-06 02:46:57 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
DestructiveUpdate: 4,
|
|
|
|
Ignore: 6,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
assertNamesHaveIndexes(t, intRange(0, 3), destructiveResultsToNames(r.destructiveUpdate))
|
2017-07-06 02:46:57 +00:00
|
|
|
}
|
2017-07-06 15:39:16 +00:00
|
|
|
|
|
|
|
// Tests the reconciler marks a deployment as complete
|
|
|
|
func TestReconciler_MarkDeploymentComplete(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-07-06 15:39:16 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-06 15:39:16 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 10,
|
|
|
|
HealthyAllocs: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-07-06 15:39:16 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2017-07-06 15:39:16 +00:00
|
|
|
}
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-07-06 15:39:16 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
updates := []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: d.ID,
|
|
|
|
Status: structs.DeploymentStatusSuccessful,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: updates,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2017-07-07 18:42:51 +00:00
|
|
|
|
2017-07-19 18:08:45 +00:00
|
|
|
// Tests the reconciler handles changing a job such that a deployment is created
|
|
|
|
// while doing a scale up but as the second eval.
|
|
|
|
func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-07-19 18:08:45 +00:00
|
|
|
// Scale the job up to 15
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
job.TaskGroups[0].Count = 30
|
|
|
|
|
|
|
|
// Create a deployment that is paused and has placed some canaries
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-07-19 18:08:45 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 30,
|
|
|
|
PlacedAllocs: 20,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-07-19 18:08:45 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 20 from new job
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for i := 10; i < 30; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.DeploymentID = d.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-07-19 18:08:45 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
handled[alloc.ID] = allocUpdateFnIgnore
|
|
|
|
}
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2017-07-19 18:08:45 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
2017-08-07 21:13:05 +00:00
|
|
|
// All should be ignored because nothing has been marked as
|
2017-07-19 18:08:45 +00:00
|
|
|
// healthy.
|
|
|
|
Ignore: 30,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2017-08-21 19:41:19 +00:00
|
|
|
|
|
|
|
// Tests the reconciler doesn't stop allocations when doing a rolling upgrade
|
|
|
|
// where the count of the old job allocs is < desired count.
|
|
|
|
func TestReconciler_RollingUpgrade_MissingAllocs(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-08-21 19:41:19 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 7 allocations from the old job
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 7; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
2017-09-29 16:58:48 +00:00
|
|
|
alloc.NodeID = uuid.Generate()
|
2017-08-21 19:41:19 +00:00
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-08-21 19:41:19 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2017-08-21 19:41:19 +00:00
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
DesiredTotal: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: d,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 3,
|
|
|
|
destructive: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 3,
|
|
|
|
DestructiveUpdate: 1,
|
|
|
|
Ignore: 6,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(7, 9), placeResultsToNames(r.place))
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 0), destructiveResultsToNames(r.destructiveUpdate))
|
|
|
|
}
|
2017-10-12 23:29:07 +00:00
|
|
|
|
|
|
|
// Tests that the reconciler handles rerunning a batch job in the case that the
|
|
|
|
// allocations are from an older instance of the job.
|
|
|
|
func TestReconciler_Batch_Rerun(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2017-10-12 23:29:07 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.Type = structs.JobTypeBatch
|
|
|
|
job.TaskGroups[0].Update = nil
|
|
|
|
|
|
|
|
// Create 10 allocations from the old job and have them be complete
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusComplete
|
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a copy of the job that is "new"
|
|
|
|
job2 := job.Copy()
|
|
|
|
job2.CreateIndex++
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job2.ID, job2,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2017-10-12 23:29:07 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 10,
|
|
|
|
destructive: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 10,
|
|
|
|
DestructiveUpdate: 0,
|
|
|
|
Ignore: 10,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place))
|
|
|
|
}
|
2018-01-19 17:58:59 +00:00
|
|
|
|
|
|
|
// Test that a failed deployment will not result in rescheduling failed allocations
|
|
|
|
func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
2018-01-19 17:58:59 +00:00
|
|
|
// Create an existing failed deployment that has some placed allocs
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2018-01-19 17:58:59 +00:00
|
|
|
d.Status = structs.DeploymentStatusFailed
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 5,
|
|
|
|
PlacedAllocs: 4,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 4 allocations and mark two as failed
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 4; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
2018-04-23 23:35:25 +00:00
|
|
|
alloc.DeploymentID = d.ID
|
2018-01-19 17:58:59 +00:00
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
2018-03-02 00:23:44 +00:00
|
|
|
|
|
|
|
//create some allocations that are reschedulable now
|
2018-01-19 17:58:59 +00:00
|
|
|
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
2018-03-02 00:23:44 +00:00
|
|
|
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
|
2018-01-19 17:58:59 +00:00
|
|
|
allocs[3].ClientStatus = structs.AllocClientStatusFailed
|
2018-03-02 00:23:44 +00:00
|
|
|
allocs[3].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
2018-01-19 17:58:59 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-01-19 17:58:59 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert that no rescheduled placements were created
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
place: 0,
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Ignore: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2018-02-02 23:22:37 +00:00
|
|
|
|
2018-04-08 23:09:14 +00:00
|
|
|
// Test that a running deployment with failed allocs will not result in
|
|
|
|
// rescheduling failed allocations unless they are marked as reschedulable.
|
2018-02-02 23:22:37 +00:00
|
|
|
func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-02-02 23:22:37 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
2018-03-02 00:23:44 +00:00
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
2018-02-02 23:22:37 +00:00
|
|
|
|
|
|
|
// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2018-02-02 23:22:37 +00:00
|
|
|
d.Status = structs.DeploymentStatusRunning
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
2018-04-08 23:09:14 +00:00
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 10,
|
2018-02-02 23:22:37 +00:00
|
|
|
}
|
|
|
|
|
2018-04-08 23:09:14 +00:00
|
|
|
// Create 10 allocations
|
2018-02-02 23:22:37 +00:00
|
|
|
var allocs []*structs.Allocation
|
2018-04-08 23:09:14 +00:00
|
|
|
for i := 0; i < 10; i++ {
|
2018-02-02 23:22:37 +00:00
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
alloc.DeploymentID = d.ID
|
2018-04-08 23:09:14 +00:00
|
|
|
alloc.ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
2018-02-02 23:22:37 +00:00
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
2018-03-02 00:23:44 +00:00
|
|
|
|
2018-04-08 23:09:14 +00:00
|
|
|
// Mark half of them as reschedulable
|
|
|
|
for i := 0; i < 5; i++ {
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[i].DesiredTransition.Reschedule = pointer.Of(true)
|
2018-04-08 23:09:14 +00:00
|
|
|
}
|
2018-02-02 23:22:37 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-02-02 23:22:37 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert that no rescheduled placements were created
|
|
|
|
assertResults(t, r, &resultExpectation{
|
2018-04-08 23:09:14 +00:00
|
|
|
place: 5,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 5,
|
2018-02-02 23:22:37 +00:00
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
2018-04-08 23:09:14 +00:00
|
|
|
Place: 5,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 5,
|
2018-04-08 23:09:14 +00:00
|
|
|
Ignore: 5,
|
2018-02-02 23:22:37 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2018-03-29 14:28:52 +00:00
|
|
|
|
|
|
|
// Test that a failed deployment cancels non-promoted canaries
|
|
|
|
func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-03-29 14:28:52 +00:00
|
|
|
// Create a job
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 3
|
|
|
|
job.TaskGroups[0].Update = &structs.UpdateStrategy{
|
|
|
|
Canary: 3,
|
|
|
|
MaxParallel: 2,
|
|
|
|
HealthCheck: structs.UpdateStrategyHealthCheck_Checks,
|
|
|
|
MinHealthyTime: 10 * time.Second,
|
|
|
|
HealthyDeadline: 10 * time.Minute,
|
|
|
|
Stagger: 31 * time.Second,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create v1 of the job
|
|
|
|
jobv1 := job.Copy()
|
|
|
|
jobv1.Version = 1
|
|
|
|
jobv1.TaskGroups[0].Meta = map[string]string{"version": "1"}
|
|
|
|
|
|
|
|
// Create v2 of the job
|
|
|
|
jobv2 := job.Copy()
|
|
|
|
jobv2.Version = 2
|
|
|
|
jobv2.TaskGroups[0].Meta = map[string]string{"version": "2"}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(jobv2, 50)
|
2018-03-29 14:28:52 +00:00
|
|
|
state := &structs.DeploymentState{
|
2018-04-08 23:09:14 +00:00
|
|
|
Promoted: true,
|
|
|
|
DesiredTotal: 3,
|
|
|
|
PlacedAllocs: 3,
|
|
|
|
HealthyAllocs: 3,
|
2018-03-29 14:28:52 +00:00
|
|
|
}
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = state
|
|
|
|
|
|
|
|
// Create the original
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 3; i++ {
|
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = jobv2
|
|
|
|
new.JobID = job.ID
|
|
|
|
new.NodeID = uuid.Generate()
|
|
|
|
new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
new.DeploymentID = d.ID
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(true),
|
2018-03-29 14:28:52 +00:00
|
|
|
}
|
|
|
|
new.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
|
|
|
|
}
|
|
|
|
for i := 0; i < 3; i++ {
|
|
|
|
new := mock.Alloc()
|
|
|
|
new.Job = jobv1
|
|
|
|
new.JobID = jobv1.ID
|
|
|
|
new.NodeID = uuid.Generate()
|
|
|
|
new.Name = structs.AllocName(jobv1.ID, jobv1.TaskGroups[0].Name, uint(i))
|
|
|
|
new.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
new.DeploymentID = uuid.Generate()
|
|
|
|
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
2022-08-17 16:26:34 +00:00
|
|
|
Healthy: pointer.Of(false),
|
2018-03-29 14:28:52 +00:00
|
|
|
}
|
|
|
|
new.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
new.ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs = append(allocs, new)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, jobv2,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-03-29 14:28:52 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
updates := []*structs.DeploymentStatusUpdate{
|
|
|
|
{
|
|
|
|
DeploymentID: d.ID,
|
|
|
|
Status: structs.DeploymentStatusSuccessful,
|
|
|
|
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: updates,
|
|
|
|
place: 0,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Stop: 0,
|
|
|
|
InPlaceUpdate: 0,
|
2018-03-29 18:28:37 +00:00
|
|
|
Ignore: 3,
|
2018-03-29 14:28:52 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2018-04-17 19:41:36 +00:00
|
|
|
|
|
|
|
// Test that a successful deployment with failed allocs will result in
|
|
|
|
// rescheduling failed allocations
|
|
|
|
func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-04-17 19:41:36 +00:00
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
|
|
|
// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
|
2021-11-23 08:23:31 +00:00
|
|
|
d := structs.NewDeployment(job, 50)
|
2018-04-17 19:41:36 +00:00
|
|
|
d.Status = structs.DeploymentStatusSuccessful
|
|
|
|
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
|
|
|
Promoted: false,
|
|
|
|
DesiredTotal: 10,
|
|
|
|
PlacedAllocs: 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create 10 allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
alloc.DeploymentID = d.ID
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
}
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
d, allocs, nil, "", 50, true)
|
2018-04-17 19:41:36 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert that rescheduled placements were created
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
place: 10,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 10,
|
2018-04-17 19:41:36 +00:00
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 10,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 10,
|
2018-04-17 19:41:36 +00:00
|
|
|
Ignore: 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 10, r.place)
|
|
|
|
}
|
2018-05-09 16:30:42 +00:00
|
|
|
|
2018-05-09 21:01:34 +00:00
|
|
|
// Tests force rescheduling a failed alloc that is past its reschedule limit
|
2018-05-09 16:30:42 +00:00
|
|
|
func TestReconciler_ForceReschedule_Service(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-05-09 16:30:42 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2018-05-09 16:30:42 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark one as failed and past its reschedule limit so not eligible to reschedule
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
|
|
|
|
// Mark DesiredTransition ForceReschedule
|
2022-08-17 16:26:34 +00:00
|
|
|
allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: pointer.Of(true)}
|
2018-05-09 16:30:42 +00:00
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2018-05-09 16:30:42 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// Verify that one rescheduled alloc was created because of the forced reschedule
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
stop: 1,
|
2018-05-09 16:30:42 +00:00
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
2019-06-06 19:04:32 +00:00
|
|
|
Stop: 1,
|
2018-05-09 16:30:42 +00:00
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// Rescheduled allocs should have previous allocs
|
|
|
|
assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 1, r.place)
|
|
|
|
}
|
2019-06-13 20:41:19 +00:00
|
|
|
|
|
|
|
// Tests behavior of service failure with rescheduling policy preventing rescheduling:
|
|
|
|
// new allocs should be placed to satisfy the job count, and current allocations are
|
|
|
|
// left unmodified
|
|
|
|
func TestReconciler_RescheduleNot_Service(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2019-06-13 20:41:19 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Set desired 5
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 5
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
now := time.Now()
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// Set up reschedule policy and update block
|
2019-06-13 20:41:19 +00:00
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 0,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "",
|
|
|
|
MaxDelay: 1 * time.Hour,
|
|
|
|
Unlimited: false,
|
|
|
|
}
|
|
|
|
job.TaskGroups[0].Update = noCanaryUpdate
|
|
|
|
|
|
|
|
// Create 5 existing allocations
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 5; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark two as failed
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Mark one of them as already rescheduled once
|
|
|
|
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: uuid.Generate(),
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-10 * time.Second)}}
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
|
|
|
|
// Mark one as desired state stop
|
|
|
|
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2019-06-13 20:41:19 +00:00
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// no rescheduling, ignore all 4 allocs
|
|
|
|
// but place one to substitute allocs[4] that was stopped explicitly
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 1,
|
|
|
|
inplace: 0,
|
|
|
|
stop: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 1,
|
|
|
|
Ignore: 4,
|
|
|
|
Stop: 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
// none of the placement should have preallocs or rescheduled
|
|
|
|
assertPlaceResultsHavePreviousAllocs(t, 0, r.place)
|
|
|
|
assertPlacementsAreRescheduled(t, 0, r.place)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests behavior of batch failure with rescheduling policy preventing rescheduling:
|
|
|
|
// current allocations are left unmodified and no follow up
|
|
|
|
func TestReconciler_RescheduleNot_Batch(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2019-06-13 20:41:19 +00:00
|
|
|
require := require.New(t)
|
|
|
|
// Set desired 4
|
|
|
|
job := mock.Job()
|
|
|
|
job.TaskGroups[0].Count = 4
|
|
|
|
now := time.Now()
|
|
|
|
// Set up reschedule policy
|
|
|
|
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
|
|
|
Attempts: 0,
|
|
|
|
Interval: 24 * time.Hour,
|
|
|
|
Delay: 5 * time.Second,
|
|
|
|
DelayFunction: "constant",
|
|
|
|
}
|
|
|
|
tgName := job.TaskGroups[0].Name
|
|
|
|
// Create 6 existing allocations - 2 running, 1 complete and 3 failed
|
|
|
|
var allocs []*structs.Allocation
|
|
|
|
for i := 0; i < 6; i++ {
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.NodeID = uuid.Generate()
|
|
|
|
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
|
|
|
allocs = append(allocs, alloc)
|
|
|
|
alloc.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
}
|
|
|
|
// Mark 3 as failed with restart tracking info
|
|
|
|
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[0].NextAllocation = allocs[1].ID
|
|
|
|
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[0].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
allocs[1].NextAllocation = allocs[2].ID
|
|
|
|
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
|
|
|
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
|
|
|
StartedAt: now.Add(-1 * time.Hour),
|
|
|
|
FinishedAt: now.Add(-5 * time.Second)}}
|
|
|
|
allocs[2].FollowupEvalID = uuid.Generate()
|
|
|
|
allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
|
|
|
{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[0].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
|
|
|
PrevAllocID: allocs[1].ID,
|
|
|
|
PrevNodeID: uuid.Generate(),
|
|
|
|
},
|
|
|
|
}}
|
|
|
|
// Mark one as complete
|
|
|
|
allocs[5].ClientStatus = structs.AllocClientStatusComplete
|
|
|
|
|
2021-11-23 08:23:31 +00:00
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nil, "", 50, true)
|
2019-06-13 20:41:19 +00:00
|
|
|
reconciler.now = now
|
|
|
|
r := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that no follow up evals were created
|
|
|
|
evals := r.desiredFollowupEvals[tgName]
|
|
|
|
require.Nil(evals)
|
|
|
|
|
|
|
|
// No reschedule attempts were made and all allocs are untouched
|
|
|
|
assertResults(t, r, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 0,
|
|
|
|
stop: 0,
|
|
|
|
inplace: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 0,
|
|
|
|
Stop: 0,
|
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
|
|
|
// Tests that when a node disconnects running allocations are queued to transition to unknown.
|
|
|
|
func TestReconciler_Node_Disconnect_Updates_Alloc_To_Unknown(t *testing.T) {
|
|
|
|
job, allocs := buildResumableAllocations(3, structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun, 2)
|
|
|
|
// Build a map of disconnected nodes
|
|
|
|
nodes := buildDisconnectedNodes(allocs, 2)
|
|
|
|
|
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
|
2022-03-07 18:40:57 +00:00
|
|
|
nil, allocs, nodes, "", 50, true)
|
2022-02-16 18:50:20 +00:00
|
|
|
reconciler.now = time.Now().UTC()
|
|
|
|
results := reconciler.Compute()
|
|
|
|
|
|
|
|
// Verify that 1 follow up eval was created with the values we expect.
|
|
|
|
evals := results.desiredFollowupEvals[job.TaskGroups[0].Name]
|
|
|
|
require.Len(t, evals, 1)
|
|
|
|
expectedTime := reconciler.now.Add(5 * time.Minute)
|
|
|
|
|
|
|
|
eval := evals[0]
|
|
|
|
require.NotNil(t, eval.WaitUntil)
|
|
|
|
require.Equal(t, expectedTime, eval.WaitUntil)
|
|
|
|
|
|
|
|
// Validate that the queued disconnectUpdates have the right client status,
|
|
|
|
// and that they have a valid FollowUpdEvalID.
|
|
|
|
for _, disconnectUpdate := range results.disconnectUpdates {
|
|
|
|
require.Equal(t, structs.AllocClientStatusUnknown, disconnectUpdate.ClientStatus)
|
|
|
|
require.NotEmpty(t, disconnectUpdate.FollowupEvalID)
|
|
|
|
require.Equal(t, eval.ID, disconnectUpdate.FollowupEvalID)
|
|
|
|
}
|
|
|
|
|
|
|
|
// 2 to place, 2 to update, 1 to ignore
|
|
|
|
assertResults(t, results, &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
stop: 0,
|
|
|
|
inplace: 0,
|
|
|
|
disconnectUpdates: 2,
|
|
|
|
|
|
|
|
// 2 to place and 1 to ignore
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
|
|
|
Stop: 0,
|
|
|
|
Ignore: 1,
|
|
|
|
InPlaceUpdate: 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
func TestReconciler_Disconnect_UpdateJobAfterReconnect(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
// Create 2 allocs and simulate one have being previously disconnected and
|
|
|
|
// then reconnected.
|
|
|
|
job, allocs := buildResumableAllocations(2, structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun, 2)
|
|
|
|
allocs[0].AllocStates = []*structs.AllocState{
|
|
|
|
{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusUnknown,
|
|
|
|
Time: time.Now().Add(-5 * time.Minute),
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusRunning,
|
|
|
|
Time: time.Now(),
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job,
|
|
|
|
nil, allocs, nil, "", 50, true)
|
|
|
|
results := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert both allocations will be updated.
|
|
|
|
assertResults(t, results, &resultExpectation{
|
|
|
|
inplace: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
job.TaskGroups[0].Name: {
|
|
|
|
InPlaceUpdate: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Tests that when a node disconnects/reconnects allocations for that node are
|
|
|
|
// reconciled according to the business rules.
|
|
|
|
func TestReconciler_Disconnected_Client(t *testing.T) {
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectAllocState := []*structs.AllocState{{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusUnknown,
|
|
|
|
Time: time.Now(),
|
|
|
|
}}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
type testCase struct {
|
|
|
|
name string
|
|
|
|
allocCount int
|
|
|
|
disconnectedAllocCount int
|
|
|
|
jobVersionIncrement uint64
|
|
|
|
nodeScoreIncrement float64
|
|
|
|
disconnectedAllocStatus string
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates []*structs.AllocState
|
2022-03-31 15:32:18 +00:00
|
|
|
isBatch bool
|
|
|
|
nodeStatusDisconnected bool
|
|
|
|
replace bool
|
|
|
|
failReplacement bool
|
2023-03-24 23:38:31 +00:00
|
|
|
taintReplacement bool
|
|
|
|
disconnectReplacement bool
|
|
|
|
replaceFailedReplacement bool
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode bool
|
|
|
|
maxDisconnect *time.Duration
|
|
|
|
expected *resultExpectation
|
|
|
|
}
|
|
|
|
|
|
|
|
testCases := []testCase{
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "reconnect-original-no-replacement",
|
|
|
|
allocCount: 2,
|
|
|
|
replace: false,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: false,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
reconnectUpdates: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Ignore: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "resume-original-and-stop-replacement",
|
|
|
|
allocCount: 3,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 1,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: false,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 1,
|
|
|
|
reconnectUpdates: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Stop: 1,
|
|
|
|
Ignore: 3,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-original-with-lower-node-score",
|
|
|
|
allocCount: 4,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 1,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
nodeScoreIncrement: 1,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Stop: 1,
|
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-original-failed-on-reconnect",
|
|
|
|
allocCount: 4,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
expected: &resultExpectation{
|
2022-04-06 13:33:32 +00:00
|
|
|
stop: 2,
|
2022-03-31 15:32:18 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
2022-04-06 13:33:32 +00:00
|
|
|
Stop: 2,
|
2022-03-31 15:32:18 +00:00
|
|
|
Ignore: 4,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "reschedule-original-failed-if-not-replaced",
|
|
|
|
allocCount: 4,
|
|
|
|
replace: false,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 2,
|
|
|
|
place: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Ignore: 2,
|
|
|
|
Place: 2,
|
|
|
|
Stop: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "ignore-reconnect-completed",
|
|
|
|
allocCount: 2,
|
|
|
|
replace: false,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusComplete,
|
2023-10-27 15:20:53 +00:00
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
isBatch: true,
|
|
|
|
expected: &resultExpectation{
|
2023-10-27 15:20:53 +00:00
|
|
|
place: 2,
|
2022-03-31 15:32:18 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Ignore: 2,
|
2023-10-27 15:20:53 +00:00
|
|
|
Place: 2,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2023-03-24 23:38:31 +00:00
|
|
|
{
|
|
|
|
name: "keep-original-alloc-and-stop-failed-replacement",
|
|
|
|
allocCount: 3,
|
|
|
|
replace: true,
|
|
|
|
failReplacement: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
reconnectUpdates: 2,
|
2023-10-27 15:20:53 +00:00
|
|
|
stop: 0,
|
2023-03-24 23:38:31 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
2023-10-27 15:20:53 +00:00
|
|
|
Ignore: 5,
|
2023-03-24 23:38:31 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "keep-original-and-stop-reconnecting-replacement",
|
|
|
|
allocCount: 2,
|
|
|
|
replace: true,
|
|
|
|
disconnectReplacement: true,
|
|
|
|
disconnectedAllocCount: 1,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
reconnectUpdates: 1,
|
|
|
|
stop: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Ignore: 2,
|
|
|
|
Stop: 1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "keep-original-and-stop-tainted-replacement",
|
|
|
|
allocCount: 3,
|
|
|
|
replace: true,
|
|
|
|
taintReplacement: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
reconnectUpdates: 2,
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Ignore: 3,
|
|
|
|
Stop: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-original-alloc-with-old-job-version",
|
|
|
|
allocCount: 5,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
jobVersionIncrement: 1,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Ignore: 5,
|
|
|
|
Stop: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-original-alloc-with-old-job-version-reconnect-eval",
|
|
|
|
allocCount: 5,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
jobVersionIncrement: 1,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Stop: 2,
|
|
|
|
Ignore: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-original-alloc-with-old-job-version-and-failed-replacements-replaced",
|
|
|
|
allocCount: 5,
|
|
|
|
replace: true,
|
|
|
|
failReplacement: true,
|
|
|
|
replaceFailedReplacement: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2023-10-27 15:20:53 +00:00
|
|
|
shouldStopOnDisconnectedNode: false,
|
2022-03-31 15:32:18 +00:00
|
|
|
jobVersionIncrement: 1,
|
|
|
|
expected: &resultExpectation{
|
2023-10-27 15:20:53 +00:00
|
|
|
stop: 2,
|
|
|
|
reconnectUpdates: 2,
|
2022-03-31 15:32:18 +00:00
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Stop: 2,
|
2023-10-27 15:20:53 +00:00
|
|
|
Ignore: 7,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-original-pending-alloc-for-disconnected-node",
|
|
|
|
allocCount: 2,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 1,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusPending,
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
nodeStatusDisconnected: true,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 1,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Stop: 1,
|
|
|
|
Ignore: 2,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2023-03-24 23:38:31 +00:00
|
|
|
{
|
2023-10-27 15:20:53 +00:00
|
|
|
name: "stop-failed-original-and-failed-replacements-and-place-new",
|
|
|
|
allocCount: 5,
|
|
|
|
replace: true,
|
|
|
|
failReplacement: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusFailed,
|
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
expected: &resultExpectation{
|
2023-10-27 15:20:53 +00:00
|
|
|
stop: 2,
|
2023-03-24 23:38:31 +00:00
|
|
|
place: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
2023-10-27 15:20:53 +00:00
|
|
|
Stop: 2,
|
2023-03-24 23:38:31 +00:00
|
|
|
Place: 2,
|
2023-10-27 15:20:53 +00:00
|
|
|
Ignore: 5,
|
2023-03-24 23:38:31 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
|
|
|
name: "stop-expired-allocs",
|
|
|
|
allocCount: 5,
|
|
|
|
replace: true,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusUnknown,
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: disconnectAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
shouldStopOnDisconnectedNode: true,
|
|
|
|
nodeStatusDisconnected: true,
|
2022-08-17 16:26:34 +00:00
|
|
|
maxDisconnect: pointer.Of(2 * time.Second),
|
2022-03-31 15:32:18 +00:00
|
|
|
expected: &resultExpectation{
|
|
|
|
stop: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Stop: 2,
|
|
|
|
Ignore: 5,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "replace-allocs-on-disconnected-node",
|
|
|
|
allocCount: 5,
|
|
|
|
replace: false,
|
|
|
|
disconnectedAllocCount: 2,
|
|
|
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
disconnectedAllocStates: []*structs.AllocState{},
|
2022-03-31 15:32:18 +00:00
|
|
|
nodeStatusDisconnected: true,
|
|
|
|
expected: &resultExpectation{
|
|
|
|
place: 2,
|
|
|
|
disconnectUpdates: 2,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
"web": {
|
|
|
|
Place: 2,
|
|
|
|
Ignore: 3,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
require.NotEqual(t, 0, tc.allocCount, "invalid test case: alloc count must be greater than zero")
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
testNode := mock.Node()
|
|
|
|
if tc.nodeStatusDisconnected == true {
|
|
|
|
testNode.Status = structs.NodeStatusDisconnected
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Create resumable allocs
|
|
|
|
job, allocs := buildResumableAllocations(tc.allocCount, structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun, 2)
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2023-03-24 23:38:31 +00:00
|
|
|
origAllocs := set.New[string](len(allocs))
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
origAllocs.Insert(alloc.ID)
|
|
|
|
}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
if tc.isBatch {
|
|
|
|
job.Type = structs.JobTypeBatch
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Set alloc state
|
|
|
|
disconnectedAllocCount := tc.disconnectedAllocCount
|
|
|
|
for _, alloc := range allocs {
|
2023-10-27 15:20:53 +00:00
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
2022-04-06 13:33:32 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
if tc.maxDisconnect != nil {
|
|
|
|
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
if disconnectedAllocCount > 0 {
|
|
|
|
alloc.ClientStatus = tc.disconnectedAllocStatus
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
alloc.AllocStates = tc.disconnectedAllocStates
|
2022-03-31 15:32:18 +00:00
|
|
|
// Set the node id on all the disconnected allocs to the node under test.
|
|
|
|
alloc.NodeID = testNode.ID
|
2022-04-21 14:05:58 +00:00
|
|
|
alloc.NodeName = "disconnected"
|
2022-03-31 15:32:18 +00:00
|
|
|
disconnectedAllocCount--
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Place the allocs on another node.
|
|
|
|
if tc.replace {
|
|
|
|
replacements := make([]*structs.Allocation, 0)
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
if alloc.NodeID != testNode.ID {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
replacement := alloc.Copy()
|
|
|
|
replacement.ID = uuid.Generate()
|
|
|
|
replacement.NodeID = uuid.Generate()
|
|
|
|
replacement.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
replacement.PreviousAllocation = alloc.ID
|
|
|
|
replacement.AllocStates = nil
|
|
|
|
replacement.TaskStates = nil
|
2023-03-24 23:38:31 +00:00
|
|
|
replacement.CreateIndex += 1
|
2022-03-31 15:32:18 +00:00
|
|
|
alloc.NextAllocation = replacement.ID
|
|
|
|
|
|
|
|
if tc.jobVersionIncrement != 0 {
|
|
|
|
replacement.Job.Version = replacement.Job.Version + tc.jobVersionIncrement
|
|
|
|
}
|
|
|
|
if tc.nodeScoreIncrement != 0 {
|
|
|
|
replacement.Metrics.ScoreMetaData[0].NormScore = replacement.Metrics.ScoreMetaData[0].NormScore + tc.nodeScoreIncrement
|
|
|
|
}
|
2023-03-24 23:38:31 +00:00
|
|
|
if tc.taintReplacement {
|
|
|
|
replacement.DesiredTransition.Migrate = pointer.Of(true)
|
|
|
|
}
|
|
|
|
if tc.disconnectReplacement {
|
|
|
|
replacement.AllocStates = tc.disconnectedAllocStates
|
|
|
|
}
|
2022-03-31 15:32:18 +00:00
|
|
|
|
|
|
|
// If we want to test intermediate replacement failures simulate that.
|
|
|
|
if tc.failReplacement {
|
|
|
|
replacement.ClientStatus = structs.AllocClientStatusFailed
|
2023-03-24 23:38:31 +00:00
|
|
|
|
|
|
|
if tc.replaceFailedReplacement {
|
|
|
|
nextReplacement := replacement.Copy()
|
|
|
|
nextReplacement.ID = uuid.Generate()
|
|
|
|
nextReplacement.ClientStatus = structs.AllocClientStatusRunning
|
|
|
|
nextReplacement.DesiredStatus = structs.AllocDesiredStatusRun
|
|
|
|
nextReplacement.PreviousAllocation = replacement.ID
|
|
|
|
nextReplacement.CreateIndex += 1
|
|
|
|
|
|
|
|
replacement.NextAllocation = nextReplacement.ID
|
|
|
|
replacement.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
|
|
|
|
replacements = append(replacements, nextReplacement)
|
|
|
|
}
|
2022-03-31 15:32:18 +00:00
|
|
|
}
|
2023-03-24 23:38:31 +00:00
|
|
|
|
|
|
|
replacements = append(replacements, replacement)
|
2022-03-31 15:32:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
allocs = append(allocs, replacements...)
|
|
|
|
}
|
|
|
|
|
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, tc.isBatch, job.ID, job,
|
|
|
|
nil, allocs, map[string]*structs.Node{testNode.ID: testNode}, "", 50, true)
|
|
|
|
|
|
|
|
reconciler.now = time.Now()
|
|
|
|
if tc.maxDisconnect != nil {
|
|
|
|
reconciler.now = time.Now().Add(*tc.maxDisconnect * 20)
|
|
|
|
}
|
|
|
|
|
|
|
|
results := reconciler.Compute()
|
2022-04-06 13:33:32 +00:00
|
|
|
assertResults(t, results, tc.expected)
|
2022-03-31 15:32:18 +00:00
|
|
|
|
|
|
|
for _, stopResult := range results.stop {
|
2023-03-24 23:38:31 +00:00
|
|
|
// Skip replacement allocs.
|
|
|
|
if !origAllocs.Contains(stopResult.alloc.ID) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
if tc.shouldStopOnDisconnectedNode {
|
|
|
|
require.Equal(t, testNode.ID, stopResult.alloc.NodeID)
|
|
|
|
} else {
|
|
|
|
require.NotEqual(t, testNode.ID, stopResult.alloc.NodeID)
|
|
|
|
}
|
|
|
|
|
|
|
|
require.Equal(t, job.Version, stopResult.alloc.Job.Version)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
}
|
2022-04-21 14:05:58 +00:00
|
|
|
|
|
|
|
// Tests that a client disconnect while a canary is in progress generates the result.
|
|
|
|
func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
|
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
disconnectAllocState := []*structs.AllocState{{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusUnknown,
|
|
|
|
Time: time.Now(),
|
|
|
|
}}
|
|
|
|
|
2022-04-21 14:05:58 +00:00
|
|
|
type testCase struct {
|
|
|
|
name string
|
|
|
|
nodes []string
|
|
|
|
deploymentState *structs.DeploymentState
|
|
|
|
deployedAllocs map[*structs.Node][]*structs.Allocation
|
|
|
|
canaryAllocs map[*structs.Node][]*structs.Allocation
|
|
|
|
expectedResult *resultExpectation
|
|
|
|
}
|
|
|
|
|
|
|
|
running := structs.AllocClientStatusRunning
|
|
|
|
complete := structs.AllocClientStatusComplete
|
|
|
|
unknown := structs.AllocClientStatusUnknown
|
|
|
|
pending := structs.AllocClientStatusPending
|
|
|
|
run := structs.AllocDesiredStatusRun
|
|
|
|
stop := structs.AllocDesiredStatusStop
|
|
|
|
|
|
|
|
maxClientDisconnect := 10 * time.Minute
|
|
|
|
|
|
|
|
readyNode := mock.Node()
|
|
|
|
readyNode.Name = "ready-" + readyNode.ID
|
|
|
|
readyNode.Status = structs.NodeStatusReady
|
|
|
|
|
|
|
|
disconnectedNode := mock.Node()
|
|
|
|
disconnectedNode.Name = "disconnected-" + disconnectedNode.ID
|
|
|
|
disconnectedNode.Status = structs.NodeStatusDisconnected
|
|
|
|
|
|
|
|
// Job with allocations and max_client_disconnect
|
|
|
|
job := mock.Job()
|
|
|
|
|
|
|
|
updatedJob := job.Copy()
|
|
|
|
updatedJob.Version = updatedJob.Version + 1
|
|
|
|
|
|
|
|
testCases := []testCase{
|
|
|
|
{
|
|
|
|
name: "3-placed-1-disconnect",
|
|
|
|
deploymentState: &structs.DeploymentState{
|
|
|
|
AutoRevert: false,
|
|
|
|
AutoPromote: false,
|
|
|
|
Promoted: false,
|
|
|
|
ProgressDeadline: 5 * time.Minute,
|
|
|
|
RequireProgressBy: time.Now().Add(5 * time.Minute),
|
|
|
|
PlacedCanaries: []string{},
|
|
|
|
DesiredCanaries: 1,
|
|
|
|
DesiredTotal: 6,
|
|
|
|
PlacedAllocs: 3,
|
|
|
|
HealthyAllocs: 2,
|
|
|
|
UnhealthyAllocs: 0,
|
|
|
|
},
|
|
|
|
deployedAllocs: map[*structs.Node][]*structs.Allocation{
|
|
|
|
readyNode: {
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop},
|
|
|
|
// destructive, but discarded because canarying
|
|
|
|
{Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
disconnectedNode: {
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
canaryAllocs: map[*structs.Node][]*structs.Allocation{
|
|
|
|
readyNode: {
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
disconnectedNode: {
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[1]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
expectedResult: &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 3,
|
|
|
|
destructive: 0,
|
|
|
|
stop: 0,
|
|
|
|
inplace: 0,
|
|
|
|
attributeUpdates: 0,
|
|
|
|
disconnectUpdates: 3,
|
|
|
|
reconnectUpdates: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
updatedJob.TaskGroups[0].Name: {
|
|
|
|
Place: 3,
|
|
|
|
Canary: 0,
|
2023-10-27 15:20:53 +00:00
|
|
|
Ignore: 6,
|
2022-04-21 14:05:58 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "ignore-unknown",
|
|
|
|
deploymentState: &structs.DeploymentState{
|
|
|
|
AutoRevert: false,
|
|
|
|
AutoPromote: false,
|
|
|
|
Promoted: false,
|
|
|
|
ProgressDeadline: 5 * time.Minute,
|
|
|
|
RequireProgressBy: time.Now().Add(5 * time.Minute),
|
|
|
|
PlacedCanaries: []string{},
|
|
|
|
DesiredCanaries: 1,
|
|
|
|
DesiredTotal: 6,
|
|
|
|
PlacedAllocs: 3,
|
|
|
|
HealthyAllocs: 2,
|
|
|
|
UnhealthyAllocs: 0,
|
|
|
|
},
|
|
|
|
deployedAllocs: map[*structs.Node][]*structs.Allocation{
|
|
|
|
readyNode: {
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: stop},
|
|
|
|
// destructive, but discarded because canarying
|
|
|
|
{Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
disconnectedNode: {
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
canaryAllocs: map[*structs.Node][]*structs.Allocation{
|
|
|
|
readyNode: {
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[2]", ClientStatus: pending, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
disconnectedNode: {
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[1]", ClientStatus: unknown, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
expectedResult: &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
destructive: 0,
|
|
|
|
stop: 0,
|
|
|
|
inplace: 0,
|
|
|
|
attributeUpdates: 0,
|
|
|
|
disconnectUpdates: 2,
|
|
|
|
reconnectUpdates: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
updatedJob.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
|
|
|
Canary: 0,
|
2023-10-27 15:20:53 +00:00
|
|
|
Ignore: 7,
|
2022-04-21 14:05:58 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "4-placed-2-pending-lost",
|
|
|
|
deploymentState: &structs.DeploymentState{
|
|
|
|
AutoRevert: false,
|
|
|
|
AutoPromote: false,
|
|
|
|
Promoted: false,
|
|
|
|
ProgressDeadline: 5 * time.Minute,
|
|
|
|
RequireProgressBy: time.Now().Add(5 * time.Minute),
|
|
|
|
PlacedCanaries: []string{},
|
|
|
|
DesiredCanaries: 2,
|
|
|
|
DesiredTotal: 6,
|
|
|
|
PlacedAllocs: 4,
|
|
|
|
HealthyAllocs: 2,
|
|
|
|
UnhealthyAllocs: 0,
|
|
|
|
},
|
|
|
|
deployedAllocs: map[*structs.Node][]*structs.Allocation{
|
|
|
|
readyNode: {
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[0]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[2]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// destructive, but discarded because canarying
|
|
|
|
{Name: "my-job.web[4]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
disconnectedNode: {
|
|
|
|
// filtered as terminal
|
|
|
|
{Name: "my-job.web[1]", ClientStatus: complete, DesiredStatus: stop},
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[3]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
// Gets a placement, and a disconnect update
|
|
|
|
{Name: "my-job.web[5]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
canaryAllocs: map[*structs.Node][]*structs.Allocation{
|
|
|
|
readyNode: {
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[0]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
// Ignored
|
|
|
|
{Name: "my-job.web[2]", ClientStatus: running, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
disconnectedNode: {
|
|
|
|
// Stop/Lost because pending
|
|
|
|
{Name: "my-job.web[1]", ClientStatus: pending, DesiredStatus: run},
|
|
|
|
// Stop/Lost because pending
|
|
|
|
{Name: "my-job.web[3]", ClientStatus: pending, DesiredStatus: run},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
expectedResult: &resultExpectation{
|
|
|
|
createDeployment: nil,
|
|
|
|
deploymentUpdates: nil,
|
|
|
|
place: 2,
|
|
|
|
destructive: 0,
|
|
|
|
stop: 2,
|
|
|
|
inplace: 0,
|
|
|
|
attributeUpdates: 0,
|
|
|
|
disconnectUpdates: 2,
|
|
|
|
reconnectUpdates: 0,
|
|
|
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
|
|
|
updatedJob.TaskGroups[0].Name: {
|
|
|
|
Place: 2,
|
|
|
|
Canary: 0,
|
2023-10-27 15:20:53 +00:00
|
|
|
Ignore: 6,
|
2022-04-21 14:05:58 +00:00
|
|
|
// The 2 stops in this test are transient failures, but
|
|
|
|
// the deployment can still progress. We don't include
|
|
|
|
// them in the stop count since DesiredTGUpdates is used
|
|
|
|
// to report deployment progress or final deployment state.
|
|
|
|
Stop: 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
// Set the count dynamically to the number from the original deployment.
|
|
|
|
job.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode])
|
|
|
|
job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect
|
|
|
|
job.TaskGroups[0].Update = &structs.UpdateStrategy{
|
|
|
|
MaxParallel: 1,
|
|
|
|
Canary: tc.deploymentState.DesiredCanaries,
|
|
|
|
MinHealthyTime: 3 * time.Second,
|
|
|
|
HealthyDeadline: 20 * time.Second,
|
|
|
|
AutoRevert: true,
|
|
|
|
AutoPromote: true,
|
|
|
|
}
|
|
|
|
|
|
|
|
updatedJob.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode])
|
|
|
|
updatedJob.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect
|
|
|
|
updatedJob.TaskGroups[0].Update = &structs.UpdateStrategy{
|
|
|
|
MaxParallel: 1,
|
|
|
|
Canary: tc.deploymentState.DesiredCanaries,
|
|
|
|
MinHealthyTime: 3 * time.Second,
|
|
|
|
HealthyDeadline: 20 * time.Second,
|
|
|
|
AutoRevert: true,
|
|
|
|
AutoPromote: true,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Populate Alloc IDS, Node IDs, Job on deployed allocs
|
|
|
|
allocsConfigured := 0
|
|
|
|
for node, allocs := range tc.deployedAllocs {
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
alloc.ID = uuid.Generate()
|
|
|
|
alloc.NodeID = node.ID
|
|
|
|
alloc.NodeName = node.Name
|
|
|
|
alloc.JobID = job.ID
|
|
|
|
alloc.Job = job
|
|
|
|
alloc.TaskGroup = job.TaskGroups[0].Name
|
|
|
|
allocsConfigured++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
require.Equal(t, tc.deploymentState.DesiredTotal, allocsConfigured, "invalid alloc configuration: expect %d got %d", tc.deploymentState.DesiredTotal, allocsConfigured)
|
|
|
|
|
|
|
|
// Populate Alloc IDS, Node IDs, Job on canaries
|
|
|
|
canariesConfigured := 0
|
|
|
|
handled := make(map[string]allocUpdateType)
|
|
|
|
for node, allocs := range tc.canaryAllocs {
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
alloc.ID = uuid.Generate()
|
|
|
|
alloc.NodeID = node.ID
|
|
|
|
alloc.NodeName = node.Name
|
|
|
|
alloc.JobID = updatedJob.ID
|
|
|
|
alloc.Job = updatedJob
|
|
|
|
alloc.TaskGroup = updatedJob.TaskGroups[0].Name
|
|
|
|
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
|
|
|
Canary: true,
|
|
|
|
}
|
|
|
|
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
2022-08-17 16:26:34 +00:00
|
|
|
alloc.DeploymentStatus.Healthy = pointer.Of(true)
|
2022-04-21 14:05:58 +00:00
|
|
|
}
|
2023-10-27 15:20:53 +00:00
|
|
|
|
|
|
|
if alloc.ClientStatus == structs.AllocClientStatusUnknown {
|
|
|
|
alloc.AllocStates = disconnectAllocState
|
|
|
|
alloc.FollowupEvalID = "eval-where-it-was-set-to-unknow"
|
|
|
|
}
|
|
|
|
|
2022-04-21 14:05:58 +00:00
|
|
|
tc.deploymentState.PlacedCanaries = append(tc.deploymentState.PlacedCanaries, alloc.ID)
|
|
|
|
handled[alloc.ID] = allocUpdateFnIgnore
|
|
|
|
canariesConfigured++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate tc.canaryAllocs against tc.deploymentState
|
|
|
|
require.Equal(t, tc.deploymentState.PlacedAllocs, canariesConfigured, "invalid canary configuration: expect %d got %d", tc.deploymentState.PlacedAllocs, canariesConfigured)
|
|
|
|
|
|
|
|
deployment := structs.NewDeployment(updatedJob, 50)
|
|
|
|
deployment.TaskGroups[updatedJob.TaskGroups[0].Name] = tc.deploymentState
|
|
|
|
|
|
|
|
// Build a map of tainted nodes that contains the last canary
|
|
|
|
tainted := make(map[string]*structs.Node, 1)
|
|
|
|
tainted[disconnectedNode.ID] = disconnectedNode
|
|
|
|
|
|
|
|
allocs := make([]*structs.Allocation, 0)
|
|
|
|
|
|
|
|
allocs = append(allocs, tc.deployedAllocs[readyNode]...)
|
|
|
|
allocs = append(allocs, tc.deployedAllocs[disconnectedNode]...)
|
|
|
|
allocs = append(allocs, tc.canaryAllocs[readyNode]...)
|
|
|
|
allocs = append(allocs, tc.canaryAllocs[disconnectedNode]...)
|
|
|
|
|
|
|
|
mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
|
|
|
|
reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, updatedJob.ID, updatedJob,
|
|
|
|
deployment, allocs, tainted, "", 50, true)
|
|
|
|
result := reconciler.Compute()
|
|
|
|
|
|
|
|
// Assert the correct results
|
|
|
|
assertResults(t, result, tc.expectedResult)
|
|
|
|
|
|
|
|
// Validate that placements are either for placed canaries for the
|
|
|
|
// updated job, or for disconnected allocs for the original job
|
|
|
|
// and that they have a disconnect update.
|
|
|
|
for _, placeResult := range result.place {
|
|
|
|
found := false
|
|
|
|
require.NotNil(t, placeResult.previousAlloc)
|
|
|
|
for _, deployed := range tc.deployedAllocs[disconnectedNode] {
|
|
|
|
if deployed.ID == placeResult.previousAlloc.ID {
|
|
|
|
found = true
|
|
|
|
require.Equal(t, job.Version, placeResult.previousAlloc.Job.Version)
|
|
|
|
require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID)
|
|
|
|
_, exists := result.disconnectUpdates[placeResult.previousAlloc.ID]
|
|
|
|
require.True(t, exists)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for _, canary := range tc.canaryAllocs[disconnectedNode] {
|
|
|
|
if canary.ID == placeResult.previousAlloc.ID {
|
|
|
|
found = true
|
|
|
|
require.Equal(t, updatedJob.Version, placeResult.previousAlloc.Job.Version)
|
|
|
|
require.Equal(t, disconnectedNode.ID, placeResult.previousAlloc.NodeID)
|
|
|
|
_, exists := result.disconnectUpdates[placeResult.previousAlloc.ID]
|
|
|
|
require.True(t, exists)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
require.True(t, found)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate that stops are for pending disconnects
|
|
|
|
for _, stopResult := range result.stop {
|
|
|
|
require.Equal(t, pending, stopResult.alloc.ClientStatus)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2022-09-22 18:31:27 +00:00
|
|
|
|
|
|
|
// Tests the reconciler properly handles the logic for computeDeploymentPaused
|
|
|
|
// for various job types.
|
|
|
|
func TestReconciler_ComputeDeploymentPaused(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
type testCase struct {
|
|
|
|
name string
|
|
|
|
jobType string
|
|
|
|
isMultiregion bool
|
|
|
|
isPeriodic bool
|
|
|
|
isParameterized bool
|
|
|
|
expected bool
|
|
|
|
}
|
|
|
|
|
|
|
|
multiregionCfg := mock.MultiregionJob().Multiregion
|
|
|
|
periodicCfg := mock.PeriodicJob().Periodic
|
|
|
|
parameterizedCfg := &structs.ParameterizedJobConfig{
|
|
|
|
Payload: structs.DispatchPayloadRequired,
|
|
|
|
}
|
|
|
|
|
|
|
|
testCases := []testCase{
|
|
|
|
{
|
|
|
|
name: "single region service is not paused",
|
|
|
|
jobType: structs.JobTypeService,
|
|
|
|
isMultiregion: false,
|
|
|
|
isPeriodic: false,
|
|
|
|
isParameterized: false,
|
|
|
|
expected: false,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "multiregion service is paused",
|
|
|
|
jobType: structs.JobTypeService,
|
|
|
|
isMultiregion: true,
|
|
|
|
isPeriodic: false,
|
|
|
|
isParameterized: false,
|
|
|
|
expected: true,
|
|
|
|
},
|
|
|
|
{
|
2022-11-25 17:45:34 +00:00
|
|
|
name: "single region batch job is not paused",
|
|
|
|
jobType: structs.JobTypeBatch,
|
|
|
|
isMultiregion: false,
|
|
|
|
isPeriodic: false,
|
2022-09-22 18:31:27 +00:00
|
|
|
isParameterized: false,
|
|
|
|
expected: false,
|
|
|
|
},
|
|
|
|
{
|
2022-11-25 17:45:34 +00:00
|
|
|
name: "multiregion batch job is not paused",
|
|
|
|
jobType: structs.JobTypeBatch,
|
|
|
|
isMultiregion: false,
|
2022-09-22 18:31:27 +00:00
|
|
|
isPeriodic: false,
|
2022-11-25 17:45:34 +00:00
|
|
|
isParameterized: false,
|
2022-09-22 18:31:27 +00:00
|
|
|
expected: false,
|
|
|
|
},
|
|
|
|
{
|
2022-11-25 17:45:34 +00:00
|
|
|
name: "multiregion parameterized batch is not paused",
|
2022-09-22 18:31:27 +00:00
|
|
|
jobType: structs.JobTypeBatch,
|
2022-11-25 17:45:34 +00:00
|
|
|
isMultiregion: true,
|
2022-09-22 18:31:27 +00:00
|
|
|
isPeriodic: false,
|
2022-11-25 17:45:34 +00:00
|
|
|
isParameterized: true,
|
2022-09-22 18:31:27 +00:00
|
|
|
expected: false,
|
|
|
|
},
|
|
|
|
{
|
2022-11-25 17:45:34 +00:00
|
|
|
name: "multiregion periodic batch is not paused",
|
2022-09-22 18:31:27 +00:00
|
|
|
jobType: structs.JobTypeBatch,
|
2022-11-25 17:45:34 +00:00
|
|
|
isMultiregion: true,
|
|
|
|
isPeriodic: true,
|
2022-09-22 18:31:27 +00:00
|
|
|
isParameterized: false,
|
|
|
|
expected: false,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
var job *structs.Job
|
|
|
|
|
|
|
|
if tc.jobType == structs.JobTypeService {
|
|
|
|
job = mock.Job()
|
|
|
|
} else if tc.jobType == structs.JobTypeBatch {
|
|
|
|
job = mock.BatchJob()
|
|
|
|
}
|
|
|
|
|
|
|
|
require.NotNil(t, job, "invalid job type", tc.jobType)
|
|
|
|
|
2022-11-25 17:45:34 +00:00
|
|
|
var deployment *structs.Deployment
|
2022-09-22 18:31:27 +00:00
|
|
|
if tc.isMultiregion {
|
|
|
|
job.Multiregion = multiregionCfg
|
2022-11-25 17:45:34 +00:00
|
|
|
|
|
|
|
// This deployment is created by the Job.Register RPC and
|
|
|
|
// fetched by the scheduler before handing it to the
|
|
|
|
// reconciler.
|
|
|
|
if job.UsesDeployments() {
|
|
|
|
deployment = structs.NewDeployment(job, 100)
|
|
|
|
deployment.Status = structs.DeploymentStatusInitializing
|
|
|
|
deployment.StatusDescription = structs.DeploymentStatusDescriptionPendingForPeer
|
|
|
|
}
|
2022-09-22 18:31:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if tc.isPeriodic {
|
|
|
|
job.Periodic = periodicCfg
|
|
|
|
}
|
|
|
|
|
|
|
|
if tc.isParameterized {
|
|
|
|
job.ParameterizedJob = parameterizedCfg
|
|
|
|
}
|
|
|
|
|
|
|
|
reconciler := NewAllocReconciler(
|
2022-11-25 17:45:34 +00:00
|
|
|
testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, deployment,
|
|
|
|
nil, nil, "", job.Priority, true)
|
2022-09-22 18:31:27 +00:00
|
|
|
|
|
|
|
_ = reconciler.Compute()
|
|
|
|
|
|
|
|
require.Equal(t, tc.expected, reconciler.deploymentPaused)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|