open-nomad/scheduler/reconcile_util_test.go

1416 lines
43 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package scheduler
import (
"testing"
"time"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
)
func TestAllocSet_filterByTainted(t *testing.T) {
ci.Parallel(t)
nodes := map[string]*structs.Node{
"draining": {
ID: "draining",
DrainStrategy: mock.DrainNode().DrainStrategy,
},
"lost": {
ID: "lost",
Status: structs.NodeStatusDown,
},
"nil": nil,
"normal": {
ID: "normal",
Status: structs.NodeStatusReady,
},
"disconnected": {
ID: "disconnected",
Status: structs.NodeStatusDisconnected,
},
}
testJob := mock.Job()
testJob.TaskGroups[0].MaxClientDisconnect = pointer.Of(5 * time.Second)
now := time.Now()
testJobNoMaxDisconnect := mock.Job()
testJobNoMaxDisconnect.TaskGroups[0].MaxClientDisconnect = nil
unknownAllocState := []*structs.AllocState{{
Field: structs.AllocStateFieldClientStatus,
Value: structs.AllocClientStatusUnknown,
Time: now,
}}
expiredAllocState := []*structs.AllocState{{
Field: structs.AllocStateFieldClientStatus,
Value: structs.AllocClientStatusUnknown,
Time: now.Add(-60 * time.Second),
}}
reconnectedAllocState := []*structs.AllocState{
{
Field: structs.AllocStateFieldClientStatus,
Value: structs.AllocClientStatusUnknown,
Time: now.Add(-time.Second),
},
{
Field: structs.AllocStateFieldClientStatus,
Value: structs.AllocClientStatusRunning,
Time: now,
},
}
type testCase struct {
name string
all allocSet
taintedNodes map[string]*structs.Node
supportsDisconnectedClients bool
skipNilNodeTest bool
now time.Time
// expected results
untainted allocSet
migrate allocSet
lost allocSet
disconnecting allocSet
reconnecting allocSet
ignore allocSet
}
testCases := []testCase{
// These two cases test that we maintain parity with pre-disconnected-clients behavior.
{
name: "lost-client",
supportsDisconnectedClients: false,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "normal",
},
// Terminal allocs are always untainted
"untainted2": {
ID: "untainted2",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "normal",
},
// Terminal allocs are always untainted, even on draining nodes
"untainted3": {
ID: "untainted3",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "draining",
},
// Terminal allocs are always untainted, even on lost nodes
"untainted4": {
ID: "untainted4",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "lost",
},
// Non-terminal alloc with migrate=true should migrate on a draining node
"migrating1": {
ID: "migrating1",
ClientStatus: structs.AllocClientStatusRunning,
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
Job: testJob,
NodeID: "draining",
},
// Non-terminal alloc with migrate=true should migrate on an unknown node
"migrating2": {
ID: "migrating2",
ClientStatus: structs.AllocClientStatusRunning,
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
Job: testJob,
NodeID: "nil",
},
},
untainted: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "normal",
},
// Terminal allocs are always untainted
"untainted2": {
ID: "untainted2",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "normal",
},
// Terminal allocs are always untainted, even on draining nodes
"untainted3": {
ID: "untainted3",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "draining",
},
// Terminal allocs are always untainted, even on lost nodes
"untainted4": {
ID: "untainted4",
ClientStatus: structs.AllocClientStatusComplete,
Job: testJob,
NodeID: "lost",
},
},
migrate: allocSet{
// Non-terminal alloc with migrate=true should migrate on a draining node
"migrating1": {
ID: "migrating1",
ClientStatus: structs.AllocClientStatusRunning,
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
Job: testJob,
NodeID: "draining",
},
// Non-terminal alloc with migrate=true should migrate on an unknown node
"migrating2": {
ID: "migrating2",
ClientStatus: structs.AllocClientStatusRunning,
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
Job: testJob,
NodeID: "nil",
},
},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{},
},
{
name: "lost-client-only-tainted-nodes",
supportsDisconnectedClients: false,
now: time.Now(),
taintedNodes: nodes,
// The logic associated with this test case can only trigger if there
// is a tainted node. Therefore, testing with a nil node set produces
// false failures, so don't perform that test if in this case.
skipNilNodeTest: true,
all: allocSet{
// Non-terminal allocs on lost nodes are lost
"lost1": {
ID: "lost1",
ClientStatus: structs.AllocClientStatusPending,
Job: testJob,
NodeID: "lost",
},
// Non-terminal allocs on lost nodes are lost
"lost2": {
ID: "lost2",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "lost",
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{
// Non-terminal allocs on lost nodes are lost
"lost1": {
ID: "lost1",
ClientStatus: structs.AllocClientStatusPending,
Job: testJob,
NodeID: "lost",
},
// Non-terminal allocs on lost nodes are lost
"lost2": {
ID: "lost2",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
NodeID: "lost",
},
},
},
{
name: "disco-client-disconnect-unset-max-disconnect",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: true,
all: allocSet{
// Non-terminal allocs on disconnected nodes w/o max-disconnect are lost
"lost-running": {
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{
"lost-running": {
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
},
},
},
// Everything below this line tests the disconnected client mode.
{
name: "disco-client-untainted-reconnect-failed-and-replaced",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
"running-replacement": {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
PreviousAllocation: "failed-original",
},
// Failed and replaced allocs on reconnected nodes
// that are still desired-running are reconnected so
// we can stop them
"failed-original": {
ID: "failed-original",
Name: "web",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
untainted: allocSet{
"running-replacement": {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
PreviousAllocation: "failed-original",
},
},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{
"failed-original": {
ID: "failed-original",
Name: "web",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
ignore: allocSet{},
lost: allocSet{},
},
{
name: "disco-client-reconnecting-running-no-replacement",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
// Running allocs on reconnected nodes with no replacement are reconnecting.
// Node.UpdateStatus has already handled syncing client state so this
// should be a noop.
"reconnecting-running-no-replacement": {
ID: "reconnecting-running-no-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{
"reconnecting-running-no-replacement": {
ID: "reconnecting-running-no-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
ignore: allocSet{},
lost: allocSet{},
},
{
name: "disco-client-terminal",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
// Allocs on reconnected nodes that are complete are ignored
"ignored-reconnect-complete": {
ID: "ignored-reconnect-complete",
Name: "ignored-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
// Failed allocs on reconnected nodes are in reconnecting so that
// they be marked with desired status stop at the server.
"reconnecting-failed": {
ID: "reconnecting-failed",
Name: "reconnecting-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
// Lost allocs on reconnected nodes don't get restarted
"ignored-reconnect-lost": {
ID: "ignored-reconnect-lost",
Name: "ignored-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
// Replacement allocs that are complete are ignored
"ignored-reconnect-complete-replacement": {
ID: "ignored-reconnect-complete-replacement",
Name: "ignored-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
PreviousAllocation: "untainted-reconnect-complete",
},
// Replacement allocs on reconnected nodes that are failed are ignored
"ignored-reconnect-failed-replacement": {
ID: "ignored-reconnect-failed-replacement",
Name: "ignored-reconnect-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
PreviousAllocation: "reconnecting-failed",
},
// Lost replacement allocs on reconnected nodes don't get restarted
"ignored-reconnect-lost-replacement": {
ID: "ignored-reconnect-lost-replacement",
Name: "ignored-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
PreviousAllocation: "untainted-reconnect-lost",
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{
"reconnecting-failed": {
ID: "reconnecting-failed",
Name: "reconnecting-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
ignore: allocSet{
"ignored-reconnect-complete": {
ID: "ignored-reconnect-complete",
Name: "ignored-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
"ignored-reconnect-lost": {
ID: "ignored-reconnect-lost",
Name: "ignored-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
"ignored-reconnect-complete-replacement": {
ID: "ignored-reconnect-complete-replacement",
Name: "ignored-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
PreviousAllocation: "untainted-reconnect-complete",
},
"ignored-reconnect-failed-replacement": {
ID: "ignored-reconnect-failed-replacement",
Name: "ignored-reconnect-failed",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
PreviousAllocation: "reconnecting-failed",
},
"ignored-reconnect-lost-replacement": {
ID: "ignored-reconnect-lost-replacement",
Name: "ignored-reconnect-lost",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
PreviousAllocation: "untainted-reconnect-lost",
},
},
lost: allocSet{},
},
{
name: "disco-client-disconnect",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: true,
all: allocSet{
// Non-terminal allocs on disconnected nodes are disconnecting
"disconnect-running": {
ID: "disconnect-running",
Name: "disconnect-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
// Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again
"untainted-unknown": {
ID: "untainted-unknown",
Name: "untainted-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
// Unknown allocs on disconnected nodes are lost when expired
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
// Pending allocs on disconnected nodes are lost
"lost-pending": {
ID: "lost-pending",
Name: "lost-pending",
ClientStatus: structs.AllocClientStatusPending,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
// Expired allocs on reconnected clients are lost
// Pending allocs on disconnected nodes are lost
"lost-expired": {
ID: "lost-expired",
Name: "lost-expired",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
// Failed and stopped allocs on disconnected nodes are ignored
"ignore-reconnected-failed-stopped": {
ID: "ignore-reconnected-failed-stopped",
Name: "ignore-reconnected-failed-stopped",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
untainted: allocSet{
// Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again
"untainted-unknown": {
ID: "untainted-unknown",
Name: "untainted-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
migrate: allocSet{},
disconnecting: allocSet{
"disconnect-running": {
ID: "disconnect-running",
Name: "disconnect-running",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
},
reconnecting: allocSet{},
ignore: allocSet{
"ignore-reconnected-failed-stopped": {
ID: "ignore-reconnected-failed-stopped",
Name: "ignore-reconnected-failed-stopped",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
lost: allocSet{
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
"lost-pending": {
ID: "lost-pending",
Name: "lost-pending",
ClientStatus: structs.AllocClientStatusPending,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
},
"lost-expired": {
ID: "lost-expired",
Name: "lost-expired",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
},
},
{
name: "disco-client-reconnect",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
// Expired allocs on reconnected clients are lost
"lost-expired-reconnect": {
ID: "lost-expired-reconnect",
Name: "lost-expired-reconnect",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{
"lost-expired-reconnect": {
ID: "lost-expired-reconnect",
Name: "lost-expired-reconnect",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: expiredAllocState,
},
},
},
{
name: "disco-client-running-reconnecting-and-replacement-untainted",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
"running-replacement": {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
PreviousAllocation: "running-original",
},
// Running and replaced allocs on reconnected nodes are reconnecting
"running-original": {
ID: "running-original",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
untainted: allocSet{
"running-replacement": {
ID: "running-replacement",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
PreviousAllocation: "running-original",
},
},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{
"running-original": {
ID: "running-original",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
ignore: allocSet{},
lost: allocSet{},
},
{
// After an alloc is reconnected, it should be considered
// "untainted" instead of "reconnecting" to allow changes such as
// job updates to be applied properly.
name: "disco-client-reconnected-alloc-untainted",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
"running-reconnected": {
ID: "running-reconnected",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: reconnectedAllocState,
},
},
untainted: allocSet{
"running-reconnected": {
ID: "running-reconnected",
Name: "web",
ClientStatus: structs.AllocClientStatusRunning,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: reconnectedAllocState,
},
},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// With tainted nodes
untainted, migrate, lost, disconnecting, reconnecting, ignore := tc.all.filterByTainted(tc.taintedNodes, tc.supportsDisconnectedClients, tc.now)
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
must.Eq(t, tc.migrate, migrate, must.Sprintf("with-nodes: migrate"))
must.Eq(t, tc.lost, lost, must.Sprintf("with-nodes: lost"))
must.Eq(t, tc.disconnecting, disconnecting, must.Sprintf("with-nodes: disconnecting"))
must.Eq(t, tc.reconnecting, reconnecting, must.Sprintf("with-nodes: reconnecting"))
must.Eq(t, tc.ignore, ignore, must.Sprintf("with-nodes: ignore"))
if tc.skipNilNodeTest {
return
}
// Now again with nodes nil
untainted, migrate, lost, disconnecting, reconnecting, ignore = tc.all.filterByTainted(nil, tc.supportsDisconnectedClients, tc.now)
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
must.Eq(t, tc.migrate, migrate, must.Sprintf("with-nodes: migrate"))
must.Eq(t, tc.lost, lost, must.Sprintf("with-nodes: lost"))
must.Eq(t, tc.disconnecting, disconnecting, must.Sprintf("with-nodes: disconnecting"))
must.Eq(t, tc.reconnecting, reconnecting, must.Sprintf("with-nodes: reconnecting"))
must.Eq(t, tc.ignore, ignore, must.Sprintf("with-nodes: ignore"))
})
}
}
func TestReconcile_shouldFilter(t *testing.T) {
testCases := []struct {
description string
batch bool
failed bool
desiredStatus string
clientStatus string
untainted bool
ignore bool
}{
{
description: "batch running",
batch: true,
failed: false,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusRunning,
untainted: true,
ignore: false,
},
{
description: "batch stopped success",
batch: true,
failed: false,
desiredStatus: structs.AllocDesiredStatusStop,
clientStatus: structs.AllocClientStatusRunning,
untainted: true,
ignore: false,
},
{
description: "batch stopped failed",
batch: true,
failed: true,
desiredStatus: structs.AllocDesiredStatusStop,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "batch evicted",
batch: true,
desiredStatus: structs.AllocDesiredStatusEvict,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "batch failed",
batch: true,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusFailed,
untainted: false,
ignore: false,
},
{
description: "service running",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusRunning,
untainted: false,
ignore: false,
},
{
description: "service stopped",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusStop,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "service evicted",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusEvict,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
{
description: "service client complete",
batch: false,
failed: false,
desiredStatus: structs.AllocDesiredStatusRun,
clientStatus: structs.AllocClientStatusComplete,
untainted: false,
ignore: true,
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
alloc := &structs.Allocation{
DesiredStatus: tc.desiredStatus,
TaskStates: map[string]*structs.TaskState{"task": {State: structs.TaskStateDead, Failed: tc.failed}},
ClientStatus: tc.clientStatus,
}
untainted, ignore := shouldFilter(alloc, tc.batch)
must.Eq(t, tc.untainted, untainted)
must.Eq(t, tc.ignore, ignore)
})
}
}
// Test that we properly create the bitmap even when the alloc set includes an
// allocation with a higher count than the current min count and it is byte
// aligned.
// Ensure no regression from: https://github.com/hashicorp/nomad/issues/3008
func TestBitmapFrom(t *testing.T) {
ci.Parallel(t)
input := map[string]*structs.Allocation{
"8": {
JobID: "foo",
TaskGroup: "bar",
Name: "foo.bar[8]",
},
}
b, dups := bitmapFrom(input, 1)
must.Eq(t, 16, b.Size())
must.MapEmpty(t, dups)
b, dups = bitmapFrom(input, 8)
must.Eq(t, 16, b.Size())
must.MapEmpty(t, dups)
}
func Test_allocNameIndex_Highest(t *testing.T) {
ci.Parallel(t)
testCases := []struct {
name string
inputAllocNameIndex *allocNameIndex
inputN uint
expectedOutput map[string]struct{}
}{
{
name: "select 1",
inputAllocNameIndex: newAllocNameIndex(
"example", "cache", 3, map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
}),
inputN: 1,
expectedOutput: map[string]struct{}{
"example.cache[2]": {},
},
},
{
name: "select all",
inputAllocNameIndex: newAllocNameIndex(
"example", "cache", 3, map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
}),
inputN: 3,
expectedOutput: map[string]struct{}{
"example.cache[2]": {},
"example.cache[1]": {},
"example.cache[0]": {},
},
},
{
name: "select too many",
inputAllocNameIndex: newAllocNameIndex(
"example", "cache", 3, map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
}),
inputN: 13,
expectedOutput: map[string]struct{}{
"example.cache[2]": {},
"example.cache[1]": {},
"example.cache[0]": {},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
must.Eq(t, tc.expectedOutput, tc.inputAllocNameIndex.Highest(tc.inputN))
})
}
}
func Test_allocNameIndex_NextCanaries(t *testing.T) {
ci.Parallel(t)
testCases := []struct {
name string
inputAllocNameIndex *allocNameIndex
inputN uint
inputExisting allocSet
inputDestructive allocSet
expectedOutput []string
}{
{
name: "single canary",
inputAllocNameIndex: newAllocNameIndex(
"example", "cache", 3, map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
}),
inputN: 1,
inputExisting: nil,
inputDestructive: map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
},
expectedOutput: []string{
"example.cache[0]",
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
must.SliceContainsAll(
t, tc.expectedOutput,
tc.inputAllocNameIndex.NextCanaries(tc.inputN, tc.inputExisting, tc.inputDestructive))
})
}
}
func Test_allocNameIndex_Next(t *testing.T) {
ci.Parallel(t)
testCases := []struct {
name string
inputAllocNameIndex *allocNameIndex
inputN uint
expectedOutput []string
}{
{
name: "empty existing bitmap",
inputAllocNameIndex: newAllocNameIndex("example", "cache", 3, nil),
inputN: 3,
expectedOutput: []string{
"example.cache[0]", "example.cache[1]", "example.cache[2]",
},
},
{
name: "non-empty existing bitmap simple",
inputAllocNameIndex: newAllocNameIndex(
"example", "cache", 3, map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
}),
inputN: 3,
expectedOutput: []string{
"example.cache[0]", "example.cache[1]", "example.cache[2]",
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
must.SliceContainsAll(t, tc.expectedOutput, tc.inputAllocNameIndex.Next(tc.inputN))
})
}
}
func Test_allocNameIndex_Duplicates(t *testing.T) {
ci.Parallel(t)
inputAllocSet := map[string]*structs.Allocation{
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
"e24771e6-8900-5d2d-ec93-e7076284774a": {
Name: "example.cache[1]",
JobID: "example",
TaskGroup: "cache",
},
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
Name: "example.cache[2]",
JobID: "example",
TaskGroup: "cache",
},
"76a6a487-016b-2fc2-8295-d811473ca93d": {
Name: "example.cache[0]",
JobID: "example",
TaskGroup: "cache",
},
}
// Build the tracker, and check some key information.
allocNameIndexTracker := newAllocNameIndex("example", "cache", 4, inputAllocSet)
must.Eq(t, 8, allocNameIndexTracker.b.Size())
must.MapLen(t, 1, allocNameIndexTracker.duplicates)
must.True(t, allocNameIndexTracker.IsDuplicate(0))
// Unsetting the index should remove the duplicate entry, but not the entry
// from the underlying bitmap.
allocNameIndexTracker.UnsetIndex(0)
must.MapLen(t, 0, allocNameIndexTracker.duplicates)
must.True(t, allocNameIndexTracker.b.Check(0))
// If we now select a new index, having previously checked for a duplicate,
// we should get a non-duplicate.
nextAllocNames := allocNameIndexTracker.Next(1)
must.Len(t, 1, nextAllocNames)
must.Eq(t, "example.cache[3]", nextAllocNames[0])
}
func TestAllocSet_filterByRescheduleable(t *testing.T) {
ci.Parallel(t)
noRescheduleJob := mock.Job()
noRescheduleTG := &structs.TaskGroup{
Name: "noRescheduleTG",
ReschedulePolicy: &structs.ReschedulePolicy{
Attempts: 0,
Unlimited: false,
},
}
noRescheduleJob.TaskGroups[0] = noRescheduleTG
testJob := mock.Job()
rescheduleTG := &structs.TaskGroup{
Name: "rescheduleTG",
ReschedulePolicy: &structs.ReschedulePolicy{
Attempts: 1,
Unlimited: false,
},
}
testJob.TaskGroups[0] = rescheduleTG
now := time.Now()
type testCase struct {
name string
all allocSet
isBatch bool
supportsDisconnectedClients bool
isDisconnecting bool
deployment *structs.Deployment
// expected results
untainted allocSet
resNow allocSet
resLater []*delayedRescheduleInfo
}
testCases := []testCase{
{
name: "batch disconnecting allocation no reschedule",
isDisconnecting: true,
isBatch: true,
all: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: noRescheduleJob,
TaskGroup: "noRescheduleTG",
},
},
untainted: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: noRescheduleJob,
TaskGroup: "noRescheduleTG",
},
},
resNow: allocSet{},
resLater: []*delayedRescheduleInfo{},
},
{
name: "batch ignore unknown disconnecting allocs",
isDisconnecting: true,
isBatch: true,
all: allocSet{
"disconnecting1": {
ID: "disconnection1",
ClientStatus: structs.AllocClientStatusUnknown,
Job: testJob,
},
},
untainted: allocSet{},
resNow: allocSet{},
resLater: []*delayedRescheduleInfo{},
},
{
name: "batch disconnecting allocation reschedule",
isDisconnecting: true,
isBatch: true,
all: allocSet{
"rescheduleNow1": {
ID: "rescheduleNow1",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
TaskGroup: "rescheduleTG",
},
},
untainted: allocSet{},
resNow: allocSet{
"rescheduleNow1": {
ID: "rescheduleNow1",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
TaskGroup: "rescheduleTG",
},
},
resLater: []*delayedRescheduleInfo{},
},
{
name: "service disconnecting allocation no reschedule",
isDisconnecting: true,
isBatch: false,
all: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: noRescheduleJob,
TaskGroup: "noRescheduleTG",
},
},
untainted: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: noRescheduleJob,
TaskGroup: "noRescheduleTG",
},
},
resNow: allocSet{},
resLater: []*delayedRescheduleInfo{},
},
{
name: "service disconnecting allocation reschedule",
isDisconnecting: true,
isBatch: false,
all: allocSet{
"rescheduleNow1": {
ID: "rescheduleNow1",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
TaskGroup: "rescheduleTG",
},
},
untainted: allocSet{},
resNow: allocSet{
"rescheduleNow1": {
ID: "rescheduleNow1",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJob,
TaskGroup: "rescheduleTG",
},
},
resLater: []*delayedRescheduleInfo{},
},
{
name: "service ignore unknown disconnecting allocs",
isDisconnecting: true,
isBatch: false,
all: allocSet{
"disconnecting1": {
ID: "disconnection1",
ClientStatus: structs.AllocClientStatusUnknown,
Job: testJob,
},
},
untainted: allocSet{},
resNow: allocSet{},
resLater: []*delayedRescheduleInfo{},
},
{
name: "service running allocation no reschedule",
isDisconnecting: false,
isBatch: true,
all: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: noRescheduleJob,
TaskGroup: "noRescheduleTG",
},
},
untainted: allocSet{
"untainted1": {
ID: "untainted1",
ClientStatus: structs.AllocClientStatusRunning,
Job: noRescheduleJob,
TaskGroup: "noRescheduleTG",
},
},
resNow: allocSet{},
resLater: []*delayedRescheduleInfo{},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
untainted, resNow, resLater := tc.all.filterByRescheduleable(tc.isBatch,
tc.isDisconnecting, now, "evailID", tc.deployment)
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
must.Eq(t, tc.resNow, resNow, must.Sprintf("with-nodes: reschedule-now"))
must.Eq(t, tc.resLater, resLater, must.Sprintf("with-nodes: rescheduleLater"))
})
}
}