allocs without max_client_disconnect should be lost on disconnect (#12529)

In the reconciler's filtering for tainted nodes, we use whether the
server supports disconnected clients as a gate to a bunch of our
logic, but this doesn't account for cases where the job doesn't have
`max_client_disconnect`. The only real consequence of this appears to
be that allocs on disconnected nodes are marked "complete" instead of
"lost".
This commit is contained in:
Tim Gross 2022-04-11 11:24:49 -04:00 committed by GitHub
parent fecf4b46eb
commit 57b3a0028f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 142 additions and 38 deletions

View File

@ -216,7 +216,7 @@ func (a allocSet) fromKeys(keys ...[]string) allocSet {
// 4. Those that are on nodes that are disconnected, but have not had their ClientState set to unknown
// 5. Those that are on a node that has reconnected.
// 6. Those that are in a state that results in a noop.
func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, supportsDisconnectedClients bool, now time.Time) (untainted, migrate, lost, disconnecting, reconnecting, ignore allocSet) {
func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverSupportsDisconnectedClients bool, now time.Time) (untainted, migrate, lost, disconnecting, reconnecting, ignore allocSet) {
untainted = make(map[string]*structs.Allocation)
migrate = make(map[string]*structs.Allocation)
lost = make(map[string]*structs.Allocation)
@ -224,7 +224,20 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, support
reconnecting = make(map[string]*structs.Allocation)
ignore = make(map[string]*structs.Allocation)
supportsDisconnectedClients := serverSupportsDisconnectedClients
for _, alloc := range a {
// make sure we don't apply any reconnect logic to task groups
// without max_client_disconnect
if alloc.Job != nil {
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg != nil {
supportsDisconnectedClients = serverSupportsDisconnectedClients &&
tg.MaxClientDisconnect != nil
}
}
reconnected := false
expired := false
@ -303,13 +316,18 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, support
// Group disconnecting/reconnecting
switch taintedNode.Status {
case structs.NodeStatusDisconnected:
// Filter running allocs on a node that is disconnected to be marked as unknown.
if supportsDisconnectedClients && alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
// Filter pending allocs on a node that is disconnected to be marked as lost.
if alloc.ClientStatus == structs.AllocClientStatusPending {
if supportsDisconnectedClients {
// Filter running allocs on a node that is disconnected to be marked as unknown.
if alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
// Filter pending allocs on a node that is disconnected to be marked as lost.
if alloc.ClientStatus == structs.AllocClientStatusPending {
lost[alloc.ID] = alloc
continue
}
} else {
lost[alloc.ID] = alloc
continue
}

View File

@ -3,12 +3,13 @@ package scheduler
import (
"testing"
"time"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/require"
"time"
)
// Test that we properly create the bitmap even when the alloc set includes an
@ -64,6 +65,9 @@ func TestAllocSet_filterByTainted(t *testing.T) {
testJob.TaskGroups[0].MaxClientDisconnect = helper.TimeToPtr(5 * time.Second)
now := time.Now()
testJobNoMaxDisconnect := mock.Job()
testJobNoMaxDisconnect.TaskGroups[0].MaxClientDisconnect = nil
unknownAllocState := []*structs.AllocState{{
Field: structs.AllocStateFieldClientStatus,
Value: structs.AllocClientStatusUnknown,
@ -253,6 +257,61 @@ func TestAllocSet_filterByTainted(t *testing.T) {
},
},
{
name: "disco-client-disconnect-unset-max-disconnect",
supportsDisconnectedClients: true,
now: time.Now(),
taintedNodes: nodes,
skipNilNodeTest: true,
all: allocSet{
// Non-terminal allocs on disconnected nodes w/o max-disconnect are lost
"lost-running": {
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
},
// Unknown allocs on disconnected nodes w/o max-disconnect are lost
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
untainted: allocSet{},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{},
ignore: allocSet{},
lost: allocSet{
"lost-running": {
ID: "lost-running",
Name: "lost-running",
ClientStatus: structs.AllocClientStatusRunning,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
},
"lost-unknown": {
ID: "lost-unknown",
Name: "lost-unknown",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJobNoMaxDisconnect,
NodeID: "disconnected",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
},
},
// Everything below this line tests the disconnected client mode.
{
name: "disco-client-untainted-reconnect-failed-and-replaced",
@ -270,16 +329,19 @@ func TestAllocSet_filterByTainted(t *testing.T) {
TaskGroup: "web",
PreviousAllocation: "failed-original",
},
// Failed and replaced allocs on reconnected nodes are untainted
// Failed and replaced allocs on reconnected nodes
// that are still desired-running are reconnected so
// we can stop them
"failed-original": {
ID: "failed-original",
Name: "web",
ClientStatus: structs.AllocClientStatusFailed,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "failed-original",
Name: "web",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
untainted: allocSet{
@ -297,14 +359,15 @@ func TestAllocSet_filterByTainted(t *testing.T) {
disconnecting: allocSet{},
reconnecting: allocSet{
"failed-original": {
ID: "failed-original",
Name: "web",
ClientStatus: structs.AllocClientStatusFailed,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
ID: "failed-original",
Name: "web",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
TaskStates: reconnectTaskState,
},
},
ignore: allocSet{},
@ -530,6 +593,18 @@ func TestAllocSet_filterByTainted(t *testing.T) {
TaskGroup: "web",
AllocStates: expiredAllocState,
},
// Failed and stopped allocs on disconnected nodes are ignored
"ignore-reconnected-failed-stopped": {
ID: "ignore-reconnected-failed-stopped",
Name: "ignore-reconnected-failed-stopped",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: unknownAllocState, // TODO really?
},
},
untainted: allocSet{},
migrate: allocSet{},
@ -556,6 +631,17 @@ func TestAllocSet_filterByTainted(t *testing.T) {
TaskGroup: "web",
AllocStates: unknownAllocState,
},
"ignore-reconnected-failed-stopped": {
ID: "ignore-reconnected-failed-stopped",
Name: "ignore-reconnected-failed-stopped",
ClientStatus: structs.AllocClientStatusFailed,
DesiredStatus: structs.AllocDesiredStatusStop,
Job: testJob,
NodeID: "disconnected",
TaskGroup: "web",
TaskStates: reconnectTaskState,
AllocStates: unknownAllocState,
},
},
lost: allocSet{
"lost-unknown": {
@ -631,12 +717,12 @@ func TestAllocSet_filterByTainted(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
// With tainted nodes
untainted, migrate, lost, disconnecting, reconnecting, ignore := tc.all.filterByTainted(tc.taintedNodes, tc.supportsDisconnectedClients, tc.now)
require.Equal(t, tc.untainted, untainted, "with-nodes", "untainted")
require.Equal(t, tc.migrate, migrate, "with-nodes", "migrate")
require.Equal(t, tc.lost, lost, "with-nodes", "lost")
require.Equal(t, tc.disconnecting, disconnecting, "with-nodes", "disconnecting")
require.Equal(t, tc.reconnecting, reconnecting, "with-nodes", "reconnecting")
require.Equal(t, tc.ignore, ignore, "with-nodes", "ignore")
require.Equal(t, tc.untainted, untainted, "with-nodes: %s", "untainted")
require.Equal(t, tc.migrate, migrate, "with-nodes: %s", "migrate")
require.Equal(t, tc.lost, lost, "with-nodes: %s", "lost")
require.Equal(t, tc.disconnecting, disconnecting, "with-nodes: %s", "disconnecting")
require.Equal(t, tc.reconnecting, reconnecting, "with-nodes: %s", "reconnecting")
require.Equal(t, tc.ignore, ignore, "with-nodes: %s", "ignore")
if tc.skipNilNodeTest {
return
@ -644,12 +730,12 @@ func TestAllocSet_filterByTainted(t *testing.T) {
// Now again with nodes nil
untainted, migrate, lost, disconnecting, reconnecting, ignore = tc.all.filterByTainted(nil, tc.supportsDisconnectedClients, tc.now)
require.Equal(t, tc.untainted, untainted, "nodes-nil", "untainted")
require.Equal(t, tc.migrate, migrate, "nodes-nil", "migrate")
require.Equal(t, tc.lost, lost, "nodes-nil", "lost")
require.Equal(t, tc.disconnecting, disconnecting, "nodes-nil", "disconnecting")
require.Equal(t, tc.reconnecting, reconnecting, "nodes-nil", "reconnecting")
require.Equal(t, tc.ignore, ignore, "nodes-nil", "ignore")
require.Equal(t, tc.untainted, untainted, "nodes-nil: %s", "untainted")
require.Equal(t, tc.migrate, migrate, "nodes-nil: %s", "migrate")
require.Equal(t, tc.lost, lost, "nodes-nil: %s", "lost")
require.Equal(t, tc.disconnecting, disconnecting, "nodes-nil: %s", "disconnecting")
require.Equal(t, tc.reconnecting, reconnecting, "nodes-nil: %s", "reconnecting")
require.Equal(t, tc.ignore, ignore, "nodes-nil: %s", "ignore")
})
}
}