2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2017-08-12 22:37:02 +00:00
|
|
|
package scheduler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"testing"
|
2022-04-11 15:24:49 +00:00
|
|
|
"time"
|
|
|
|
|
2022-03-15 12:42:43 +00:00
|
|
|
"github.com/hashicorp/nomad/ci"
|
2022-08-17 16:26:34 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/pointer"
|
2021-02-11 15:40:59 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
2017-08-12 22:37:02 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2023-08-03 14:44:19 +00:00
|
|
|
"github.com/shoenig/test/must"
|
2017-08-12 22:37:02 +00:00
|
|
|
)
|
|
|
|
|
2018-02-24 00:45:57 +00:00
|
|
|
func TestAllocSet_filterByTainted(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2018-02-24 00:45:57 +00:00
|
|
|
nodes := map[string]*structs.Node{
|
2018-03-20 18:31:55 +00:00
|
|
|
"draining": {
|
2021-02-11 15:40:59 +00:00
|
|
|
ID: "draining",
|
|
|
|
DrainStrategy: mock.DrainNode().DrainStrategy,
|
2018-02-24 00:45:57 +00:00
|
|
|
},
|
2018-03-20 18:31:55 +00:00
|
|
|
"lost": {
|
2018-02-24 00:45:57 +00:00
|
|
|
ID: "lost",
|
|
|
|
Status: structs.NodeStatusDown,
|
|
|
|
},
|
|
|
|
"nil": nil,
|
2018-03-20 18:31:55 +00:00
|
|
|
"normal": {
|
2018-02-24 00:45:57 +00:00
|
|
|
ID: "normal",
|
|
|
|
Status: structs.NodeStatusReady,
|
|
|
|
},
|
2022-02-16 18:50:20 +00:00
|
|
|
"disconnected": {
|
|
|
|
ID: "disconnected",
|
|
|
|
Status: structs.NodeStatusDisconnected,
|
|
|
|
},
|
2018-02-24 00:45:57 +00:00
|
|
|
}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
testJob := mock.Job()
|
2022-08-17 16:26:34 +00:00
|
|
|
testJob.TaskGroups[0].MaxClientDisconnect = pointer.Of(5 * time.Second)
|
2022-03-31 15:32:18 +00:00
|
|
|
now := time.Now()
|
2018-02-24 00:45:57 +00:00
|
|
|
|
2022-04-11 15:24:49 +00:00
|
|
|
testJobNoMaxDisconnect := mock.Job()
|
|
|
|
testJobNoMaxDisconnect.TaskGroups[0].MaxClientDisconnect = nil
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
unknownAllocState := []*structs.AllocState{{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusUnknown,
|
|
|
|
Time: now,
|
|
|
|
}}
|
|
|
|
|
|
|
|
expiredAllocState := []*structs.AllocState{{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusUnknown,
|
|
|
|
Time: now.Add(-60 * time.Second),
|
|
|
|
}}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnectedAllocState := []*structs.AllocState{
|
|
|
|
{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusUnknown,
|
|
|
|
Time: now.Add(-time.Second),
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Field: structs.AllocStateFieldClientStatus,
|
|
|
|
Value: structs.AllocClientStatusRunning,
|
|
|
|
Time: now,
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type testCase struct {
|
|
|
|
name string
|
|
|
|
all allocSet
|
|
|
|
taintedNodes map[string]*structs.Node
|
|
|
|
supportsDisconnectedClients bool
|
|
|
|
skipNilNodeTest bool
|
|
|
|
now time.Time
|
|
|
|
// expected results
|
|
|
|
untainted allocSet
|
|
|
|
migrate allocSet
|
|
|
|
lost allocSet
|
|
|
|
disconnecting allocSet
|
|
|
|
reconnecting allocSet
|
|
|
|
ignore allocSet
|
|
|
|
}
|
|
|
|
|
|
|
|
testCases := []testCase{
|
|
|
|
// These two cases test that we maintain parity with pre-disconnected-clients behavior.
|
|
|
|
{
|
|
|
|
name: "lost-client",
|
|
|
|
supportsDisconnectedClients: false,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
},
|
|
|
|
// Terminal allocs are always untainted
|
|
|
|
"untainted2": {
|
|
|
|
ID: "untainted2",
|
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
},
|
|
|
|
// Terminal allocs are always untainted, even on draining nodes
|
|
|
|
"untainted3": {
|
|
|
|
ID: "untainted3",
|
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "draining",
|
|
|
|
},
|
|
|
|
// Terminal allocs are always untainted, even on lost nodes
|
|
|
|
"untainted4": {
|
|
|
|
ID: "untainted4",
|
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "lost",
|
|
|
|
},
|
|
|
|
// Non-terminal alloc with migrate=true should migrate on a draining node
|
|
|
|
"migrating1": {
|
|
|
|
ID: "migrating1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-08-17 16:26:34 +00:00
|
|
|
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "draining",
|
|
|
|
},
|
|
|
|
// Non-terminal alloc with migrate=true should migrate on an unknown node
|
|
|
|
"migrating2": {
|
|
|
|
ID: "migrating2",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-08-17 16:26:34 +00:00
|
|
|
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "nil",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
},
|
|
|
|
// Terminal allocs are always untainted
|
|
|
|
"untainted2": {
|
|
|
|
ID: "untainted2",
|
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
},
|
|
|
|
// Terminal allocs are always untainted, even on draining nodes
|
|
|
|
"untainted3": {
|
|
|
|
ID: "untainted3",
|
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "draining",
|
|
|
|
},
|
|
|
|
// Terminal allocs are always untainted, even on lost nodes
|
|
|
|
"untainted4": {
|
|
|
|
ID: "untainted4",
|
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "lost",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
migrate: allocSet{
|
|
|
|
// Non-terminal alloc with migrate=true should migrate on a draining node
|
|
|
|
"migrating1": {
|
|
|
|
ID: "migrating1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-08-17 16:26:34 +00:00
|
|
|
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "draining",
|
|
|
|
},
|
|
|
|
// Non-terminal alloc with migrate=true should migrate on an unknown node
|
|
|
|
"migrating2": {
|
|
|
|
ID: "migrating2",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-08-17 16:26:34 +00:00
|
|
|
DesiredTransition: structs.DesiredTransition{Migrate: pointer.Of(true)},
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "nil",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
|
|
|
name: "lost-client-only-tainted-nodes",
|
|
|
|
supportsDisconnectedClients: false,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
// The logic associated with this test case can only trigger if there
|
|
|
|
// is a tainted node. Therefore, testing with a nil node set produces
|
|
|
|
// false failures, so don't perform that test if in this case.
|
|
|
|
skipNilNodeTest: true,
|
|
|
|
all: allocSet{
|
|
|
|
// Non-terminal allocs on lost nodes are lost
|
|
|
|
"lost1": {
|
|
|
|
ID: "lost1",
|
|
|
|
ClientStatus: structs.AllocClientStatusPending,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "lost",
|
|
|
|
},
|
|
|
|
// Non-terminal allocs on lost nodes are lost
|
|
|
|
"lost2": {
|
|
|
|
ID: "lost2",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "lost",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{
|
|
|
|
// Non-terminal allocs on lost nodes are lost
|
|
|
|
"lost1": {
|
|
|
|
ID: "lost1",
|
|
|
|
ClientStatus: structs.AllocClientStatusPending,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "lost",
|
|
|
|
},
|
|
|
|
// Non-terminal allocs on lost nodes are lost
|
|
|
|
"lost2": {
|
|
|
|
ID: "lost2",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "lost",
|
|
|
|
},
|
|
|
|
},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-04-11 15:24:49 +00:00
|
|
|
{
|
|
|
|
name: "disco-client-disconnect-unset-max-disconnect",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: true,
|
|
|
|
all: allocSet{
|
|
|
|
// Non-terminal allocs on disconnected nodes w/o max-disconnect are lost
|
|
|
|
"lost-running": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "lost-running",
|
|
|
|
Name: "lost-running",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-04-11 15:24:49 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJobNoMaxDisconnect,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{
|
|
|
|
"lost-running": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "lost-running",
|
|
|
|
Name: "lost-running",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-04-11 15:24:49 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJobNoMaxDisconnect,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
// Everything below this line tests the disconnected client mode.
|
|
|
|
{
|
|
|
|
name: "disco-client-untainted-reconnect-failed-and-replaced",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
|
|
|
"running-replacement": {
|
|
|
|
ID: "running-replacement",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
PreviousAllocation: "failed-original",
|
|
|
|
},
|
2022-04-11 15:24:49 +00:00
|
|
|
// Failed and replaced allocs on reconnected nodes
|
|
|
|
// that are still desired-running are reconnected so
|
|
|
|
// we can stop them
|
2022-03-31 15:32:18 +00:00
|
|
|
"failed-original": {
|
2022-04-11 15:24:49 +00:00
|
|
|
ID: "failed-original",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"running-replacement": {
|
|
|
|
ID: "running-replacement",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
PreviousAllocation: "failed-original",
|
|
|
|
},
|
2022-04-06 13:33:32 +00:00
|
|
|
},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{
|
2022-03-31 15:32:18 +00:00
|
|
|
"failed-original": {
|
2022-04-11 15:24:49 +00:00
|
|
|
ID: "failed-original",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
2022-04-06 13:33:32 +00:00
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
|
|
|
name: "disco-client-reconnecting-running-no-replacement",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
|
|
|
// Running allocs on reconnected nodes with no replacement are reconnecting.
|
|
|
|
// Node.UpdateStatus has already handled syncing client state so this
|
|
|
|
// should be a noop.
|
|
|
|
"reconnecting-running-no-replacement": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "reconnecting-running-no-replacement",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{
|
|
|
|
"reconnecting-running-no-replacement": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "reconnecting-running-no-replacement",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
|
|
|
name: "disco-client-terminal",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
2023-10-27 15:20:53 +00:00
|
|
|
// Allocs on reconnected nodes that are complete are ignored
|
|
|
|
"ignored-reconnect-complete": {
|
|
|
|
ID: "ignored-reconnect-complete",
|
|
|
|
Name: "ignored-reconnect-complete",
|
2022-04-21 14:05:58 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2022-04-06 13:33:32 +00:00
|
|
|
// Failed allocs on reconnected nodes are in reconnecting so that
|
|
|
|
// they be marked with desired status stop at the server.
|
|
|
|
"reconnecting-failed": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "reconnecting-failed",
|
|
|
|
Name: "reconnecting-failed",
|
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
// Lost allocs on reconnected nodes don't get restarted
|
2023-10-27 15:20:53 +00:00
|
|
|
"ignored-reconnect-lost": {
|
|
|
|
ID: "ignored-reconnect-lost",
|
|
|
|
Name: "ignored-reconnect-lost",
|
2022-04-21 14:05:58 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusLost,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
// Replacement allocs that are complete are ignored
|
|
|
|
"ignored-reconnect-complete-replacement": {
|
|
|
|
ID: "ignored-reconnect-complete-replacement",
|
|
|
|
Name: "ignored-reconnect-complete",
|
2022-03-31 15:32:18 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
PreviousAllocation: "untainted-reconnect-complete",
|
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
// Replacement allocs on reconnected nodes that are failed are ignored
|
|
|
|
"ignored-reconnect-failed-replacement": {
|
|
|
|
ID: "ignored-reconnect-failed-replacement",
|
|
|
|
Name: "ignored-reconnect-failed",
|
2022-03-31 15:32:18 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
2022-04-06 13:33:32 +00:00
|
|
|
PreviousAllocation: "reconnecting-failed",
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
// Lost replacement allocs on reconnected nodes don't get restarted
|
2023-10-27 15:20:53 +00:00
|
|
|
"ignored-reconnect-lost-replacement": {
|
|
|
|
ID: "ignored-reconnect-lost-replacement",
|
|
|
|
Name: "ignored-reconnect-lost",
|
2022-03-31 15:32:18 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusLost,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
PreviousAllocation: "untainted-reconnect-lost",
|
|
|
|
},
|
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
untainted: allocSet{},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{
|
|
|
|
"reconnecting-failed": {
|
|
|
|
ID: "reconnecting-failed",
|
|
|
|
Name: "reconnecting-failed",
|
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
ignore: allocSet{
|
|
|
|
|
|
|
|
"ignored-reconnect-complete": {
|
|
|
|
ID: "ignored-reconnect-complete",
|
|
|
|
Name: "ignored-reconnect-complete",
|
2022-04-21 14:05:58 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
"ignored-reconnect-lost": {
|
|
|
|
ID: "ignored-reconnect-lost",
|
|
|
|
Name: "ignored-reconnect-lost",
|
2022-04-21 14:05:58 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusLost,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
"ignored-reconnect-complete-replacement": {
|
|
|
|
ID: "ignored-reconnect-complete-replacement",
|
|
|
|
Name: "ignored-reconnect-complete",
|
2022-03-31 15:32:18 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusComplete,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
PreviousAllocation: "untainted-reconnect-complete",
|
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
"ignored-reconnect-failed-replacement": {
|
|
|
|
ID: "ignored-reconnect-failed-replacement",
|
|
|
|
Name: "ignored-reconnect-failed",
|
2022-03-31 15:32:18 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
2022-04-06 13:33:32 +00:00
|
|
|
PreviousAllocation: "reconnecting-failed",
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
"ignored-reconnect-lost-replacement": {
|
|
|
|
ID: "ignored-reconnect-lost-replacement",
|
|
|
|
Name: "ignored-reconnect-lost",
|
2022-03-31 15:32:18 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusLost,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
PreviousAllocation: "untainted-reconnect-lost",
|
|
|
|
},
|
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
lost: allocSet{},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
|
|
|
name: "disco-client-disconnect",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: true,
|
|
|
|
all: allocSet{
|
|
|
|
// Non-terminal allocs on disconnected nodes are disconnecting
|
|
|
|
"disconnect-running": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "disconnect-running",
|
|
|
|
Name: "disconnect-running",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
// Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again
|
|
|
|
"untainted-unknown": {
|
|
|
|
ID: "untainted-unknown",
|
|
|
|
Name: "untainted-unknown",
|
2022-04-06 13:33:32 +00:00
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
// Unknown allocs on disconnected nodes are lost when expired
|
|
|
|
"lost-unknown": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "lost-unknown",
|
|
|
|
Name: "lost-unknown",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: expiredAllocState,
|
|
|
|
},
|
|
|
|
// Pending allocs on disconnected nodes are lost
|
|
|
|
"lost-pending": {
|
|
|
|
ID: "lost-pending",
|
|
|
|
Name: "lost-pending",
|
|
|
|
ClientStatus: structs.AllocClientStatusPending,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
},
|
|
|
|
// Expired allocs on reconnected clients are lost
|
|
|
|
// Pending allocs on disconnected nodes are lost
|
|
|
|
"lost-expired": {
|
|
|
|
ID: "lost-expired",
|
|
|
|
Name: "lost-expired",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: expiredAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2022-04-11 15:24:49 +00:00
|
|
|
// Failed and stopped allocs on disconnected nodes are ignored
|
|
|
|
"ignore-reconnected-failed-stopped": {
|
|
|
|
ID: "ignore-reconnected-failed-stopped",
|
|
|
|
Name: "ignore-reconnected-failed-stopped",
|
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
2022-04-21 14:05:58 +00:00
|
|
|
AllocStates: unknownAllocState,
|
2022-04-11 15:24:49 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
2023-10-27 15:20:53 +00:00
|
|
|
untainted: allocSet{
|
|
|
|
// Unknown allocs on disconnected nodes are acknowledge, so they wont be rescheduled again
|
|
|
|
"untainted-unknown": {
|
|
|
|
ID: "untainted-unknown",
|
|
|
|
Name: "untainted-unknown",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
migrate: allocSet{},
|
2022-03-31 15:32:18 +00:00
|
|
|
disconnecting: allocSet{
|
|
|
|
"disconnect-running": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "disconnect-running",
|
|
|
|
Name: "disconnect-running",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
reconnecting: allocSet{},
|
|
|
|
ignore: allocSet{
|
2022-04-11 15:24:49 +00:00
|
|
|
"ignore-reconnected-failed-stopped": {
|
|
|
|
ID: "ignore-reconnected-failed-stopped",
|
|
|
|
Name: "ignore-reconnected-failed-stopped",
|
|
|
|
ClientStatus: structs.AllocClientStatusFailed,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
lost: allocSet{
|
|
|
|
"lost-unknown": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "lost-unknown",
|
|
|
|
Name: "lost-unknown",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: expiredAllocState,
|
|
|
|
},
|
|
|
|
"lost-pending": {
|
|
|
|
ID: "lost-pending",
|
|
|
|
Name: "lost-pending",
|
|
|
|
ClientStatus: structs.AllocClientStatusPending,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "disconnected",
|
|
|
|
TaskGroup: "web",
|
|
|
|
},
|
|
|
|
"lost-expired": {
|
|
|
|
ID: "lost-expired",
|
|
|
|
Name: "lost-expired",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: expiredAllocState,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "disco-client-reconnect",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
|
|
|
// Expired allocs on reconnected clients are lost
|
|
|
|
"lost-expired-reconnect": {
|
|
|
|
ID: "lost-expired-reconnect",
|
|
|
|
Name: "lost-expired-reconnect",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: expiredAllocState,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{
|
|
|
|
"lost-expired-reconnect": {
|
|
|
|
ID: "lost-expired-reconnect",
|
|
|
|
Name: "lost-expired-reconnect",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: expiredAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
2022-03-31 15:32:18 +00:00
|
|
|
{
|
|
|
|
name: "disco-client-running-reconnecting-and-replacement-untainted",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
|
|
|
"running-replacement": {
|
|
|
|
ID: "running-replacement",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
PreviousAllocation: "running-original",
|
|
|
|
},
|
|
|
|
// Running and replaced allocs on reconnected nodes are reconnecting
|
|
|
|
"running-original": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "running-original",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"running-replacement": {
|
|
|
|
ID: "running-replacement",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
2022-04-21 14:05:58 +00:00
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
2022-03-31 15:32:18 +00:00
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
PreviousAllocation: "running-original",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{
|
|
|
|
"running-original": {
|
2022-04-21 14:05:58 +00:00
|
|
|
ID: "running-original",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: unknownAllocState,
|
2022-03-31 15:32:18 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{},
|
2022-02-16 18:50:20 +00:00
|
|
|
},
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
{
|
|
|
|
// After an alloc is reconnected, it should be considered
|
|
|
|
// "untainted" instead of "reconnecting" to allow changes such as
|
|
|
|
// job updates to be applied properly.
|
|
|
|
name: "disco-client-reconnected-alloc-untainted",
|
|
|
|
supportsDisconnectedClients: true,
|
|
|
|
now: time.Now(),
|
|
|
|
taintedNodes: nodes,
|
|
|
|
skipNilNodeTest: false,
|
|
|
|
all: allocSet{
|
|
|
|
"running-reconnected": {
|
|
|
|
ID: "running-reconnected",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: reconnectedAllocState,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"running-reconnected": {
|
|
|
|
ID: "running-reconnected",
|
|
|
|
Name: "web",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
Job: testJob,
|
|
|
|
NodeID: "normal",
|
|
|
|
TaskGroup: "web",
|
|
|
|
AllocStates: reconnectedAllocState,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
migrate: allocSet{},
|
|
|
|
disconnecting: allocSet{},
|
|
|
|
reconnecting: allocSet{},
|
|
|
|
ignore: allocSet{},
|
|
|
|
lost: allocSet{},
|
|
|
|
},
|
2018-02-24 00:45:57 +00:00
|
|
|
}
|
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
// With tainted nodes
|
|
|
|
untainted, migrate, lost, disconnecting, reconnecting, ignore := tc.all.filterByTainted(tc.taintedNodes, tc.supportsDisconnectedClients, tc.now)
|
2023-08-03 14:44:19 +00:00
|
|
|
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
|
|
|
|
must.Eq(t, tc.migrate, migrate, must.Sprintf("with-nodes: migrate"))
|
|
|
|
must.Eq(t, tc.lost, lost, must.Sprintf("with-nodes: lost"))
|
|
|
|
must.Eq(t, tc.disconnecting, disconnecting, must.Sprintf("with-nodes: disconnecting"))
|
|
|
|
must.Eq(t, tc.reconnecting, reconnecting, must.Sprintf("with-nodes: reconnecting"))
|
|
|
|
must.Eq(t, tc.ignore, ignore, must.Sprintf("with-nodes: ignore"))
|
2022-03-31 15:32:18 +00:00
|
|
|
|
|
|
|
if tc.skipNilNodeTest {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now again with nodes nil
|
|
|
|
untainted, migrate, lost, disconnecting, reconnecting, ignore = tc.all.filterByTainted(nil, tc.supportsDisconnectedClients, tc.now)
|
2023-08-03 14:44:19 +00:00
|
|
|
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
|
|
|
|
must.Eq(t, tc.migrate, migrate, must.Sprintf("with-nodes: migrate"))
|
|
|
|
must.Eq(t, tc.lost, lost, must.Sprintf("with-nodes: lost"))
|
|
|
|
must.Eq(t, tc.disconnecting, disconnecting, must.Sprintf("with-nodes: disconnecting"))
|
|
|
|
must.Eq(t, tc.reconnecting, reconnecting, must.Sprintf("with-nodes: reconnecting"))
|
|
|
|
must.Eq(t, tc.ignore, ignore, must.Sprintf("with-nodes: ignore"))
|
2022-03-31 15:32:18 +00:00
|
|
|
})
|
|
|
|
}
|
2018-02-24 00:45:57 +00:00
|
|
|
}
|
2022-06-02 16:42:15 +00:00
|
|
|
|
|
|
|
func TestReconcile_shouldFilter(t *testing.T) {
|
|
|
|
testCases := []struct {
|
|
|
|
description string
|
|
|
|
batch bool
|
|
|
|
failed bool
|
|
|
|
desiredStatus string
|
|
|
|
clientStatus string
|
|
|
|
|
|
|
|
untainted bool
|
|
|
|
ignore bool
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
description: "batch running",
|
|
|
|
batch: true,
|
|
|
|
failed: false,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
clientStatus: structs.AllocClientStatusRunning,
|
|
|
|
untainted: true,
|
|
|
|
ignore: false,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "batch stopped success",
|
|
|
|
batch: true,
|
|
|
|
failed: false,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
clientStatus: structs.AllocClientStatusRunning,
|
|
|
|
untainted: true,
|
|
|
|
ignore: false,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "batch stopped failed",
|
|
|
|
batch: true,
|
|
|
|
failed: true,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
clientStatus: structs.AllocClientStatusComplete,
|
|
|
|
untainted: false,
|
|
|
|
ignore: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "batch evicted",
|
|
|
|
batch: true,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusEvict,
|
|
|
|
clientStatus: structs.AllocClientStatusComplete,
|
|
|
|
untainted: false,
|
|
|
|
ignore: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "batch failed",
|
|
|
|
batch: true,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
clientStatus: structs.AllocClientStatusFailed,
|
|
|
|
untainted: false,
|
|
|
|
ignore: false,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "service running",
|
|
|
|
batch: false,
|
|
|
|
failed: false,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
clientStatus: structs.AllocClientStatusRunning,
|
|
|
|
untainted: false,
|
|
|
|
ignore: false,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "service stopped",
|
|
|
|
batch: false,
|
|
|
|
failed: false,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusStop,
|
|
|
|
clientStatus: structs.AllocClientStatusComplete,
|
|
|
|
untainted: false,
|
|
|
|
ignore: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "service evicted",
|
|
|
|
batch: false,
|
|
|
|
failed: false,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusEvict,
|
|
|
|
clientStatus: structs.AllocClientStatusComplete,
|
|
|
|
untainted: false,
|
|
|
|
ignore: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
description: "service client complete",
|
|
|
|
batch: false,
|
|
|
|
failed: false,
|
|
|
|
desiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
clientStatus: structs.AllocClientStatusComplete,
|
|
|
|
untainted: false,
|
|
|
|
ignore: true,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.description, func(t *testing.T) {
|
|
|
|
alloc := &structs.Allocation{
|
|
|
|
DesiredStatus: tc.desiredStatus,
|
|
|
|
TaskStates: map[string]*structs.TaskState{"task": {State: structs.TaskStateDead, Failed: tc.failed}},
|
|
|
|
ClientStatus: tc.clientStatus,
|
|
|
|
}
|
|
|
|
|
|
|
|
untainted, ignore := shouldFilter(alloc, tc.batch)
|
2023-08-03 14:44:19 +00:00
|
|
|
must.Eq(t, tc.untainted, untainted)
|
|
|
|
must.Eq(t, tc.ignore, ignore)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that we properly create the bitmap even when the alloc set includes an
|
|
|
|
// allocation with a higher count than the current min count and it is byte
|
|
|
|
// aligned.
|
|
|
|
// Ensure no regression from: https://github.com/hashicorp/nomad/issues/3008
|
|
|
|
func TestBitmapFrom(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
input := map[string]*structs.Allocation{
|
|
|
|
"8": {
|
|
|
|
JobID: "foo",
|
|
|
|
TaskGroup: "bar",
|
|
|
|
Name: "foo.bar[8]",
|
|
|
|
},
|
|
|
|
}
|
2023-10-27 16:04:04 +00:00
|
|
|
b, dups := bitmapFrom(input, 1)
|
2023-08-03 14:44:19 +00:00
|
|
|
must.Eq(t, 16, b.Size())
|
2023-10-27 16:04:04 +00:00
|
|
|
must.MapEmpty(t, dups)
|
2023-08-03 14:44:19 +00:00
|
|
|
|
2023-10-27 16:04:04 +00:00
|
|
|
b, dups = bitmapFrom(input, 8)
|
2023-08-03 14:44:19 +00:00
|
|
|
must.Eq(t, 16, b.Size())
|
2023-10-27 16:04:04 +00:00
|
|
|
must.MapEmpty(t, dups)
|
2023-08-03 14:44:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func Test_allocNameIndex_Highest(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
testCases := []struct {
|
|
|
|
name string
|
|
|
|
inputAllocNameIndex *allocNameIndex
|
|
|
|
inputN uint
|
|
|
|
expectedOutput map[string]struct{}
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "select 1",
|
|
|
|
inputAllocNameIndex: newAllocNameIndex(
|
|
|
|
"example", "cache", 3, map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
}),
|
|
|
|
inputN: 1,
|
|
|
|
expectedOutput: map[string]struct{}{
|
|
|
|
"example.cache[2]": {},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "select all",
|
|
|
|
inputAllocNameIndex: newAllocNameIndex(
|
|
|
|
"example", "cache", 3, map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
}),
|
|
|
|
inputN: 3,
|
|
|
|
expectedOutput: map[string]struct{}{
|
|
|
|
"example.cache[2]": {},
|
|
|
|
"example.cache[1]": {},
|
|
|
|
"example.cache[0]": {},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "select too many",
|
|
|
|
inputAllocNameIndex: newAllocNameIndex(
|
|
|
|
"example", "cache", 3, map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
}),
|
|
|
|
inputN: 13,
|
|
|
|
expectedOutput: map[string]struct{}{
|
|
|
|
"example.cache[2]": {},
|
|
|
|
"example.cache[1]": {},
|
|
|
|
"example.cache[0]": {},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
must.Eq(t, tc.expectedOutput, tc.inputAllocNameIndex.Highest(tc.inputN))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func Test_allocNameIndex_NextCanaries(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
testCases := []struct {
|
|
|
|
name string
|
|
|
|
inputAllocNameIndex *allocNameIndex
|
|
|
|
inputN uint
|
|
|
|
inputExisting allocSet
|
|
|
|
inputDestructive allocSet
|
|
|
|
expectedOutput []string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "single canary",
|
|
|
|
inputAllocNameIndex: newAllocNameIndex(
|
|
|
|
"example", "cache", 3, map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
}),
|
|
|
|
inputN: 1,
|
|
|
|
inputExisting: nil,
|
|
|
|
inputDestructive: map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
expectedOutput: []string{
|
|
|
|
"example.cache[0]",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
must.SliceContainsAll(
|
|
|
|
t, tc.expectedOutput,
|
|
|
|
tc.inputAllocNameIndex.NextCanaries(tc.inputN, tc.inputExisting, tc.inputDestructive))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func Test_allocNameIndex_Next(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
testCases := []struct {
|
|
|
|
name string
|
|
|
|
inputAllocNameIndex *allocNameIndex
|
|
|
|
inputN uint
|
|
|
|
expectedOutput []string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "empty existing bitmap",
|
|
|
|
inputAllocNameIndex: newAllocNameIndex("example", "cache", 3, nil),
|
|
|
|
inputN: 3,
|
|
|
|
expectedOutput: []string{
|
|
|
|
"example.cache[0]", "example.cache[1]", "example.cache[2]",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "non-empty existing bitmap simple",
|
|
|
|
inputAllocNameIndex: newAllocNameIndex(
|
|
|
|
"example", "cache", 3, map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
}),
|
|
|
|
inputN: 3,
|
|
|
|
expectedOutput: []string{
|
|
|
|
"example.cache[0]", "example.cache[1]", "example.cache[2]",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
must.SliceContainsAll(t, tc.expectedOutput, tc.inputAllocNameIndex.Next(tc.inputN))
|
2022-06-02 16:42:15 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2023-10-27 15:20:53 +00:00
|
|
|
|
2023-10-27 16:04:04 +00:00
|
|
|
func Test_allocNameIndex_Duplicates(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
inputAllocSet := map[string]*structs.Allocation{
|
|
|
|
"6b255fa3-c2cb-94de-5ddd-41aac25a6851": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"e24771e6-8900-5d2d-ec93-e7076284774a": {
|
|
|
|
Name: "example.cache[1]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"d7842822-32c4-1a1c-bac8-66c3f20dfb0f": {
|
|
|
|
Name: "example.cache[2]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
"76a6a487-016b-2fc2-8295-d811473ca93d": {
|
|
|
|
Name: "example.cache[0]",
|
|
|
|
JobID: "example",
|
|
|
|
TaskGroup: "cache",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build the tracker, and check some key information.
|
|
|
|
allocNameIndexTracker := newAllocNameIndex("example", "cache", 4, inputAllocSet)
|
|
|
|
must.Eq(t, 8, allocNameIndexTracker.b.Size())
|
|
|
|
must.MapLen(t, 1, allocNameIndexTracker.duplicates)
|
|
|
|
must.True(t, allocNameIndexTracker.IsDuplicate(0))
|
|
|
|
|
|
|
|
// Unsetting the index should remove the duplicate entry, but not the entry
|
|
|
|
// from the underlying bitmap.
|
|
|
|
allocNameIndexTracker.UnsetIndex(0)
|
|
|
|
must.MapLen(t, 0, allocNameIndexTracker.duplicates)
|
|
|
|
must.True(t, allocNameIndexTracker.b.Check(0))
|
|
|
|
|
|
|
|
// If we now select a new index, having previously checked for a duplicate,
|
|
|
|
// we should get a non-duplicate.
|
|
|
|
nextAllocNames := allocNameIndexTracker.Next(1)
|
|
|
|
must.Len(t, 1, nextAllocNames)
|
|
|
|
must.Eq(t, "example.cache[3]", nextAllocNames[0])
|
|
|
|
}
|
|
|
|
|
2023-10-27 15:20:53 +00:00
|
|
|
func TestAllocSet_filterByRescheduleable(t *testing.T) {
|
|
|
|
ci.Parallel(t)
|
|
|
|
|
|
|
|
noRescheduleJob := mock.Job()
|
|
|
|
noRescheduleTG := &structs.TaskGroup{
|
|
|
|
Name: "noRescheduleTG",
|
|
|
|
ReschedulePolicy: &structs.ReschedulePolicy{
|
|
|
|
Attempts: 0,
|
|
|
|
Unlimited: false,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
noRescheduleJob.TaskGroups[0] = noRescheduleTG
|
|
|
|
|
|
|
|
testJob := mock.Job()
|
|
|
|
rescheduleTG := &structs.TaskGroup{
|
|
|
|
Name: "rescheduleTG",
|
|
|
|
ReschedulePolicy: &structs.ReschedulePolicy{
|
|
|
|
Attempts: 1,
|
|
|
|
Unlimited: false,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
testJob.TaskGroups[0] = rescheduleTG
|
|
|
|
|
|
|
|
now := time.Now()
|
|
|
|
|
|
|
|
type testCase struct {
|
|
|
|
name string
|
|
|
|
all allocSet
|
|
|
|
isBatch bool
|
|
|
|
supportsDisconnectedClients bool
|
|
|
|
isDisconnecting bool
|
|
|
|
deployment *structs.Deployment
|
|
|
|
|
|
|
|
// expected results
|
|
|
|
untainted allocSet
|
|
|
|
resNow allocSet
|
|
|
|
resLater []*delayedRescheduleInfo
|
|
|
|
}
|
|
|
|
|
|
|
|
testCases := []testCase{
|
|
|
|
{
|
|
|
|
name: "batch disconnecting allocation no reschedule",
|
|
|
|
isDisconnecting: true,
|
|
|
|
isBatch: true,
|
|
|
|
all: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: noRescheduleJob,
|
|
|
|
TaskGroup: "noRescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: noRescheduleJob,
|
|
|
|
TaskGroup: "noRescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
resNow: allocSet{},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "batch ignore unknown disconnecting allocs",
|
|
|
|
isDisconnecting: true,
|
|
|
|
isBatch: true,
|
|
|
|
all: allocSet{
|
|
|
|
"disconnecting1": {
|
|
|
|
ID: "disconnection1",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
Job: testJob,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
resNow: allocSet{},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "batch disconnecting allocation reschedule",
|
|
|
|
isDisconnecting: true,
|
|
|
|
isBatch: true,
|
|
|
|
all: allocSet{
|
|
|
|
"rescheduleNow1": {
|
|
|
|
ID: "rescheduleNow1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
TaskGroup: "rescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
resNow: allocSet{
|
|
|
|
"rescheduleNow1": {
|
|
|
|
ID: "rescheduleNow1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
TaskGroup: "rescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "service disconnecting allocation no reschedule",
|
|
|
|
isDisconnecting: true,
|
|
|
|
isBatch: false,
|
|
|
|
all: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: noRescheduleJob,
|
|
|
|
TaskGroup: "noRescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: noRescheduleJob,
|
|
|
|
TaskGroup: "noRescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
resNow: allocSet{},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "service disconnecting allocation reschedule",
|
|
|
|
isDisconnecting: true,
|
|
|
|
isBatch: false,
|
|
|
|
all: allocSet{
|
|
|
|
"rescheduleNow1": {
|
|
|
|
ID: "rescheduleNow1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
TaskGroup: "rescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
resNow: allocSet{
|
|
|
|
"rescheduleNow1": {
|
|
|
|
ID: "rescheduleNow1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: testJob,
|
|
|
|
TaskGroup: "rescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "service ignore unknown disconnecting allocs",
|
|
|
|
isDisconnecting: true,
|
|
|
|
isBatch: false,
|
|
|
|
all: allocSet{
|
|
|
|
"disconnecting1": {
|
|
|
|
ID: "disconnection1",
|
|
|
|
ClientStatus: structs.AllocClientStatusUnknown,
|
|
|
|
Job: testJob,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{},
|
|
|
|
resNow: allocSet{},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "service running allocation no reschedule",
|
|
|
|
isDisconnecting: false,
|
|
|
|
isBatch: true,
|
|
|
|
all: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: noRescheduleJob,
|
|
|
|
TaskGroup: "noRescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
untainted: allocSet{
|
|
|
|
"untainted1": {
|
|
|
|
ID: "untainted1",
|
|
|
|
ClientStatus: structs.AllocClientStatusRunning,
|
|
|
|
Job: noRescheduleJob,
|
|
|
|
TaskGroup: "noRescheduleTG",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
resNow: allocSet{},
|
|
|
|
resLater: []*delayedRescheduleInfo{},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
|
|
untainted, resNow, resLater := tc.all.filterByRescheduleable(tc.isBatch,
|
|
|
|
tc.isDisconnecting, now, "evailID", tc.deployment)
|
|
|
|
must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted"))
|
|
|
|
must.Eq(t, tc.resNow, resNow, must.Sprintf("with-nodes: reschedule-now"))
|
|
|
|
must.Eq(t, tc.resLater, resLater, must.Sprintf("with-nodes: rescheduleLater"))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|