2015-08-13 23:25:59 +00:00
|
|
|
package scheduler
|
|
|
|
|
|
|
|
import (
|
2022-02-08 17:16:33 +00:00
|
|
|
"encoding/binary"
|
2015-08-13 23:25:59 +00:00
|
|
|
"fmt"
|
2015-09-07 18:23:38 +00:00
|
|
|
"math/rand"
|
2015-09-07 19:25:23 +00:00
|
|
|
"reflect"
|
2022-04-15 13:31:32 +00:00
|
|
|
"time"
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
2017-02-08 04:31:23 +00:00
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
2015-08-13 23:25:59 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2022-09-21 19:53:25 +00:00
|
|
|
"golang.org/x/exp/slices"
|
2015-08-13 23:25:59 +00:00
|
|
|
)
|
|
|
|
|
2015-08-14 01:16:32 +00:00
|
|
|
// allocTuple is a tuple of the allocation name and potential alloc ID
|
|
|
|
type allocTuple struct {
|
|
|
|
Name string
|
|
|
|
TaskGroup *structs.TaskGroup
|
|
|
|
Alloc *structs.Allocation
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// materializeTaskGroups is used to materialize all the task groups
|
|
|
|
// a job requires. This is used to do the count expansion.
|
|
|
|
func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
|
|
|
|
out := make(map[string]*structs.TaskGroup)
|
2017-04-19 17:54:03 +00:00
|
|
|
if job.Stopped() {
|
2015-10-15 20:14:44 +00:00
|
|
|
return out
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
for _, tg := range job.TaskGroups {
|
2015-10-15 20:14:44 +00:00
|
|
|
for i := 0; i < tg.Count; i++ {
|
|
|
|
name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
|
2015-10-14 23:43:06 +00:00
|
|
|
out[name] = tg
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
2015-08-14 01:28:09 +00:00
|
|
|
// diffResult is used to return the sets that result from the diff
|
|
|
|
type diffResult struct {
|
2022-04-15 13:31:32 +00:00
|
|
|
place, update, migrate, stop, ignore, lost, disconnecting, reconnecting []allocTuple
|
2015-08-14 01:28:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *diffResult) GoString() string {
|
2022-04-15 13:31:32 +00:00
|
|
|
return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d) (disconnecting %d) (reconnecting %d)",
|
|
|
|
len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost), len(d.disconnecting), len(d.reconnecting))
|
2015-08-14 01:28:09 +00:00
|
|
|
}
|
|
|
|
|
2015-10-15 20:14:44 +00:00
|
|
|
func (d *diffResult) Append(other *diffResult) {
|
|
|
|
d.place = append(d.place, other.place...)
|
|
|
|
d.update = append(d.update, other.update...)
|
|
|
|
d.migrate = append(d.migrate, other.migrate...)
|
|
|
|
d.stop = append(d.stop, other.stop...)
|
|
|
|
d.ignore = append(d.ignore, other.ignore...)
|
2016-08-03 22:45:42 +00:00
|
|
|
d.lost = append(d.lost, other.lost...)
|
2022-04-15 13:31:32 +00:00
|
|
|
d.disconnecting = append(d.disconnecting, other.disconnecting...)
|
|
|
|
d.reconnecting = append(d.reconnecting, other.reconnecting...)
|
2015-10-15 20:14:44 +00:00
|
|
|
}
|
|
|
|
|
2020-01-30 16:37:14 +00:00
|
|
|
// diffSystemAllocsForNode is used to do a set difference between the target allocations
|
2022-04-15 13:31:32 +00:00
|
|
|
// and the existing allocations for a particular node. This returns 8 sets of results,
|
2020-01-30 16:37:14 +00:00
|
|
|
// the list of named task groups that need to be placed (no existing allocation), the
|
2015-08-13 23:47:39 +00:00
|
|
|
// allocations that need to be updated (job definition is newer), allocs that
|
|
|
|
// need to be migrated (node is draining), the allocs that need to be evicted
|
2022-04-15 13:31:32 +00:00
|
|
|
// (no longer required), those that should be ignored, those that are lost
|
|
|
|
// that need to be replaced (running on a lost node), those that are running on
|
|
|
|
// a disconnected node but may resume, and those that may still be running on
|
|
|
|
// a node that has resumed reconnected.
|
2020-10-09 21:31:38 +00:00
|
|
|
func diffSystemAllocsForNode(
|
|
|
|
job *structs.Job, // job whose allocs are going to be diff-ed
|
|
|
|
nodeID string,
|
|
|
|
eligibleNodes map[string]*structs.Node,
|
2021-10-27 14:04:13 +00:00
|
|
|
notReadyNodes map[string]struct{}, // nodes that are not ready, e.g. draining
|
|
|
|
taintedNodes map[string]*structs.Node, // nodes which are down (by node id)
|
2020-10-09 21:31:38 +00:00
|
|
|
required map[string]*structs.TaskGroup, // set of allocations that must exist
|
|
|
|
allocs []*structs.Allocation, // non-terminal allocations that exist
|
2021-10-27 14:04:13 +00:00
|
|
|
terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, id)
|
2022-04-15 13:31:32 +00:00
|
|
|
serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic
|
2020-10-09 21:31:38 +00:00
|
|
|
) *diffResult {
|
|
|
|
result := new(diffResult)
|
2015-08-14 01:18:32 +00:00
|
|
|
|
2015-08-13 23:25:59 +00:00
|
|
|
// Scan the existing updates
|
2020-10-09 21:31:38 +00:00
|
|
|
existing := make(map[string]struct{}) // set of alloc names
|
2015-08-14 01:20:55 +00:00
|
|
|
for _, exist := range allocs {
|
|
|
|
// Index the existing node
|
|
|
|
name := exist.Name
|
|
|
|
existing[name] = struct{}{}
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2015-08-14 01:20:55 +00:00
|
|
|
// Check for the definition in the required set
|
|
|
|
tg, ok := required[name]
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2015-08-26 00:06:06 +00:00
|
|
|
// If not required, we stop the alloc
|
2015-08-14 01:20:55 +00:00
|
|
|
if !ok {
|
2015-10-16 18:43:09 +00:00
|
|
|
result.stop = append(result.stop, allocTuple{
|
2015-08-14 01:20:55 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
2015-08-13 23:47:39 +00:00
|
|
|
|
2022-04-15 13:31:32 +00:00
|
|
|
supportsDisconnectedClients := exist.SupportsDisconnectedClients(serverSupportsDisconnectedClients)
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect := false
|
|
|
|
expired := false
|
|
|
|
|
|
|
|
// Only compute reconnect for unknown and running since they need to go
|
|
|
|
// through the reconnect process.
|
2022-04-15 13:31:32 +00:00
|
|
|
if supportsDisconnectedClients &&
|
|
|
|
(exist.ClientStatus == structs.AllocClientStatusUnknown ||
|
|
|
|
exist.ClientStatus == structs.AllocClientStatusRunning) {
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect = exist.NeedsToReconnect()
|
|
|
|
if reconnect {
|
|
|
|
expired = exist.Expired(time.Now())
|
|
|
|
}
|
2022-04-15 13:31:32 +00:00
|
|
|
}
|
|
|
|
|
2018-03-28 18:57:47 +00:00
|
|
|
// If we have been marked for migration and aren't terminal, migrate
|
|
|
|
if !exist.TerminalStatus() && exist.DesiredTransition.ShouldMigrate() {
|
|
|
|
result.migrate = append(result.migrate, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
2020-10-09 21:31:38 +00:00
|
|
|
|
|
|
|
// If we are a sysbatch job and terminal, ignore (or stop?) the alloc
|
|
|
|
if job.Type == structs.JobTypeSysBatch && exist.TerminalStatus() {
|
|
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-04-15 13:31:32 +00:00
|
|
|
// Expired unknown allocs are lost. Expired checks that status is unknown.
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
if supportsDisconnectedClients && expired {
|
2022-04-15 13:31:32 +00:00
|
|
|
result.lost = append(result.lost, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ignore unknown allocs that we want to reconnect eventually.
|
|
|
|
if supportsDisconnectedClients &&
|
|
|
|
exist.ClientStatus == structs.AllocClientStatusUnknown &&
|
|
|
|
exist.DesiredStatus == structs.AllocDesiredStatusRun {
|
|
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
node, nodeIsTainted := taintedNodes[exist.NodeID]
|
|
|
|
|
|
|
|
// Filter allocs on a node that is now re-connected to reconnecting.
|
|
|
|
if supportsDisconnectedClients &&
|
|
|
|
!nodeIsTainted &&
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect {
|
|
|
|
|
|
|
|
// Record the new ClientStatus to indicate to future evals that the
|
|
|
|
// alloc has already reconnected.
|
|
|
|
reconnecting := exist.Copy()
|
|
|
|
reconnecting.AppendState(structs.AllocStateFieldClientStatus, exist.ClientStatus)
|
2022-04-15 13:31:32 +00:00
|
|
|
result.reconnecting = append(result.reconnecting, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
Alloc: reconnecting,
|
2022-04-15 13:31:32 +00:00
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2016-05-25 00:23:18 +00:00
|
|
|
// If we are on a tainted node, we must migrate if we are a service or
|
|
|
|
// if the batch allocation did not finish
|
2022-04-15 13:31:32 +00:00
|
|
|
if nodeIsTainted {
|
2016-06-16 23:17:17 +00:00
|
|
|
// If the job is batch and finished successfully, the fact that the
|
2016-08-03 22:45:42 +00:00
|
|
|
// node is tainted does not mean it should be migrated or marked as
|
|
|
|
// lost as the work was already successfully finished. However for
|
|
|
|
// service/system jobs, tasks should never complete. The check of
|
|
|
|
// batch type, defends against client bugs.
|
2022-04-15 13:31:32 +00:00
|
|
|
if exist.Job.Type == structs.JobTypeSysBatch && exist.RanSuccessfully() {
|
2016-05-25 00:47:03 +00:00
|
|
|
goto IGNORE
|
2016-05-25 00:23:18 +00:00
|
|
|
}
|
2016-08-03 22:45:42 +00:00
|
|
|
|
2022-04-15 13:31:32 +00:00
|
|
|
// Filter running allocs on a node that is disconnected to be marked as unknown.
|
|
|
|
if node != nil &&
|
|
|
|
supportsDisconnectedClients &&
|
|
|
|
node.Status == structs.NodeStatusDisconnected &&
|
|
|
|
exist.ClientStatus == structs.AllocClientStatusRunning {
|
|
|
|
|
|
|
|
disconnect := exist.Copy()
|
|
|
|
disconnect.ClientStatus = structs.AllocClientStatusUnknown
|
|
|
|
disconnect.AppendState(structs.AllocStateFieldClientStatus, structs.AllocClientStatusUnknown)
|
|
|
|
disconnect.ClientDescription = allocUnknown
|
|
|
|
result.disconnecting = append(result.disconnecting, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: disconnect,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-03-28 18:57:47 +00:00
|
|
|
if !exist.TerminalStatus() && (node == nil || node.TerminalStatus()) {
|
|
|
|
result.lost = append(result.lost, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
2016-08-03 22:45:42 +00:00
|
|
|
} else {
|
2018-02-21 18:58:04 +00:00
|
|
|
goto IGNORE
|
2016-08-03 22:45:42 +00:00
|
|
|
}
|
2018-02-21 18:58:04 +00:00
|
|
|
|
2015-08-14 01:20:55 +00:00
|
|
|
continue
|
|
|
|
}
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2020-01-30 16:37:14 +00:00
|
|
|
// For an existing allocation, if the nodeID is no longer
|
|
|
|
// eligible, the diff should be ignored
|
2022-04-15 13:31:32 +00:00
|
|
|
if _, ineligible := notReadyNodes[nodeID]; ineligible {
|
2020-01-30 16:37:14 +00:00
|
|
|
goto IGNORE
|
|
|
|
}
|
|
|
|
|
2021-10-27 14:04:13 +00:00
|
|
|
// Existing allocations on nodes that are no longer targeted
|
|
|
|
// should be stopped
|
2022-04-15 13:31:32 +00:00
|
|
|
if _, eligible := eligibleNodes[nodeID]; !eligible {
|
2021-10-27 14:04:13 +00:00
|
|
|
result.stop = append(result.stop, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-08-14 01:20:55 +00:00
|
|
|
// If the definition is updated we need to update
|
2016-01-12 17:50:33 +00:00
|
|
|
if job.JobModifyIndex != exist.Job.JobModifyIndex {
|
2015-10-16 18:43:09 +00:00
|
|
|
result.update = append(result.update, allocTuple{
|
2015-08-14 01:16:32 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
2015-08-14 01:20:55 +00:00
|
|
|
continue
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
2015-08-14 01:20:55 +00:00
|
|
|
|
|
|
|
// Everything is up-to-date
|
2016-05-25 00:47:03 +00:00
|
|
|
IGNORE:
|
2015-10-16 18:43:09 +00:00
|
|
|
result.ignore = append(result.ignore, allocTuple{
|
2015-08-14 01:20:55 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Scan the required groups
|
2015-08-14 01:16:32 +00:00
|
|
|
for name, tg := range required {
|
2020-10-09 21:31:38 +00:00
|
|
|
|
2015-08-13 23:25:59 +00:00
|
|
|
// Check for an existing allocation
|
2020-10-09 21:31:38 +00:00
|
|
|
if _, ok := existing[name]; !ok {
|
|
|
|
|
|
|
|
// Check for a terminal sysbatch allocation, which should be not placed
|
|
|
|
// again unless the job has been updated.
|
|
|
|
if job.Type == structs.JobTypeSysBatch {
|
|
|
|
if alloc, termExists := terminal.Get(nodeID, name); termExists {
|
|
|
|
// the alloc is terminal, but now the job has been updated
|
|
|
|
if job.JobModifyIndex != alloc.Job.JobModifyIndex {
|
|
|
|
result.update = append(result.update, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: alloc,
|
|
|
|
})
|
|
|
|
} else {
|
|
|
|
// alloc is terminal and job unchanged, leave it alone
|
|
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: alloc,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Require a placement if no existing allocation. If there
|
|
|
|
// is an existing allocation, we would have checked for a potential
|
|
|
|
// update or ignore above. Ignore placements for tainted or
|
|
|
|
// ineligible nodes
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2020-01-30 18:37:59 +00:00
|
|
|
// Tainted and ineligible nodes for a non existing alloc
|
|
|
|
// should be filtered out and not count towards ignore or place
|
2020-01-30 16:37:14 +00:00
|
|
|
if _, tainted := taintedNodes[nodeID]; tainted {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if _, eligible := eligibleNodes[nodeID]; !eligible {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-10-09 21:31:38 +00:00
|
|
|
termOnNode, _ := terminal.Get(nodeID, name)
|
2020-01-30 16:37:14 +00:00
|
|
|
allocTuple := allocTuple{
|
2015-08-14 01:16:32 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
2020-10-09 21:31:38 +00:00
|
|
|
Alloc: termOnNode,
|
2020-01-30 16:37:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// If the new allocation isn't annotated with a previous allocation
|
|
|
|
// or if the previous allocation isn't from the same node then we
|
|
|
|
// annotate the allocTuple with a new Allocation
|
|
|
|
if allocTuple.Alloc == nil || allocTuple.Alloc.NodeID != nodeID {
|
|
|
|
allocTuple.Alloc = &structs.Allocation{NodeID: nodeID}
|
|
|
|
}
|
2020-10-09 21:31:38 +00:00
|
|
|
|
2020-01-30 16:37:14 +00:00
|
|
|
result.place = append(result.place, allocTuple)
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
}
|
2015-08-14 01:28:09 +00:00
|
|
|
return result
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
2020-01-30 16:37:14 +00:00
|
|
|
// diffSystemAllocs is like diffSystemAllocsForNode however, the allocations in the
|
2015-10-15 20:14:44 +00:00
|
|
|
// diffResult contain the specific nodeID they should be allocated on.
|
2020-10-09 21:31:38 +00:00
|
|
|
func diffSystemAllocs(
|
|
|
|
job *structs.Job, // jobs whose allocations are going to be diff-ed
|
2021-10-27 14:04:13 +00:00
|
|
|
readyNodes []*structs.Node, // list of nodes in the ready state
|
|
|
|
notReadyNodes map[string]struct{}, // list of nodes in DC but not ready, e.g. draining
|
|
|
|
taintedNodes map[string]*structs.Node, // nodes which are down or drain mode (by node id)
|
2020-10-09 21:31:38 +00:00
|
|
|
allocs []*structs.Allocation, // non-terminal allocations
|
2021-10-27 14:04:13 +00:00
|
|
|
terminal structs.TerminalByNodeByName, // latest terminal allocations (by node id)
|
2022-04-15 13:31:32 +00:00
|
|
|
serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic
|
2020-10-09 21:31:38 +00:00
|
|
|
) *diffResult {
|
2015-10-15 20:14:44 +00:00
|
|
|
|
|
|
|
// Build a mapping of nodes to all their allocs.
|
|
|
|
nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
|
|
|
|
for _, alloc := range allocs {
|
2021-10-27 14:04:13 +00:00
|
|
|
nodeAllocs[alloc.NodeID] = append(nodeAllocs[alloc.NodeID], alloc)
|
2015-10-15 20:14:44 +00:00
|
|
|
}
|
|
|
|
|
2020-01-30 16:37:14 +00:00
|
|
|
eligibleNodes := make(map[string]*structs.Node)
|
2021-10-27 14:04:13 +00:00
|
|
|
for _, node := range readyNodes {
|
2015-10-15 20:14:44 +00:00
|
|
|
if _, ok := nodeAllocs[node.ID]; !ok {
|
|
|
|
nodeAllocs[node.ID] = nil
|
|
|
|
}
|
2020-01-30 16:37:14 +00:00
|
|
|
eligibleNodes[node.ID] = node
|
2015-10-15 20:14:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create the required task groups.
|
|
|
|
required := materializeTaskGroups(job)
|
|
|
|
|
2020-10-09 21:31:38 +00:00
|
|
|
result := new(diffResult)
|
2015-10-15 20:14:44 +00:00
|
|
|
for nodeID, allocs := range nodeAllocs {
|
2022-04-15 13:31:32 +00:00
|
|
|
diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, notReadyNodes, taintedNodes, required, allocs, terminal, serverSupportsDisconnectedClients)
|
2015-10-15 20:14:44 +00:00
|
|
|
result.Append(diff)
|
|
|
|
}
|
|
|
|
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2016-01-04 20:07:33 +00:00
|
|
|
// readyNodesInDCs returns all the ready nodes in the given datacenters and a
|
|
|
|
// mapping of each data center to the count of ready nodes.
|
2021-10-27 14:04:13 +00:00
|
|
|
func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]struct{}, map[string]int, error) {
|
2015-08-15 20:11:42 +00:00
|
|
|
// Index the DCs
|
2023-02-02 14:57:45 +00:00
|
|
|
dcMap := make(map[string]int)
|
2015-08-15 20:11:42 +00:00
|
|
|
|
|
|
|
// Scan the nodes
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2015-08-15 20:11:42 +00:00
|
|
|
var out []*structs.Node
|
2021-10-27 14:04:13 +00:00
|
|
|
notReady := map[string]struct{}{}
|
2017-02-08 04:31:23 +00:00
|
|
|
iter, err := state.Nodes(ws)
|
2015-08-15 20:11:42 +00:00
|
|
|
if err != nil {
|
2021-10-27 14:04:13 +00:00
|
|
|
return nil, nil, nil, err
|
2015-08-15 20:11:42 +00:00
|
|
|
}
|
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
2015-08-15 20:11:42 +00:00
|
|
|
|
|
|
|
// Filter on datacenter and status
|
|
|
|
node := raw.(*structs.Node)
|
2021-03-26 17:03:15 +00:00
|
|
|
if !node.Ready() {
|
2021-10-27 14:04:13 +00:00
|
|
|
notReady[node.ID] = struct{}{}
|
2018-01-24 00:47:00 +00:00
|
|
|
continue
|
|
|
|
}
|
2023-02-02 14:57:45 +00:00
|
|
|
for _, dc := range dcs {
|
|
|
|
if node.IsInDC(dc) {
|
|
|
|
out = append(out, node)
|
|
|
|
dcMap[node.Datacenter]++
|
|
|
|
break
|
|
|
|
}
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
|
|
|
}
|
2021-10-27 14:04:13 +00:00
|
|
|
return out, notReady, dcMap, nil
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
2015-08-14 00:40:23 +00:00
|
|
|
|
|
|
|
// retryMax is used to retry a callback until it returns success or
|
2016-02-10 05:24:47 +00:00
|
|
|
// a maximum number of attempts is reached. An optional reset function may be
|
|
|
|
// passed which is called after each failed iteration. If the reset function is
|
|
|
|
// set and returns true, the number of attempts is reset back to max.
|
|
|
|
func retryMax(max int, cb func() (bool, error), reset func() bool) error {
|
2015-08-14 00:40:23 +00:00
|
|
|
attempts := 0
|
|
|
|
for attempts < max {
|
|
|
|
done, err := cb()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if done {
|
|
|
|
return nil
|
|
|
|
}
|
2016-02-10 05:24:47 +00:00
|
|
|
|
|
|
|
// Check if we should reset the number attempts
|
|
|
|
if reset != nil && reset() {
|
|
|
|
attempts = 0
|
|
|
|
} else {
|
2017-09-26 22:26:33 +00:00
|
|
|
attempts++
|
2016-02-10 05:24:47 +00:00
|
|
|
}
|
2015-08-14 00:40:23 +00:00
|
|
|
}
|
2015-08-15 21:47:13 +00:00
|
|
|
return &SetStatusError{
|
|
|
|
Err: fmt.Errorf("maximum attempts reached (%d)", max),
|
|
|
|
EvalStatus: structs.EvalStatusFailed,
|
|
|
|
}
|
2015-08-14 00:40:23 +00:00
|
|
|
}
|
2015-08-14 00:51:31 +00:00
|
|
|
|
2016-02-10 05:24:47 +00:00
|
|
|
// progressMade checks to see if the plan result made allocations or updates.
|
|
|
|
// If the result is nil, false is returned.
|
|
|
|
func progressMade(result *structs.PlanResult) bool {
|
2016-02-22 18:38:04 +00:00
|
|
|
return result != nil && (len(result.NodeUpdate) != 0 ||
|
2017-07-06 16:55:39 +00:00
|
|
|
len(result.NodeAllocation) != 0 || result.Deployment != nil ||
|
|
|
|
len(result.DeploymentUpdates) != 0)
|
2016-02-10 05:24:47 +00:00
|
|
|
}
|
|
|
|
|
2015-08-14 00:51:31 +00:00
|
|
|
// taintedNodes is used to scan the allocations and then check if the
|
2022-02-16 18:50:20 +00:00
|
|
|
// underlying nodes are tainted, and should force a migration of the allocation,
|
|
|
|
// or if the underlying nodes are disconnected, and should be used to calculate
|
|
|
|
// the reconnect timeout of its allocations. All the nodes returned in the map are tainted.
|
2016-08-03 22:45:42 +00:00
|
|
|
func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) {
|
|
|
|
out := make(map[string]*structs.Node)
|
2015-08-14 00:51:31 +00:00
|
|
|
for _, alloc := range allocs {
|
|
|
|
if _, ok := out[alloc.NodeID]; ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
node, err := state.NodeByID(ws, alloc.NodeID)
|
2015-08-14 00:51:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2015-08-14 01:05:31 +00:00
|
|
|
// If the node does not exist, we should migrate
|
|
|
|
if node == nil {
|
2016-08-03 22:45:42 +00:00
|
|
|
out[alloc.NodeID] = nil
|
2015-08-14 01:05:31 +00:00
|
|
|
continue
|
|
|
|
}
|
2021-02-11 15:40:59 +00:00
|
|
|
if structs.ShouldDrainNode(node.Status) || node.DrainStrategy != nil {
|
2018-02-21 18:58:04 +00:00
|
|
|
out[alloc.NodeID] = node
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
|
|
|
// Disconnected nodes are included in the tainted set so that their
|
|
|
|
// MaxClientDisconnect configuration can be included in the
|
|
|
|
// timeout calculation.
|
|
|
|
if node.Status == structs.NodeStatusDisconnected {
|
|
|
|
out[alloc.NodeID] = node
|
|
|
|
}
|
2015-08-14 00:51:31 +00:00
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2015-08-14 00:51:31 +00:00
|
|
|
return out, nil
|
|
|
|
}
|
2015-09-07 18:23:38 +00:00
|
|
|
|
2022-02-08 17:16:33 +00:00
|
|
|
// shuffleNodes randomizes the slice order with the Fisher-Yates
|
|
|
|
// algorithm. We seed the random source with the eval ID (which is
|
|
|
|
// random) to aid in postmortem debugging of specific evaluations and
|
|
|
|
// state snapshots.
|
|
|
|
func shuffleNodes(plan *structs.Plan, index uint64, nodes []*structs.Node) {
|
|
|
|
|
|
|
|
// use the last 4 bytes because those are the random bits
|
|
|
|
// if we have sortable IDs
|
|
|
|
buf := []byte(plan.EvalID)
|
|
|
|
seed := binary.BigEndian.Uint64(buf[len(buf)-8:])
|
|
|
|
|
|
|
|
// for retried plans the index is the plan result's RefreshIndex
|
|
|
|
// so that we don't retry with the exact same shuffle
|
|
|
|
seed ^= index
|
|
|
|
r := rand.New(rand.NewSource(int64(seed >> 2)))
|
|
|
|
|
2015-09-07 18:23:38 +00:00
|
|
|
n := len(nodes)
|
|
|
|
for i := n - 1; i > 0; i-- {
|
2022-02-08 17:16:33 +00:00
|
|
|
j := r.Intn(i + 1)
|
2015-09-07 18:23:38 +00:00
|
|
|
nodes[i], nodes[j] = nodes[j], nodes[i]
|
|
|
|
}
|
|
|
|
}
|
2015-09-07 19:25:23 +00:00
|
|
|
|
|
|
|
// tasksUpdated does a diff between task groups to see if the
|
2016-12-16 01:08:38 +00:00
|
|
|
// tasks, their drivers, environment variables or config have updated. The
|
|
|
|
// inputs are the task group name to diff and two jobs to diff.
|
2019-11-18 21:06:25 +00:00
|
|
|
// taskUpdated and functions called within assume that the given
|
|
|
|
// taskGroup has already been checked to not be nil
|
2016-12-16 01:08:38 +00:00
|
|
|
func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
|
|
|
|
a := jobA.LookupTaskGroup(taskGroup)
|
|
|
|
b := jobB.LookupTaskGroup(taskGroup)
|
|
|
|
|
2015-09-07 19:25:23 +00:00
|
|
|
// If the number of tasks do not match, clearly there is an update
|
|
|
|
if len(a.Tasks) != len(b.Tasks) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:00:02 +00:00
|
|
|
// Check ephemeral disk
|
|
|
|
if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2019-04-29 17:35:15 +00:00
|
|
|
// Check that the network resources haven't changed
|
|
|
|
if networkUpdated(a.Networks, b.Networks) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2019-11-14 20:34:38 +00:00
|
|
|
// Check Affinities
|
|
|
|
if affinitiesUpdated(jobA, jobB, taskGroup) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check Spreads
|
|
|
|
if spreadsUpdated(jobA, jobB, taskGroup) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Check consul namespace updated
|
|
|
|
if consulNamespaceUpdated(a, b) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2020-10-05 19:13:39 +00:00
|
|
|
// Check connect service(s) updated
|
|
|
|
if connectServiceUpdated(a.Services, b.Services) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2022-05-13 15:34:04 +00:00
|
|
|
// Check if volumes are updated (no task driver can support
|
|
|
|
// altering mounts in-place)
|
|
|
|
if !reflect.DeepEqual(a.Volumes, b.Volumes) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2015-09-07 19:25:23 +00:00
|
|
|
// Check each task
|
|
|
|
for _, at := range a.Tasks {
|
|
|
|
bt := b.LookupTask(at.Name)
|
|
|
|
if bt == nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
if at.Driver != bt.Driver {
|
|
|
|
return true
|
|
|
|
}
|
2016-04-26 00:20:25 +00:00
|
|
|
if at.User != bt.User {
|
|
|
|
return true
|
|
|
|
}
|
2015-09-07 19:25:23 +00:00
|
|
|
if !reflect.DeepEqual(at.Config, bt.Config) {
|
|
|
|
return true
|
|
|
|
}
|
2015-10-23 21:52:06 +00:00
|
|
|
if !reflect.DeepEqual(at.Env, bt.Env) {
|
|
|
|
return true
|
|
|
|
}
|
2016-04-26 00:20:25 +00:00
|
|
|
if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
|
|
|
|
return true
|
2015-10-04 19:53:02 +00:00
|
|
|
}
|
2016-09-21 18:29:50 +00:00
|
|
|
if !reflect.DeepEqual(at.Vault, bt.Vault) {
|
|
|
|
return true
|
|
|
|
}
|
2016-10-17 18:41:22 +00:00
|
|
|
if !reflect.DeepEqual(at.Templates, bt.Templates) {
|
|
|
|
return true
|
|
|
|
}
|
2022-04-25 16:59:25 +00:00
|
|
|
if !reflect.DeepEqual(at.CSIPluginConfig, bt.CSIPluginConfig) {
|
|
|
|
return true
|
|
|
|
}
|
2022-05-13 15:34:04 +00:00
|
|
|
if !reflect.DeepEqual(at.VolumeMounts, bt.VolumeMounts) {
|
|
|
|
return true
|
|
|
|
}
|
2016-05-06 04:32:01 +00:00
|
|
|
|
2016-12-16 01:08:38 +00:00
|
|
|
// Check the metadata
|
|
|
|
if !reflect.DeepEqual(
|
|
|
|
jobA.CombinedTaskMeta(taskGroup, at.Name),
|
|
|
|
jobB.CombinedTaskMeta(taskGroup, bt.Name)) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-05-06 04:32:01 +00:00
|
|
|
// Inspect the network to see if the dynamic ports are different
|
2019-05-08 15:09:35 +00:00
|
|
|
if networkUpdated(at.Resources.Networks, bt.Resources.Networks) {
|
2016-05-06 04:32:01 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Inspect the non-network resources
|
|
|
|
if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
|
|
|
|
return true
|
2021-03-20 02:25:50 +00:00
|
|
|
} else if ar.Cores != br.Cores {
|
|
|
|
return true
|
2016-05-06 04:32:01 +00:00
|
|
|
} else if ar.MemoryMB != br.MemoryMB {
|
|
|
|
return true
|
2021-03-26 20:01:27 +00:00
|
|
|
} else if ar.MemoryMaxMB != br.MemoryMaxMB {
|
|
|
|
return true
|
2022-10-10 14:28:46 +00:00
|
|
|
} else if !ar.Devices.Equal(&br.Devices) {
|
2019-11-07 17:51:15 +00:00
|
|
|
return true
|
2016-05-06 04:32:01 +00:00
|
|
|
}
|
2015-09-07 19:25:23 +00:00
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
2015-10-15 00:26:20 +00:00
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// consulNamespaceUpdated returns true if the Consul namespace in the task group
|
|
|
|
// has been changed.
|
|
|
|
//
|
|
|
|
// This is treated as a destructive update unlike ordinary Consul service configuration
|
|
|
|
// because Namespaces directly impact networking validity among Consul intentions.
|
|
|
|
// Forcing the task through a reschedule is a sure way of breaking no-longer valid
|
|
|
|
// network connections.
|
|
|
|
func consulNamespaceUpdated(tgA, tgB *structs.TaskGroup) bool {
|
|
|
|
// job.ConsulNamespace is pushed down to the TGs, just check those
|
|
|
|
return tgA.Consul.GetNamespace() != tgB.Consul.GetNamespace()
|
|
|
|
}
|
|
|
|
|
2023-01-30 14:48:43 +00:00
|
|
|
// connectServiceUpdated returns true if any services with a connect block have
|
2020-10-05 19:13:39 +00:00
|
|
|
// been changed in such a way that requires a destructive update.
|
|
|
|
//
|
|
|
|
// Ordinary services can be updated in-place by updating the service definition
|
|
|
|
// in Consul. Connect service changes mostly require destroying the task.
|
|
|
|
func connectServiceUpdated(servicesA, servicesB []*structs.Service) bool {
|
|
|
|
for _, serviceA := range servicesA {
|
|
|
|
if serviceA.Connect != nil {
|
|
|
|
for _, serviceB := range servicesB {
|
|
|
|
if serviceA.Name == serviceB.Name {
|
|
|
|
if connectUpdated(serviceA.Connect, serviceB.Connect) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
// Part of the Connect plumbing is derived from port label,
|
|
|
|
// if that changes we need to destroy the task.
|
|
|
|
if serviceA.PortLabel != serviceB.PortLabel {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// connectUpdated returns true if the connect block has been updated in a manner
|
|
|
|
// that will require a destructive update.
|
|
|
|
//
|
|
|
|
// Fields that can be updated through consul-sync do not need a destructive
|
|
|
|
// update.
|
|
|
|
func connectUpdated(connectA, connectB *structs.ConsulConnect) bool {
|
|
|
|
if connectA == nil || connectB == nil {
|
2021-02-23 21:08:39 +00:00
|
|
|
return connectA != connectB
|
2020-10-05 19:13:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if connectA.Native != connectB.Native {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2022-10-10 14:28:46 +00:00
|
|
|
if !connectA.Gateway.Equal(connectB.Gateway) {
|
2020-10-05 19:13:39 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2022-10-10 14:28:46 +00:00
|
|
|
if !connectA.SidecarTask.Equal(connectB.SidecarTask) {
|
2020-10-05 19:13:39 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// not everything in sidecar_service needs task destruction
|
|
|
|
if connectSidecarServiceUpdated(connectA.SidecarService, connectB.SidecarService) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func connectSidecarServiceUpdated(ssA, ssB *structs.ConsulSidecarService) bool {
|
|
|
|
if ssA == nil || ssB == nil {
|
2021-02-23 21:08:39 +00:00
|
|
|
return ssA != ssB
|
2020-10-05 19:13:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ssA.Port != ssB.Port {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// sidecar_service.tags handled in-place (registration)
|
|
|
|
|
|
|
|
// sidecar_service.proxy handled in-place (registration + xDS)
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2019-04-29 17:35:15 +00:00
|
|
|
func networkUpdated(netA, netB []*structs.NetworkResource) bool {
|
|
|
|
if len(netA) != len(netB) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
for idx := range netA {
|
|
|
|
an := netA[idx]
|
|
|
|
bn := netB[idx]
|
|
|
|
|
2020-03-21 20:51:10 +00:00
|
|
|
if an.Mode != bn.Mode {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2019-04-29 17:35:15 +00:00
|
|
|
if an.MBits != bn.MBits {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2021-09-16 06:13:09 +00:00
|
|
|
if an.Hostname != bn.Hostname {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2020-04-28 03:11:06 +00:00
|
|
|
if !reflect.DeepEqual(an.DNS, bn.DNS) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2019-04-29 17:35:15 +00:00
|
|
|
aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
|
|
|
|
if !reflect.DeepEqual(aPorts, bPorts) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2021-02-01 20:56:43 +00:00
|
|
|
// networkPortMap takes a network resource and returns a AllocatedPorts.
|
|
|
|
// The value for dynamic ports is disregarded even if it is set. This
|
2016-05-06 04:32:01 +00:00
|
|
|
// makes this function suitable for comparing two network resources for changes.
|
2021-02-01 20:56:43 +00:00
|
|
|
func networkPortMap(n *structs.NetworkResource) structs.AllocatedPorts {
|
|
|
|
var m structs.AllocatedPorts
|
2016-05-06 04:32:01 +00:00
|
|
|
for _, p := range n.ReservedPorts {
|
2021-02-01 20:56:43 +00:00
|
|
|
m = append(m, structs.AllocatedPortMapping{
|
|
|
|
Label: p.Label,
|
|
|
|
Value: p.Value,
|
|
|
|
To: p.To,
|
|
|
|
HostIP: p.HostNetwork,
|
|
|
|
})
|
2016-05-06 04:32:01 +00:00
|
|
|
}
|
|
|
|
for _, p := range n.DynamicPorts {
|
2021-02-01 20:56:43 +00:00
|
|
|
m = append(m, structs.AllocatedPortMapping{
|
|
|
|
Label: p.Label,
|
|
|
|
Value: -1,
|
|
|
|
To: p.To,
|
|
|
|
HostIP: p.HostNetwork,
|
|
|
|
})
|
2016-05-06 04:32:01 +00:00
|
|
|
}
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
|
2019-11-14 20:34:38 +00:00
|
|
|
func affinitiesUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
|
|
|
|
var aAffinities []*structs.Affinity
|
|
|
|
var bAffinities []*structs.Affinity
|
|
|
|
|
|
|
|
tgA := jobA.LookupTaskGroup(taskGroup)
|
|
|
|
tgB := jobB.LookupTaskGroup(taskGroup)
|
|
|
|
|
|
|
|
// Append jobA job and task group level affinities
|
|
|
|
aAffinities = append(aAffinities, jobA.Affinities...)
|
|
|
|
aAffinities = append(aAffinities, tgA.Affinities...)
|
|
|
|
|
|
|
|
// Append jobB job and task group level affinities
|
|
|
|
bAffinities = append(bAffinities, jobB.Affinities...)
|
|
|
|
bAffinities = append(bAffinities, tgB.Affinities...)
|
|
|
|
|
|
|
|
// append task affinities
|
|
|
|
for _, task := range tgA.Tasks {
|
|
|
|
aAffinities = append(aAffinities, task.Affinities...)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, task := range tgB.Tasks {
|
|
|
|
bAffinities = append(bAffinities, task.Affinities...)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for equality
|
|
|
|
if len(aAffinities) != len(bAffinities) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
return !reflect.DeepEqual(aAffinities, bAffinities)
|
|
|
|
}
|
|
|
|
|
|
|
|
func spreadsUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
|
|
|
|
var aSpreads []*structs.Spread
|
|
|
|
var bSpreads []*structs.Spread
|
|
|
|
|
|
|
|
tgA := jobA.LookupTaskGroup(taskGroup)
|
|
|
|
tgB := jobB.LookupTaskGroup(taskGroup)
|
|
|
|
|
|
|
|
// append jobA and task group level spreads
|
|
|
|
aSpreads = append(aSpreads, jobA.Spreads...)
|
|
|
|
aSpreads = append(aSpreads, tgA.Spreads...)
|
|
|
|
|
|
|
|
// append jobB and task group level spreads
|
|
|
|
bSpreads = append(bSpreads, jobB.Spreads...)
|
|
|
|
bSpreads = append(bSpreads, tgB.Spreads...)
|
|
|
|
|
|
|
|
// Check for equality
|
|
|
|
if len(aSpreads) != len(bSpreads) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
return !reflect.DeepEqual(aSpreads, bSpreads)
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
// setStatus is used to update the status of the evaluation
|
2018-09-15 23:23:13 +00:00
|
|
|
func setStatus(logger log.Logger, planner Planner,
|
2016-05-27 18:26:14 +00:00
|
|
|
eval, nextEval, spawnedBlocked *structs.Evaluation,
|
2016-07-18 22:04:05 +00:00
|
|
|
tgMetrics map[string]*structs.AllocMetric, status, desc string,
|
2017-07-06 00:13:45 +00:00
|
|
|
queuedAllocs map[string]int, deploymentID string) error {
|
2016-05-27 18:26:14 +00:00
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
logger.Debug("setting eval status", "status", status)
|
2015-10-15 00:26:20 +00:00
|
|
|
newEval := eval.Copy()
|
|
|
|
newEval.Status = status
|
|
|
|
newEval.StatusDescription = desc
|
2017-07-06 00:13:45 +00:00
|
|
|
newEval.DeploymentID = deploymentID
|
2016-05-27 18:26:14 +00:00
|
|
|
newEval.FailedTGAllocs = tgMetrics
|
2015-10-15 00:26:20 +00:00
|
|
|
if nextEval != nil {
|
|
|
|
newEval.NextEval = nextEval.ID
|
|
|
|
}
|
2016-05-19 20:09:52 +00:00
|
|
|
if spawnedBlocked != nil {
|
2016-05-25 01:12:59 +00:00
|
|
|
newEval.BlockedEval = spawnedBlocked.ID
|
2016-05-19 20:09:52 +00:00
|
|
|
}
|
2016-07-18 22:04:05 +00:00
|
|
|
if queuedAllocs != nil {
|
|
|
|
newEval.QueuedAllocations = queuedAllocs
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
return planner.UpdateEval(newEval)
|
|
|
|
}
|
|
|
|
|
2016-05-17 22:37:37 +00:00
|
|
|
// inplaceUpdate attempts to update allocations in-place where possible. It
|
|
|
|
// returns the allocs that couldn't be done inplace and then those that could.
|
2015-10-15 00:26:20 +00:00
|
|
|
func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
|
2016-05-17 22:37:37 +00:00
|
|
|
stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
|
2015-10-15 00:26:20 +00:00
|
|
|
|
2017-03-12 01:19:22 +00:00
|
|
|
// doInplace manipulates the updates map to make the current allocation
|
|
|
|
// an inplace update.
|
|
|
|
doInplace := func(cur, last, inplaceCount *int) {
|
|
|
|
updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur]
|
|
|
|
*cur--
|
|
|
|
*last--
|
|
|
|
*inplaceCount++
|
|
|
|
}
|
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2015-10-15 00:26:20 +00:00
|
|
|
n := len(updates)
|
2016-05-17 22:37:37 +00:00
|
|
|
inplaceCount := 0
|
2015-10-15 00:26:20 +00:00
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
// Get the update
|
|
|
|
update := updates[i]
|
|
|
|
|
|
|
|
// Check if the task drivers or config has changed, requires
|
|
|
|
// a rolling upgrade since that cannot be done in-place.
|
2016-12-16 01:08:38 +00:00
|
|
|
existing := update.Alloc.Job
|
|
|
|
if tasksUpdated(job, existing, update.TaskGroup.Name) {
|
2015-10-15 00:26:20 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-03-12 01:19:22 +00:00
|
|
|
// Terminal batch allocations are not filtered when they are completed
|
|
|
|
// successfully. We should avoid adding the allocation to the plan in
|
|
|
|
// the case that it is an in-place update to avoid both additional data
|
|
|
|
// in the plan and work for the clients.
|
|
|
|
if update.Alloc.TerminalStatus() {
|
|
|
|
doInplace(&i, &n, &inplaceCount)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
// Get the existing node
|
2017-02-08 04:31:23 +00:00
|
|
|
node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID)
|
2015-10-15 00:26:20 +00:00
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
ctx.Logger().Error("failed to get node", "node_id", update.Alloc.NodeID, "error", err)
|
2015-10-15 00:26:20 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if node == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-07-07 15:14:20 +00:00
|
|
|
// The alloc is on a node that's now in an ineligible DC
|
2022-09-21 19:53:25 +00:00
|
|
|
if !slices.Contains(job.Datacenters, node.Datacenter) {
|
2021-07-07 15:14:20 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
// Set the existing node as the base set
|
|
|
|
stack.SetNodes([]*structs.Node{node})
|
|
|
|
|
2015-10-16 23:35:55 +00:00
|
|
|
// Stage an eviction of the current allocation. This is done so that
|
2018-03-11 18:07:09 +00:00
|
|
|
// the current allocation is discounted when checking for feasibility.
|
2015-10-16 23:35:55 +00:00
|
|
|
// Otherwise we would be trying to fit the tasks current resources and
|
|
|
|
// updated resources. After select is called we can remove the evict.
|
2020-06-09 21:13:53 +00:00
|
|
|
ctx.Plan().AppendStoppedAlloc(update.Alloc, allocInPlace, "", "")
|
2015-10-15 00:26:20 +00:00
|
|
|
|
|
|
|
// Attempt to match the task group
|
2021-03-18 19:35:11 +00:00
|
|
|
option := stack.Select(update.TaskGroup,
|
|
|
|
&SelectOptions{AllocName: update.Alloc.Name})
|
2015-10-15 00:26:20 +00:00
|
|
|
|
|
|
|
// Pop the allocation
|
|
|
|
ctx.Plan().PopUpdate(update.Alloc)
|
|
|
|
|
|
|
|
// Skip if we could not do an in-place update
|
|
|
|
if option == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-04-21 12:57:15 +00:00
|
|
|
// Restore the network and device offers from the existing allocation.
|
2015-10-15 00:26:20 +00:00
|
|
|
// We do not allow network resources (reserved/dynamic ports)
|
|
|
|
// to be updated. This is guarded in taskUpdated, so we can
|
|
|
|
// safely restore those here.
|
|
|
|
for task, resources := range option.TaskResources {
|
2018-10-03 16:47:18 +00:00
|
|
|
var networks structs.Networks
|
2020-04-21 12:57:15 +00:00
|
|
|
var devices []*structs.AllocatedDeviceResource
|
2018-10-03 16:47:18 +00:00
|
|
|
if update.Alloc.AllocatedResources != nil {
|
|
|
|
if tr, ok := update.Alloc.AllocatedResources.Tasks[task]; ok {
|
|
|
|
networks = tr.Networks
|
2020-04-21 12:57:15 +00:00
|
|
|
devices = tr.Devices
|
2018-10-03 16:47:18 +00:00
|
|
|
}
|
|
|
|
} else if tr, ok := update.Alloc.TaskResources[task]; ok {
|
|
|
|
networks = tr.Networks
|
|
|
|
}
|
|
|
|
|
2020-04-21 12:57:15 +00:00
|
|
|
// Add the networks and devices back
|
2018-10-03 16:47:18 +00:00
|
|
|
resources.Networks = networks
|
2020-04-21 12:57:15 +00:00
|
|
|
resources.Devices = devices
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create a shallow copy
|
|
|
|
newAlloc := new(structs.Allocation)
|
|
|
|
*newAlloc = *update.Alloc
|
|
|
|
|
|
|
|
// Update the allocation
|
|
|
|
newAlloc.EvalID = eval.ID
|
2016-03-01 22:09:25 +00:00
|
|
|
newAlloc.Job = nil // Use the Job in the Plan
|
|
|
|
newAlloc.Resources = nil // Computed in Plan Apply
|
2018-10-02 20:36:04 +00:00
|
|
|
newAlloc.AllocatedResources = &structs.AllocatedResources{
|
2019-12-16 20:34:58 +00:00
|
|
|
Tasks: option.TaskResources,
|
|
|
|
TaskLifecycles: option.TaskLifecycles,
|
2018-10-02 20:36:04 +00:00
|
|
|
Shared: structs.AllocatedSharedResources{
|
2021-01-08 14:00:41 +00:00
|
|
|
DiskMB: int64(update.TaskGroup.EphemeralDisk.SizeMB),
|
|
|
|
Ports: update.Alloc.AllocatedResources.Shared.Ports,
|
|
|
|
Networks: update.Alloc.AllocatedResources.Shared.Networks.Copy(),
|
2018-10-02 20:36:04 +00:00
|
|
|
},
|
|
|
|
}
|
2015-10-15 00:26:20 +00:00
|
|
|
newAlloc.Metrics = ctx.Metrics()
|
2020-08-25 21:09:21 +00:00
|
|
|
ctx.Plan().AppendAlloc(newAlloc, nil)
|
2015-10-15 00:26:20 +00:00
|
|
|
|
|
|
|
// Remove this allocation from the slice
|
2017-03-12 01:19:22 +00:00
|
|
|
doInplace(&i, &n, &inplaceCount)
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
2017-03-12 01:19:22 +00:00
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
if len(updates) > 0 {
|
2018-09-15 23:23:13 +00:00
|
|
|
ctx.Logger().Debug("made in-place updates", "in-place", inplaceCount, "total_updates", len(updates))
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
2016-05-17 22:37:37 +00:00
|
|
|
return updates[:n], updates[n:]
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// evictAndPlace is used to mark allocations for evicts and add them to the
|
2016-10-11 19:31:40 +00:00
|
|
|
// placement queue. evictAndPlace modifies both the diffResult and the
|
2015-10-15 00:26:20 +00:00
|
|
|
// limit. It returns true if the limit has been reached.
|
2015-10-16 18:43:09 +00:00
|
|
|
func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
|
2015-10-15 00:26:20 +00:00
|
|
|
n := len(allocs)
|
|
|
|
for i := 0; i < n && i < *limit; i++ {
|
|
|
|
a := allocs[i]
|
2020-06-09 21:13:53 +00:00
|
|
|
ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "")
|
2016-08-03 22:45:42 +00:00
|
|
|
diff.place = append(diff.place, a)
|
|
|
|
}
|
|
|
|
if n <= *limit {
|
|
|
|
*limit -= n
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
*limit = 0
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2015-10-16 21:00:51 +00:00
|
|
|
// tgConstrainTuple is used to store the total constraints of a task group.
|
|
|
|
type tgConstrainTuple struct {
|
|
|
|
// Holds the combined constraints of the task group and all it's sub-tasks.
|
|
|
|
constraints []*structs.Constraint
|
|
|
|
|
|
|
|
// The set of required drivers within the task group.
|
|
|
|
drivers map[string]struct{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// taskGroupConstraints collects the constraints, drivers and resources required by each
|
|
|
|
// sub-task to aggregate the TaskGroup totals
|
|
|
|
func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
|
|
|
|
c := tgConstrainTuple{
|
|
|
|
constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
|
|
|
|
drivers: make(map[string]struct{}),
|
|
|
|
}
|
|
|
|
|
|
|
|
c.constraints = append(c.constraints, tg.Constraints...)
|
|
|
|
for _, task := range tg.Tasks {
|
|
|
|
c.drivers[task.Driver] = struct{}{}
|
|
|
|
c.constraints = append(c.constraints, task.Constraints...)
|
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
2016-05-05 18:21:58 +00:00
|
|
|
|
|
|
|
// desiredUpdates takes the diffResult as well as the set of inplace and
|
|
|
|
// destructive updates and returns a map of task groups to their set of desired
|
|
|
|
// updates.
|
|
|
|
func desiredUpdates(diff *diffResult, inplaceUpdates,
|
|
|
|
destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
|
|
|
|
desiredTgs := make(map[string]*structs.DesiredUpdates)
|
|
|
|
|
|
|
|
for _, tuple := range diff.place {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Place++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range diff.stop {
|
2016-05-13 18:53:11 +00:00
|
|
|
name := tuple.Alloc.TaskGroup
|
2016-05-05 18:21:58 +00:00
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Stop++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range diff.ignore {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Ignore++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range diff.migrate {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Migrate++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range inplaceUpdates {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.InPlaceUpdate++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range destructiveUpdates {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.DestructiveUpdate++
|
|
|
|
}
|
|
|
|
|
|
|
|
return desiredTgs
|
|
|
|
}
|
2016-07-22 21:53:49 +00:00
|
|
|
|
|
|
|
// adjustQueuedAllocations decrements the number of allocations pending per task
|
|
|
|
// group based on the number of allocations successfully placed
|
2018-09-15 23:23:13 +00:00
|
|
|
func adjustQueuedAllocations(logger log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) {
|
2017-07-18 00:18:12 +00:00
|
|
|
if result == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, allocations := range result.NodeAllocation {
|
|
|
|
for _, allocation := range allocations {
|
|
|
|
// Ensure that the allocation is newly created. We check that
|
|
|
|
// the CreateIndex is equal to the ModifyIndex in order to check
|
|
|
|
// that the allocation was just created. We do not check that
|
|
|
|
// the CreateIndex is equal to the results AllocIndex because
|
|
|
|
// the allocations we get back have gone through the planner's
|
|
|
|
// optimistic snapshot and thus their indexes may not be
|
|
|
|
// correct, but they will be consistent.
|
|
|
|
if allocation.CreateIndex != allocation.ModifyIndex {
|
|
|
|
continue
|
|
|
|
}
|
2016-07-22 21:53:49 +00:00
|
|
|
|
2017-07-18 00:18:12 +00:00
|
|
|
if _, ok := queuedAllocs[allocation.TaskGroup]; ok {
|
2017-09-26 22:26:33 +00:00
|
|
|
queuedAllocs[allocation.TaskGroup]--
|
2017-07-18 00:18:12 +00:00
|
|
|
} else {
|
2018-09-15 23:23:13 +00:00
|
|
|
logger.Error("allocation placed but task group is not in list of unplaced allocations", "task_group", allocation.TaskGroup)
|
2016-07-22 21:53:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-08-09 20:11:58 +00:00
|
|
|
|
2020-05-13 20:39:04 +00:00
|
|
|
// updateNonTerminalAllocsToLost updates the allocations which are in pending/running state
|
|
|
|
// on tainted node to lost, but only for allocs already DesiredStatus stop or evict
|
2016-08-09 20:11:58 +00:00
|
|
|
func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) {
|
|
|
|
for _, alloc := range allocs {
|
2018-03-30 21:17:41 +00:00
|
|
|
node, ok := tainted[alloc.NodeID]
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only handle down nodes or nodes that are gone (node == nil)
|
|
|
|
if node != nil && node.Status != structs.NodeStatusDown {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-05-13 20:39:04 +00:00
|
|
|
// If the alloc is already correctly marked lost, we're done
|
2020-01-06 20:56:31 +00:00
|
|
|
if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
|
|
|
|
alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
|
2016-08-09 20:11:58 +00:00
|
|
|
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
|
|
|
|
alloc.ClientStatus == structs.AllocClientStatusPending) {
|
2020-06-09 21:13:53 +00:00
|
|
|
plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost, "")
|
2016-08-09 20:11:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-05-31 23:55:40 +00:00
|
|
|
|
2017-06-21 20:26:45 +00:00
|
|
|
// genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
|
|
|
|
// function to be passed into the reconciler. The factory takes objects that
|
|
|
|
// exist only in the scheduler context and returns a function that can be used
|
2018-01-14 22:47:21 +00:00
|
|
|
// by the reconciler to make decisions about how to update an allocation. The
|
2017-06-21 20:26:45 +00:00
|
|
|
// factory allows the reconciler to be unaware of how to determine the type of
|
|
|
|
// update necessary and can minimize the set of objects it is exposed to.
|
|
|
|
func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
|
2017-05-31 23:55:40 +00:00
|
|
|
return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) {
|
|
|
|
// Same index, so nothing to do
|
|
|
|
if existing.Job.JobModifyIndex == newJob.JobModifyIndex {
|
|
|
|
return true, false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the task drivers or config has changed, requires
|
|
|
|
// a destructive upgrade since that cannot be done in-place.
|
|
|
|
if tasksUpdated(newJob, existing.Job, newTG.Name) {
|
|
|
|
return false, true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Terminal batch allocations are not filtered when they are completed
|
|
|
|
// successfully. We should avoid adding the allocation to the plan in
|
|
|
|
// the case that it is an in-place update to avoid both additional data
|
|
|
|
// in the plan and work for the clients.
|
|
|
|
if existing.TerminalStatus() {
|
|
|
|
return true, false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the existing node
|
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
node, err := ctx.State().NodeByID(ws, existing.NodeID)
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
ctx.Logger().Error("failed to get node", "node_id", existing.NodeID, "error", err)
|
2017-05-31 23:55:40 +00:00
|
|
|
return true, false, nil
|
|
|
|
}
|
|
|
|
if node == nil {
|
|
|
|
return false, true, nil
|
|
|
|
}
|
|
|
|
|
2021-07-07 15:14:20 +00:00
|
|
|
// The alloc is on a node that's now in an ineligible DC
|
2022-09-21 19:53:25 +00:00
|
|
|
if !slices.Contains(newJob.Datacenters, node.Datacenter) {
|
2021-07-07 15:14:20 +00:00
|
|
|
return false, true, nil
|
|
|
|
}
|
|
|
|
|
2017-05-31 23:55:40 +00:00
|
|
|
// Set the existing node as the base set
|
|
|
|
stack.SetNodes([]*structs.Node{node})
|
|
|
|
|
|
|
|
// Stage an eviction of the current allocation. This is done so that
|
2018-03-11 18:07:09 +00:00
|
|
|
// the current allocation is discounted when checking for feasibility.
|
2017-05-31 23:55:40 +00:00
|
|
|
// Otherwise we would be trying to fit the tasks current resources and
|
|
|
|
// updated resources. After select is called we can remove the evict.
|
2020-06-09 21:13:53 +00:00
|
|
|
ctx.Plan().AppendStoppedAlloc(existing, allocInPlace, "", "")
|
2017-05-31 23:55:40 +00:00
|
|
|
|
|
|
|
// Attempt to match the task group
|
2021-03-18 19:35:11 +00:00
|
|
|
option := stack.Select(newTG, &SelectOptions{AllocName: existing.Name})
|
2017-05-31 23:55:40 +00:00
|
|
|
|
|
|
|
// Pop the allocation
|
|
|
|
ctx.Plan().PopUpdate(existing)
|
|
|
|
|
|
|
|
// Require destructive if we could not do an in-place update
|
|
|
|
if option == nil {
|
|
|
|
return false, true, nil
|
|
|
|
}
|
|
|
|
|
2020-04-21 12:57:15 +00:00
|
|
|
// Restore the network and device offers from the existing allocation.
|
2017-05-31 23:55:40 +00:00
|
|
|
// We do not allow network resources (reserved/dynamic ports)
|
|
|
|
// to be updated. This is guarded in taskUpdated, so we can
|
|
|
|
// safely restore those here.
|
|
|
|
for task, resources := range option.TaskResources {
|
2018-10-03 16:47:18 +00:00
|
|
|
var networks structs.Networks
|
2020-04-21 12:57:15 +00:00
|
|
|
var devices []*structs.AllocatedDeviceResource
|
2018-10-03 16:47:18 +00:00
|
|
|
if existing.AllocatedResources != nil {
|
|
|
|
if tr, ok := existing.AllocatedResources.Tasks[task]; ok {
|
|
|
|
networks = tr.Networks
|
2020-04-21 12:57:15 +00:00
|
|
|
devices = tr.Devices
|
2018-10-03 16:47:18 +00:00
|
|
|
}
|
|
|
|
} else if tr, ok := existing.TaskResources[task]; ok {
|
|
|
|
networks = tr.Networks
|
|
|
|
}
|
|
|
|
|
2019-09-05 23:37:24 +00:00
|
|
|
// Add the networks back
|
2018-10-03 16:47:18 +00:00
|
|
|
resources.Networks = networks
|
2020-04-21 12:57:15 +00:00
|
|
|
resources.Devices = devices
|
2017-05-31 23:55:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create a shallow copy
|
|
|
|
newAlloc := new(structs.Allocation)
|
|
|
|
*newAlloc = *existing
|
|
|
|
|
|
|
|
// Update the allocation
|
|
|
|
newAlloc.EvalID = evalID
|
|
|
|
newAlloc.Job = nil // Use the Job in the Plan
|
|
|
|
newAlloc.Resources = nil // Computed in Plan Apply
|
2018-10-02 20:36:04 +00:00
|
|
|
newAlloc.AllocatedResources = &structs.AllocatedResources{
|
2019-12-16 20:34:58 +00:00
|
|
|
Tasks: option.TaskResources,
|
|
|
|
TaskLifecycles: option.TaskLifecycles,
|
2019-06-18 17:12:23 +00:00
|
|
|
Shared: structs.AllocatedSharedResources{
|
2019-09-05 23:37:24 +00:00
|
|
|
DiskMB: int64(newTG.EphemeralDisk.SizeMB),
|
2019-06-18 17:12:23 +00:00
|
|
|
},
|
2018-10-02 20:36:04 +00:00
|
|
|
}
|
2019-06-18 04:55:43 +00:00
|
|
|
|
2021-01-15 17:45:12 +00:00
|
|
|
// Since this is an inplace update, we should copy network and port
|
2019-10-23 22:23:16 +00:00
|
|
|
// information from the original alloc. This is similar to how
|
|
|
|
// we copy network info for task level networks above.
|
|
|
|
//
|
|
|
|
// existing.AllocatedResources is nil on Allocations created by
|
|
|
|
// Nomad v0.8 or earlier.
|
|
|
|
if existing.AllocatedResources != nil {
|
|
|
|
newAlloc.AllocatedResources.Shared.Networks = existing.AllocatedResources.Shared.Networks
|
2021-01-15 17:45:12 +00:00
|
|
|
newAlloc.AllocatedResources.Shared.Ports = existing.AllocatedResources.Shared.Ports
|
2019-10-23 22:23:16 +00:00
|
|
|
}
|
|
|
|
|
2019-03-13 04:36:46 +00:00
|
|
|
// Use metrics from existing alloc for in place upgrade
|
|
|
|
// This is because if the inplace upgrade succeeded, any scoring metadata from
|
|
|
|
// when it first went through the scheduler should still be preserved. Using scoring
|
|
|
|
// metadata from the context would incorrectly replace it with metadata only from a single node that the
|
|
|
|
// allocation is already on.
|
|
|
|
newAlloc.Metrics = existing.Metrics.Copy()
|
2017-05-31 23:55:40 +00:00
|
|
|
return false, false, newAlloc
|
|
|
|
}
|
|
|
|
}
|