2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2017-05-23 00:14:38 +00:00
|
|
|
package scheduler
|
|
|
|
|
2023-02-03 17:29:39 +00:00
|
|
|
// The structs and helpers in this file are split out of reconciler.go for code
|
|
|
|
// manageability and should not be shared to the system schedulers! If you need
|
|
|
|
// something here for system/sysbatch jobs, double-check it's safe to use for
|
|
|
|
// all scheduler types before moving it into util.go
|
|
|
|
|
2017-05-23 00:14:38 +00:00
|
|
|
import (
|
|
|
|
"fmt"
|
2017-06-02 23:11:29 +00:00
|
|
|
"sort"
|
|
|
|
"strings"
|
2018-01-17 19:22:30 +00:00
|
|
|
"time"
|
|
|
|
|
2017-05-23 00:14:38 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
2017-12-13 17:36:03 +00:00
|
|
|
// placementResult is an allocation that must be placed. It potentially has a
|
2017-07-15 23:31:33 +00:00
|
|
|
// previous allocation attached to it that should be stopped only if the
|
|
|
|
// paired placement is complete. This gives an atomic place/stop behavior to
|
|
|
|
// prevent an impossible resource ask as part of a rolling update to wipe the
|
|
|
|
// job out.
|
|
|
|
type placementResult interface {
|
|
|
|
// TaskGroup returns the task group the placement is for
|
|
|
|
TaskGroup() *structs.TaskGroup
|
|
|
|
|
|
|
|
// Name returns the name of the desired allocation
|
|
|
|
Name() string
|
|
|
|
|
|
|
|
// Canary returns whether the placement should be a canary
|
|
|
|
Canary() bool
|
|
|
|
|
|
|
|
// PreviousAllocation returns the previous allocation
|
|
|
|
PreviousAllocation() *structs.Allocation
|
|
|
|
|
2018-01-19 21:20:00 +00:00
|
|
|
// IsRescheduling returns whether the placement was rescheduling a failed allocation
|
|
|
|
IsRescheduling() bool
|
2018-01-19 19:21:50 +00:00
|
|
|
|
2017-07-15 23:31:33 +00:00
|
|
|
// StopPreviousAlloc returns whether the previous allocation should be
|
|
|
|
// stopped and if so the status description.
|
|
|
|
StopPreviousAlloc() (bool, string)
|
2020-08-13 13:35:09 +00:00
|
|
|
|
2020-12-17 23:21:46 +00:00
|
|
|
// PreviousLost is true if the previous allocation was lost.
|
|
|
|
PreviousLost() bool
|
|
|
|
|
2020-08-13 13:35:09 +00:00
|
|
|
// DowngradeNonCanary indicates that placement should use the latest stable job
|
|
|
|
// with the MinJobVersion, rather than the current deployment version
|
|
|
|
DowngradeNonCanary() bool
|
|
|
|
|
|
|
|
MinJobVersion() uint64
|
2017-07-15 23:31:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// allocStopResult contains the information required to stop a single allocation
|
|
|
|
type allocStopResult struct {
|
|
|
|
alloc *structs.Allocation
|
|
|
|
clientStatus string
|
|
|
|
statusDescription string
|
2020-06-09 21:13:53 +00:00
|
|
|
followupEvalID string
|
2017-07-15 23:31:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// allocPlaceResult contains the information required to place a single
|
|
|
|
// allocation
|
|
|
|
type allocPlaceResult struct {
|
|
|
|
name string
|
|
|
|
canary bool
|
|
|
|
taskGroup *structs.TaskGroup
|
|
|
|
previousAlloc *structs.Allocation
|
2018-01-19 19:21:50 +00:00
|
|
|
reschedule bool
|
2020-12-17 23:21:46 +00:00
|
|
|
lost bool
|
2020-08-13 13:35:09 +00:00
|
|
|
|
|
|
|
downgradeNonCanary bool
|
|
|
|
minJobVersion uint64
|
2017-07-15 23:31:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (a allocPlaceResult) TaskGroup() *structs.TaskGroup { return a.taskGroup }
|
|
|
|
func (a allocPlaceResult) Name() string { return a.name }
|
|
|
|
func (a allocPlaceResult) Canary() bool { return a.canary }
|
|
|
|
func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc }
|
2018-01-19 21:20:00 +00:00
|
|
|
func (a allocPlaceResult) IsRescheduling() bool { return a.reschedule }
|
2017-07-15 23:31:33 +00:00
|
|
|
func (a allocPlaceResult) StopPreviousAlloc() (bool, string) { return false, "" }
|
2020-08-13 13:35:09 +00:00
|
|
|
func (a allocPlaceResult) DowngradeNonCanary() bool { return a.downgradeNonCanary }
|
|
|
|
func (a allocPlaceResult) MinJobVersion() uint64 { return a.minJobVersion }
|
2020-12-17 23:21:46 +00:00
|
|
|
func (a allocPlaceResult) PreviousLost() bool { return a.lost }
|
2017-07-15 23:31:33 +00:00
|
|
|
|
|
|
|
// allocDestructiveResult contains the information required to do a destructive
|
|
|
|
// update. Destructive changes should be applied atomically, as in the old alloc
|
|
|
|
// is only stopped if the new one can be placed.
|
|
|
|
type allocDestructiveResult struct {
|
|
|
|
placeName string
|
|
|
|
placeTaskGroup *structs.TaskGroup
|
|
|
|
stopAlloc *structs.Allocation
|
|
|
|
stopStatusDescription string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup { return a.placeTaskGroup }
|
|
|
|
func (a allocDestructiveResult) Name() string { return a.placeName }
|
|
|
|
func (a allocDestructiveResult) Canary() bool { return false }
|
|
|
|
func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc }
|
2018-01-19 21:20:00 +00:00
|
|
|
func (a allocDestructiveResult) IsRescheduling() bool { return false }
|
2017-07-15 23:31:33 +00:00
|
|
|
func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) {
|
|
|
|
return true, a.stopStatusDescription
|
|
|
|
}
|
2020-08-13 13:35:09 +00:00
|
|
|
func (a allocDestructiveResult) DowngradeNonCanary() bool { return false }
|
|
|
|
func (a allocDestructiveResult) MinJobVersion() uint64 { return 0 }
|
2020-12-17 23:21:46 +00:00
|
|
|
func (a allocDestructiveResult) PreviousLost() bool { return false }
|
2017-07-15 23:31:33 +00:00
|
|
|
|
2017-05-23 00:14:38 +00:00
|
|
|
// allocMatrix is a mapping of task groups to their allocation set.
|
|
|
|
type allocMatrix map[string]allocSet
|
|
|
|
|
|
|
|
// newAllocMatrix takes a job and the existing allocations for the job and
|
|
|
|
// creates an allocMatrix
|
|
|
|
func newAllocMatrix(job *structs.Job, allocs []*structs.Allocation) allocMatrix {
|
|
|
|
m := allocMatrix(make(map[string]allocSet))
|
|
|
|
for _, a := range allocs {
|
|
|
|
s, ok := m[a.TaskGroup]
|
|
|
|
if !ok {
|
|
|
|
s = make(map[string]*structs.Allocation)
|
|
|
|
m[a.TaskGroup] = s
|
|
|
|
}
|
|
|
|
s[a.ID] = a
|
|
|
|
}
|
2017-06-01 22:16:24 +00:00
|
|
|
|
|
|
|
if job != nil {
|
|
|
|
for _, tg := range job.TaskGroups {
|
2017-09-26 22:26:33 +00:00
|
|
|
if _, ok := m[tg.Name]; !ok {
|
|
|
|
m[tg.Name] = make(map[string]*structs.Allocation)
|
2017-06-01 22:16:24 +00:00
|
|
|
}
|
2017-05-23 00:14:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
|
|
|
|
// allocSet is a set of allocations with a series of helper functions defined
|
|
|
|
// that help reconcile state.
|
|
|
|
type allocSet map[string]*structs.Allocation
|
|
|
|
|
|
|
|
// GoString provides a human readable view of the set
|
|
|
|
func (a allocSet) GoString() string {
|
|
|
|
if len(a) == 0 {
|
|
|
|
return "[]"
|
|
|
|
}
|
|
|
|
|
|
|
|
start := fmt.Sprintf("len(%d) [\n", len(a))
|
2017-06-02 23:11:29 +00:00
|
|
|
var s []string
|
|
|
|
for k, v := range a {
|
|
|
|
s = append(s, fmt.Sprintf("%q: %v", k, v.Name))
|
|
|
|
}
|
|
|
|
return start + strings.Join(s, "\n") + "]"
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// nameSet returns the set of allocation names
|
2017-06-02 23:11:29 +00:00
|
|
|
func (a allocSet) nameSet() map[string]struct{} {
|
|
|
|
names := make(map[string]struct{}, len(a))
|
|
|
|
for _, alloc := range a {
|
|
|
|
names[alloc.Name] = struct{}{}
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// nameOrder returns the set of allocation names in sorted order
|
2017-06-02 23:11:29 +00:00
|
|
|
func (a allocSet) nameOrder() []*structs.Allocation {
|
|
|
|
allocs := make([]*structs.Allocation, 0, len(a))
|
|
|
|
for _, alloc := range a {
|
|
|
|
allocs = append(allocs, alloc)
|
2017-05-23 00:14:38 +00:00
|
|
|
}
|
2017-06-02 23:11:29 +00:00
|
|
|
sort.Slice(allocs, func(i, j int) bool {
|
|
|
|
return allocs[i].Index() < allocs[j].Index()
|
|
|
|
})
|
|
|
|
return allocs
|
2017-05-23 00:14:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// difference returns a new allocSet that has all the existing item except those
|
|
|
|
// contained within the other allocation sets
|
|
|
|
func (a allocSet) difference(others ...allocSet) allocSet {
|
|
|
|
diff := make(map[string]*structs.Allocation)
|
|
|
|
OUTER:
|
|
|
|
for k, v := range a {
|
|
|
|
for _, other := range others {
|
|
|
|
if _, ok := other[k]; ok {
|
|
|
|
continue OUTER
|
|
|
|
}
|
|
|
|
}
|
|
|
|
diff[k] = v
|
|
|
|
}
|
|
|
|
return diff
|
|
|
|
}
|
|
|
|
|
2017-05-31 18:34:46 +00:00
|
|
|
// union returns a new allocSet that has the union of the two allocSets.
|
|
|
|
// Conflicts prefer the last passed allocSet containing the value
|
|
|
|
func (a allocSet) union(others ...allocSet) allocSet {
|
|
|
|
union := make(map[string]*structs.Allocation, len(a))
|
|
|
|
order := []allocSet{a}
|
|
|
|
order = append(order, others...)
|
|
|
|
|
|
|
|
for _, set := range order {
|
|
|
|
for k, v := range set {
|
|
|
|
union[k] = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return union
|
|
|
|
}
|
|
|
|
|
2017-07-05 19:50:40 +00:00
|
|
|
// fromKeys returns an alloc set matching the passed keys
|
|
|
|
func (a allocSet) fromKeys(keys ...[]string) allocSet {
|
|
|
|
from := make(map[string]*structs.Allocation)
|
|
|
|
for _, set := range keys {
|
|
|
|
for _, k := range set {
|
|
|
|
if alloc, ok := a[k]; ok {
|
|
|
|
from[k] = alloc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return from
|
|
|
|
}
|
|
|
|
|
2017-12-13 17:36:03 +00:00
|
|
|
// filterByTainted takes a set of tainted nodes and filters the allocation set
|
2022-03-31 15:32:18 +00:00
|
|
|
// into the following groups:
|
2017-05-23 00:14:38 +00:00
|
|
|
// 1. Those that exist on untainted nodes
|
|
|
|
// 2. Those exist on nodes that are draining
|
2022-03-31 15:32:18 +00:00
|
|
|
// 3. Those that exist on lost nodes or have expired
|
2022-02-16 18:50:20 +00:00
|
|
|
// 4. Those that are on nodes that are disconnected, but have not had their ClientState set to unknown
|
2022-03-31 15:32:18 +00:00
|
|
|
// 5. Those that are on a node that has reconnected.
|
|
|
|
// 6. Those that are in a state that results in a noop.
|
2022-04-11 15:24:49 +00:00
|
|
|
func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverSupportsDisconnectedClients bool, now time.Time) (untainted, migrate, lost, disconnecting, reconnecting, ignore allocSet) {
|
2017-05-23 00:14:38 +00:00
|
|
|
untainted = make(map[string]*structs.Allocation)
|
|
|
|
migrate = make(map[string]*structs.Allocation)
|
|
|
|
lost = make(map[string]*structs.Allocation)
|
2022-02-16 18:50:20 +00:00
|
|
|
disconnecting = make(map[string]*structs.Allocation)
|
|
|
|
reconnecting = make(map[string]*structs.Allocation)
|
2022-03-31 15:32:18 +00:00
|
|
|
ignore = make(map[string]*structs.Allocation)
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2017-05-23 00:14:38 +00:00
|
|
|
for _, alloc := range a {
|
2022-04-11 15:24:49 +00:00
|
|
|
// make sure we don't apply any reconnect logic to task groups
|
|
|
|
// without max_client_disconnect
|
2022-04-15 13:31:32 +00:00
|
|
|
supportsDisconnectedClients := alloc.SupportsDisconnectedClients(serverSupportsDisconnectedClients)
|
2022-04-11 15:24:49 +00:00
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect := false
|
2022-04-06 13:33:32 +00:00
|
|
|
expired := false
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
// Only compute reconnect for unknown, running, and failed since they
|
|
|
|
// need to go through the reconnect logic.
|
2022-04-06 13:33:32 +00:00
|
|
|
if supportsDisconnectedClients &&
|
|
|
|
(alloc.ClientStatus == structs.AllocClientStatusUnknown ||
|
|
|
|
alloc.ClientStatus == structs.AllocClientStatusRunning ||
|
|
|
|
alloc.ClientStatus == structs.AllocClientStatusFailed) {
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect = alloc.NeedsToReconnect()
|
|
|
|
if reconnect {
|
|
|
|
expired = alloc.Expired(now)
|
|
|
|
}
|
2022-04-06 13:33:32 +00:00
|
|
|
}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
// Failed allocs that need to be reconnected must be added to
|
|
|
|
// reconnecting so that they can be handled as a failed reconnect.
|
2022-04-06 13:33:32 +00:00
|
|
|
if supportsDisconnectedClients &&
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect &&
|
2022-04-06 13:33:32 +00:00
|
|
|
alloc.DesiredStatus == structs.AllocDesiredStatusRun &&
|
|
|
|
alloc.ClientStatus == structs.AllocClientStatusFailed {
|
|
|
|
reconnecting[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-04-21 14:05:58 +00:00
|
|
|
taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID]
|
|
|
|
if taintedNode != nil {
|
|
|
|
// Group disconnecting/reconnecting
|
|
|
|
switch taintedNode.Status {
|
|
|
|
case structs.NodeStatusDisconnected:
|
|
|
|
if supportsDisconnectedClients {
|
|
|
|
// Filter running allocs on a node that is disconnected to be marked as unknown.
|
|
|
|
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
|
|
|
disconnecting[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Filter pending allocs on a node that is disconnected to be marked as lost.
|
|
|
|
if alloc.ClientStatus == structs.AllocClientStatusPending {
|
|
|
|
lost[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
lost[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
case structs.NodeStatusReady:
|
|
|
|
// Filter reconnecting allocs on a node that is now connected.
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
if reconnect {
|
2022-04-21 14:05:58 +00:00
|
|
|
if expired {
|
|
|
|
lost[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
reconnecting[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
// Terminal allocs, if not reconnect, are always untainted as they
|
2022-04-06 13:33:32 +00:00
|
|
|
// should never be migrated.
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
if alloc.TerminalStatus() && !reconnect {
|
2017-05-24 20:43:01 +00:00
|
|
|
untainted[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-02-24 00:45:57 +00:00
|
|
|
// Non-terminal allocs that should migrate should always migrate
|
|
|
|
if alloc.DesiredTransition.ShouldMigrate() {
|
|
|
|
migrate[alloc.ID] = alloc
|
2017-05-24 20:43:01 +00:00
|
|
|
continue
|
|
|
|
}
|
2018-02-21 18:58:04 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Expired unknown allocs are lost
|
|
|
|
if supportsDisconnectedClients && alloc.Expired(now) {
|
|
|
|
lost[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-04-06 13:33:32 +00:00
|
|
|
// Ignore unknown allocs that we want to reconnect eventually.
|
|
|
|
if supportsDisconnectedClients &&
|
|
|
|
alloc.ClientStatus == structs.AllocClientStatusUnknown &&
|
|
|
|
alloc.DesiredStatus == structs.AllocDesiredStatusRun {
|
|
|
|
ignore[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
// Ignore failed allocs that need to be reconnected and that have been
|
|
|
|
// marked to stop by the server.
|
2022-04-06 13:33:32 +00:00
|
|
|
if supportsDisconnectedClients &&
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
reconnect &&
|
2022-04-06 13:33:32 +00:00
|
|
|
alloc.ClientStatus == structs.AllocClientStatusFailed &&
|
|
|
|
alloc.DesiredStatus == structs.AllocDesiredStatusStop {
|
2022-03-31 15:32:18 +00:00
|
|
|
ignore[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-04-21 14:05:58 +00:00
|
|
|
if !nodeIsTainted {
|
2022-02-16 18:50:20 +00:00
|
|
|
// Filter allocs on a node that is now re-connected to be resumed.
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
if reconnect {
|
2022-03-31 15:32:18 +00:00
|
|
|
if expired {
|
|
|
|
lost[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
reconnecting[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, Node is untainted so alloc is untainted
|
2018-01-14 22:47:21 +00:00
|
|
|
untainted[alloc.ID] = alloc
|
2018-02-24 00:45:57 +00:00
|
|
|
continue
|
2018-01-14 22:47:21 +00:00
|
|
|
}
|
2018-02-24 00:45:57 +00:00
|
|
|
|
|
|
|
// Allocs on GC'd (nil) or lost nodes are Lost
|
2022-02-05 09:54:19 +00:00
|
|
|
if taintedNode == nil || taintedNode.TerminalStatus() {
|
2018-02-24 00:45:57 +00:00
|
|
|
lost[alloc.ID] = alloc
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// All other allocs are untainted
|
|
|
|
untainted[alloc.ID] = alloc
|
2018-01-14 22:47:21 +00:00
|
|
|
}
|
2022-02-16 18:50:20 +00:00
|
|
|
|
2018-01-14 22:47:21 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// filterByRescheduleable filters the allocation set to return the set of allocations that are either
|
2018-03-29 18:28:37 +00:00
|
|
|
// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
|
|
|
|
// at a future time are also returned so that we can create follow up evaluations for them. Allocs are
|
|
|
|
// skipped or considered untainted according to logic defined in shouldFilter method.
|
2022-03-31 15:32:18 +00:00
|
|
|
func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
|
2018-01-14 22:47:21 +00:00
|
|
|
untainted = make(map[string]*structs.Allocation)
|
2018-03-02 00:23:44 +00:00
|
|
|
rescheduleNow = make(map[string]*structs.Allocation)
|
2018-01-14 22:47:21 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// When filtering disconnected sets, the untainted set is never populated.
|
|
|
|
// It has no purpose in that context.
|
2018-01-14 22:47:21 +00:00
|
|
|
for _, alloc := range a {
|
2022-04-21 14:05:58 +00:00
|
|
|
// Ignore disconnecting allocs that are already unknown. This can happen
|
|
|
|
// in the case of canaries that are interrupted by a disconnect.
|
|
|
|
if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-03-29 18:28:37 +00:00
|
|
|
var eligibleNow, eligibleLater bool
|
2018-03-02 00:23:44 +00:00
|
|
|
var rescheduleTime time.Time
|
2018-03-29 18:28:37 +00:00
|
|
|
|
2022-03-31 15:32:18 +00:00
|
|
|
// Ignore failing allocs that have already been rescheduled.
|
|
|
|
// Only failed or disconnecting allocs should be rescheduled.
|
|
|
|
// Protects against a bug allowing rescheduling running allocs.
|
2020-09-14 21:17:11 +00:00
|
|
|
if alloc.NextAllocation != "" && alloc.TerminalStatus() {
|
2018-03-29 18:28:37 +00:00
|
|
|
continue
|
2017-05-23 00:14:38 +00:00
|
|
|
}
|
2018-03-29 18:28:37 +00:00
|
|
|
|
2018-03-29 20:09:04 +00:00
|
|
|
isUntainted, ignore := shouldFilter(alloc, isBatch)
|
2022-03-31 15:32:18 +00:00
|
|
|
if isUntainted && !isDisconnecting {
|
2018-03-02 00:23:44 +00:00
|
|
|
untainted[alloc.ID] = alloc
|
|
|
|
}
|
2018-03-29 20:09:04 +00:00
|
|
|
if isUntainted || ignore {
|
2018-03-29 18:28:37 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only failed allocs with desired state run get to this point
|
2022-03-31 15:32:18 +00:00
|
|
|
// If the failed alloc is not eligible for rescheduling now we
|
|
|
|
// add it to the untainted set. Disconnecting delay evals are
|
|
|
|
// handled by allocReconciler.createTimeoutLaterEvals
|
|
|
|
eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting)
|
|
|
|
if !isDisconnecting && !eligibleNow {
|
2018-03-29 18:28:37 +00:00
|
|
|
untainted[alloc.ID] = alloc
|
|
|
|
if eligibleLater {
|
2019-06-06 19:04:32 +00:00
|
|
|
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
|
2018-03-29 18:28:37 +00:00
|
|
|
}
|
|
|
|
} else {
|
2018-03-02 00:23:44 +00:00
|
|
|
rescheduleNow[alloc.ID] = alloc
|
|
|
|
}
|
2017-05-23 00:14:38 +00:00
|
|
|
}
|
2018-03-02 00:23:44 +00:00
|
|
|
return
|
|
|
|
}
|
2018-01-14 22:47:21 +00:00
|
|
|
|
2022-06-02 16:42:15 +00:00
|
|
|
// shouldFilter returns whether the alloc should be ignored or considered untainted.
|
|
|
|
//
|
2018-03-29 20:09:04 +00:00
|
|
|
// Ignored allocs are filtered out.
|
|
|
|
// Untainted allocs count against the desired total.
|
|
|
|
// Filtering logic for batch jobs:
|
|
|
|
// If complete, and ran successfully - untainted
|
|
|
|
// If desired state is stop - ignore
|
|
|
|
//
|
|
|
|
// Filtering logic for service jobs:
|
|
|
|
// If desired state is stop/evict - ignore
|
|
|
|
// If client status is complete/lost - ignore
|
|
|
|
func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) {
|
2018-03-29 18:28:37 +00:00
|
|
|
// Allocs from batch jobs should be filtered when the desired status
|
|
|
|
// is terminal and the client did not finish or when the client
|
|
|
|
// status is failed so that they will be replaced. If they are
|
|
|
|
// complete but not failed, they shouldn't be replaced.
|
|
|
|
if isBatch {
|
|
|
|
switch alloc.DesiredStatus {
|
2022-06-02 16:42:15 +00:00
|
|
|
case structs.AllocDesiredStatusStop:
|
2018-03-29 18:28:37 +00:00
|
|
|
if alloc.RanSuccessfully() {
|
|
|
|
return true, false
|
|
|
|
}
|
|
|
|
return false, true
|
2022-06-02 16:42:15 +00:00
|
|
|
case structs.AllocDesiredStatusEvict:
|
|
|
|
return false, true
|
2018-03-29 18:28:37 +00:00
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
switch alloc.ClientStatus {
|
|
|
|
case structs.AllocClientStatusFailed:
|
|
|
|
default:
|
|
|
|
return true, false
|
|
|
|
}
|
|
|
|
return false, false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle service jobs
|
|
|
|
switch alloc.DesiredStatus {
|
|
|
|
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
|
|
|
return false, true
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
switch alloc.ClientStatus {
|
|
|
|
case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
|
|
|
|
return false, true
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
return false, false
|
|
|
|
}
|
|
|
|
|
2018-03-02 00:23:44 +00:00
|
|
|
// updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
|
|
|
|
// should be rescheduled now, later or left in the untainted set
|
2022-03-31 15:32:18 +00:00
|
|
|
func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment, isDisconnecting bool) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
|
2018-04-17 19:41:36 +00:00
|
|
|
// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
|
|
|
|
// if it has been marked eligible
|
|
|
|
if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
|
2018-04-08 22:23:19 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2018-05-08 22:26:36 +00:00
|
|
|
// Check if the allocation is marked as it should be force rescheduled
|
|
|
|
if alloc.DesiredTransition.ShouldForceReschedule() {
|
|
|
|
rescheduleNow = true
|
|
|
|
}
|
|
|
|
|
2018-04-03 20:49:18 +00:00
|
|
|
// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
|
2022-03-31 15:32:18 +00:00
|
|
|
var eligible bool
|
|
|
|
if isDisconnecting {
|
|
|
|
rescheduleTime, eligible = alloc.NextRescheduleTimeByFailTime(now)
|
|
|
|
} else {
|
|
|
|
rescheduleTime, eligible = alloc.NextRescheduleTime()
|
|
|
|
}
|
|
|
|
|
2018-04-04 12:35:20 +00:00
|
|
|
if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
|
2018-03-02 00:23:44 +00:00
|
|
|
rescheduleNow = true
|
2018-03-29 18:28:37 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
if eligible && alloc.FollowupEvalID == "" {
|
|
|
|
rescheduleLater = true
|
2018-03-02 00:23:44 +00:00
|
|
|
}
|
2017-05-23 00:14:38 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2018-01-19 21:20:00 +00:00
|
|
|
// filterByTerminal filters out terminal allocs
|
|
|
|
func filterByTerminal(untainted allocSet) (nonTerminal allocSet) {
|
|
|
|
nonTerminal = make(map[string]*structs.Allocation)
|
|
|
|
for id, alloc := range untainted {
|
|
|
|
if !alloc.TerminalStatus() {
|
|
|
|
nonTerminal[id] = alloc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:14:38 +00:00
|
|
|
// filterByDeployment filters allocations into two sets, those that match the
|
|
|
|
// given deployment ID and those that don't
|
|
|
|
func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {
|
|
|
|
match = make(map[string]*structs.Allocation)
|
|
|
|
nonmatch = make(map[string]*structs.Allocation)
|
|
|
|
for _, alloc := range a {
|
|
|
|
if alloc.DeploymentID == id {
|
|
|
|
match[alloc.ID] = alloc
|
|
|
|
} else {
|
|
|
|
nonmatch[alloc.ID] = alloc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
2017-06-06 21:08:46 +00:00
|
|
|
|
2020-05-13 20:39:04 +00:00
|
|
|
// delayByStopAfterClientDisconnect returns a delay for any lost allocation that's got a
|
|
|
|
// stop_after_client_disconnect configured
|
2021-12-20 10:44:21 +00:00
|
|
|
func (a allocSet) delayByStopAfterClientDisconnect() (later []*delayedRescheduleInfo) {
|
2020-05-13 20:39:04 +00:00
|
|
|
now := time.Now().UTC()
|
2021-12-20 10:44:21 +00:00
|
|
|
for _, a := range a {
|
2020-05-13 20:39:04 +00:00
|
|
|
if !a.ShouldClientStop() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
t := a.WaitClientStop()
|
|
|
|
|
|
|
|
if t.After(now) {
|
|
|
|
later = append(later, &delayedRescheduleInfo{
|
|
|
|
allocID: a.ID,
|
|
|
|
alloc: a,
|
|
|
|
rescheduleTime: t,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return later
|
|
|
|
}
|
|
|
|
|
2022-02-16 18:50:20 +00:00
|
|
|
// delayByMaxClientDisconnect returns a delay for any unknown allocation
|
|
|
|
// that's got a max_client_reconnect configured
|
|
|
|
func (a allocSet) delayByMaxClientDisconnect(now time.Time) (later []*delayedRescheduleInfo, err error) {
|
|
|
|
for _, alloc := range a {
|
|
|
|
timeout := alloc.DisconnectTimeout(now)
|
|
|
|
|
|
|
|
if !timeout.After(now) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
later = append(later, &delayedRescheduleInfo{
|
|
|
|
allocID: alloc.ID,
|
|
|
|
alloc: alloc,
|
|
|
|
rescheduleTime: timeout,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-04-21 14:05:58 +00:00
|
|
|
// filterByClientStatus returns allocs from the set with the specified client status.
|
|
|
|
func (a allocSet) filterByClientStatus(clientStatus string) allocSet {
|
|
|
|
allocs := make(allocSet)
|
|
|
|
for _, alloc := range a {
|
|
|
|
if alloc.ClientStatus == clientStatus {
|
|
|
|
allocs[alloc.ID] = alloc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return allocs
|
|
|
|
}
|
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
// allocNameIndex is used to select allocation names for placement or removal
|
|
|
|
// given an existing set of placed allocations.
|
|
|
|
type allocNameIndex struct {
|
|
|
|
job, taskGroup string
|
|
|
|
count int
|
|
|
|
b structs.Bitmap
|
|
|
|
}
|
|
|
|
|
|
|
|
// newAllocNameIndex returns an allocNameIndex for use in selecting names of
|
|
|
|
// allocations to create or stop. It takes the job and task group name, desired
|
|
|
|
// count and any existing allocations as input.
|
|
|
|
func newAllocNameIndex(job, taskGroup string, count int, in allocSet) *allocNameIndex {
|
|
|
|
return &allocNameIndex{
|
|
|
|
count: count,
|
|
|
|
b: bitmapFrom(in, uint(count)),
|
|
|
|
job: job,
|
|
|
|
taskGroup: taskGroup,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// bitmapFrom creates a bitmap from the given allocation set and a minimum size
|
|
|
|
// maybe given. The size of the bitmap is as the larger of the passed minimum
|
2017-08-15 19:27:05 +00:00
|
|
|
// and the maximum alloc index of the passed input (byte aligned).
|
2017-06-06 21:08:46 +00:00
|
|
|
func bitmapFrom(input allocSet, minSize uint) structs.Bitmap {
|
|
|
|
var max uint
|
|
|
|
for _, a := range input {
|
|
|
|
if num := a.Index(); num > max {
|
|
|
|
max = num
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if l := uint(len(input)); minSize < l {
|
|
|
|
minSize = l
|
|
|
|
}
|
2017-08-12 22:37:02 +00:00
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
if max < minSize {
|
|
|
|
max = minSize
|
2017-08-15 19:27:05 +00:00
|
|
|
} else if max%8 == 0 {
|
2017-08-12 22:37:02 +00:00
|
|
|
// This may be possible if the job was scaled down. We want to make sure
|
2017-09-26 22:26:33 +00:00
|
|
|
// that the max index is not byte-aligned otherwise we will overflow
|
2017-08-12 22:37:02 +00:00
|
|
|
// the bitmap.
|
|
|
|
max++
|
2017-06-06 21:08:46 +00:00
|
|
|
}
|
2017-08-12 22:37:02 +00:00
|
|
|
|
2017-06-06 21:08:46 +00:00
|
|
|
if max == 0 {
|
|
|
|
max = 8
|
|
|
|
}
|
|
|
|
|
|
|
|
// byteAlign the count
|
|
|
|
if remainder := max % 8; remainder != 0 {
|
|
|
|
max = max + 8 - remainder
|
|
|
|
}
|
|
|
|
|
|
|
|
bitmap, err := structs.NewBitmap(max)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, a := range input {
|
|
|
|
bitmap.Set(a.Index())
|
|
|
|
}
|
|
|
|
|
|
|
|
return bitmap
|
|
|
|
}
|
|
|
|
|
2022-02-10 21:24:51 +00:00
|
|
|
// Highest removes and returns the highest n used names. The returned set
|
2017-06-06 21:08:46 +00:00
|
|
|
// can be less than n if there aren't n names set in the index
|
|
|
|
func (a *allocNameIndex) Highest(n uint) map[string]struct{} {
|
|
|
|
h := make(map[string]struct{}, n)
|
|
|
|
for i := a.b.Size(); i > uint(0) && uint(len(h)) < n; i-- {
|
|
|
|
// Use this to avoid wrapping around b/c of the unsigned int
|
|
|
|
idx := i - 1
|
|
|
|
if a.b.Check(idx) {
|
|
|
|
a.b.Unset(idx)
|
|
|
|
h[structs.AllocName(a.job, a.taskGroup, idx)] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return h
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set sets the indexes from the passed alloc set as used
|
|
|
|
func (a *allocNameIndex) Set(set allocSet) {
|
|
|
|
for _, alloc := range set {
|
|
|
|
a.b.Set(alloc.Index())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unset unsets all indexes of the passed alloc set as being used
|
|
|
|
func (a *allocNameIndex) Unset(as allocSet) {
|
|
|
|
for _, alloc := range as {
|
|
|
|
a.b.Unset(alloc.Index())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// UnsetIndex unsets the index as having its name used
|
|
|
|
func (a *allocNameIndex) UnsetIndex(idx uint) {
|
|
|
|
a.b.Unset(idx)
|
|
|
|
}
|
|
|
|
|
|
|
|
// NextCanaries returns the next n names for use as canaries and sets them as
|
|
|
|
// used. The existing canaries and destructive updates are also passed in.
|
|
|
|
func (a *allocNameIndex) NextCanaries(n uint, existing, destructive allocSet) []string {
|
|
|
|
next := make([]string, 0, n)
|
|
|
|
|
|
|
|
// Create a name index
|
|
|
|
existingNames := existing.nameSet()
|
|
|
|
|
|
|
|
// First select indexes from the allocations that are undergoing destructive
|
|
|
|
// updates. This way we avoid duplicate names as they will get replaced.
|
|
|
|
dmap := bitmapFrom(destructive, uint(a.count))
|
2018-04-20 00:08:24 +00:00
|
|
|
remainder := n
|
2017-06-06 21:08:46 +00:00
|
|
|
for _, idx := range dmap.IndexesInRange(true, uint(0), uint(a.count)-1) {
|
|
|
|
name := structs.AllocName(a.job, a.taskGroup, uint(idx))
|
|
|
|
if _, used := existingNames[name]; !used {
|
|
|
|
next = append(next, name)
|
|
|
|
a.b.Set(uint(idx))
|
|
|
|
|
|
|
|
// If we have enough, return
|
2018-04-20 00:08:24 +00:00
|
|
|
remainder = n - uint(len(next))
|
2017-06-06 21:08:46 +00:00
|
|
|
if remainder == 0 {
|
|
|
|
return next
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the set of unset names that can be used
|
|
|
|
for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) {
|
|
|
|
name := structs.AllocName(a.job, a.taskGroup, uint(idx))
|
|
|
|
if _, used := existingNames[name]; !used {
|
|
|
|
next = append(next, name)
|
|
|
|
a.b.Set(uint(idx))
|
|
|
|
|
|
|
|
// If we have enough, return
|
|
|
|
remainder = n - uint(len(next))
|
|
|
|
if remainder == 0 {
|
|
|
|
return next
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-20 00:08:24 +00:00
|
|
|
// We have exhausted the preferred and free set. Pick starting from n to
|
2018-04-25 22:03:30 +00:00
|
|
|
// n+remainder, to avoid overlapping where possible. An example is the
|
|
|
|
// desired count is 3 and we want 5 canaries. The first 3 canaries can use
|
|
|
|
// index [0, 1, 2] but after that we prefer picking indexes [4, 5] so that
|
|
|
|
// we do not overlap. Once the canaries are promoted, these would be the
|
|
|
|
// allocations that would be shut down as well.
|
2018-04-20 00:08:24 +00:00
|
|
|
for i := uint(a.count); i < uint(a.count)+remainder; i++ {
|
2017-06-06 21:08:46 +00:00
|
|
|
name := structs.AllocName(a.job, a.taskGroup, i)
|
2018-04-20 00:08:24 +00:00
|
|
|
next = append(next, name)
|
2017-06-06 21:08:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return next
|
|
|
|
}
|
|
|
|
|
|
|
|
// Next returns the next n names for use as new placements and sets them as
|
|
|
|
// used.
|
|
|
|
func (a *allocNameIndex) Next(n uint) []string {
|
|
|
|
next := make([]string, 0, n)
|
|
|
|
|
|
|
|
// Get the set of unset names that can be used
|
|
|
|
remainder := n
|
|
|
|
for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) {
|
|
|
|
next = append(next, structs.AllocName(a.job, a.taskGroup, uint(idx)))
|
|
|
|
a.b.Set(uint(idx))
|
|
|
|
|
|
|
|
// If we have enough, return
|
|
|
|
remainder = n - uint(len(next))
|
|
|
|
if remainder == 0 {
|
|
|
|
return next
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have exhausted the free set, now just pick overlapping indexes
|
|
|
|
var i uint
|
|
|
|
for i = 0; i < remainder; i++ {
|
|
|
|
next = append(next, structs.AllocName(a.job, a.taskGroup, i))
|
|
|
|
a.b.Set(i)
|
|
|
|
}
|
|
|
|
|
|
|
|
return next
|
|
|
|
}
|