2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
package scheduler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2022-11-17 23:59:33 +00:00
|
|
|
"runtime/debug"
|
2015-10-14 23:43:06 +00:00
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
2019-03-08 12:48:12 +00:00
|
|
|
"github.com/hashicorp/go-memdb"
|
2017-09-29 16:58:48 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2015-10-14 23:43:06 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
// maxSystemScheduleAttempts is used to limit the number of times
|
|
|
|
// we will attempt to schedule if we continue to hit conflicts for system
|
|
|
|
// jobs.
|
2015-10-16 18:36:26 +00:00
|
|
|
maxSystemScheduleAttempts = 5
|
2020-10-09 21:31:38 +00:00
|
|
|
|
|
|
|
// maxSysBatchScheduleAttempts is used to limit the number of times we will
|
|
|
|
// attempt to schedule if we continue to hit conflicts for sysbatch jobs.
|
|
|
|
maxSysBatchScheduleAttempts = 2
|
2015-10-14 23:43:06 +00:00
|
|
|
)
|
|
|
|
|
2020-10-09 21:31:38 +00:00
|
|
|
// SystemScheduler is used for 'system' and 'sysbatch' jobs. This scheduler is
|
|
|
|
// designed for jobs that should be run on every client. The 'system' mode
|
|
|
|
// will ensure those jobs continuously run regardless of successful task exits,
|
|
|
|
// whereas 'sysbatch' considers the task complete on success.
|
2015-10-14 23:43:06 +00:00
|
|
|
type SystemScheduler struct {
|
2020-10-09 21:31:38 +00:00
|
|
|
logger log.Logger
|
2022-01-15 01:09:14 +00:00
|
|
|
eventsCh chan<- interface{}
|
2020-10-09 21:31:38 +00:00
|
|
|
state State
|
|
|
|
planner Planner
|
|
|
|
sysbatch bool
|
2015-10-14 23:43:06 +00:00
|
|
|
|
2016-02-10 05:24:47 +00:00
|
|
|
eval *structs.Evaluation
|
|
|
|
job *structs.Job
|
|
|
|
plan *structs.Plan
|
|
|
|
planResult *structs.PlanResult
|
|
|
|
ctx *EvalContext
|
|
|
|
stack *SystemStack
|
2020-10-09 21:31:38 +00:00
|
|
|
|
2021-10-27 14:04:13 +00:00
|
|
|
nodes []*structs.Node
|
|
|
|
notReadyNodes map[string]struct{}
|
|
|
|
nodesByDC map[string]int
|
2015-10-14 23:43:06 +00:00
|
|
|
|
|
|
|
limitReached bool
|
|
|
|
nextEval *structs.Evaluation
|
2016-06-15 19:58:19 +00:00
|
|
|
|
|
|
|
failedTGAllocs map[string]*structs.AllocMetric
|
2016-07-18 22:04:05 +00:00
|
|
|
queuedAllocs map[string]int
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewSystemScheduler is a factory function to instantiate a new system
|
|
|
|
// scheduler.
|
2022-01-15 01:09:14 +00:00
|
|
|
func NewSystemScheduler(logger log.Logger, eventsCh chan<- interface{}, state State, planner Planner) Scheduler {
|
2015-10-14 23:43:06 +00:00
|
|
|
return &SystemScheduler{
|
2020-10-09 21:31:38 +00:00
|
|
|
logger: logger.Named("system_sched"),
|
2022-01-15 01:09:14 +00:00
|
|
|
eventsCh: eventsCh,
|
2020-10-09 21:31:38 +00:00
|
|
|
state: state,
|
|
|
|
planner: planner,
|
|
|
|
sysbatch: false,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-15 01:09:14 +00:00
|
|
|
func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state State, planner Planner) Scheduler {
|
2020-10-09 21:31:38 +00:00
|
|
|
return &SystemScheduler{
|
|
|
|
logger: logger.Named("sysbatch_sched"),
|
2022-01-15 01:09:14 +00:00
|
|
|
eventsCh: eventsCh,
|
2020-10-09 21:31:38 +00:00
|
|
|
state: state,
|
|
|
|
planner: planner,
|
|
|
|
sysbatch: true,
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process is used to handle a single evaluation.
|
2022-02-07 16:47:53 +00:00
|
|
|
func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
|
|
|
|
|
|
|
|
defer func() {
|
|
|
|
if r := recover(); r != nil {
|
2022-11-17 23:59:33 +00:00
|
|
|
s.logger.Error("processing eval panicked scheduler - please report this as a bug!", "eval_id", eval.ID, "error", r, "stack_trace", string(debug.Stack()))
|
|
|
|
err = fmt.Errorf("failed to process eval: %v", r)
|
2022-02-07 16:47:53 +00:00
|
|
|
}
|
|
|
|
}()
|
2020-10-09 21:31:38 +00:00
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// Store the evaluation
|
|
|
|
s.eval = eval
|
|
|
|
|
2018-09-15 23:23:13 +00:00
|
|
|
// Update our logger with the eval's information
|
|
|
|
s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace)
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// Verify the evaluation trigger reason is understood
|
2020-10-09 21:31:38 +00:00
|
|
|
if !s.canHandle(eval.TriggeredBy) {
|
|
|
|
desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy)
|
2016-07-18 22:04:05 +00:00
|
|
|
return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc,
|
2017-07-06 00:13:45 +00:00
|
|
|
s.queuedAllocs, "")
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
2020-10-09 21:31:38 +00:00
|
|
|
limit := maxSystemScheduleAttempts
|
|
|
|
if s.sysbatch {
|
|
|
|
limit = maxSysBatchScheduleAttempts
|
|
|
|
}
|
|
|
|
|
2016-02-10 05:24:47 +00:00
|
|
|
// Retry up to the maxSystemScheduleAttempts and reset if progress is made.
|
|
|
|
progress := func() bool { return progressMade(s.planResult) }
|
2020-10-09 21:31:38 +00:00
|
|
|
if err := retryMax(limit, s.process, progress); err != nil {
|
2015-10-14 23:43:06 +00:00
|
|
|
if statusErr, ok := err.(*SetStatusError); ok {
|
2016-07-18 22:04:05 +00:00
|
|
|
return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
|
2017-07-06 00:13:45 +00:00
|
|
|
s.queuedAllocs, "")
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update the status to complete
|
2016-07-18 22:04:05 +00:00
|
|
|
return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "",
|
2017-07-06 00:13:45 +00:00
|
|
|
s.queuedAllocs, "")
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// process is wrapped in retryMax to iteratively run the handler until we have no
|
|
|
|
// further work or we've made the maximum number of attempts.
|
|
|
|
func (s *SystemScheduler) process() (bool, error) {
|
|
|
|
// Lookup the Job by ID
|
|
|
|
var err error
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID)
|
2015-10-14 23:43:06 +00:00
|
|
|
if err != nil {
|
2020-10-09 21:31:38 +00:00
|
|
|
return false, fmt.Errorf("failed to get job '%s': %v", s.eval.JobID, err)
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
2020-10-09 21:31:38 +00:00
|
|
|
|
2016-07-26 05:11:11 +00:00
|
|
|
numTaskGroups := 0
|
2017-04-19 17:54:03 +00:00
|
|
|
if !s.job.Stopped() {
|
2016-07-26 05:11:11 +00:00
|
|
|
numTaskGroups = len(s.job.TaskGroups)
|
|
|
|
}
|
|
|
|
s.queuedAllocs = make(map[string]int, numTaskGroups)
|
2015-10-14 23:43:06 +00:00
|
|
|
|
|
|
|
// Get the ready nodes in the required datacenters
|
2017-04-19 17:54:03 +00:00
|
|
|
if !s.job.Stopped() {
|
2023-06-07 14:39:03 +00:00
|
|
|
s.nodes, s.notReadyNodes, s.nodesByDC, err = readyNodesInDCsAndPool(
|
|
|
|
s.state, s.job.Datacenters, s.job.NodePool)
|
2015-10-14 23:43:06 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, fmt.Errorf("failed to get ready nodes: %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a plan
|
2019-03-08 12:48:12 +00:00
|
|
|
s.plan = s.eval.MakePlan(s.job)
|
2015-10-14 23:43:06 +00:00
|
|
|
|
2016-05-19 01:11:40 +00:00
|
|
|
// Reset the failed allocations
|
2016-06-15 19:58:19 +00:00
|
|
|
s.failedTGAllocs = nil
|
2016-05-19 01:11:40 +00:00
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// Create an evaluation context
|
2022-01-15 01:09:14 +00:00
|
|
|
s.ctx = NewEvalContext(s.eventsCh, s.state, s.plan, s.logger)
|
2015-10-14 23:43:06 +00:00
|
|
|
|
|
|
|
// Construct the placement stack
|
2020-10-09 21:31:38 +00:00
|
|
|
s.stack = NewSystemStack(s.sysbatch, s.ctx)
|
2017-04-19 17:54:03 +00:00
|
|
|
if !s.job.Stopped() {
|
2023-06-22 00:31:50 +00:00
|
|
|
s.setJob(s.job)
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Compute the target job allocations
|
|
|
|
if err := s.computeJobAllocs(); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
s.logger.Error("failed to compute job allocations", "error", err)
|
2015-10-14 23:43:06 +00:00
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
2016-05-05 18:21:58 +00:00
|
|
|
// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
|
|
|
|
// anyways to get the annotations.
|
|
|
|
if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
|
2015-10-14 23:43:06 +00:00
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the limit of placements was reached we need to create an evaluation
|
|
|
|
// to pickup from here after the stagger period.
|
|
|
|
if s.limitReached && s.nextEval == nil {
|
|
|
|
s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
|
|
|
|
if err := s.planner.CreateEval(s.nextEval); err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
s.logger.Error("failed to make next eval for rolling update", "error", err)
|
2015-10-14 23:43:06 +00:00
|
|
|
return false, err
|
|
|
|
}
|
2018-09-15 23:23:13 +00:00
|
|
|
s.logger.Debug("rolling update limit reached, next eval created", "next_eval_id", s.nextEval.ID)
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Submit the plan
|
|
|
|
result, newState, err := s.planner.SubmitPlan(s.plan)
|
2016-02-10 05:24:47 +00:00
|
|
|
s.planResult = result
|
2015-10-14 23:43:06 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
2016-07-22 06:13:07 +00:00
|
|
|
// Decrement the number of allocations pending per task group based on the
|
|
|
|
// number of allocations successfully placed
|
2016-07-22 21:53:49 +00:00
|
|
|
adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
|
2016-07-18 22:04:05 +00:00
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// If we got a state refresh, try again since we have stale data
|
|
|
|
if newState != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
s.logger.Debug("refresh forced")
|
2015-10-14 23:43:06 +00:00
|
|
|
s.state = newState
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try again if the plan was not fully committed, potential conflict
|
|
|
|
fullCommit, expected, actual := result.FullCommit(s.plan)
|
|
|
|
if !fullCommit {
|
2018-09-15 23:23:13 +00:00
|
|
|
s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual)
|
2015-10-14 23:43:06 +00:00
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Success!
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2023-06-22 00:31:50 +00:00
|
|
|
// setJob updates the stack with the given job and job's node pool scheduler
|
|
|
|
// configuration.
|
|
|
|
func (s *SystemScheduler) setJob(job *structs.Job) error {
|
|
|
|
// Fetch node pool and global scheduler configuration to determine how to
|
|
|
|
// configure the scheduler.
|
|
|
|
pool, err := s.state.NodePoolByName(nil, job.NodePool)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to get job node pool %q: %v", job.NodePool, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
_, schedConfig, err := s.state.SchedulerConfig()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to get scheduler configuration: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
s.stack.SetJob(job)
|
|
|
|
s.stack.SetSchedulerConfiguration(schedConfig.WithNodePool(pool))
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// computeJobAllocs is used to reconcile differences between the job,
|
|
|
|
// existing allocations and node status to update the allocations.
|
|
|
|
func (s *SystemScheduler) computeJobAllocs() error {
|
|
|
|
// Lookup the allocations by JobID
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true)
|
2015-10-14 23:43:06 +00:00
|
|
|
if err != nil {
|
2020-10-09 21:31:38 +00:00
|
|
|
return fmt.Errorf("failed to get allocs for job '%s': %v", s.eval.JobID, err)
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Determine the tainted nodes containing job allocs
|
|
|
|
tainted, err := taintedNodes(s.state, allocs)
|
|
|
|
if err != nil {
|
2020-10-09 21:31:38 +00:00
|
|
|
return fmt.Errorf("failed to get tainted nodes for job '%s': %v", s.eval.JobID, err)
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
2016-08-09 21:48:25 +00:00
|
|
|
// Update the allocations which are in pending/running state on tainted
|
2020-10-09 21:31:38 +00:00
|
|
|
// nodes to lost.
|
2016-08-09 20:11:58 +00:00
|
|
|
updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
|
|
|
|
|
2020-10-09 21:31:38 +00:00
|
|
|
// Split out terminal allocations
|
|
|
|
live, term := structs.SplitTerminalAllocs(allocs)
|
2016-08-09 20:11:58 +00:00
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// Diff the required and existing allocations
|
2022-04-15 13:31:32 +00:00
|
|
|
diff := diffSystemAllocs(s.job, s.nodes, s.notReadyNodes, tainted, live, term,
|
|
|
|
s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true))
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", diff))
|
2015-10-14 23:43:06 +00:00
|
|
|
|
|
|
|
// Add all the allocs to stop
|
|
|
|
for _, e := range diff.stop {
|
2020-06-09 21:13:53 +00:00
|
|
|
s.plan.AppendStoppedAlloc(e.Alloc, allocNotNeeded, "", "")
|
2016-08-03 22:45:42 +00:00
|
|
|
}
|
|
|
|
|
2018-03-27 00:54:46 +00:00
|
|
|
// Add all the allocs to migrate
|
|
|
|
for _, e := range diff.migrate {
|
2020-06-09 21:13:53 +00:00
|
|
|
s.plan.AppendStoppedAlloc(e.Alloc, allocNodeTainted, "", "")
|
2018-03-27 00:54:46 +00:00
|
|
|
}
|
|
|
|
|
2017-12-13 17:36:03 +00:00
|
|
|
// Lost allocations should be transitioned to desired status stop and client
|
2016-08-03 22:45:42 +00:00
|
|
|
// status lost.
|
|
|
|
for _, e := range diff.lost {
|
2020-06-09 21:13:53 +00:00
|
|
|
s.plan.AppendStoppedAlloc(e.Alloc, allocLost, structs.AllocClientStatusLost, "")
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
2022-04-15 13:31:32 +00:00
|
|
|
for _, e := range diff.disconnecting {
|
|
|
|
s.plan.AppendUnknownAlloc(e.Alloc)
|
|
|
|
}
|
|
|
|
|
Update alloc after reconnect and enforece client heartbeat order (#15068)
* scheduler: allow updates after alloc reconnects
When an allocation reconnects to a cluster the scheduler needs to run
special logic to handle the reconnection, check if a replacement was
create and stop one of them.
If the allocation kept running while the node was disconnected, it will
be reconnected with `ClientStatus: running` and the node will have
`Status: ready`. This combination is the same as the normal steady state
of allocation, where everything is running as expected.
In order to differentiate between the two states (an allocation that is
reconnecting and one that is just running) the scheduler needs an extra
piece of state.
The current implementation uses the presence of a
`TaskClientReconnected` task event to detect when the allocation has
reconnected and thus must go through the reconnection process. But this
event remains even after the allocation is reconnected, causing all
future evals to consider the allocation as still reconnecting.
This commit changes the reconnect logic to use an `AllocState` to
register when the allocation was reconnected. This provides the
following benefits:
- Only a limited number of task states are kept, and they are used for
many other events. It's possible that, upon reconnecting, several
actions are triggered that could cause the `TaskClientReconnected`
event to be dropped.
- Task events are set by clients and so their timestamps are subject
to time skew from servers. This prevents using time to determine if
an allocation reconnected after a disconnect event.
- Disconnect events are already stored as `AllocState` and so storing
reconnects there as well makes it the only source of information
required.
With the new logic, the reconnection logic is only triggered if the
last `AllocState` is a disconnect event, meaning that the allocation has
not been reconnected yet. After the reconnection is handled, the new
`ClientStatus` is store in `AllocState` allowing future evals to skip
the reconnection logic.
* scheduler: prevent spurious placement on reconnect
When a client reconnects it makes two independent RPC calls:
- `Node.UpdateStatus` to heartbeat and set its status as `ready`.
- `Node.UpdateAlloc` to update the status of its allocations.
These two calls can happen in any order, and in case the allocations are
updated before a heartbeat it causes the state to be the same as a node
being disconnected: the node status will still be `disconnected` while
the allocation `ClientStatus` is set to `running`.
The current implementation did not handle this order of events properly,
and the scheduler would create an unnecessary placement since it
considered the allocation was being disconnected. This extra allocation
would then be quickly stopped by the heartbeat eval.
This commit adds a new code path to handle this order of events. If the
node is `disconnected` and the allocation `ClientStatus` is `running`
the scheduler will check if the allocation is actually reconnecting
using its `AllocState` events.
* rpc: only allow alloc updates from `ready` nodes
Clients interact with servers using three main RPC methods:
- `Node.GetAllocs` reads allocation data from the server and writes it
to the client.
- `Node.UpdateAlloc` reads allocation from from the client and writes
them to the server.
- `Node.UpdateStatus` writes the client status to the server and is
used as the heartbeat mechanism.
These three methods are called periodically by the clients and are done
so independently from each other, meaning that there can't be any
assumptions in their ordering.
This can generate scenarios that are hard to reason about and to code
for. For example, when a client misses too many heartbeats it will be
considered `down` or `disconnected` and the allocations it was running
are set to `lost` or `unknown`.
When connectivity is restored the to rest of the cluster, the natural
mental model is to think that the client will heartbeat first and then
update its allocations status into the servers.
But since there's no inherit order in these calls the reverse is just as
possible: the client updates the alloc status and then heartbeats. This
results in a state where allocs are, for example, `running` while the
client is still `disconnected`.
This commit adds a new verification to the `Node.UpdateAlloc` method to
reject updates from nodes that are not `ready`, forcing clients to
heartbeat first. Since this check is done server-side there is no need
to coordinate operations client-side: they can continue sending these
requests independently and alloc update will succeed after the heartbeat
is done.
* chagelog: add entry for #15068
* code review
* client: skip terminal allocations on reconnect
When the client reconnects with the server it synchronizes the state of
its allocations by sending data using the `Node.UpdateAlloc` RPC and
fetching data using the `Node.GetClientAllocs` RPC.
If the data fetch happens before the data write, `unknown` allocations
will still be in this state and would trigger the
`allocRunner.Reconnect` flow.
But when the server `DesiredStatus` for the allocation is `stop` the
client should not reconnect the allocation.
* apply more code review changes
* scheduler: persist changes to reconnected allocs
Reconnected allocs have a new AllocState entry that must be persisted by
the plan applier.
* rpc: read node ID from allocs in UpdateAlloc
The AllocUpdateRequest struct is used in three disjoint use cases:
1. Stripped allocs from clients Node.UpdateAlloc RPC using the Allocs,
and WriteRequest fields
2. Raft log message using the Allocs, Evals, and WriteRequest fields
3. Plan updates using the AllocsStopped, AllocsUpdated, and Job fields
Adding a new field that would only be used in one these cases (1) made
things more confusing and error prone. While in theory an
AllocUpdateRequest could send allocations from different nodes, in
practice this never actually happens since only clients call this method
with their own allocations.
* scheduler: remove logic to handle exceptional case
This condition could only be hit if, somehow, the allocation status was
set to "running" while the client was "unknown". This was addressed by
enforcing an order in "Node.UpdateStatus" and "Node.UpdateAlloc" RPC
calls, so this scenario is not expected to happen.
Adding unnecessary code to the scheduler makes it harder to read and
reason about it.
* more code review
* remove another unused test
2022-11-04 20:25:11 +00:00
|
|
|
// Attempt to do the upgrades in place.
|
|
|
|
// Reconnecting allocations need to be updated to persists alloc state
|
|
|
|
// changes.
|
|
|
|
updates := make([]allocTuple, 0, len(diff.update)+len(diff.reconnecting))
|
|
|
|
updates = append(updates, diff.update...)
|
|
|
|
updates = append(updates, diff.reconnecting...)
|
|
|
|
destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, updates)
|
2016-05-05 18:21:58 +00:00
|
|
|
diff.update = destructiveUpdates
|
|
|
|
|
|
|
|
if s.eval.AnnotatePlan {
|
|
|
|
s.plan.Annotations = &structs.PlanAnnotations{
|
|
|
|
DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
|
|
|
|
}
|
|
|
|
}
|
2015-10-14 23:43:06 +00:00
|
|
|
|
|
|
|
// Check if a rolling upgrade strategy is being used
|
|
|
|
limit := len(diff.update)
|
2017-04-19 17:54:03 +00:00
|
|
|
if !s.job.Stopped() && s.job.Update.Rolling() {
|
2015-10-14 23:43:06 +00:00
|
|
|
limit = s.job.Update.MaxParallel
|
|
|
|
}
|
|
|
|
|
|
|
|
// Treat non in-place updates as an eviction and new placement.
|
2015-10-15 00:26:20 +00:00
|
|
|
s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
|
2015-10-14 23:43:06 +00:00
|
|
|
|
|
|
|
// Nothing remaining to do if placement is not required
|
|
|
|
if len(diff.place) == 0 {
|
2017-04-19 17:54:03 +00:00
|
|
|
if !s.job.Stopped() {
|
2016-07-28 21:02:50 +00:00
|
|
|
for _, tg := range s.job.TaskGroups {
|
|
|
|
s.queuedAllocs[tg.Name] = 0
|
|
|
|
}
|
|
|
|
}
|
2015-10-14 23:43:06 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-07-22 06:13:07 +00:00
|
|
|
// Record the number of allocations that needs to be placed per Task Group
|
2016-07-18 22:04:05 +00:00
|
|
|
for _, allocTuple := range diff.place {
|
2016-07-22 06:13:07 +00:00
|
|
|
s.queuedAllocs[allocTuple.TaskGroup.Name] += 1
|
2016-07-18 22:04:05 +00:00
|
|
|
}
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// Compute the placements
|
|
|
|
return s.computePlacements(diff.place)
|
|
|
|
}
|
|
|
|
|
2021-08-31 20:51:30 +00:00
|
|
|
func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric {
|
|
|
|
if acc == nil {
|
2021-09-10 23:41:31 +00:00
|
|
|
return curr.Copy()
|
2021-08-31 20:51:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
acc.NodesEvaluated += curr.NodesEvaluated
|
|
|
|
acc.NodesFiltered += curr.NodesFiltered
|
2021-11-24 17:28:47 +00:00
|
|
|
|
|
|
|
if acc.ClassFiltered == nil {
|
|
|
|
acc.ClassFiltered = make(map[string]int)
|
|
|
|
}
|
2021-08-31 20:51:30 +00:00
|
|
|
for k, v := range curr.ClassFiltered {
|
|
|
|
acc.ClassFiltered[k] += v
|
|
|
|
}
|
2021-11-24 17:28:47 +00:00
|
|
|
if acc.ConstraintFiltered == nil {
|
|
|
|
acc.ConstraintFiltered = make(map[string]int)
|
|
|
|
}
|
2021-08-31 20:51:30 +00:00
|
|
|
for k, v := range curr.ConstraintFiltered {
|
|
|
|
acc.ConstraintFiltered[k] += v
|
|
|
|
}
|
|
|
|
acc.AllocationTime += curr.AllocationTime
|
|
|
|
return acc
|
|
|
|
}
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// computePlacements computes placements for allocations
|
2015-10-16 18:43:09 +00:00
|
|
|
func (s *SystemScheduler) computePlacements(place []allocTuple) error {
|
2015-10-14 23:43:06 +00:00
|
|
|
nodeByID := make(map[string]*structs.Node, len(s.nodes))
|
|
|
|
for _, node := range s.nodes {
|
|
|
|
nodeByID[node.ID] = node
|
|
|
|
}
|
|
|
|
|
2021-08-31 20:51:30 +00:00
|
|
|
// track node filtering, to only report an error if all nodes have been filtered
|
|
|
|
var filteredMetrics map[string]*structs.AllocMetric
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
nodes := make([]*structs.Node, 1)
|
|
|
|
for _, missing := range place {
|
2021-08-31 20:51:30 +00:00
|
|
|
tgName := missing.TaskGroup.Name
|
|
|
|
|
2015-10-15 20:14:44 +00:00
|
|
|
node, ok := nodeByID[missing.Alloc.NodeID]
|
2015-10-14 23:43:06 +00:00
|
|
|
if !ok {
|
2020-01-21 19:42:39 +00:00
|
|
|
s.logger.Debug("could not find node %q", missing.Alloc.NodeID)
|
|
|
|
continue
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
2016-01-04 20:07:33 +00:00
|
|
|
// Update the set of placement nodes
|
2015-10-14 23:43:06 +00:00
|
|
|
nodes[0] = node
|
|
|
|
s.stack.SetNodes(nodes)
|
|
|
|
|
|
|
|
// Attempt to match the task group
|
2021-03-18 19:35:11 +00:00
|
|
|
option := s.stack.Select(missing.TaskGroup, &SelectOptions{AllocName: missing.Name})
|
2015-10-14 23:43:06 +00:00
|
|
|
|
2016-05-25 01:12:59 +00:00
|
|
|
if option == nil {
|
2019-04-30 19:32:39 +00:00
|
|
|
// If the task can't be placed on this node, update reporting data
|
|
|
|
// and continue to short circuit the loop
|
|
|
|
|
2021-09-10 23:41:31 +00:00
|
|
|
// If this node was filtered because of constraint
|
|
|
|
// mismatches and we couldn't create an allocation then
|
|
|
|
// decrement queuedAllocs for that task group.
|
2016-09-20 19:05:19 +00:00
|
|
|
if s.ctx.metrics.NodesFiltered > 0 {
|
2021-08-31 20:51:30 +00:00
|
|
|
queued := s.queuedAllocs[tgName] - 1
|
|
|
|
s.queuedAllocs[tgName] = queued
|
2021-09-10 23:41:31 +00:00
|
|
|
|
2021-08-31 20:51:30 +00:00
|
|
|
if filteredMetrics == nil {
|
|
|
|
filteredMetrics = map[string]*structs.AllocMetric{}
|
|
|
|
}
|
|
|
|
filteredMetrics[tgName] = mergeNodeFiltered(filteredMetrics[tgName], s.ctx.Metrics())
|
|
|
|
|
|
|
|
if queued <= 0 {
|
|
|
|
if s.failedTGAllocs == nil {
|
|
|
|
s.failedTGAllocs = make(map[string]*structs.AllocMetric)
|
|
|
|
}
|
|
|
|
s.failedTGAllocs[tgName] = filteredMetrics[tgName]
|
|
|
|
}
|
2016-08-11 22:26:25 +00:00
|
|
|
|
|
|
|
// If we are annotating the plan, then decrement the desired
|
|
|
|
// placements based on whether the node meets the constraints
|
|
|
|
if s.eval.AnnotatePlan && s.plan.Annotations != nil &&
|
|
|
|
s.plan.Annotations.DesiredTGUpdates != nil {
|
2021-08-31 20:51:30 +00:00
|
|
|
desired := s.plan.Annotations.DesiredTGUpdates[tgName]
|
2016-08-11 22:26:25 +00:00
|
|
|
desired.Place -= 1
|
|
|
|
}
|
2019-04-30 19:32:39 +00:00
|
|
|
|
|
|
|
// Filtered nodes are not reported to users, just omitted from the job status
|
2019-04-29 20:58:59 +00:00
|
|
|
continue
|
2016-08-10 21:30:02 +00:00
|
|
|
}
|
2016-08-11 22:26:25 +00:00
|
|
|
|
2019-04-30 19:32:39 +00:00
|
|
|
// Check if this task group has already failed, reported to the user as a count
|
2021-08-31 20:51:30 +00:00
|
|
|
if metric, ok := s.failedTGAllocs[tgName]; ok {
|
2016-05-19 01:11:40 +00:00
|
|
|
metric.CoalescedFailures += 1
|
2021-04-29 19:03:45 +00:00
|
|
|
metric.ExhaustResources(missing.TaskGroup)
|
2015-10-14 23:43:06 +00:00
|
|
|
continue
|
|
|
|
}
|
2019-04-29 20:58:59 +00:00
|
|
|
|
|
|
|
// Store the available nodes by datacenter
|
|
|
|
s.ctx.Metrics().NodesAvailable = s.nodesByDC
|
2023-06-07 14:39:03 +00:00
|
|
|
s.ctx.Metrics().NodesInPool = len(s.nodes)
|
2019-04-29 20:58:59 +00:00
|
|
|
|
|
|
|
// Compute top K scoring node metadata
|
|
|
|
s.ctx.Metrics().PopulateScoreMetaData()
|
|
|
|
|
|
|
|
// Lazy initialize the failed map
|
|
|
|
if s.failedTGAllocs == nil {
|
|
|
|
s.failedTGAllocs = make(map[string]*structs.AllocMetric)
|
|
|
|
}
|
|
|
|
|
2021-04-29 19:03:45 +00:00
|
|
|
// Update metrics with the resources requested by the task group.
|
|
|
|
s.ctx.Metrics().ExhaustResources(missing.TaskGroup)
|
|
|
|
|
2019-04-30 19:32:39 +00:00
|
|
|
// Actual failure to start this task on this candidate node, report it individually
|
2021-08-31 20:51:30 +00:00
|
|
|
s.failedTGAllocs[tgName] = s.ctx.Metrics()
|
2019-06-19 19:10:57 +00:00
|
|
|
s.addBlocked(node)
|
2020-01-21 19:42:39 +00:00
|
|
|
|
2019-04-29 20:58:59 +00:00
|
|
|
continue
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
|
|
|
|
2016-01-04 22:23:06 +00:00
|
|
|
// Store the available nodes by datacenter
|
|
|
|
s.ctx.Metrics().NodesAvailable = s.nodesByDC
|
2023-06-07 14:39:03 +00:00
|
|
|
s.ctx.Metrics().NodesInPool = len(s.nodes)
|
2016-01-04 22:23:06 +00:00
|
|
|
|
2018-08-18 01:11:07 +00:00
|
|
|
// Compute top K scoring node metadata
|
|
|
|
s.ctx.Metrics().PopulateScoreMetaData()
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
// Set fields based on if we found an allocation option
|
2019-04-29 20:58:59 +00:00
|
|
|
resources := &structs.AllocatedResources{
|
2019-12-16 20:34:58 +00:00
|
|
|
Tasks: option.TaskResources,
|
|
|
|
TaskLifecycles: option.TaskLifecycles,
|
2019-04-29 20:58:59 +00:00
|
|
|
Shared: structs.AllocatedSharedResources{
|
|
|
|
DiskMB: int64(missing.TaskGroup.EphemeralDisk.SizeMB),
|
|
|
|
},
|
|
|
|
}
|
2018-10-02 20:36:04 +00:00
|
|
|
|
2019-07-03 18:29:47 +00:00
|
|
|
if option.AllocResources != nil {
|
|
|
|
resources.Shared.Networks = option.AllocResources.Networks
|
2020-09-22 16:54:34 +00:00
|
|
|
resources.Shared.Ports = option.AllocResources.Ports
|
2019-07-03 18:29:47 +00:00
|
|
|
}
|
|
|
|
|
2019-04-29 20:58:59 +00:00
|
|
|
// Create an allocation for this
|
|
|
|
alloc := &structs.Allocation{
|
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: s.job.Namespace,
|
|
|
|
EvalID: s.eval.ID,
|
|
|
|
Name: missing.Name,
|
|
|
|
JobID: s.job.ID,
|
2021-08-31 20:51:30 +00:00
|
|
|
TaskGroup: tgName,
|
2019-04-29 20:58:59 +00:00
|
|
|
Metrics: s.ctx.Metrics(),
|
|
|
|
NodeID: option.Node.ID,
|
|
|
|
NodeName: option.Node.Name,
|
|
|
|
TaskResources: resources.OldTaskResources(),
|
|
|
|
AllocatedResources: resources,
|
|
|
|
DesiredStatus: structs.AllocDesiredStatusRun,
|
|
|
|
ClientStatus: structs.AllocClientStatusPending,
|
2019-09-05 23:37:24 +00:00
|
|
|
// SharedResources is considered deprecated, will be removed in 0.11.
|
|
|
|
// It is only set for compat reasons
|
2019-04-29 20:58:59 +00:00
|
|
|
SharedResources: &structs.Resources{
|
2019-07-03 18:29:47 +00:00
|
|
|
DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB,
|
2019-09-05 23:37:24 +00:00
|
|
|
Networks: resources.Shared.Networks,
|
2019-04-29 20:58:59 +00:00
|
|
|
},
|
|
|
|
}
|
2016-05-19 01:11:40 +00:00
|
|
|
|
2019-04-30 19:32:39 +00:00
|
|
|
// If the new allocation is replacing an older allocation then we record the
|
|
|
|
// older allocation id so that they are chained
|
2019-04-29 20:58:59 +00:00
|
|
|
if missing.Alloc != nil {
|
|
|
|
alloc.PreviousAllocation = missing.Alloc.ID
|
|
|
|
}
|
2016-08-16 17:49:45 +00:00
|
|
|
|
2019-04-29 20:58:59 +00:00
|
|
|
// If this placement involves preemption, set DesiredState to evict for those allocations
|
|
|
|
if option.PreemptedAllocs != nil {
|
|
|
|
var preemptedAllocIDs []string
|
|
|
|
for _, stop := range option.PreemptedAllocs {
|
|
|
|
s.plan.AppendPreemptedAlloc(stop, alloc.ID)
|
|
|
|
|
|
|
|
preemptedAllocIDs = append(preemptedAllocIDs, stop.ID)
|
|
|
|
if s.eval.AnnotatePlan && s.plan.Annotations != nil {
|
2020-10-09 05:21:41 +00:00
|
|
|
s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub(nil))
|
2019-04-29 20:58:59 +00:00
|
|
|
if s.plan.Annotations.DesiredTGUpdates != nil {
|
2021-08-31 20:51:30 +00:00
|
|
|
desired := s.plan.Annotations.DesiredTGUpdates[tgName]
|
2019-04-29 20:58:59 +00:00
|
|
|
desired.Preemptions += 1
|
2018-09-21 21:05:00 +00:00
|
|
|
}
|
|
|
|
}
|
2016-05-19 01:11:40 +00:00
|
|
|
}
|
2019-04-29 20:58:59 +00:00
|
|
|
alloc.PreemptedAllocations = preemptedAllocIDs
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
2019-04-29 20:58:59 +00:00
|
|
|
|
2020-08-25 21:09:21 +00:00
|
|
|
s.plan.AppendAlloc(alloc, nil)
|
2015-10-14 23:43:06 +00:00
|
|
|
}
|
2016-05-19 01:11:40 +00:00
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
return nil
|
|
|
|
}
|
2019-06-19 19:10:57 +00:00
|
|
|
|
|
|
|
// addBlocked creates a new blocked eval for this job on this node
|
|
|
|
// and submit to the planner (worker.go), which keeps the eval for execution later
|
|
|
|
func (s *SystemScheduler) addBlocked(node *structs.Node) error {
|
|
|
|
e := s.ctx.Eligibility()
|
|
|
|
escaped := e.HasEscaped()
|
|
|
|
|
|
|
|
// Only store the eligible classes if the eval hasn't escaped.
|
|
|
|
var classEligibility map[string]bool
|
|
|
|
if !escaped {
|
|
|
|
classEligibility = e.GetClasses()
|
|
|
|
}
|
|
|
|
|
2021-04-29 19:03:45 +00:00
|
|
|
blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached(), s.failedTGAllocs)
|
2019-06-19 19:10:57 +00:00
|
|
|
blocked.StatusDescription = blockedEvalFailedPlacements
|
|
|
|
blocked.NodeID = node.ID
|
|
|
|
|
|
|
|
return s.planner.CreateEval(blocked)
|
|
|
|
}
|
2020-10-09 21:31:38 +00:00
|
|
|
|
|
|
|
func (s *SystemScheduler) canHandle(trigger string) bool {
|
|
|
|
switch trigger {
|
|
|
|
case structs.EvalTriggerJobRegister:
|
|
|
|
case structs.EvalTriggerNodeUpdate:
|
|
|
|
case structs.EvalTriggerFailedFollowUp:
|
|
|
|
case structs.EvalTriggerJobDeregister:
|
|
|
|
case structs.EvalTriggerRollingUpdate:
|
|
|
|
case structs.EvalTriggerPreemption:
|
|
|
|
case structs.EvalTriggerDeploymentWatcher:
|
|
|
|
case structs.EvalTriggerNodeDrain:
|
|
|
|
case structs.EvalTriggerAllocStop:
|
|
|
|
case structs.EvalTriggerQueuedAllocs:
|
|
|
|
case structs.EvalTriggerScaling:
|
2022-04-15 13:31:32 +00:00
|
|
|
case structs.EvalTriggerReconnect:
|
2020-10-09 21:31:38 +00:00
|
|
|
default:
|
|
|
|
switch s.sysbatch {
|
|
|
|
case true:
|
|
|
|
return trigger == structs.EvalTriggerPeriodicJob
|
|
|
|
case false:
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|