811fe333da
Many of the functions in the `utils.go` file are specific to a particular scheduler, and very few of them have guards (or even names) that help avoid misuse with features specific to a given scheduler type. Move these functions (and their tests) into files specific to their scheduler type without any functionality changes to make it clear which bits go with what.
349 lines
11 KiB
Go
349 lines
11 KiB
Go
package scheduler
|
|
|
|
// The structs and helpers in this file are split out of scheduler_system.go and
|
|
// shared by the system and sysbatch scheduler. No code in the generic scheduler
|
|
// or reconciler should use anything here! If you need something here for
|
|
// service/batch jobs, double-check it's safe to use for all scheduler types
|
|
// before moving it into util.go
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
// materializeSystemTaskGroups is used to materialize all the task groups
|
|
// a system or sysbatch job requires.
|
|
func materializeSystemTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
|
|
out := make(map[string]*structs.TaskGroup)
|
|
if job.Stopped() {
|
|
return out
|
|
}
|
|
|
|
for _, tg := range job.TaskGroups {
|
|
for i := 0; i < tg.Count; i++ {
|
|
name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
|
|
out[name] = tg
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// diffSystemAllocsForNode is used to do a set difference between the target allocations
|
|
// and the existing allocations for a particular node. This returns 8 sets of results,
|
|
// the list of named task groups that need to be placed (no existing allocation), the
|
|
// allocations that need to be updated (job definition is newer), allocs that
|
|
// need to be migrated (node is draining), the allocs that need to be evicted
|
|
// (no longer required), those that should be ignored, those that are lost
|
|
// that need to be replaced (running on a lost node), those that are running on
|
|
// a disconnected node but may resume, and those that may still be running on
|
|
// a node that has resumed reconnected.
|
|
func diffSystemAllocsForNode(
|
|
job *structs.Job, // job whose allocs are going to be diff-ed
|
|
nodeID string,
|
|
eligibleNodes map[string]*structs.Node,
|
|
notReadyNodes map[string]struct{}, // nodes that are not ready, e.g. draining
|
|
taintedNodes map[string]*structs.Node, // nodes which are down (by node id)
|
|
required map[string]*structs.TaskGroup, // set of allocations that must exist
|
|
allocs []*structs.Allocation, // non-terminal allocations that exist
|
|
terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, id)
|
|
serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic
|
|
) *diffResult {
|
|
result := new(diffResult)
|
|
|
|
// Scan the existing updates
|
|
existing := make(map[string]struct{}) // set of alloc names
|
|
for _, exist := range allocs {
|
|
// Index the existing node
|
|
name := exist.Name
|
|
existing[name] = struct{}{}
|
|
|
|
// Check for the definition in the required set
|
|
tg, ok := required[name]
|
|
|
|
// If not required, we stop the alloc
|
|
if !ok {
|
|
result.stop = append(result.stop, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
supportsDisconnectedClients := exist.SupportsDisconnectedClients(serverSupportsDisconnectedClients)
|
|
|
|
reconnect := false
|
|
expired := false
|
|
|
|
// Only compute reconnect for unknown and running since they need to go
|
|
// through the reconnect process.
|
|
if supportsDisconnectedClients &&
|
|
(exist.ClientStatus == structs.AllocClientStatusUnknown ||
|
|
exist.ClientStatus == structs.AllocClientStatusRunning) {
|
|
reconnect = exist.NeedsToReconnect()
|
|
if reconnect {
|
|
expired = exist.Expired(time.Now())
|
|
}
|
|
}
|
|
|
|
// If we have been marked for migration and aren't terminal, migrate
|
|
if !exist.TerminalStatus() && exist.DesiredTransition.ShouldMigrate() {
|
|
result.migrate = append(result.migrate, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// If we are a sysbatch job and terminal, ignore (or stop?) the alloc
|
|
if job.Type == structs.JobTypeSysBatch && exist.TerminalStatus() {
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Expired unknown allocs are lost. Expired checks that status is unknown.
|
|
if supportsDisconnectedClients && expired {
|
|
result.lost = append(result.lost, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Ignore unknown allocs that we want to reconnect eventually.
|
|
if supportsDisconnectedClients &&
|
|
exist.ClientStatus == structs.AllocClientStatusUnknown &&
|
|
exist.DesiredStatus == structs.AllocDesiredStatusRun {
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
node, nodeIsTainted := taintedNodes[exist.NodeID]
|
|
|
|
// Filter allocs on a node that is now re-connected to reconnecting.
|
|
if supportsDisconnectedClients &&
|
|
!nodeIsTainted &&
|
|
reconnect {
|
|
|
|
// Record the new ClientStatus to indicate to future evals that the
|
|
// alloc has already reconnected.
|
|
reconnecting := exist.Copy()
|
|
reconnecting.AppendState(structs.AllocStateFieldClientStatus, exist.ClientStatus)
|
|
result.reconnecting = append(result.reconnecting, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: reconnecting,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// If we are on a tainted node, we must migrate if we are a service or
|
|
// if the batch allocation did not finish
|
|
if nodeIsTainted {
|
|
// If the job is batch and finished successfully, the fact that the
|
|
// node is tainted does not mean it should be migrated or marked as
|
|
// lost as the work was already successfully finished. However for
|
|
// service/system jobs, tasks should never complete. The check of
|
|
// batch type, defends against client bugs.
|
|
if exist.Job.Type == structs.JobTypeSysBatch && exist.RanSuccessfully() {
|
|
goto IGNORE
|
|
}
|
|
|
|
// Filter running allocs on a node that is disconnected to be marked as unknown.
|
|
if node != nil &&
|
|
supportsDisconnectedClients &&
|
|
node.Status == structs.NodeStatusDisconnected &&
|
|
exist.ClientStatus == structs.AllocClientStatusRunning {
|
|
|
|
disconnect := exist.Copy()
|
|
disconnect.ClientStatus = structs.AllocClientStatusUnknown
|
|
disconnect.AppendState(structs.AllocStateFieldClientStatus, structs.AllocClientStatusUnknown)
|
|
disconnect.ClientDescription = allocUnknown
|
|
result.disconnecting = append(result.disconnecting, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: disconnect,
|
|
})
|
|
continue
|
|
}
|
|
|
|
if !exist.TerminalStatus() && (node == nil || node.TerminalStatus()) {
|
|
result.lost = append(result.lost, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
} else {
|
|
goto IGNORE
|
|
}
|
|
|
|
continue
|
|
}
|
|
|
|
// For an existing allocation, if the nodeID is no longer
|
|
// eligible, the diff should be ignored
|
|
if _, ineligible := notReadyNodes[nodeID]; ineligible {
|
|
goto IGNORE
|
|
}
|
|
|
|
// Existing allocations on nodes that are no longer targeted
|
|
// should be stopped
|
|
if _, eligible := eligibleNodes[nodeID]; !eligible {
|
|
result.stop = append(result.stop, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// If the definition is updated we need to update
|
|
if job.JobModifyIndex != exist.Job.JobModifyIndex {
|
|
result.update = append(result.update, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Everything is up-to-date
|
|
IGNORE:
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: exist,
|
|
})
|
|
}
|
|
|
|
// Scan the required groups
|
|
for name, tg := range required {
|
|
|
|
// Check for an existing allocation
|
|
if _, ok := existing[name]; !ok {
|
|
|
|
// Check for a terminal sysbatch allocation, which should be not placed
|
|
// again unless the job has been updated.
|
|
if job.Type == structs.JobTypeSysBatch {
|
|
if alloc, termExists := terminal.Get(nodeID, name); termExists {
|
|
// the alloc is terminal, but now the job has been updated
|
|
if job.JobModifyIndex != alloc.Job.JobModifyIndex {
|
|
result.update = append(result.update, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: alloc,
|
|
})
|
|
} else {
|
|
// alloc is terminal and job unchanged, leave it alone
|
|
result.ignore = append(result.ignore, allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: alloc,
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Require a placement if no existing allocation. If there
|
|
// is an existing allocation, we would have checked for a potential
|
|
// update or ignore above. Ignore placements for tainted or
|
|
// ineligible nodes
|
|
|
|
// Tainted and ineligible nodes for a non existing alloc
|
|
// should be filtered out and not count towards ignore or place
|
|
if _, tainted := taintedNodes[nodeID]; tainted {
|
|
continue
|
|
}
|
|
if _, eligible := eligibleNodes[nodeID]; !eligible {
|
|
continue
|
|
}
|
|
|
|
termOnNode, _ := terminal.Get(nodeID, name)
|
|
allocTuple := allocTuple{
|
|
Name: name,
|
|
TaskGroup: tg,
|
|
Alloc: termOnNode,
|
|
}
|
|
|
|
// If the new allocation isn't annotated with a previous allocation
|
|
// or if the previous allocation isn't from the same node then we
|
|
// annotate the allocTuple with a new Allocation
|
|
if allocTuple.Alloc == nil || allocTuple.Alloc.NodeID != nodeID {
|
|
allocTuple.Alloc = &structs.Allocation{NodeID: nodeID}
|
|
}
|
|
|
|
result.place = append(result.place, allocTuple)
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// diffSystemAllocs is like diffSystemAllocsForNode however, the allocations in the
|
|
// diffResult contain the specific nodeID they should be allocated on.
|
|
func diffSystemAllocs(
|
|
job *structs.Job, // jobs whose allocations are going to be diff-ed
|
|
readyNodes []*structs.Node, // list of nodes in the ready state
|
|
notReadyNodes map[string]struct{}, // list of nodes in DC but not ready, e.g. draining
|
|
taintedNodes map[string]*structs.Node, // nodes which are down or drain mode (by node id)
|
|
allocs []*structs.Allocation, // non-terminal allocations
|
|
terminal structs.TerminalByNodeByName, // latest terminal allocations (by node id)
|
|
serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic
|
|
) *diffResult {
|
|
|
|
// Build a mapping of nodes to all their allocs.
|
|
nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
|
|
for _, alloc := range allocs {
|
|
nodeAllocs[alloc.NodeID] = append(nodeAllocs[alloc.NodeID], alloc)
|
|
}
|
|
|
|
eligibleNodes := make(map[string]*structs.Node)
|
|
for _, node := range readyNodes {
|
|
if _, ok := nodeAllocs[node.ID]; !ok {
|
|
nodeAllocs[node.ID] = nil
|
|
}
|
|
eligibleNodes[node.ID] = node
|
|
}
|
|
|
|
// Create the required task groups.
|
|
required := materializeSystemTaskGroups(job)
|
|
|
|
result := new(diffResult)
|
|
for nodeID, allocs := range nodeAllocs {
|
|
diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, notReadyNodes, taintedNodes, required, allocs, terminal, serverSupportsDisconnectedClients)
|
|
result.Append(diff)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// evictAndPlace is used to mark allocations for evicts and add them to the
|
|
// placement queue. evictAndPlace modifies both the diffResult and the
|
|
// limit. It returns true if the limit has been reached.
|
|
func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
|
|
n := len(allocs)
|
|
for i := 0; i < n && i < *limit; i++ {
|
|
a := allocs[i]
|
|
ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "")
|
|
diff.place = append(diff.place, a)
|
|
}
|
|
if n <= *limit {
|
|
*limit -= n
|
|
return false
|
|
}
|
|
*limit = 0
|
|
return true
|
|
}
|