2015-08-13 23:25:59 +00:00
|
|
|
package scheduler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2015-10-15 00:26:20 +00:00
|
|
|
"log"
|
2015-09-07 18:23:38 +00:00
|
|
|
"math/rand"
|
2015-09-07 19:25:23 +00:00
|
|
|
"reflect"
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
2015-08-13 23:25:59 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
2015-08-14 01:16:32 +00:00
|
|
|
// allocTuple is a tuple of the allocation name and potential alloc ID
|
|
|
|
type allocTuple struct {
|
|
|
|
Name string
|
|
|
|
TaskGroup *structs.TaskGroup
|
|
|
|
Alloc *structs.Allocation
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// materializeTaskGroups is used to materialize all the task groups
|
|
|
|
// a job requires. This is used to do the count expansion.
|
|
|
|
func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
|
|
|
|
out := make(map[string]*structs.TaskGroup)
|
2017-04-15 03:54:30 +00:00
|
|
|
if job == nil || job.Stop {
|
2015-10-15 20:14:44 +00:00
|
|
|
return out
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
2015-10-14 23:43:06 +00:00
|
|
|
for _, tg := range job.TaskGroups {
|
2015-10-15 20:14:44 +00:00
|
|
|
for i := 0; i < tg.Count; i++ {
|
|
|
|
name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
|
2015-10-14 23:43:06 +00:00
|
|
|
out[name] = tg
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
2015-08-14 01:28:09 +00:00
|
|
|
// diffResult is used to return the sets that result from the diff
|
|
|
|
type diffResult struct {
|
2016-08-03 22:45:42 +00:00
|
|
|
place, update, migrate, stop, ignore, lost []allocTuple
|
2015-08-14 01:28:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *diffResult) GoString() string {
|
2016-08-03 22:45:42 +00:00
|
|
|
return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)",
|
|
|
|
len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost))
|
2015-08-14 01:28:09 +00:00
|
|
|
}
|
|
|
|
|
2015-10-15 20:14:44 +00:00
|
|
|
func (d *diffResult) Append(other *diffResult) {
|
|
|
|
d.place = append(d.place, other.place...)
|
|
|
|
d.update = append(d.update, other.update...)
|
|
|
|
d.migrate = append(d.migrate, other.migrate...)
|
|
|
|
d.stop = append(d.stop, other.stop...)
|
|
|
|
d.ignore = append(d.ignore, other.ignore...)
|
2016-08-03 22:45:42 +00:00
|
|
|
d.lost = append(d.lost, other.lost...)
|
2015-10-15 20:14:44 +00:00
|
|
|
}
|
|
|
|
|
2015-08-13 23:25:59 +00:00
|
|
|
// diffAllocs is used to do a set difference between the target allocations
|
2016-08-03 22:45:42 +00:00
|
|
|
// and the existing allocations. This returns 6 sets of results, the list of
|
2015-08-13 23:25:59 +00:00
|
|
|
// named task groups that need to be placed (no existing allocation), the
|
2015-08-13 23:47:39 +00:00
|
|
|
// allocations that need to be updated (job definition is newer), allocs that
|
|
|
|
// need to be migrated (node is draining), the allocs that need to be evicted
|
2016-08-03 22:45:42 +00:00
|
|
|
// (no longer required), those that should be ignored and those that are lost
|
|
|
|
// that need to be replaced (running on a lost node).
|
2016-08-31 21:06:31 +00:00
|
|
|
//
|
|
|
|
// job is the job whose allocs is going to be diff-ed.
|
|
|
|
// taintedNodes is an index of the nodes which are either down or in drain mode
|
|
|
|
// by name.
|
|
|
|
// required is a set of allocations that must exist.
|
|
|
|
// allocs is a list of non terminal allocations.
|
|
|
|
// terminalAllocs is an index of the latest terminal allocations by name.
|
2016-08-03 22:45:42 +00:00
|
|
|
func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node,
|
2016-08-30 22:36:30 +00:00
|
|
|
required map[string]*structs.TaskGroup, allocs []*structs.Allocation,
|
|
|
|
terminalAllocs map[string]*structs.Allocation) *diffResult {
|
2015-08-14 01:28:09 +00:00
|
|
|
result := &diffResult{}
|
2015-08-14 01:18:32 +00:00
|
|
|
|
2015-08-13 23:25:59 +00:00
|
|
|
// Scan the existing updates
|
2015-08-14 01:20:55 +00:00
|
|
|
existing := make(map[string]struct{})
|
|
|
|
for _, exist := range allocs {
|
|
|
|
// Index the existing node
|
|
|
|
name := exist.Name
|
|
|
|
existing[name] = struct{}{}
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2015-08-14 01:20:55 +00:00
|
|
|
// Check for the definition in the required set
|
|
|
|
tg, ok := required[name]
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2015-08-26 00:06:06 +00:00
|
|
|
// If not required, we stop the alloc
|
2015-08-14 01:20:55 +00:00
|
|
|
if !ok {
|
2015-10-16 18:43:09 +00:00
|
|
|
result.stop = append(result.stop, allocTuple{
|
2015-08-14 01:20:55 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
2015-08-13 23:47:39 +00:00
|
|
|
|
2016-05-25 00:23:18 +00:00
|
|
|
// If we are on a tainted node, we must migrate if we are a service or
|
|
|
|
// if the batch allocation did not finish
|
2016-08-03 22:45:42 +00:00
|
|
|
if node, ok := taintedNodes[exist.NodeID]; ok {
|
2016-06-16 23:17:17 +00:00
|
|
|
// If the job is batch and finished successfully, the fact that the
|
2016-08-03 22:45:42 +00:00
|
|
|
// node is tainted does not mean it should be migrated or marked as
|
|
|
|
// lost as the work was already successfully finished. However for
|
|
|
|
// service/system jobs, tasks should never complete. The check of
|
|
|
|
// batch type, defends against client bugs.
|
2016-05-25 00:47:03 +00:00
|
|
|
if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
|
|
|
|
goto IGNORE
|
2016-05-25 00:23:18 +00:00
|
|
|
}
|
2016-08-03 22:45:42 +00:00
|
|
|
|
|
|
|
if node == nil || node.TerminalStatus() {
|
|
|
|
result.lost = append(result.lost, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
} else {
|
|
|
|
// This is the drain case
|
|
|
|
result.migrate = append(result.migrate, allocTuple{
|
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
|
|
|
}
|
2015-08-14 01:20:55 +00:00
|
|
|
continue
|
|
|
|
}
|
2015-08-13 23:25:59 +00:00
|
|
|
|
2015-08-14 01:20:55 +00:00
|
|
|
// If the definition is updated we need to update
|
2016-01-12 17:50:33 +00:00
|
|
|
if job.JobModifyIndex != exist.Job.JobModifyIndex {
|
2015-10-16 18:43:09 +00:00
|
|
|
result.update = append(result.update, allocTuple{
|
2015-08-14 01:16:32 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
2015-08-14 01:20:55 +00:00
|
|
|
continue
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
2015-08-14 01:20:55 +00:00
|
|
|
|
|
|
|
// Everything is up-to-date
|
2016-05-25 00:47:03 +00:00
|
|
|
IGNORE:
|
2015-10-16 18:43:09 +00:00
|
|
|
result.ignore = append(result.ignore, allocTuple{
|
2015-08-14 01:20:55 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
|
|
|
Alloc: exist,
|
|
|
|
})
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Scan the required groups
|
2015-08-14 01:16:32 +00:00
|
|
|
for name, tg := range required {
|
2015-08-13 23:25:59 +00:00
|
|
|
// Check for an existing allocation
|
|
|
|
_, ok := existing[name]
|
|
|
|
|
|
|
|
// Require a placement if no existing allocation. If there
|
|
|
|
// is an existing allocation, we would have checked for a potential
|
|
|
|
// update or ignore above.
|
|
|
|
if !ok {
|
2015-10-16 18:43:09 +00:00
|
|
|
result.place = append(result.place, allocTuple{
|
2015-08-14 01:16:32 +00:00
|
|
|
Name: name,
|
|
|
|
TaskGroup: tg,
|
2016-08-30 22:36:30 +00:00
|
|
|
Alloc: terminalAllocs[name],
|
2015-08-14 01:16:32 +00:00
|
|
|
})
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
}
|
2015-08-14 01:28:09 +00:00
|
|
|
return result
|
2015-08-13 23:25:59 +00:00
|
|
|
}
|
|
|
|
|
2015-10-15 20:14:44 +00:00
|
|
|
// diffSystemAllocs is like diffAllocs however, the allocations in the
|
|
|
|
// diffResult contain the specific nodeID they should be allocated on.
|
2016-08-31 21:06:31 +00:00
|
|
|
//
|
|
|
|
// job is the job whose allocs is going to be diff-ed.
|
|
|
|
// nodes is a list of nodes in ready state.
|
|
|
|
// taintedNodes is an index of the nodes which are either down or in drain mode
|
|
|
|
// by name.
|
|
|
|
// allocs is a list of non terminal allocations.
|
|
|
|
// terminalAllocs is an index of the latest terminal allocations by name.
|
2016-08-03 22:45:42 +00:00
|
|
|
func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node,
|
2016-08-30 22:36:30 +00:00
|
|
|
allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult {
|
2015-10-15 20:14:44 +00:00
|
|
|
|
|
|
|
// Build a mapping of nodes to all their allocs.
|
|
|
|
nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
nallocs := append(nodeAllocs[alloc.NodeID], alloc)
|
|
|
|
nodeAllocs[alloc.NodeID] = nallocs
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range nodes {
|
|
|
|
if _, ok := nodeAllocs[node.ID]; !ok {
|
|
|
|
nodeAllocs[node.ID] = nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the required task groups.
|
|
|
|
required := materializeTaskGroups(job)
|
|
|
|
|
|
|
|
result := &diffResult{}
|
|
|
|
for nodeID, allocs := range nodeAllocs {
|
2016-08-30 22:36:30 +00:00
|
|
|
diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs)
|
2015-10-15 20:14:44 +00:00
|
|
|
|
2016-09-17 18:28:02 +00:00
|
|
|
// If the node is tainted there should be no placements made
|
|
|
|
if _, ok := taintedNodes[nodeID]; ok {
|
|
|
|
diff.place = nil
|
|
|
|
} else {
|
|
|
|
// Mark the alloc as being for a specific node.
|
|
|
|
for i := range diff.place {
|
|
|
|
alloc := &diff.place[i]
|
|
|
|
|
|
|
|
// If the new allocation isn't annotated with a previous allocation
|
|
|
|
// or if the previous allocation isn't from the same node then we
|
|
|
|
// annotate the allocTuple with a new Allocation
|
|
|
|
if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
|
|
|
|
alloc.Alloc = &structs.Allocation{NodeID: nodeID}
|
|
|
|
}
|
2016-08-30 22:36:30 +00:00
|
|
|
}
|
2015-10-15 20:14:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Migrate does not apply to system jobs and instead should be marked as
|
|
|
|
// stop because if a node is tainted, the job is invalid on that node.
|
|
|
|
diff.stop = append(diff.stop, diff.migrate...)
|
|
|
|
diff.migrate = nil
|
|
|
|
|
|
|
|
result.Append(diff)
|
|
|
|
}
|
|
|
|
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2016-01-04 20:07:33 +00:00
|
|
|
// readyNodesInDCs returns all the ready nodes in the given datacenters and a
|
|
|
|
// mapping of each data center to the count of ready nodes.
|
|
|
|
func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
|
2015-08-15 20:11:42 +00:00
|
|
|
// Index the DCs
|
2016-01-04 20:07:33 +00:00
|
|
|
dcMap := make(map[string]int, len(dcs))
|
2015-08-14 00:19:09 +00:00
|
|
|
for _, dc := range dcs {
|
2016-01-04 20:07:33 +00:00
|
|
|
dcMap[dc] = 0
|
2015-08-15 20:11:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Scan the nodes
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2015-08-15 20:11:42 +00:00
|
|
|
var out []*structs.Node
|
2017-02-08 04:31:23 +00:00
|
|
|
iter, err := state.Nodes(ws)
|
2015-08-15 20:11:42 +00:00
|
|
|
if err != nil {
|
2016-01-04 20:07:33 +00:00
|
|
|
return nil, nil, err
|
2015-08-15 20:11:42 +00:00
|
|
|
}
|
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
2015-08-15 20:11:42 +00:00
|
|
|
|
|
|
|
// Filter on datacenter and status
|
|
|
|
node := raw.(*structs.Node)
|
|
|
|
if node.Status != structs.NodeStatusReady {
|
|
|
|
continue
|
|
|
|
}
|
2015-09-07 02:47:02 +00:00
|
|
|
if node.Drain {
|
|
|
|
continue
|
|
|
|
}
|
2015-08-15 20:11:42 +00:00
|
|
|
if _, ok := dcMap[node.Datacenter]; !ok {
|
|
|
|
continue
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
2015-08-15 20:11:42 +00:00
|
|
|
out = append(out, node)
|
2016-01-04 20:07:33 +00:00
|
|
|
dcMap[node.Datacenter] += 1
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
2016-01-04 20:07:33 +00:00
|
|
|
return out, dcMap, nil
|
2015-08-14 00:19:09 +00:00
|
|
|
}
|
2015-08-14 00:40:23 +00:00
|
|
|
|
|
|
|
// retryMax is used to retry a callback until it returns success or
|
2016-02-10 05:24:47 +00:00
|
|
|
// a maximum number of attempts is reached. An optional reset function may be
|
|
|
|
// passed which is called after each failed iteration. If the reset function is
|
|
|
|
// set and returns true, the number of attempts is reset back to max.
|
|
|
|
func retryMax(max int, cb func() (bool, error), reset func() bool) error {
|
2015-08-14 00:40:23 +00:00
|
|
|
attempts := 0
|
|
|
|
for attempts < max {
|
|
|
|
done, err := cb()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if done {
|
|
|
|
return nil
|
|
|
|
}
|
2016-02-10 05:24:47 +00:00
|
|
|
|
|
|
|
// Check if we should reset the number attempts
|
|
|
|
if reset != nil && reset() {
|
|
|
|
attempts = 0
|
|
|
|
} else {
|
|
|
|
attempts += 1
|
|
|
|
}
|
2015-08-14 00:40:23 +00:00
|
|
|
}
|
2015-08-15 21:47:13 +00:00
|
|
|
return &SetStatusError{
|
|
|
|
Err: fmt.Errorf("maximum attempts reached (%d)", max),
|
|
|
|
EvalStatus: structs.EvalStatusFailed,
|
|
|
|
}
|
2015-08-14 00:40:23 +00:00
|
|
|
}
|
2015-08-14 00:51:31 +00:00
|
|
|
|
2016-02-10 05:24:47 +00:00
|
|
|
// progressMade checks to see if the plan result made allocations or updates.
|
|
|
|
// If the result is nil, false is returned.
|
|
|
|
func progressMade(result *structs.PlanResult) bool {
|
2016-02-22 18:38:04 +00:00
|
|
|
return result != nil && (len(result.NodeUpdate) != 0 ||
|
|
|
|
len(result.NodeAllocation) != 0)
|
2016-02-10 05:24:47 +00:00
|
|
|
}
|
|
|
|
|
2015-08-14 00:51:31 +00:00
|
|
|
// taintedNodes is used to scan the allocations and then check if the
|
|
|
|
// underlying nodes are tainted, and should force a migration of the allocation.
|
2016-08-03 22:45:42 +00:00
|
|
|
// All the nodes returned in the map are tainted.
|
|
|
|
func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) {
|
|
|
|
out := make(map[string]*structs.Node)
|
2015-08-14 00:51:31 +00:00
|
|
|
for _, alloc := range allocs {
|
|
|
|
if _, ok := out[alloc.NodeID]; ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
|
|
|
node, err := state.NodeByID(ws, alloc.NodeID)
|
2015-08-14 00:51:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2015-08-14 01:05:31 +00:00
|
|
|
// If the node does not exist, we should migrate
|
|
|
|
if node == nil {
|
2016-08-03 22:45:42 +00:00
|
|
|
out[alloc.NodeID] = nil
|
2015-08-14 01:05:31 +00:00
|
|
|
continue
|
|
|
|
}
|
2016-08-03 22:45:42 +00:00
|
|
|
if structs.ShouldDrainNode(node.Status) || node.Drain {
|
|
|
|
out[alloc.NodeID] = node
|
|
|
|
}
|
2015-08-14 00:51:31 +00:00
|
|
|
}
|
|
|
|
return out, nil
|
|
|
|
}
|
2015-09-07 18:23:38 +00:00
|
|
|
|
|
|
|
// shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
|
|
|
|
func shuffleNodes(nodes []*structs.Node) {
|
|
|
|
n := len(nodes)
|
|
|
|
for i := n - 1; i > 0; i-- {
|
|
|
|
j := rand.Intn(i + 1)
|
|
|
|
nodes[i], nodes[j] = nodes[j], nodes[i]
|
|
|
|
}
|
|
|
|
}
|
2015-09-07 19:25:23 +00:00
|
|
|
|
|
|
|
// tasksUpdated does a diff between task groups to see if the
|
2016-12-16 01:08:38 +00:00
|
|
|
// tasks, their drivers, environment variables or config have updated. The
|
|
|
|
// inputs are the task group name to diff and two jobs to diff.
|
|
|
|
func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
|
|
|
|
a := jobA.LookupTaskGroup(taskGroup)
|
|
|
|
b := jobB.LookupTaskGroup(taskGroup)
|
|
|
|
|
2015-09-07 19:25:23 +00:00
|
|
|
// If the number of tasks do not match, clearly there is an update
|
|
|
|
if len(a.Tasks) != len(b.Tasks) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:00:02 +00:00
|
|
|
// Check ephemeral disk
|
|
|
|
if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2015-09-07 19:25:23 +00:00
|
|
|
// Check each task
|
|
|
|
for _, at := range a.Tasks {
|
|
|
|
bt := b.LookupTask(at.Name)
|
|
|
|
if bt == nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
if at.Driver != bt.Driver {
|
|
|
|
return true
|
|
|
|
}
|
2016-04-26 00:20:25 +00:00
|
|
|
if at.User != bt.User {
|
|
|
|
return true
|
|
|
|
}
|
2015-09-07 19:25:23 +00:00
|
|
|
if !reflect.DeepEqual(at.Config, bt.Config) {
|
|
|
|
return true
|
|
|
|
}
|
2015-10-23 21:52:06 +00:00
|
|
|
if !reflect.DeepEqual(at.Env, bt.Env) {
|
|
|
|
return true
|
|
|
|
}
|
2016-04-26 00:20:25 +00:00
|
|
|
if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
|
|
|
|
return true
|
2015-10-04 19:53:02 +00:00
|
|
|
}
|
2016-09-21 18:29:50 +00:00
|
|
|
if !reflect.DeepEqual(at.Vault, bt.Vault) {
|
|
|
|
return true
|
|
|
|
}
|
2016-10-17 18:41:22 +00:00
|
|
|
if !reflect.DeepEqual(at.Templates, bt.Templates) {
|
|
|
|
return true
|
|
|
|
}
|
2016-05-06 04:32:01 +00:00
|
|
|
|
2016-12-16 01:08:38 +00:00
|
|
|
// Check the metadata
|
|
|
|
if !reflect.DeepEqual(
|
|
|
|
jobA.CombinedTaskMeta(taskGroup, at.Name),
|
|
|
|
jobB.CombinedTaskMeta(taskGroup, bt.Name)) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-05-06 04:32:01 +00:00
|
|
|
// Inspect the network to see if the dynamic ports are different
|
|
|
|
if len(at.Resources.Networks) != len(bt.Resources.Networks) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
for idx := range at.Resources.Networks {
|
|
|
|
an := at.Resources.Networks[idx]
|
|
|
|
bn := bt.Resources.Networks[idx]
|
|
|
|
|
|
|
|
if an.MBits != bn.MBits {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
|
|
|
|
if !reflect.DeepEqual(aPorts, bPorts) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Inspect the non-network resources
|
|
|
|
if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
|
|
|
|
return true
|
|
|
|
} else if ar.MemoryMB != br.MemoryMB {
|
|
|
|
return true
|
|
|
|
} else if ar.IOPS != br.IOPS {
|
|
|
|
return true
|
|
|
|
}
|
2015-09-07 19:25:23 +00:00
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
2015-10-15 00:26:20 +00:00
|
|
|
|
2016-05-06 04:32:01 +00:00
|
|
|
// networkPortMap takes a network resource and returns a map of port labels to
|
|
|
|
// values. The value for dynamic ports is disregarded even if it is set. This
|
|
|
|
// makes this function suitable for comparing two network resources for changes.
|
|
|
|
func networkPortMap(n *structs.NetworkResource) map[string]int {
|
|
|
|
m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts))
|
|
|
|
for _, p := range n.ReservedPorts {
|
|
|
|
m[p.Label] = p.Value
|
|
|
|
}
|
|
|
|
for _, p := range n.DynamicPorts {
|
|
|
|
m[p.Label] = -1
|
|
|
|
}
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
// setStatus is used to update the status of the evaluation
|
2016-05-27 18:26:14 +00:00
|
|
|
func setStatus(logger *log.Logger, planner Planner,
|
|
|
|
eval, nextEval, spawnedBlocked *structs.Evaluation,
|
2016-07-18 22:04:05 +00:00
|
|
|
tgMetrics map[string]*structs.AllocMetric, status, desc string,
|
|
|
|
queuedAllocs map[string]int) error {
|
2016-05-27 18:26:14 +00:00
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status)
|
|
|
|
newEval := eval.Copy()
|
|
|
|
newEval.Status = status
|
|
|
|
newEval.StatusDescription = desc
|
2016-05-27 18:26:14 +00:00
|
|
|
newEval.FailedTGAllocs = tgMetrics
|
2015-10-15 00:26:20 +00:00
|
|
|
if nextEval != nil {
|
|
|
|
newEval.NextEval = nextEval.ID
|
|
|
|
}
|
2016-05-19 20:09:52 +00:00
|
|
|
if spawnedBlocked != nil {
|
2016-05-25 01:12:59 +00:00
|
|
|
newEval.BlockedEval = spawnedBlocked.ID
|
2016-05-19 20:09:52 +00:00
|
|
|
}
|
2016-07-18 22:04:05 +00:00
|
|
|
if queuedAllocs != nil {
|
|
|
|
newEval.QueuedAllocations = queuedAllocs
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
return planner.UpdateEval(newEval)
|
|
|
|
}
|
|
|
|
|
2016-05-17 22:37:37 +00:00
|
|
|
// inplaceUpdate attempts to update allocations in-place where possible. It
|
|
|
|
// returns the allocs that couldn't be done inplace and then those that could.
|
2015-10-15 00:26:20 +00:00
|
|
|
func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
|
2016-05-17 22:37:37 +00:00
|
|
|
stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
|
2015-10-15 00:26:20 +00:00
|
|
|
|
2017-03-12 01:19:22 +00:00
|
|
|
// doInplace manipulates the updates map to make the current allocation
|
|
|
|
// an inplace update.
|
|
|
|
doInplace := func(cur, last, inplaceCount *int) {
|
|
|
|
updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur]
|
|
|
|
*cur--
|
|
|
|
*last--
|
|
|
|
*inplaceCount++
|
|
|
|
}
|
|
|
|
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2015-10-15 00:26:20 +00:00
|
|
|
n := len(updates)
|
2016-05-17 22:37:37 +00:00
|
|
|
inplaceCount := 0
|
2015-10-15 00:26:20 +00:00
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
// Get the update
|
|
|
|
update := updates[i]
|
|
|
|
|
|
|
|
// Check if the task drivers or config has changed, requires
|
|
|
|
// a rolling upgrade since that cannot be done in-place.
|
2016-12-16 01:08:38 +00:00
|
|
|
existing := update.Alloc.Job
|
|
|
|
if tasksUpdated(job, existing, update.TaskGroup.Name) {
|
2015-10-15 00:26:20 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-03-12 01:19:22 +00:00
|
|
|
// Terminal batch allocations are not filtered when they are completed
|
|
|
|
// successfully. We should avoid adding the allocation to the plan in
|
|
|
|
// the case that it is an in-place update to avoid both additional data
|
|
|
|
// in the plan and work for the clients.
|
|
|
|
if update.Alloc.TerminalStatus() {
|
|
|
|
doInplace(&i, &n, &inplaceCount)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
// Get the existing node
|
2017-02-08 04:31:23 +00:00
|
|
|
node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID)
|
2015-10-15 00:26:20 +00:00
|
|
|
if err != nil {
|
|
|
|
ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v",
|
|
|
|
eval, update.Alloc.NodeID, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if node == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the existing node as the base set
|
|
|
|
stack.SetNodes([]*structs.Node{node})
|
|
|
|
|
2015-10-16 23:35:55 +00:00
|
|
|
// Stage an eviction of the current allocation. This is done so that
|
|
|
|
// the current allocation is discounted when checking for feasability.
|
|
|
|
// Otherwise we would be trying to fit the tasks current resources and
|
|
|
|
// updated resources. After select is called we can remove the evict.
|
2015-10-15 00:26:20 +00:00
|
|
|
ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop,
|
2016-08-03 22:45:42 +00:00
|
|
|
allocInPlace, "")
|
2015-10-15 00:26:20 +00:00
|
|
|
|
|
|
|
// Attempt to match the task group
|
2016-03-01 22:09:25 +00:00
|
|
|
option, _ := stack.Select(update.TaskGroup)
|
2015-10-15 00:26:20 +00:00
|
|
|
|
|
|
|
// Pop the allocation
|
|
|
|
ctx.Plan().PopUpdate(update.Alloc)
|
|
|
|
|
|
|
|
// Skip if we could not do an in-place update
|
|
|
|
if option == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Restore the network offers from the existing allocation.
|
|
|
|
// We do not allow network resources (reserved/dynamic ports)
|
|
|
|
// to be updated. This is guarded in taskUpdated, so we can
|
|
|
|
// safely restore those here.
|
|
|
|
for task, resources := range option.TaskResources {
|
|
|
|
existing := update.Alloc.TaskResources[task]
|
|
|
|
resources.Networks = existing.Networks
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a shallow copy
|
|
|
|
newAlloc := new(structs.Allocation)
|
|
|
|
*newAlloc = *update.Alloc
|
|
|
|
|
|
|
|
// Update the allocation
|
|
|
|
newAlloc.EvalID = eval.ID
|
2016-03-01 22:09:25 +00:00
|
|
|
newAlloc.Job = nil // Use the Job in the Plan
|
|
|
|
newAlloc.Resources = nil // Computed in Plan Apply
|
2015-10-15 00:26:20 +00:00
|
|
|
newAlloc.TaskResources = option.TaskResources
|
|
|
|
newAlloc.Metrics = ctx.Metrics()
|
|
|
|
ctx.Plan().AppendAlloc(newAlloc)
|
|
|
|
|
|
|
|
// Remove this allocation from the slice
|
2017-03-12 01:19:22 +00:00
|
|
|
doInplace(&i, &n, &inplaceCount)
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
2017-03-12 01:19:22 +00:00
|
|
|
|
2015-10-15 00:26:20 +00:00
|
|
|
if len(updates) > 0 {
|
2016-05-17 22:37:37 +00:00
|
|
|
ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates))
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
2016-05-17 22:37:37 +00:00
|
|
|
return updates[:n], updates[n:]
|
2015-10-15 00:26:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// evictAndPlace is used to mark allocations for evicts and add them to the
|
2016-10-11 19:31:40 +00:00
|
|
|
// placement queue. evictAndPlace modifies both the diffResult and the
|
2015-10-15 00:26:20 +00:00
|
|
|
// limit. It returns true if the limit has been reached.
|
2015-10-16 18:43:09 +00:00
|
|
|
func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
|
2015-10-15 00:26:20 +00:00
|
|
|
n := len(allocs)
|
|
|
|
for i := 0; i < n && i < *limit; i++ {
|
|
|
|
a := allocs[i]
|
2016-08-03 22:45:42 +00:00
|
|
|
ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "")
|
|
|
|
diff.place = append(diff.place, a)
|
|
|
|
}
|
|
|
|
if n <= *limit {
|
|
|
|
*limit -= n
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
*limit = 0
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// markLostAndPlace is used to mark allocations as lost and add them to the
|
2016-10-11 19:31:40 +00:00
|
|
|
// placement queue. evictAndPlace modifies both the diffResult and the
|
2016-08-03 22:45:42 +00:00
|
|
|
// limit. It returns true if the limit has been reached.
|
|
|
|
func markLostAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
|
|
|
|
n := len(allocs)
|
|
|
|
for i := 0; i < n && i < *limit; i++ {
|
|
|
|
a := allocs[i]
|
|
|
|
ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, structs.AllocClientStatusLost)
|
2015-10-15 00:26:20 +00:00
|
|
|
diff.place = append(diff.place, a)
|
|
|
|
}
|
|
|
|
if n <= *limit {
|
|
|
|
*limit -= n
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
*limit = 0
|
|
|
|
return true
|
|
|
|
}
|
2015-10-16 21:00:51 +00:00
|
|
|
|
|
|
|
// tgConstrainTuple is used to store the total constraints of a task group.
|
|
|
|
type tgConstrainTuple struct {
|
|
|
|
// Holds the combined constraints of the task group and all it's sub-tasks.
|
|
|
|
constraints []*structs.Constraint
|
|
|
|
|
|
|
|
// The set of required drivers within the task group.
|
|
|
|
drivers map[string]struct{}
|
|
|
|
|
|
|
|
// The combined resources of all tasks within the task group.
|
|
|
|
size *structs.Resources
|
|
|
|
}
|
|
|
|
|
|
|
|
// taskGroupConstraints collects the constraints, drivers and resources required by each
|
|
|
|
// sub-task to aggregate the TaskGroup totals
|
|
|
|
func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
|
|
|
|
c := tgConstrainTuple{
|
|
|
|
constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
|
|
|
|
drivers: make(map[string]struct{}),
|
2016-09-14 22:43:42 +00:00
|
|
|
size: &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB},
|
2015-10-16 21:00:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
c.constraints = append(c.constraints, tg.Constraints...)
|
|
|
|
for _, task := range tg.Tasks {
|
|
|
|
c.drivers[task.Driver] = struct{}{}
|
|
|
|
c.constraints = append(c.constraints, task.Constraints...)
|
|
|
|
c.size.Add(task.Resources)
|
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
2016-05-05 18:21:58 +00:00
|
|
|
|
|
|
|
// desiredUpdates takes the diffResult as well as the set of inplace and
|
|
|
|
// destructive updates and returns a map of task groups to their set of desired
|
|
|
|
// updates.
|
|
|
|
func desiredUpdates(diff *diffResult, inplaceUpdates,
|
|
|
|
destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
|
|
|
|
desiredTgs := make(map[string]*structs.DesiredUpdates)
|
|
|
|
|
|
|
|
for _, tuple := range diff.place {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Place++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range diff.stop {
|
2016-05-13 18:53:11 +00:00
|
|
|
name := tuple.Alloc.TaskGroup
|
2016-05-05 18:21:58 +00:00
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Stop++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range diff.ignore {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Ignore++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range diff.migrate {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.Migrate++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range inplaceUpdates {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.InPlaceUpdate++
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, tuple := range destructiveUpdates {
|
|
|
|
name := tuple.TaskGroup.Name
|
|
|
|
des, ok := desiredTgs[name]
|
|
|
|
if !ok {
|
|
|
|
des = &structs.DesiredUpdates{}
|
|
|
|
desiredTgs[name] = des
|
|
|
|
}
|
|
|
|
|
|
|
|
des.DestructiveUpdate++
|
|
|
|
}
|
|
|
|
|
|
|
|
return desiredTgs
|
|
|
|
}
|
2016-07-22 21:53:49 +00:00
|
|
|
|
|
|
|
// adjustQueuedAllocations decrements the number of allocations pending per task
|
|
|
|
// group based on the number of allocations successfully placed
|
|
|
|
func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) {
|
|
|
|
if result != nil {
|
|
|
|
for _, allocations := range result.NodeAllocation {
|
|
|
|
for _, allocation := range allocations {
|
2017-01-07 21:41:09 +00:00
|
|
|
// Ensure that the allocation is newly created. We check that
|
|
|
|
// the CreateIndex is equal to the ModifyIndex in order to check
|
|
|
|
// that the allocation was just created. We do not check that
|
|
|
|
// the CreateIndex is equal to the results AllocIndex because
|
|
|
|
// the allocations we get back have gone through the planner's
|
|
|
|
// optimistic snapshot and thus their indexes may not be
|
|
|
|
// correct, but they will be consistent.
|
|
|
|
if allocation.CreateIndex != allocation.ModifyIndex {
|
2016-07-22 21:53:49 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := queuedAllocs[allocation.TaskGroup]; ok {
|
|
|
|
queuedAllocs[allocation.TaskGroup] -= 1
|
|
|
|
} else {
|
|
|
|
logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-08-09 20:11:58 +00:00
|
|
|
|
|
|
|
// updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node
|
|
|
|
// to lost
|
|
|
|
func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) {
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
if _, ok := tainted[alloc.NodeID]; ok &&
|
|
|
|
alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
|
|
|
|
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
|
|
|
|
alloc.ClientStatus == structs.AllocClientStatusPending) {
|
|
|
|
plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|