open-nomad/scheduler/stack.go

152 lines
4.9 KiB
Go
Raw Normal View History

2015-08-14 00:48:26 +00:00
package scheduler
import (
"math"
2015-08-14 04:46:33 +00:00
"time"
2015-08-14 00:48:26 +00:00
"github.com/hashicorp/nomad/nomad/structs"
)
const (
// serviceJobAntiAffinityPenalty is the penalty applied
// to the score for placing an alloc on a node that
// already has an alloc for this job.
serviceJobAntiAffinityPenalty = 10.0
// batchJobAntiAffinityPenalty is the same as the
// serviceJobAntiAffinityPenalty but for batch type jobs.
batchJobAntiAffinityPenalty = 5.0
)
2015-08-14 01:44:27 +00:00
// Stack is a chained collection of iterators. The stack is used to
// make placement decisions. Different schedulers may customize the
// stack they use to vary the way placements are made.
2015-08-14 00:48:26 +00:00
type Stack interface {
// SetNodes is used to set the base set of potential nodes
SetNodes([]*structs.Node)
2015-08-14 00:48:26 +00:00
// SetTaskGroup is used to set the job for selection
SetJob(job *structs.Job)
// Select is used to select a node for the task group
2015-08-14 01:44:27 +00:00
Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources)
2015-08-14 00:48:26 +00:00
}
// GenericStack is the Stack used for the Generic scheduler. It is
2015-08-14 01:44:27 +00:00
// designed to make better placement decisions at the cost of performance.
type GenericStack struct {
batch bool
2015-08-14 01:44:27 +00:00
ctx Context
source *StaticIterator
2015-08-14 01:44:27 +00:00
jobConstraint *ConstraintIterator
taskGroupDrivers *DriverIterator
taskGroupConstraint *ConstraintIterator
binPack *BinPackIterator
jobAntiAff *JobAntiAffinityIterator
2015-08-14 01:44:27 +00:00
maxScore *MaxScoreIterator
2015-08-14 00:48:26 +00:00
}
// NewGenericStack constructs a stack used for selecting service placements
func NewGenericStack(batch bool, ctx Context, baseNodes []*structs.Node) *GenericStack {
2015-08-14 00:48:26 +00:00
// Create a new stack
stack := &GenericStack{
batch: batch,
ctx: ctx,
2015-08-14 00:48:26 +00:00
}
// Create the source iterator. We randomize the order we visit nodes
// to reduce collisions between schedulers and to do a basic load
// balancing across eligible nodes.
stack.source = NewRandomIterator(ctx, baseNodes)
2015-08-14 00:48:26 +00:00
// Attach the job constraints. The job is filled in later.
stack.jobConstraint = NewConstraintIterator(ctx, stack.source, nil)
2015-08-14 00:48:26 +00:00
// Filter on task group drivers first as they are faster
2015-08-14 01:44:27 +00:00
stack.taskGroupDrivers = NewDriverIterator(ctx, stack.jobConstraint, nil)
2015-08-14 00:48:26 +00:00
// Filter on task group constraints second
2015-08-14 01:44:27 +00:00
stack.taskGroupConstraint = NewConstraintIterator(ctx, stack.taskGroupDrivers, nil)
2015-08-14 00:48:26 +00:00
// Upgrade from feasible to rank iterator
2015-08-14 01:44:27 +00:00
rankSource := NewFeasibleRankIterator(ctx, stack.taskGroupConstraint)
2015-08-14 00:48:26 +00:00
// Apply the bin packing, this depends on the resources needed
// by a particular task group. Only enable eviction for the service
// scheduler as that logic is expensive.
evict := !batch
stack.binPack = NewBinPackIterator(ctx, rankSource, nil, evict, 0)
2015-08-14 00:48:26 +00:00
// Apply the job anti-affinity iterator. This is to avoid placing
// multiple allocations on the same node for this job. The penalty
// is less for batch jobs as it matters less.
penalty := serviceJobAntiAffinityPenalty
if batch {
penalty = batchJobAntiAffinityPenalty
}
stack.jobAntiAff = NewJobAntiAffinityIterator(ctx, stack.binPack, penalty, "")
2015-08-14 00:48:26 +00:00
// Apply a limit function. This is to avoid scanning *every* possible node.
// For batch jobs we only need to evaluate 2 options and depend on the
// powwer of two choices. For services jobs we need to visit "enough".
// Using a log of the total number of nodes is a good restriction, with
// at least 2 as the floor
2015-08-14 00:48:26 +00:00
limit := 2
if n := len(baseNodes); !batch && n > 0 {
2015-08-14 00:48:26 +00:00
logLimit := int(math.Ceil(math.Log2(float64(n))))
if logLimit > limit {
limit = logLimit
}
}
2015-08-14 01:44:27 +00:00
limitIter := NewLimitIterator(ctx, stack.binPack, limit)
2015-08-14 00:48:26 +00:00
// Select the node with the maximum score for placement
2015-08-14 01:44:27 +00:00
stack.maxScore = NewMaxScoreIterator(ctx, limitIter)
2015-08-14 00:48:26 +00:00
return stack
}
func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
// Shuffle base nodes
shuffleNodes(baseNodes)
// Update the set of base nodes
s.source.SetNodes(baseNodes)
}
func (s *GenericStack) SetJob(job *structs.Job) {
2015-08-14 01:44:27 +00:00
s.jobConstraint.SetConstraints(job.Constraints)
s.binPack.SetPriority(job.Priority)
s.jobAntiAff.SetJob(job.ID)
2015-08-14 00:48:26 +00:00
}
func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) {
2015-08-14 01:44:27 +00:00
// Reset the max selector and context
s.maxScore.Reset()
s.ctx.Reset()
2015-08-14 04:46:33 +00:00
start := time.Now()
2015-08-14 01:44:27 +00:00
2015-08-14 00:48:26 +00:00
// Collect the constraints, drivers and resources required by each
// sub-task to aggregate the TaskGroup totals
constr := make([]*structs.Constraint, 0, len(tg.Constraints))
drivers := make(map[string]struct{})
size := new(structs.Resources)
constr = append(constr, tg.Constraints...)
for _, task := range tg.Tasks {
drivers[task.Driver] = struct{}{}
constr = append(constr, task.Constraints...)
size.Add(task.Resources)
}
// Update the parameters of iterators
2015-08-14 01:44:27 +00:00
s.taskGroupDrivers.SetDrivers(drivers)
s.taskGroupConstraint.SetConstraints(constr)
s.binPack.SetResources(size)
2015-08-14 01:36:13 +00:00
2015-08-14 04:46:33 +00:00
// Find the node with the max score
option := s.maxScore.Next()
// Store the compute time
s.ctx.Metrics().AllocationTime = time.Since(start)
return option, size
2015-08-14 00:48:26 +00:00
}