open-nomad/scheduler/stack.go

package scheduler

import (
	"math"
	"time"

	"github.com/hashicorp/nomad/nomad/structs"
)

const (
	// serviceJobAntiAffinityPenalty is the penalty applied
	// to the score for placing an alloc on a node that
	// already has an alloc for this job.
	serviceJobAntiAffinityPenalty = 10.0

	// batchJobAntiAffinityPenalty is the same as the
	// serviceJobAntiAffinityPenalty but for batch type jobs.
	batchJobAntiAffinityPenalty = 5.0
)

// Stack is a chained collection of iterators. The stack is used to
// make placement decisions. Different schedulers may customize the
// stack they use to vary the way placements are made.
type Stack interface {
	// SetNodes is used to set the base set of potential nodes
	SetNodes([]*structs.Node)

	// SetTaskGroup is used to set the job for selection
	SetJob(job *structs.Job)

	// Select is used to select a node for the task group
	Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources)
}

// GenericStack is the Stack used for the Generic scheduler. It is
// designed to make better placement decisions at the cost of performance.
type GenericStack struct {
	batch               bool
	ctx                 Context
	source              *StaticIterator
	jobConstraint       *ConstraintIterator
	taskGroupDrivers    *DriverIterator
	taskGroupConstraint *ConstraintIterator
	binPack             *BinPackIterator
	jobAntiAff          *JobAntiAffinityIterator
	maxScore            *MaxScoreIterator
}

// NewGenericStack constructs a stack used for selecting service placements
func NewGenericStack(batch bool, ctx Context, baseNodes []*structs.Node) *GenericStack {
	// Create a new stack
	stack := &GenericStack{
		batch: batch,
		ctx:   ctx,
	}

	// Create the source iterator. We randomize the order we visit nodes
	// to reduce collisions between schedulers and to do a basic load
	// balancing across eligible nodes.
	stack.source = NewRandomIterator(ctx, baseNodes)

	// Attach the job constraints. The job is filled in later.
	stack.jobConstraint = NewConstraintIterator(ctx, stack.source, nil)

	// Filter on task group drivers first as they are faster
	stack.taskGroupDrivers = NewDriverIterator(ctx, stack.jobConstraint, nil)

	// Filter on task group constraints second
	stack.taskGroupConstraint = NewConstraintIterator(ctx, stack.taskGroupDrivers, nil)

	// Upgrade from feasible to rank iterator
	rankSource := NewFeasibleRankIterator(ctx, stack.taskGroupConstraint)

	// Apply the bin packing, this depends on the resources needed
	// by a particular task group. Only enable eviction for the service
	// scheduler as that logic is expensive.
	evict := !batch
	stack.binPack = NewBinPackIterator(ctx, rankSource, nil, evict, 0)

	// Apply the job anti-affinity iterator. This is to avoid placing
	// multiple allocations on the same node for this job. The penalty
	// is less for batch jobs as it matters less.
	penalty := serviceJobAntiAffinityPenalty
	if batch {
		penalty = batchJobAntiAffinityPenalty
	}
	stack.jobAntiAff = NewJobAntiAffinityIterator(ctx, stack.binPack, penalty, "")

	// Apply a limit function. This is to avoid scanning *every* possible node.
	// For batch jobs we only need to evaluate 2 options and depend on the
	// powwer of two choices. For services jobs we need to visit "enough".
	// Using a log of the total number of nodes is a good restriction, with
	// at least 2 as the floor
	limit := 2
	if n := len(baseNodes); !batch && n > 0 {
		logLimit := int(math.Ceil(math.Log2(float64(n))))
		if logLimit > limit {
			limit = logLimit
		}
	}
	limitIter := NewLimitIterator(ctx, stack.binPack, limit)

	// Select the node with the maximum score for placement
	stack.maxScore = NewMaxScoreIterator(ctx, limitIter)
	return stack
}

func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
	// Shuffle base nodes
	shuffleNodes(baseNodes)

	// Update the set of base nodes
	s.source.SetNodes(baseNodes)
}

func (s *GenericStack) SetJob(job *structs.Job) {
	s.jobConstraint.SetConstraints(job.Constraints)
	s.binPack.SetPriority(job.Priority)
	s.jobAntiAff.SetJob(job.ID)
}

func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) {
	// Reset the max selector and context
	s.maxScore.Reset()
	s.ctx.Reset()
	start := time.Now()

	// Collect the constraints, drivers and resources required by each
	// sub-task to aggregate the TaskGroup totals
	constr := make([]*structs.Constraint, 0, len(tg.Constraints))
	drivers := make(map[string]struct{})
	size := new(structs.Resources)
	constr = append(constr, tg.Constraints...)
	for _, task := range tg.Tasks {
		drivers[task.Driver] = struct{}{}
		constr = append(constr, task.Constraints...)
		size.Add(task.Resources)
	}

	// Update the parameters of iterators
	s.taskGroupDrivers.SetDrivers(drivers)
	s.taskGroupConstraint.SetConstraints(constr)
	s.binPack.SetResources(size)

	// Find the node with the max score
	option := s.maxScore.Next()

	// Store the compute time
	s.ctx.Metrics().AllocationTime = time.Since(start)
	return option, size
}
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`package scheduler`

			`import (`
			`"math"`
scheduler: basic metrics integration 2015-08-14 04:46:33 +00:00			`"time"`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`

scheduler: adding job anti-affinity to the generic stack 2015-08-16 17:37:11 +00:00			`const (`
			`// serviceJobAntiAffinityPenalty is the penalty applied`
			`// to the score for placing an alloc on a node that`
			`// already has an alloc for this job.`
			`serviceJobAntiAffinityPenalty = 10.0`

			`// batchJobAntiAffinityPenalty is the same as the`
			`// serviceJobAntiAffinityPenalty but for batch type jobs.`
			`batchJobAntiAffinityPenalty = 5.0`
			`)`

scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`// Stack is a chained collection of iterators. The stack is used to`
			`// make placement decisions. Different schedulers may customize the`
			`// stack they use to vary the way placements are made.`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`type Stack interface {`
scheduler: allow updating the base nodes 2015-09-07 18:30:13 +00:00			`// SetNodes is used to set the base set of potential nodes`
			`SetNodes([]*structs.Node)`

scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`// SetTaskGroup is used to set the job for selection`
			`SetJob(job *structs.Job)`

			`// Select is used to select a node for the task group`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`Select(tg structs.TaskGroup) (RankedNode, *structs.Resources)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`}`

scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`// GenericStack is the Stack used for the Generic scheduler. It is`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`// designed to make better placement decisions at the cost of performance.`
scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`type GenericStack struct {`
			`batch bool`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`ctx Context`
scheduler: allow updating the base nodes 2015-09-07 18:30:13 +00:00			`source *StaticIterator`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`jobConstraint *ConstraintIterator`
			`taskGroupDrivers *DriverIterator`
			`taskGroupConstraint *ConstraintIterator`
			`binPack *BinPackIterator`
scheduler: adding job anti-affinity to the generic stack 2015-08-16 17:37:11 +00:00			`jobAntiAff *JobAntiAffinityIterator`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`maxScore *MaxScoreIterator`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`}`

scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`// NewGenericStack constructs a stack used for selecting service placements`
			`func NewGenericStack(batch bool, ctx Context, baseNodes []structs.Node) GenericStack {`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`// Create a new stack`
scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`stack := &GenericStack{`
			`batch: batch,`
			`ctx: ctx,`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`}`

			`// Create the source iterator. We randomize the order we visit nodes`
			`// to reduce collisions between schedulers and to do a basic load`
			`// balancing across eligible nodes.`
scheduler: allow updating the base nodes 2015-09-07 18:30:13 +00:00			`stack.source = NewRandomIterator(ctx, baseNodes)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
			`// Attach the job constraints. The job is filled in later.`
scheduler: allow updating the base nodes 2015-09-07 18:30:13 +00:00			`stack.jobConstraint = NewConstraintIterator(ctx, stack.source, nil)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
			`// Filter on task group drivers first as they are faster`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`stack.taskGroupDrivers = NewDriverIterator(ctx, stack.jobConstraint, nil)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
			`// Filter on task group constraints second`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`stack.taskGroupConstraint = NewConstraintIterator(ctx, stack.taskGroupDrivers, nil)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
			`// Upgrade from feasible to rank iterator`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`rankSource := NewFeasibleRankIterator(ctx, stack.taskGroupConstraint)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`// Apply the bin packing, this depends on the resources needed`
			`// by a particular task group. Only enable eviction for the service`
			`// scheduler as that logic is expensive.`
			`evict := !batch`
			`stack.binPack = NewBinPackIterator(ctx, rankSource, nil, evict, 0)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
scheduler: adding job anti-affinity to the generic stack 2015-08-16 17:37:11 +00:00			`// Apply the job anti-affinity iterator. This is to avoid placing`
			`// multiple allocations on the same node for this job. The penalty`
			`// is less for batch jobs as it matters less.`
			`penalty := serviceJobAntiAffinityPenalty`
			`if batch {`
			`penalty = batchJobAntiAffinityPenalty`
			`}`
			`stack.jobAntiAff = NewJobAntiAffinityIterator(ctx, stack.binPack, penalty, "")`

scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`// Apply a limit function. This is to avoid scanning every possible node.`
scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`// For batch jobs we only need to evaluate 2 options and depend on the`
			`// powwer of two choices. For services jobs we need to visit "enough".`
			`// Using a log of the total number of nodes is a good restriction, with`
			`// at least 2 as the floor`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`limit := 2`
scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`if n := len(baseNodes); !batch && n > 0 {`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`logLimit := int(math.Ceil(math.Log2(float64(n))))`
			`if logLimit > limit {`
			`limit = logLimit`
			`}`
			`}`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`limitIter := NewLimitIterator(ctx, stack.binPack, limit)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00
			`// Select the node with the maximum score for placement`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`stack.maxScore = NewMaxScoreIterator(ctx, limitIter)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`return stack`
			`}`

scheduler: allow updating the base nodes 2015-09-07 18:30:13 +00:00			`func (s GenericStack) SetNodes(baseNodes []structs.Node) {`
			`// Shuffle base nodes`
			`shuffleNodes(baseNodes)`

			`// Update the set of base nodes`
			`s.source.SetNodes(baseNodes)`
			`}`

scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`func (s GenericStack) SetJob(job structs.Job) {`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`s.jobConstraint.SetConstraints(job.Constraints)`
			`s.binPack.SetPriority(job.Priority)`
scheduler: adding job anti-affinity to the generic stack 2015-08-16 17:37:11 +00:00			`s.jobAntiAff.SetJob(job.ID)`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`}`

scheduler: adding minor specialization for batch 2015-08-14 05:35:48 +00:00			`func (s GenericStack) Select(tg structs.TaskGroup) (RankedNode, structs.Resources) {`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`// Reset the max selector and context`
			`s.maxScore.Reset()`
			`s.ctx.Reset()`
scheduler: basic metrics integration 2015-08-14 04:46:33 +00:00			`start := time.Now()`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`// Collect the constraints, drivers and resources required by each`
			`// sub-task to aggregate the TaskGroup totals`
			`constr := make([]*structs.Constraint, 0, len(tg.Constraints))`
			`drivers := make(map[string]struct{})`
			`size := new(structs.Resources)`
			`constr = append(constr, tg.Constraints...)`
			`for _, task := range tg.Tasks {`
			`drivers[task.Driver] = struct{}{}`
			`constr = append(constr, task.Constraints...)`
			`size.Add(task.Resources)`
			`}`

			`// Update the parameters of iterators`
scheduler: simply stack implementation 2015-08-14 01:44:27 +00:00			`s.taskGroupDrivers.SetDrivers(drivers)`
			`s.taskGroupConstraint.SetConstraints(constr)`
			`s.binPack.SetResources(size)`
scheduler: thread size through 2015-08-14 01:36:13 +00:00
scheduler: basic metrics integration 2015-08-14 04:46:33 +00:00			`// Find the node with the max score`
			`option := s.maxScore.Next()`

			`// Store the compute time`
			`s.ctx.Metrics().AllocationTime = time.Since(start)`
			`return option, size`
scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`}`