open-nomad/scheduler/rank.go

package scheduler

import (
	"fmt"

	"github.com/hashicorp/nomad/nomad/structs"
)

// Rank is used to provide a score and various ranking metadata
// along with a node when iterating. This state can be modified as
// various rank methods are applied.
type RankedNode struct {
	Node          *structs.Node
	Score         float64
	TaskResources map[string]*structs.Resources

	// Allocs is used to cache the proposed allocations on the
	// node. This can be shared between iterators that require it.
	Proposed []*structs.Allocation
}

func (r *RankedNode) GoString() string {
	return fmt.Sprintf("<Node: %s Score: %0.3f>", r.Node.ID, r.Score)
}

func (r *RankedNode) ProposedAllocs(ctx Context) ([]*structs.Allocation, error) {
	if r.Proposed != nil {
		return r.Proposed, nil
	}

	p, err := ctx.ProposedAllocs(r.Node.ID)
	if err != nil {
		return nil, err
	}
	r.Proposed = p
	return p, nil
}

func (r *RankedNode) SetTaskResources(task *structs.Task,
	resource *structs.Resources) {
	if r.TaskResources == nil {
		r.TaskResources = make(map[string]*structs.Resources)
	}
	r.TaskResources[task.Name] = resource
}

// RankFeasibleIterator is used to iteratively yield nodes along
// with ranking metadata. The iterators may manage some state for
// performance optimizations.
type RankIterator interface {
	// Next yields a ranked option or nil if exhausted
	Next() *RankedNode

	// Reset is invoked when an allocation has been placed
	// to reset any stale state.
	Reset()
}

// FeasibleRankIterator is used to consume from a FeasibleIterator
// and return an unranked node with base ranking.
type FeasibleRankIterator struct {
	ctx    Context
	source FeasibleIterator
}

// NewFeasibleRankIterator is used to return a new FeasibleRankIterator
// from a FeasibleIterator source.
func NewFeasibleRankIterator(ctx Context, source FeasibleIterator) *FeasibleRankIterator {
	iter := &FeasibleRankIterator{
		ctx:    ctx,
		source: source,
	}
	return iter
}

func (iter *FeasibleRankIterator) Next() *RankedNode {
	option := iter.source.Next()
	if option == nil {
		return nil
	}
	ranked := &RankedNode{
		Node: option,
	}
	return ranked
}

func (iter *FeasibleRankIterator) Reset() {
	iter.source.Reset()
}

// StaticRankIterator is a RankIterator that returns a static set of results.
// This is largely only useful for testing.
type StaticRankIterator struct {
	ctx    Context
	nodes  []*RankedNode
	offset int
	seen   int
}

// NewStaticRankIterator returns a new static rank iterator over the given nodes
func NewStaticRankIterator(ctx Context, nodes []*RankedNode) *StaticRankIterator {
	iter := &StaticRankIterator{
		ctx:   ctx,
		nodes: nodes,
	}
	return iter
}

func (iter *StaticRankIterator) Next() *RankedNode {
	// Check if exhausted
	n := len(iter.nodes)
	if iter.offset == n || iter.seen == n {
		if iter.seen != n {
			iter.offset = 0
		} else {
			return nil
		}
	}

	// Return the next offset
	offset := iter.offset
	iter.offset += 1
	iter.seen += 1
	return iter.nodes[offset]
}

func (iter *StaticRankIterator) Reset() {
	iter.seen = 0
}

// BinPackIterator is a RankIterator that scores potential options
// based on a bin-packing algorithm.
type BinPackIterator struct {
	ctx       Context
	source    RankIterator
	evict     bool
	priority  int
	taskGroup *structs.TaskGroup
}

// NewBinPackIterator returns a BinPackIterator which tries to fit tasks
// potentially evicting other tasks based on a given priority.
func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int) *BinPackIterator {
	iter := &BinPackIterator{
		ctx:      ctx,
		source:   source,
		evict:    evict,
		priority: priority,
	}
	return iter
}

func (iter *BinPackIterator) SetPriority(p int) {
	iter.priority = p
}

func (iter *BinPackIterator) SetTaskGroup(taskGroup *structs.TaskGroup) {
	iter.taskGroup = taskGroup
}

func (iter *BinPackIterator) Next() *RankedNode {
OUTER:
	for {
		// Get the next potential option
		option := iter.source.Next()
		if option == nil {
			return nil
		}

		// Get the proposed allocations
		proposed, err := option.ProposedAllocs(iter.ctx)
		if err != nil {
			iter.ctx.Logger().Printf(
				"[ERR] sched.binpack: failed to get proposed allocations: %v",
				err)
			continue
		}

		// Index the existing network usage
		netIdx := structs.NewNetworkIndex()
		netIdx.SetNode(option.Node)
		netIdx.AddAllocs(proposed)

		// Assign the resources for each task
		total := &structs.Resources{
			DiskMB: iter.taskGroup.EphemeralDisk.SizeMB,
		}
		for _, task := range iter.taskGroup.Tasks {
			taskResources := task.Resources.Copy()

			// Check if we need a network resource
			if len(taskResources.Networks) > 0 {
				ask := taskResources.Networks[0]
				offer, err := netIdx.AssignNetwork(ask)
				if offer == nil {
					iter.ctx.Metrics().ExhaustedNode(option.Node,
						fmt.Sprintf("network: %s", err))
					netIdx.Release()
					continue OUTER
				}

				// Reserve this to prevent another task from colliding
				netIdx.AddReserved(offer)

				// Update the network ask to the offer
				taskResources.Networks = []*structs.NetworkResource{offer}
			}

			// Store the task resource
			option.SetTaskResources(task, taskResources)

			// Accumulate the total resource requirement
			total.Add(taskResources)
		}

		// Add the resources we are trying to fit
		proposed = append(proposed, &structs.Allocation{Resources: total})

		// Check if these allocations fit, if they do not, simply skip this node
		fit, dim, util, _ := structs.AllocsFit(option.Node, proposed, netIdx)
		netIdx.Release()
		if !fit {
			iter.ctx.Metrics().ExhaustedNode(option.Node, dim)
			continue
		}

		// XXX: For now we completely ignore evictions. We should use that flag
		// to determine if its possible to evict other lower priority allocations
		// to make room. This explodes the search space, so it must be done
		// carefully.

		// Score the fit normally otherwise
		fitness := structs.ScoreFit(option.Node, util)
		option.Score += fitness
		iter.ctx.Metrics().ScoreNode(option.Node, "binpack", fitness)
		return option
	}
}

func (iter *BinPackIterator) Reset() {
	iter.source.Reset()
}

// JobAntiAffinityIterator is used to apply an anti-affinity to allocating
// along side other allocations from this job. This is used to help distribute
// load across the cluster.
type JobAntiAffinityIterator struct {
	ctx     Context
	source  RankIterator
	penalty float64
	jobID   string
}

// NewJobAntiAffinityIterator is used to create a JobAntiAffinityIterator that
// applies the given penalty for co-placement with allocs from this job.
func NewJobAntiAffinityIterator(ctx Context, source RankIterator, penalty float64, jobID string) *JobAntiAffinityIterator {
	iter := &JobAntiAffinityIterator{
		ctx:     ctx,
		source:  source,
		penalty: penalty,
		jobID:   jobID,
	}
	return iter
}

func (iter *JobAntiAffinityIterator) SetJob(jobID string) {
	iter.jobID = jobID
}

func (iter *JobAntiAffinityIterator) Next() *RankedNode {
	for {
		option := iter.source.Next()
		if option == nil {
			return nil
		}

		// Get the proposed allocations
		proposed, err := option.ProposedAllocs(iter.ctx)
		if err != nil {
			iter.ctx.Logger().Printf(
				"[ERR] sched.job-anti-aff: failed to get proposed allocations: %v",
				err)
			continue
		}

		// Determine the number of collisions
		collisions := 0
		for _, alloc := range proposed {
			if alloc.JobID == iter.jobID {
				collisions += 1
			}
		}

		// Apply a penalty if there are collisions
		if collisions > 0 {
			scorePenalty := -1 * float64(collisions) * iter.penalty
			option.Score += scorePenalty
			iter.ctx.Metrics().ScoreNode(option.Node, "job-anti-affinity", scorePenalty)
		}
		return option
	}
}

func (iter *JobAntiAffinityIterator) Reset() {
	iter.source.Reset()
}
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`package scheduler`

scheduler: testing bin pack scoring 2015-08-13 20:08:15 +00:00			`import (`
			`"fmt"`

			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00
			`// Rank is used to provide a score and various ranking metadata`
			`// along with a node when iterating. This state can be modified as`
			`// various rank methods are applied.`
			`type RankedNode struct {`
Ensuring resources are re-calculated properly in fsm 2016-08-27 03:08:03 +00:00			`Node *structs.Node`
			`Score float64`
			`TaskResources map[string]*structs.Resources`
scheduler: move proposed alloc logic to Context 2015-08-16 17:28:58 +00:00
			`// Allocs is used to cache the proposed allocations on the`
			`// node. This can be shared between iterators that require it.`
			`Proposed []*structs.Allocation`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`}`

scheduler: testing bin pack scoring 2015-08-13 20:08:15 +00:00			`func (r *RankedNode) GoString() string {`
			`return fmt.Sprintf("<Node: %s Score: %0.3f>", r.Node.ID, r.Score)`
			`}`

scheduler: refactor shared logic 2015-09-07 23:19:21 +00:00			`func (r RankedNode) ProposedAllocs(ctx Context) ([]structs.Allocation, error) {`
			`if r.Proposed != nil {`
			`return r.Proposed, nil`
			`}`

			`p, err := ctx.ProposedAllocs(r.Node.ID)`
			`if err != nil {`
			`return nil, err`
			`}`
			`r.Proposed = p`
			`return p, nil`
			`}`

scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`func (r RankedNode) SetTaskResources(task structs.Task,`
			`resource *structs.Resources) {`
			`if r.TaskResources == nil {`
			`r.TaskResources = make(map[string]*structs.Resources)`
			`}`
			`r.TaskResources[task.Name] = resource`
			`}`

scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`// RankFeasibleIterator is used to iteratively yield nodes along`
			`// with ranking metadata. The iterators may manage some state for`
			`// performance optimizations.`
			`type RankIterator interface {`
scheduler: support iterator reset 2015-08-13 22:01:02 +00:00			`// Next yields a ranked option or nil if exhausted`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`Next() *RankedNode`
scheduler: support iterator reset 2015-08-13 22:01:02 +00:00
			`// Reset is invoked when an allocation has been placed`
			`// to reset any stale state.`
			`Reset()`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`}`

			`// FeasibleRankIterator is used to consume from a FeasibleIterator`
			`// and return an unranked node with base ranking.`
			`type FeasibleRankIterator struct {`
scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`ctx Context`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`source FeasibleIterator`
			`}`

			`// NewFeasibleRankIterator is used to return a new FeasibleRankIterator`
			`// from a FeasibleIterator source.`
			`func NewFeasibleRankIterator(ctx Context, source FeasibleIterator) *FeasibleRankIterator {`
			`iter := &FeasibleRankIterator{`
scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`ctx: ctx,`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`source: source,`
			`}`
			`return iter`
			`}`

			`func (iter FeasibleRankIterator) Next() RankedNode {`
			`option := iter.source.Next()`
scheduler: testing more iterators 2015-08-13 17:13:11 +00:00			`if option == nil {`
			`return nil`
			`}`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`ranked := &RankedNode{`
			`Node: option,`
			`}`
			`return ranked`
			`}`

scheduler: support iterator reset 2015-08-13 22:01:02 +00:00			`func (iter *FeasibleRankIterator) Reset() {`
			`iter.source.Reset()`
			`}`

scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`// StaticRankIterator is a RankIterator that returns a static set of results.`
			`// This is largely only useful for testing.`
			`type StaticRankIterator struct {`
			`ctx Context`
			`nodes []*RankedNode`
			`offset int`
scheduler: support iterator reset 2015-08-13 22:01:02 +00:00			`seen int`
scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`}`

scheduler: testing select iterators 2015-08-13 17:05:54 +00:00			`// NewStaticRankIterator returns a new static rank iterator over the given nodes`
			`func NewStaticRankIterator(ctx Context, nodes []RankedNode) StaticRankIterator {`
			`iter := &StaticRankIterator{`
			`ctx: ctx,`
			`nodes: nodes,`
			`}`
			`return iter`
			`}`

scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`func (iter StaticRankIterator) Next() RankedNode {`
			`// Check if exhausted`
scheduler: support iterator reset 2015-08-13 22:01:02 +00:00			`n := len(iter.nodes)`
			`if iter.offset == n \|\| iter.seen == n {`
			`if iter.seen != n {`
			`iter.offset = 0`
			`} else {`
			`return nil`
			`}`
scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`}`

			`// Return the next offset`
			`offset := iter.offset`
			`iter.offset += 1`
scheduler: support iterator reset 2015-08-13 22:01:02 +00:00			`iter.seen += 1`
scheduler: adding static rank iterator 2015-08-12 01:30:45 +00:00			`return iter.nodes[offset]`
			`}`

scheduler: support iterator reset 2015-08-13 22:01:02 +00:00			`func (iter *StaticRankIterator) Reset() {`
			`iter.seen = 0`
			`}`

scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`// BinPackIterator is a RankIterator that scores potential options`
			`// based on a bin-packing algorithm.`
			`type BinPackIterator struct {`
Making the scheduler use LocalDisk instead of Resources.DiskMB 2016-08-25 17:27:19 +00:00			`ctx Context`
			`source RankIterator`
			`evict bool`
			`priority int`
			`taskGroup *structs.TaskGroup`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`}`

scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`// NewBinPackIterator returns a BinPackIterator which tries to fit tasks`
			`// potentially evicting other tasks based on a given priority.`
			`func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int) *BinPackIterator {`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`iter := &BinPackIterator{`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`ctx: ctx,`
			`source: source,`
			`evict: evict,`
			`priority: priority,`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`}`
			`return iter`
			`}`

scheduler: refactor stack out 2015-08-14 00:48:26 +00:00			`func (iter *BinPackIterator) SetPriority(p int) {`
			`iter.priority = p`
			`}`

Making the scheduler use LocalDisk instead of Resources.DiskMB 2016-08-25 17:27:19 +00:00			`func (iter BinPackIterator) SetTaskGroup(taskGroup structs.TaskGroup) {`
			`iter.taskGroup = taskGroup`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`}`

scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`func (iter BinPackIterator) Next() RankedNode {`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`OUTER:`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`for {`
scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`// Get the next potential option`
scheduler: adding various iterators 2015-08-12 01:27:54 +00:00			`option := iter.source.Next()`
			`if option == nil {`
			`return nil`
			`}`
scheduler: working on bin pack 2015-08-13 18:54:59 +00:00
scheduler: move proposed alloc logic to Context 2015-08-16 17:28:58 +00:00			`// Get the proposed allocations`
scheduler: refactor shared logic 2015-09-07 23:19:21 +00:00			`proposed, err := option.ProposedAllocs(iter.ctx)`
			`if err != nil {`
			`iter.ctx.Logger().Printf(`
			`"[ERR] sched.binpack: failed to get proposed allocations: %v",`
			`err)`
			`continue`
scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`}`
schedueler: adding best fit scoring 2015-08-13 18:28:02 +00:00
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`// Index the existing network usage`
scheduler: use the new network index 2015-09-13 21:37:09 +00:00			`netIdx := structs.NewNetworkIndex()`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`netIdx.SetNode(option.Node)`
			`netIdx.AddAllocs(proposed)`

			`// Assign the resources for each task`
Making the scheduler use LocalDisk instead of Resources.DiskMB 2016-08-25 17:27:19 +00:00			`total := &structs.Resources{`
Renaming LocalDisk to EphemeralDisk (#1710) Renaming LocalDisk to EphemeralDisk 2016-09-14 22:43:42 +00:00			`DiskMB: iter.taskGroup.EphemeralDisk.SizeMB,`
Making the scheduler use LocalDisk instead of Resources.DiskMB 2016-08-25 17:27:19 +00:00			`}`
			`for _, task := range iter.taskGroup.Tasks {`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`taskResources := task.Resources.Copy()`

			`// Check if we need a network resource`
			`if len(taskResources.Networks) > 0 {`
			`ask := taskResources.Networks[0]`
scheduler: expose reason network offer failed 2015-09-13 23:41:32 +00:00			`offer, err := netIdx.AssignNetwork(ask)`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`if offer == nil {`
scheduler: track dimension of exhaustion 2015-09-13 23:48:01 +00:00			`iter.ctx.Metrics().ExhaustedNode(option.Node,`
			`fmt.Sprintf("network: %s", err))`
nomad: cache bitmaps to avoid GC pressure 2016-02-20 20:18:22 +00:00			`netIdx.Release()`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`continue OUTER`
			`}`

			`// Reserve this to prevent another task from colliding`
			`netIdx.AddReserved(offer)`

			`// Update the network ask to the offer`
			`taskResources.Networks = []*structs.NetworkResource{offer}`
			`}`

			`// Store the task resource`
			`option.SetTaskResources(task, taskResources)`

			`// Accumulate the total resource requirement`
			`total.Add(taskResources)`
			`}`

scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`// Add the resources we are trying to fit`
scheduler: binpacker makes network offers 2015-09-13 21:31:32 +00:00			`proposed = append(proposed, &structs.Allocation{Resources: total})`
schedueler: adding best fit scoring 2015-08-13 18:28:02 +00:00
scheduler: refactor tests 2015-08-13 19:02:42 +00:00			`// Check if these allocations fit, if they do not, simply skip this node`
scheduler: pass failure reason to ExhaustedNode 2015-09-14 01:38:26 +00:00			`fit, dim, util, _ := structs.AllocsFit(option.Node, proposed, netIdx)`
nomad: cache bitmaps to avoid GC pressure 2016-02-20 20:18:22 +00:00			`netIdx.Release()`
scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`if !fit {`
scheduler: pass failure reason to ExhaustedNode 2015-09-14 01:38:26 +00:00			`iter.ctx.Metrics().ExhaustedNode(option.Node, dim)`
scheduler: refactor tests 2015-08-13 19:02:42 +00:00			`continue`
scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`}`

scheduler: refactor tests 2015-08-13 19:02:42 +00:00			`// XXX: For now we completely ignore evictions. We should use that flag`
			`// to determine if its possible to evict other lower priority allocations`
			`// to make room. This explodes the search space, so it must be done`
			`// carefully.`

scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`// Score the fit normally otherwise`
scheduler: move proposed alloc logic to Context 2015-08-16 17:28:58 +00:00			`fitness := structs.ScoreFit(option.Node, util)`
			`option.Score += fitness`
			`iter.ctx.Metrics().ScoreNode(option.Node, "binpack", fitness)`
scheduler: working on bin pack 2015-08-13 18:54:59 +00:00			`return option`
schedueler: adding best fit scoring 2015-08-13 18:28:02 +00:00			`}`
			`}`
scheduler: support iterator reset 2015-08-13 22:01:02 +00:00
			`func (iter *BinPackIterator) Reset() {`
			`iter.source.Reset()`
			`}`
scheduler: adding JobAntiAffinityIterator 2015-08-16 17:32:25 +00:00
			`// JobAntiAffinityIterator is used to apply an anti-affinity to allocating`
			`// along side other allocations from this job. This is used to help distribute`
			`// load across the cluster.`
			`type JobAntiAffinityIterator struct {`
			`ctx Context`
			`source RankIterator`
			`penalty float64`
			`jobID string`
			`}`

			`// NewJobAntiAffinityIterator is used to create a JobAntiAffinityIterator that`
			`// applies the given penalty for co-placement with allocs from this job.`
			`func NewJobAntiAffinityIterator(ctx Context, source RankIterator, penalty float64, jobID string) *JobAntiAffinityIterator {`
			`iter := &JobAntiAffinityIterator{`
			`ctx: ctx,`
			`source: source,`
			`penalty: penalty,`
			`jobID: jobID,`
			`}`
			`return iter`
			`}`

			`func (iter *JobAntiAffinityIterator) SetJob(jobID string) {`
			`iter.jobID = jobID`
			`}`

			`func (iter JobAntiAffinityIterator) Next() RankedNode {`
			`for {`
			`option := iter.source.Next()`
			`if option == nil {`
			`return nil`
			`}`

			`// Get the proposed allocations`
scheduler: refactor shared logic 2015-09-07 23:19:21 +00:00			`proposed, err := option.ProposedAllocs(iter.ctx)`
			`if err != nil {`
			`iter.ctx.Logger().Printf(`
			`"[ERR] sched.job-anti-aff: failed to get proposed allocations: %v",`
			`err)`
			`continue`
scheduler: adding JobAntiAffinityIterator 2015-08-16 17:32:25 +00:00			`}`

			`// Determine the number of collisions`
			`collisions := 0`
			`for _, alloc := range proposed {`
			`if alloc.JobID == iter.jobID {`
			`collisions += 1`
			`}`
			`}`

			`// Apply a penalty if there are collisions`
			`if collisions > 0 {`
scheduler: job anti-affinity score should record as negative 2015-09-23 05:24:07 +00:00			`scorePenalty := -1 * float64(collisions) * iter.penalty`
			`option.Score += scorePenalty`
scheduler: adding JobAntiAffinityIterator 2015-08-16 17:32:25 +00:00			`iter.ctx.Metrics().ScoreNode(option.Node, "job-anti-affinity", scorePenalty)`
			`}`
			`return option`
			`}`
			`}`

			`func (iter *JobAntiAffinityIterator) Reset() {`
			`iter.source.Reset()`
			`}`