scheduler: prevent -Inf in spread scoring (#17198)
When spread targets have a percent value of zero it's possible for them to return -Inf scoring because of a float divide by zero. This is very hard for operators to debug because the string "-Inf" is returned in the API and that breaks the presentation of debugging data. Most scoring iterators are bracketed to -1/+1, but spread iterators do not so that they can handle greatly unbalanced scoring so we can't simply return a -1 score without generating a score that might be greater than the negative scores set by other spread targets. Instead, track the lowest-seen spread boost and use that as the spread boost for any cases where we'd divide by zero. Fixes: #8863
This commit is contained in:
parent
7a92c7b5ac
commit
2426aae832
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
scheduler: Fixed a bug where scores for spread scheduling could be -Inf
|
||||||
|
```
|
|
@ -33,6 +33,9 @@ type SpreadIterator struct {
|
||||||
// blocks
|
// blocks
|
||||||
sumSpreadWeights int32
|
sumSpreadWeights int32
|
||||||
|
|
||||||
|
// lowestSpreadBoost tracks the lowest spread boost across all spread blocks
|
||||||
|
lowestSpreadBoost float64
|
||||||
|
|
||||||
// hasSpread is used to early return when the job/task group
|
// hasSpread is used to early return when the job/task group
|
||||||
// does not have spread configured
|
// does not have spread configured
|
||||||
hasSpread bool
|
hasSpread bool
|
||||||
|
@ -56,6 +59,7 @@ func NewSpreadIterator(ctx Context, source RankIterator) *SpreadIterator {
|
||||||
source: source,
|
source: source,
|
||||||
groupPropertySets: make(map[string][]*propertySet),
|
groupPropertySets: make(map[string][]*propertySet),
|
||||||
tgSpreadInfo: make(map[string]spreadAttributeMap),
|
tgSpreadInfo: make(map[string]spreadAttributeMap),
|
||||||
|
lowestSpreadBoost: -1.0,
|
||||||
}
|
}
|
||||||
return iter
|
return iter
|
||||||
}
|
}
|
||||||
|
@ -117,6 +121,7 @@ func (iter *SpreadIterator) hasSpreads() bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (iter *SpreadIterator) Next() *RankedNode {
|
func (iter *SpreadIterator) Next() *RankedNode {
|
||||||
|
|
||||||
for {
|
for {
|
||||||
option := iter.source.Next()
|
option := iter.source.Next()
|
||||||
|
|
||||||
|
@ -165,7 +170,7 @@ func (iter *SpreadIterator) Next() *RankedNode {
|
||||||
desiredCount, ok = spreadDetails.desiredCounts[implicitTarget]
|
desiredCount, ok = spreadDetails.desiredCounts[implicitTarget]
|
||||||
if !ok {
|
if !ok {
|
||||||
// The desired count for this attribute is zero if it gets here
|
// The desired count for this attribute is zero if it gets here
|
||||||
// so use the maximum possible penalty for this node
|
// so use the default negative penalty for this node
|
||||||
totalSpreadScore -= 1.0
|
totalSpreadScore -= 1.0
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -174,12 +179,20 @@ func (iter *SpreadIterator) Next() *RankedNode {
|
||||||
// Calculate the relative weight of this specific spread attribute
|
// Calculate the relative weight of this specific spread attribute
|
||||||
spreadWeight := float64(spreadDetails.weight) / float64(iter.sumSpreadWeights)
|
spreadWeight := float64(spreadDetails.weight) / float64(iter.sumSpreadWeights)
|
||||||
|
|
||||||
|
if desiredCount == 0 {
|
||||||
|
totalSpreadScore += iter.lowestSpreadBoost
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// Score Boost is proportional the difference between current and desired count
|
// Score Boost is proportional the difference between current and desired count
|
||||||
// It is negative when the used count is greater than the desired count
|
// It is negative when the used count is greater than the desired count
|
||||||
// It is multiplied with the spread weight to account for cases where the job has
|
// It is multiplied with the spread weight to account for cases where the job has
|
||||||
// more than one spread attribute
|
// more than one spread attribute
|
||||||
scoreBoost := ((desiredCount - float64(usedCount)) / desiredCount) * spreadWeight
|
scoreBoost := ((desiredCount - float64(usedCount)) / desiredCount) * spreadWeight
|
||||||
totalSpreadScore += scoreBoost
|
totalSpreadScore += scoreBoost
|
||||||
|
if scoreBoost < iter.lowestSpreadBoost {
|
||||||
|
iter.lowestSpreadBoost = scoreBoost
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@ import (
|
||||||
"github.com/hashicorp/nomad/helper/uuid"
|
"github.com/hashicorp/nomad/helper/uuid"
|
||||||
"github.com/hashicorp/nomad/nomad/mock"
|
"github.com/hashicorp/nomad/nomad/mock"
|
||||||
"github.com/hashicorp/nomad/nomad/structs"
|
"github.com/hashicorp/nomad/nomad/structs"
|
||||||
|
"github.com/shoenig/test"
|
||||||
|
"github.com/shoenig/test/must"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -561,6 +563,104 @@ func TestSpreadIterator_MaxPenalty(t *testing.T) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSpreadIterator_NoInfinity(t *testing.T) {
|
||||||
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
store, ctx := testContext(t)
|
||||||
|
var nodes []*RankedNode
|
||||||
|
|
||||||
|
// Add 3 nodes in different DCs to the state store
|
||||||
|
for i := 1; i < 4; i++ {
|
||||||
|
node := mock.Node()
|
||||||
|
node.Datacenter = fmt.Sprintf("dc%d", i)
|
||||||
|
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, uint64(100+i), node))
|
||||||
|
nodes = append(nodes, &RankedNode{Node: node})
|
||||||
|
}
|
||||||
|
|
||||||
|
static := NewStaticRankIterator(ctx, nodes)
|
||||||
|
|
||||||
|
job := mock.Job()
|
||||||
|
tg := job.TaskGroups[0]
|
||||||
|
job.TaskGroups[0].Count = 8
|
||||||
|
|
||||||
|
// Create spread target of 50% in dc1, 50% in dc2, and 0% in the implicit target
|
||||||
|
spread := &structs.Spread{
|
||||||
|
Weight: 100,
|
||||||
|
Attribute: "${node.datacenter}",
|
||||||
|
SpreadTarget: []*structs.SpreadTarget{
|
||||||
|
{
|
||||||
|
Value: "dc1",
|
||||||
|
Percent: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Value: "dc2",
|
||||||
|
Percent: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Value: "*",
|
||||||
|
Percent: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tg.Spreads = []*structs.Spread{spread}
|
||||||
|
spreadIter := NewSpreadIterator(ctx, static)
|
||||||
|
spreadIter.SetJob(job)
|
||||||
|
spreadIter.SetTaskGroup(tg)
|
||||||
|
|
||||||
|
scoreNorm := NewScoreNormalizationIterator(ctx, spreadIter)
|
||||||
|
|
||||||
|
out := collectRanked(scoreNorm)
|
||||||
|
|
||||||
|
// Scores should be even between dc1 and dc2 nodes, without an -Inf on dc3
|
||||||
|
must.Len(t, 3, out)
|
||||||
|
test.Eq(t, 0.75, out[0].FinalScore)
|
||||||
|
test.Eq(t, 0.75, out[1].FinalScore)
|
||||||
|
test.Eq(t, -1, out[2].FinalScore)
|
||||||
|
|
||||||
|
// Reset scores
|
||||||
|
for _, node := range nodes {
|
||||||
|
node.Scores = nil
|
||||||
|
node.FinalScore = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create very unbalanced spread target to force large negative scores
|
||||||
|
spread = &structs.Spread{
|
||||||
|
Weight: 100,
|
||||||
|
Attribute: "${node.datacenter}",
|
||||||
|
SpreadTarget: []*structs.SpreadTarget{
|
||||||
|
{
|
||||||
|
Value: "dc1",
|
||||||
|
Percent: 99,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Value: "dc2",
|
||||||
|
Percent: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Value: "*",
|
||||||
|
Percent: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tg.Spreads = []*structs.Spread{spread}
|
||||||
|
static = NewStaticRankIterator(ctx, nodes)
|
||||||
|
spreadIter = NewSpreadIterator(ctx, static)
|
||||||
|
spreadIter.SetJob(job)
|
||||||
|
spreadIter.SetTaskGroup(tg)
|
||||||
|
|
||||||
|
scoreNorm = NewScoreNormalizationIterator(ctx, spreadIter)
|
||||||
|
|
||||||
|
out = collectRanked(scoreNorm)
|
||||||
|
|
||||||
|
// Scores should heavily favor dc1, with an -Inf on dc3
|
||||||
|
must.Len(t, 3, out)
|
||||||
|
desired := 8 * 0.99 // 8 allocs * 99%
|
||||||
|
test.Eq(t, (desired-1)/desired, out[0].FinalScore)
|
||||||
|
test.Eq(t, -11.5, out[1].FinalScore)
|
||||||
|
test.LessEq(t, out[1].FinalScore, out[2].FinalScore,
|
||||||
|
test.Sprintf("expected implicit dc3 to be <= dc2"))
|
||||||
|
}
|
||||||
|
|
||||||
func Test_evenSpreadScoreBoost(t *testing.T) {
|
func Test_evenSpreadScoreBoost(t *testing.T) {
|
||||||
ci.Parallel(t)
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue