scheduler: prevent panic in spread iterator during alloc stop

The spread iterator can panic when processing an evaluation, resulting in an unrecoverable state in the cluster. Whenever a panicked server restarts and quorum is restored, the next server to dequeue the evaluation will panic. To trigger this state: * The job must have `max_parallel = 0` and a `canary >= 1`. * The job must not have a `spread` block. * The job must have a previous version. * The previous version must have a `spread` block and at least one failed allocation. In this scenario, the desired changes include `(place 1+) (stop 1+), (ignore n) (canary 1)`. Before the scheduler can place the canary allocation, it tries to find out which allocations can be stopped. This passes back through the stack so that we can determine previous-node penalties, etc. We call `SetJob` on the stack with the previous version of the job, which will include assessing the `spread` block (even though the results are unused). The task group spread info state from that pass through the spread iterator is not reset when we call `SetJob` again. When the new job version iterates over the `groupPropertySets`, it will get an empty `spreadAttributeMap`, resulting in an unexpected nil pointer dereference. This changeset resets the spread iterator internal state when setting the job, logging with a bypass around the bug in case we hit similar cases, and a test that panics the scheduler without the patch.
2022-02-02 13:26:05 -05:00 · 2022-02-02 13:26:05 -05:00 · 74486d86fb
parent 15f9d54dea
commit 74486d86fb
3 changed files with 113 additions and 0 deletions
--- a/.changelog/12039.txt
+++ b/.changelog/12039.txt
@ -0,0 +1,3 @@
+```release-note:security
+Prevent panic in spread iterator during allocation stop. [CVE-2022-24684](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-24684)
+```
--- a/scheduler/spread.go
+++ b/scheduler/spread.go
@ -71,6 +71,12 @@ func (iter *SpreadIterator) SetJob(job *structs.Job) {
 	if job.Spreads != nil {
 		iter.jobSpreads = job.Spreads
 	}
+
+	// reset group spread/property so that when we temporarily SetJob
+	// to an older version to calculate stops we don't leak old
+	// versions of spread/properties to the new job version
+	iter.tgSpreadInfo = make(map[string]spreadAttributeMap)
+	iter.groupPropertySets = make(map[string][]*propertySet)
 }

 func (iter *SpreadIterator) SetTaskGroup(tg *structs.TaskGroup) {
@ -134,6 +140,15 @@ func (iter *SpreadIterator) Next() *RankedNode {
 			spreadAttributeMap := iter.tgSpreadInfo[tgName]
 			spreadDetails := spreadAttributeMap[pset.targetAttribute]

+			if spreadDetails == nil {
+				iter.ctx.Logger().Named("spread").Error(
+					"error reading spread attribute map for task group",
+					"task_group", tgName,
+					"target", pset.targetAttribute,
+				)
+				continue
+			}
+
 			if len(spreadDetails.desiredCounts) == 0 {
 				// When desired counts map is empty the user didn't specify any targets
 				// Use even spreading scoring algorithm for this scenario
--- a/scheduler/spread_test.go
+++ b/scheduler/spread_test.go
@ -9,6 +9,7 @@ import (

 	"fmt"

+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
@ -811,3 +812,97 @@ func validateEqualSpread(h *Harness) error {
 	}
 	return fmt.Errorf("expected even distributon of allocs to racks, but got:\n%+v", countSet)
 }
+
+func TestSpreadPanicDowngrade(t *testing.T) {
+
+	h := NewHarness(t)
+
+	nodes := []*structs.Node{}
+	for i := 0; i < 5; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		err := h.State.UpsertNode(structs.MsgTypeTestSetup,
+			h.NextIndex(), node)
+		require.NoError(t, err)
+	}
+
+	// job version 1
+	// max_parallel = 0, canary = 1, spread != nil, 1 failed alloc
+
+	job1 := mock.Job()
+	job1.Spreads = []*structs.Spread{
+		{
+			Attribute:    "${node.unique.name}",
+			Weight:       50,
+			SpreadTarget: []*structs.SpreadTarget{},
+		},
+	}
+	job1.Update = structs.UpdateStrategy{
+		Stagger:     time.Duration(30 * time.Second),
+		MaxParallel: 0,
+	}
+	job1.Status = structs.JobStatusRunning
+	job1.TaskGroups[0].Count = 4
+	job1.TaskGroups[0].Update = &structs.UpdateStrategy{
+		Stagger:          time.Duration(30 * time.Second),
+		MaxParallel:      1,
+		HealthCheck:      "checks",
+		MinHealthyTime:   time.Duration(30 * time.Second),
+		HealthyDeadline:  time.Duration(9 * time.Minute),
+		ProgressDeadline: time.Duration(10 * time.Minute),
+		AutoRevert:       true,
+		Canary:           1,
+	}
+
+	job1.Version = 1
+	job1.TaskGroups[0].Count = 5
+	err := h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job1)
+	require.NoError(t, err)
+
+	allocs := []*structs.Allocation{}
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job1
+		alloc.JobID = job1.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy:     helper.BoolToPtr(true),
+			Timestamp:   time.Now(),
+			Canary:      false,
+			ModifyIndex: h.NextIndex(),
+		}
+		if i == 0 {
+			alloc.DeploymentStatus.Canary = true
+		}
+		if i == 1 {
+			alloc.ClientStatus = structs.AllocClientStatusFailed
+		}
+		allocs = append(allocs, alloc)
+	}
+	err = h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)
+
+	// job version 2
+	// max_parallel = 0, canary = 1, spread == nil
+
+	job2 := job1.Copy()
+	job2.Version = 2
+	job2.Spreads = nil
+	err = h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)
+	require.NoError(t, err)
+
+	eval := &structs.Evaluation{
+		Namespace:   job2.Namespace,
+		ID:          uuid.Generate(),
+		Priority:    job2.Priority,
+		TriggeredBy: structs.EvalTriggerJobRegister,
+		JobID:       job2.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	err = h.State.UpsertEvals(structs.MsgTypeTestSetup,
+		h.NextIndex(), []*structs.Evaluation{eval})
+	require.NoError(t, err)
+
+	processErr := h.Process(NewServiceScheduler, eval)
+	require.NoError(t, processErr, "failed to process eval")
+	require.Len(t, h.Plans, 1)
+}