system: re-evaluate node on feasibility changes (#11007)

Fix a bug where system jobs may fail to be placed on a node that initially was not eligible for system job placement. This changes causes the reschedule to re-evaluate the node if any attribute used in feasibility checks changes. Fixes https://github.com/hashicorp/nomad/issues/8448
2021-08-10 17:17:44 -04:00 · 2021-08-10 17:17:44 -04:00 · ea003188fa
parent bfc766357e
commit ea003188fa
3 changed files with 108 additions and 6 deletions
--- a/.changelog/11007.txt
+++ b/.changelog/11007.txt
@ -0,0 +1,3 @@
+```release-note:improvement
+scheduler: Re-evaluate nodes for system jobs after attributes changes
+```
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@ -3,6 +3,7 @@ package nomad
 import (
 	"context"
 	"fmt"
+	"reflect"
 	"strings"
 	"sync"
 	"time"
@ -169,12 +170,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp
 	reply.NodeModifyIndex = index

 	// Check if we should trigger evaluations
-	originalStatus := structs.NodeStatusInit
-	if originalNode != nil {
-		originalStatus = originalNode.Status
-	}
-	transitionToReady := transitionedToReady(args.Node.Status, originalStatus)
-	if structs.ShouldDrainNode(args.Node.Status) || transitionToReady {
+	if shouldCreateNodeEval(originalNode, args.Node) {
 		evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index)
 		if err != nil {
 			n.logger.Error("eval creation failed", "error", err)
@ -211,6 +207,56 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp
 	return nil
 }

+// shouldCreateNodeEval returns true if the node update may result into
+// allocation updates, so the node should be re-evaluating.
+//
+// Such cases might be:
+// * node health/drain status changes that may result into alloc rescheduling
+// * node drivers or attributes changing that may cause system job placement changes
+func shouldCreateNodeEval(original, updated *structs.Node) bool {
+	if structs.ShouldDrainNode(updated.Status) {
+		return true
+	}
+
+	if original == nil {
+		return transitionedToReady(updated.Status, structs.NodeStatusInit)
+	}
+
+	if transitionedToReady(updated.Status, original.Status) {
+		return true
+	}
+
+	// check fields used by the feasibility checks in ../scheduler/feasible.go,
+	// whether through a Constraint explicitly added by user or an implicit constraint
+	// added through a driver/volume check.
+	//
+	// Node Resources (e.g. CPU/Memory) are handled differently, using blocked evals,
+	// and not relevant in this check.
+	return !(original.ID == updated.ID &&
+		original.Datacenter == updated.Datacenter &&
+		original.Name == updated.Name &&
+		original.NodeClass == updated.NodeClass &&
+		reflect.DeepEqual(original.Attributes, updated.Attributes) &&
+		reflect.DeepEqual(original.Meta, updated.Meta) &&
+		reflect.DeepEqual(original.Drivers, updated.Drivers) &&
+		reflect.DeepEqual(original.HostVolumes, updated.HostVolumes) &&
+		equalDevices(original, updated))
+}
+
+func equalDevices(n1, n2 *structs.Node) bool {
+	// ignore super old nodes, mostly to avoid nil dereferencing
+	if n1.NodeResources == nil || n2.NodeResources == nil {
+		return n1.NodeResources == n2.NodeResources
+	}
+
+	// treat nil and empty value as equal
+	if len(n1.NodeResources.Devices) == 0 {
+		return len(n1.NodeResources.Devices) == len(n2.NodeResources.Devices)
+	}
+
+	return reflect.DeepEqual(n1.NodeResources.Devices, n2.NodeResources.Devices)
+}
+
 // updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading.
 func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error {
 	reply.LeaderRPCAddr = string(n.srv.raft.Leader())
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@ -3602,3 +3602,56 @@ func TestClientEndpoint_EmitEvents(t *testing.T) {
 	require.Nil(err)
 	require.False(len(out.Events) < 2)
 }
+
+func TestClientEndpoint_ShouldCreateNodeEval(t *testing.T) {
+	t.Run("spurious changes don't require eval", func(t *testing.T) {
+		n1 := mock.Node()
+		n2 := n1.Copy()
+		n2.SecretID = uuid.Generate()
+		n2.Links["vault"] = "links don't get interpolated"
+		n2.ModifyIndex++
+
+		require.False(t, shouldCreateNodeEval(n1, n2))
+	})
+
+	positiveCases := []struct {
+		name     string
+		updateFn func(n *structs.Node)
+	}{
+		{
+			"data center changes",
+			func(n *structs.Node) { n.Datacenter += "u" },
+		},
+		{
+			"attribute change",
+			func(n *structs.Node) { n.Attributes["test.attribute"] = "something" },
+		},
+		{
+			"meta change",
+			func(n *structs.Node) { n.Meta["test.meta"] = "something" },
+		},
+		{
+			"drivers health changed",
+			func(n *structs.Node) { n.Drivers["exec"].Detected = false },
+		},
+		{
+			"new drivers",
+			func(n *structs.Node) {
+				n.Drivers["newdriver"] = &structs.DriverInfo{
+					Detected: true,
+					Healthy:  true,
+				}
+			},
+		},
+	}
+
+	for _, c := range positiveCases {
+		t.Run(c.name, func(t *testing.T) {
+			n1 := mock.Node()
+			n2 := n1.Copy()
+			c.updateFn(n2)
+
+			require.Truef(t, shouldCreateNodeEval(n1, n2), "node changed but without node eval: %v", pretty.Diff(n1, n2))
+		})
+	}
+}