scheduler: filter device instance IDs by constraints (#18141)

When the scheduler assigns a device instance, it iterates over the feasible devices and then picks the first instance with availability. If the jobspec uses a constraint on device ID, this can lead to buggy/surprising behavior where the node's device matches the constraint but then the individual device instance does not. Add a second filter based on the `${device.ids}` constraint after selecting a node's device to ensure the device instance ID falls within the constraint as well. Fixes: #18112
2023-08-03 14:58:30 -04:00 · 2023-08-03 14:58:30 -04:00 · 87101b131a
parent c4f223249d
commit 87101b131a
3 changed files with 77 additions and 18 deletions
--- a/.changelog/18141.txt
+++ b/.changelog/18141.txt
@ -0,0 +1,3 @@
+```release-note:bug
+scheduler: Fixed a bug where device IDs were not correctly filtered in constraints
+```
--- a/scheduler/device.go
+++ b/scheduler/device.go
@ -9,6 +9,7 @@ import (
 	"math"

 	"github.com/hashicorp/nomad/nomad/structs"
+	psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
 )

 // deviceAllocator is used to allocate devices to allocations. The allocator
@ -115,7 +116,8 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc

 		assigned := uint64(0)
 		for id, v := range devInst.Instances {
-			if v == 0 && assigned < ask.Count {
+			if v == 0 && assigned < ask.Count &&
+				d.deviceIDMatchesConstraint(id, ask.Constraints, devInst.Device) {
 				assigned++
 				offer.DeviceIDs = append(offer.DeviceIDs, id)
 				if assigned == ask.Count {
@ -132,3 +134,34 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc

 	return offer, matchedWeights, nil
 }
+
+// deviceIDMatchesConstraint checks a device instance ID against the constraints
+// to ensure we're only assigning instance IDs that match. This is a narrower
+// check than nodeDeviceMatches because we've already asserted that the device
+// matches and now need to filter by instance ID.
+func (d *deviceAllocator) deviceIDMatchesConstraint(id string, constraints structs.Constraints, device *structs.NodeDeviceResource) bool {
+
+	// There are no constraints to consider
+	if len(constraints) == 0 {
+		return true
+	}
+
+	deviceID := psstructs.NewStringAttribute(id)
+
+	for _, c := range constraints {
+		var target *psstructs.Attribute
+		if c.LTarget == "${device.ids}" {
+			target, _ = resolveDeviceTarget(c.RTarget, device)
+		} else if c.RTarget == "${device.ids}" {
+			target, _ = resolveDeviceTarget(c.LTarget, device)
+		} else {
+			continue
+		}
+
+		if !checkAttributeConstraint(d.ctx, c.Operand, target, deviceID, true, true) {
+			return false
+		}
+	}
+
+	return true
+}
--- a/scheduler/device_test.go
+++ b/scheduler/device_test.go
@ -11,6 +11,7 @@ import (
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
+	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/require"
 )

@ -165,12 +166,15 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {

 	cases := []struct {
 		Name              string
+		Note              string
 		Constraints       []*structs.Constraint
 		ExpectedDevice    *structs.NodeDeviceResource
+		ExpectedDeviceIDs []string
 		NoPlacement       bool
 	}{
 		{
 			Name: "gpu",
+			Note: "-gt",
 			Constraints: []*structs.Constraint{
 				{
 					LTarget: "${device.attr.cuda_cores}",
@ -179,9 +183,11 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
 				},
 			},
 			ExpectedDevice:    nvidia1,
+			ExpectedDeviceIDs: collectInstanceIDs(nvidia1),
 		},
 		{
 			Name: "gpu",
+			Note: "-lt",
 			Constraints: []*structs.Constraint{
 				{
 					LTarget: "${device.attr.cuda_cores}",
@ -190,6 +196,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
 				},
 			},
 			ExpectedDevice:    nvidia0,
+			ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
 		},
 		{
 			Name: "nvidia/gpu",
@ -212,6 +219,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
 				},
 			},
 			ExpectedDevice:    nvidia0,
+			ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
 		},
 		{
 			Name:        "intel/gpu",
@ -219,6 +227,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
 		},
 		{
 			Name: "nvidia/gpu",
+			Note: "-no-placement",
 			Constraints: []*structs.Constraint{
 				{
 					LTarget: "${device.attr.memory_bandwidth}",
@ -239,29 +248,43 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
 			},
 			NoPlacement: true,
 		},
+		{
+			Name: "nvidia/gpu",
+			Note: "-contains-id",
+			Constraints: []*structs.Constraint{
+				{
+					LTarget: "${device.ids}",
+					Operand: "set_contains",
+					RTarget: nvidia0.Instances[1].ID,
+				},
+			},
+			ExpectedDevice:    nvidia0,
+			ExpectedDeviceIDs: []string{nvidia0.Instances[1].ID},
+		},
 	}

 	for _, c := range cases {
-		t.Run(c.Name, func(t *testing.T) {
-			require := require.New(t)
+		t.Run(c.Name+c.Note, func(t *testing.T) {
 			_, ctx := testContext(t)
 			d := newDeviceAllocator(ctx, n)
-			require.NotNil(d)
+			must.NotNil(t, d)

 			// Build the request
 			ask := deviceRequest(c.Name, 1, c.Constraints, nil)

 			out, score, err := d.AssignDevice(ask)
 			if c.NoPlacement {
-				require.Nil(out)
+				require.Nil(t, out)
 			} else {
-				require.NotNil(out)
-				require.Zero(score)
-				require.NoError(err)
+				must.NotNil(t, out)
+				must.Zero(t, score)
+				must.NoError(t, err)

-				// Check that we got the nvidia device
-				require.Len(out.DeviceIDs, 1)
-				require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
+				// Check that we got the right nvidia device instance, and
+				// specific device instance IDs if required
+				must.Len(t, 1, out.DeviceIDs)
+				must.SliceContains(t, collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
+				must.SliceContainsSubset(t, c.ExpectedDeviceIDs, out.DeviceIDs)
 			}
 		})
 	}