scheduler: filter device instance IDs by constraints (#18141)

When the scheduler assigns a device instance, it iterates over the feasible
devices and then picks the first instance with availability. If the jobspec uses
a constraint on device ID, this can lead to buggy/surprising behavior where the
node's device matches the constraint but then the individual device instance
does not.

Add a second filter based on the `${device.ids}` constraint after selecting a
node's device to ensure the device instance ID falls within the constraint as
well.

Fixes: #18112
This commit is contained in:
Tim Gross 2023-08-03 14:58:30 -04:00
parent c4f223249d
commit 87101b131a
3 changed files with 77 additions and 18 deletions

3
.changelog/18141.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
scheduler: Fixed a bug where device IDs were not correctly filtered in constraints
```

View File

@ -9,6 +9,7 @@ import (
"math" "math"
"github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
) )
// deviceAllocator is used to allocate devices to allocations. The allocator // deviceAllocator is used to allocate devices to allocations. The allocator
@ -115,7 +116,8 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc
assigned := uint64(0) assigned := uint64(0)
for id, v := range devInst.Instances { for id, v := range devInst.Instances {
if v == 0 && assigned < ask.Count { if v == 0 && assigned < ask.Count &&
d.deviceIDMatchesConstraint(id, ask.Constraints, devInst.Device) {
assigned++ assigned++
offer.DeviceIDs = append(offer.DeviceIDs, id) offer.DeviceIDs = append(offer.DeviceIDs, id)
if assigned == ask.Count { if assigned == ask.Count {
@ -132,3 +134,34 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc
return offer, matchedWeights, nil return offer, matchedWeights, nil
} }
// deviceIDMatchesConstraint checks a device instance ID against the constraints
// to ensure we're only assigning instance IDs that match. This is a narrower
// check than nodeDeviceMatches because we've already asserted that the device
// matches and now need to filter by instance ID.
func (d *deviceAllocator) deviceIDMatchesConstraint(id string, constraints structs.Constraints, device *structs.NodeDeviceResource) bool {
// There are no constraints to consider
if len(constraints) == 0 {
return true
}
deviceID := psstructs.NewStringAttribute(id)
for _, c := range constraints {
var target *psstructs.Attribute
if c.LTarget == "${device.ids}" {
target, _ = resolveDeviceTarget(c.RTarget, device)
} else if c.RTarget == "${device.ids}" {
target, _ = resolveDeviceTarget(c.LTarget, device)
} else {
continue
}
if !checkAttributeConstraint(d.ctx, c.Operand, target, deviceID, true, true) {
return false
}
}
return true
}

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs" psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
@ -164,13 +165,16 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
nvidia1 := n.NodeResources.Devices[1] nvidia1 := n.NodeResources.Devices[1]
cases := []struct { cases := []struct {
Name string Name string
Constraints []*structs.Constraint Note string
ExpectedDevice *structs.NodeDeviceResource Constraints []*structs.Constraint
NoPlacement bool ExpectedDevice *structs.NodeDeviceResource
ExpectedDeviceIDs []string
NoPlacement bool
}{ }{
{ {
Name: "gpu", Name: "gpu",
Note: "-gt",
Constraints: []*structs.Constraint{ Constraints: []*structs.Constraint{
{ {
LTarget: "${device.attr.cuda_cores}", LTarget: "${device.attr.cuda_cores}",
@ -178,10 +182,12 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
RTarget: "4000", RTarget: "4000",
}, },
}, },
ExpectedDevice: nvidia1, ExpectedDevice: nvidia1,
ExpectedDeviceIDs: collectInstanceIDs(nvidia1),
}, },
{ {
Name: "gpu", Name: "gpu",
Note: "-lt",
Constraints: []*structs.Constraint{ Constraints: []*structs.Constraint{
{ {
LTarget: "${device.attr.cuda_cores}", LTarget: "${device.attr.cuda_cores}",
@ -189,7 +195,8 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
RTarget: "4000", RTarget: "4000",
}, },
}, },
ExpectedDevice: nvidia0, ExpectedDevice: nvidia0,
ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
}, },
{ {
Name: "nvidia/gpu", Name: "nvidia/gpu",
@ -211,7 +218,8 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
RTarget: "1.4 GHz", RTarget: "1.4 GHz",
}, },
}, },
ExpectedDevice: nvidia0, ExpectedDevice: nvidia0,
ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
}, },
{ {
Name: "intel/gpu", Name: "intel/gpu",
@ -219,6 +227,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
}, },
{ {
Name: "nvidia/gpu", Name: "nvidia/gpu",
Note: "-no-placement",
Constraints: []*structs.Constraint{ Constraints: []*structs.Constraint{
{ {
LTarget: "${device.attr.memory_bandwidth}", LTarget: "${device.attr.memory_bandwidth}",
@ -239,29 +248,43 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
}, },
NoPlacement: true, NoPlacement: true,
}, },
{
Name: "nvidia/gpu",
Note: "-contains-id",
Constraints: []*structs.Constraint{
{
LTarget: "${device.ids}",
Operand: "set_contains",
RTarget: nvidia0.Instances[1].ID,
},
},
ExpectedDevice: nvidia0,
ExpectedDeviceIDs: []string{nvidia0.Instances[1].ID},
},
} }
for _, c := range cases { for _, c := range cases {
t.Run(c.Name, func(t *testing.T) { t.Run(c.Name+c.Note, func(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t) _, ctx := testContext(t)
d := newDeviceAllocator(ctx, n) d := newDeviceAllocator(ctx, n)
require.NotNil(d) must.NotNil(t, d)
// Build the request // Build the request
ask := deviceRequest(c.Name, 1, c.Constraints, nil) ask := deviceRequest(c.Name, 1, c.Constraints, nil)
out, score, err := d.AssignDevice(ask) out, score, err := d.AssignDevice(ask)
if c.NoPlacement { if c.NoPlacement {
require.Nil(out) require.Nil(t, out)
} else { } else {
require.NotNil(out) must.NotNil(t, out)
require.Zero(score) must.Zero(t, score)
require.NoError(err) must.NoError(t, err)
// Check that we got the nvidia device // Check that we got the right nvidia device instance, and
require.Len(out.DeviceIDs, 1) // specific device instance IDs if required
require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0]) must.Len(t, 1, out.DeviceIDs)
must.SliceContains(t, collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
must.SliceContainsSubset(t, c.ExpectedDeviceIDs, out.DeviceIDs)
} }
}) })
} }