scheduler: filter device instance IDs by constraints (#18141)
When the scheduler assigns a device instance, it iterates over the feasible devices and then picks the first instance with availability. If the jobspec uses a constraint on device ID, this can lead to buggy/surprising behavior where the node's device matches the constraint but then the individual device instance does not. Add a second filter based on the `${device.ids}` constraint after selecting a node's device to ensure the device instance ID falls within the constraint as well. Fixes: #18112
This commit is contained in:
parent
c4f223249d
commit
87101b131a
|
@ -0,0 +1,3 @@
|
|||
```release-note:bug
|
||||
scheduler: Fixed a bug where device IDs were not correctly filtered in constraints
|
||||
```
|
|
@ -9,6 +9,7 @@ import (
|
|||
"math"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
)
|
||||
|
||||
// deviceAllocator is used to allocate devices to allocations. The allocator
|
||||
|
@ -115,7 +116,8 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc
|
|||
|
||||
assigned := uint64(0)
|
||||
for id, v := range devInst.Instances {
|
||||
if v == 0 && assigned < ask.Count {
|
||||
if v == 0 && assigned < ask.Count &&
|
||||
d.deviceIDMatchesConstraint(id, ask.Constraints, devInst.Device) {
|
||||
assigned++
|
||||
offer.DeviceIDs = append(offer.DeviceIDs, id)
|
||||
if assigned == ask.Count {
|
||||
|
@ -132,3 +134,34 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc
|
|||
|
||||
return offer, matchedWeights, nil
|
||||
}
|
||||
|
||||
// deviceIDMatchesConstraint checks a device instance ID against the constraints
|
||||
// to ensure we're only assigning instance IDs that match. This is a narrower
|
||||
// check than nodeDeviceMatches because we've already asserted that the device
|
||||
// matches and now need to filter by instance ID.
|
||||
func (d *deviceAllocator) deviceIDMatchesConstraint(id string, constraints structs.Constraints, device *structs.NodeDeviceResource) bool {
|
||||
|
||||
// There are no constraints to consider
|
||||
if len(constraints) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
deviceID := psstructs.NewStringAttribute(id)
|
||||
|
||||
for _, c := range constraints {
|
||||
var target *psstructs.Attribute
|
||||
if c.LTarget == "${device.ids}" {
|
||||
target, _ = resolveDeviceTarget(c.RTarget, device)
|
||||
} else if c.RTarget == "${device.ids}" {
|
||||
target, _ = resolveDeviceTarget(c.LTarget, device)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
|
||||
if !checkAttributeConstraint(d.ctx, c.Operand, target, deviceID, true, true) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
"github.com/shoenig/test/must"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
|
@ -164,13 +165,16 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
|||
nvidia1 := n.NodeResources.Devices[1]
|
||||
|
||||
cases := []struct {
|
||||
Name string
|
||||
Constraints []*structs.Constraint
|
||||
ExpectedDevice *structs.NodeDeviceResource
|
||||
NoPlacement bool
|
||||
Name string
|
||||
Note string
|
||||
Constraints []*structs.Constraint
|
||||
ExpectedDevice *structs.NodeDeviceResource
|
||||
ExpectedDeviceIDs []string
|
||||
NoPlacement bool
|
||||
}{
|
||||
{
|
||||
Name: "gpu",
|
||||
Note: "-gt",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${device.attr.cuda_cores}",
|
||||
|
@ -178,10 +182,12 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
|||
RTarget: "4000",
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia1,
|
||||
ExpectedDevice: nvidia1,
|
||||
ExpectedDeviceIDs: collectInstanceIDs(nvidia1),
|
||||
},
|
||||
{
|
||||
Name: "gpu",
|
||||
Note: "-lt",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${device.attr.cuda_cores}",
|
||||
|
@ -189,7 +195,8 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
|||
RTarget: "4000",
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia0,
|
||||
ExpectedDevice: nvidia0,
|
||||
ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
|
||||
},
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
|
@ -211,7 +218,8 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
|||
RTarget: "1.4 GHz",
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia0,
|
||||
ExpectedDevice: nvidia0,
|
||||
ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
|
||||
},
|
||||
{
|
||||
Name: "intel/gpu",
|
||||
|
@ -219,6 +227,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
|||
},
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Note: "-no-placement",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${device.attr.memory_bandwidth}",
|
||||
|
@ -239,29 +248,43 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
|||
},
|
||||
NoPlacement: true,
|
||||
},
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Note: "-contains-id",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${device.ids}",
|
||||
Operand: "set_contains",
|
||||
RTarget: nvidia0.Instances[1].ID,
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia0,
|
||||
ExpectedDeviceIDs: []string{nvidia0.Instances[1].ID},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.Name, func(t *testing.T) {
|
||||
require := require.New(t)
|
||||
t.Run(c.Name+c.Note, func(t *testing.T) {
|
||||
_, ctx := testContext(t)
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
must.NotNil(t, d)
|
||||
|
||||
// Build the request
|
||||
ask := deviceRequest(c.Name, 1, c.Constraints, nil)
|
||||
|
||||
out, score, err := d.AssignDevice(ask)
|
||||
if c.NoPlacement {
|
||||
require.Nil(out)
|
||||
require.Nil(t, out)
|
||||
} else {
|
||||
require.NotNil(out)
|
||||
require.Zero(score)
|
||||
require.NoError(err)
|
||||
must.NotNil(t, out)
|
||||
must.Zero(t, score)
|
||||
must.NoError(t, err)
|
||||
|
||||
// Check that we got the nvidia device
|
||||
require.Len(out.DeviceIDs, 1)
|
||||
require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
|
||||
// Check that we got the right nvidia device instance, and
|
||||
// specific device instance IDs if required
|
||||
must.Len(t, 1, out.DeviceIDs)
|
||||
must.SliceContains(t, collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
|
||||
must.SliceContainsSubset(t, c.ExpectedDeviceIDs, out.DeviceIDs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue