Fix DevicesSets being removed when cpusets are reloaded with cgroup v2 (#17535)

* Fix DevicesSets being removed when cpusets are reloaded with cgroup v2

This meant that if any allocation was created or removed, all
active DevicesSets were removed from all cgroups of all tasks.

This was most noticeable with "exec" and "raw_exec", as it meant
they no longer had access to /dev files.

* e2e: add test for verifying cgroups do not interfere with access to devices

---------

Co-authored-by: Seth Hoenig <shoenig@duck.com>
This commit is contained in:
Patric Stout 2023-06-15 16:39:36 +02:00 committed by GitHub
parent 2856967dda
commit 4767d44b94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 100 additions and 1 deletions

3
.changelog/17535.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
cgroups: Fixed a bug removing all DevicesSets when alloc is created/removed
```

View File

@ -330,7 +330,8 @@ func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) {
// set the cpuset value for the cgroup
if err = m.Set(&configs.Resources{
CpusetCpus: set.String(),
CpusetCpus: set.String(),
SkipDevices: true,
}); err != nil {
c.logger.Error("failed to set cgroup", "path", path, "error", err)
return

View File

@ -0,0 +1,54 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package isolation
import (
"testing"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test/must"
)
func TestCgroupDevices(t *testing.T) {
nomad := e2eutil.NomadClient(t)
e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)
t.Run("testDevicesUsable", testDevicesUsable)
}
func testDevicesUsable(t *testing.T) {
nomad := e2eutil.NomadClient(t)
jobID := "cgroup-devices-" + uuid.Short()
jobIDs := []string{jobID}
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
// start job
allocs := e2eutil.RegisterAndWaitForAllocs(t, nomad, "./input/cgroup_devices.hcl", jobID, "")
must.Len(t, 2, allocs)
// pick one to stop and one to verify
allocA := allocs[0].ID
allocB := allocs[1].ID
// verify devices are working
checkDev(t, allocA)
checkDev(t, allocB)
// stop the chosen alloc
_, err := e2eutil.Command("nomad", "alloc", "stop", "-detach", allocA)
must.NoError(t, err)
e2eutil.WaitForAllocStopped(t, nomad, allocA)
// verify device of remaining alloc
checkDev(t, allocB)
}
func checkDev(t *testing.T, allocID string) {
_, err := e2eutil.Command("nomad", "alloc", "exec", allocID, "dd", "if=/dev/zero", "of=/dev/null", "count=1")
must.NoError(t, err)
}

View File

@ -0,0 +1,41 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: MPL-2.0
job "cgroup_devices" {
type = "service"
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "group1" {
task "task1" {
driver = "raw_exec"
config {
command = "/bin/sleep"
args = ["infinity"]
}
resources {
cpu = 50
memory = 50
}
}
}
group "group2" {
task "task2" {
driver = "raw_exec"
config {
command = "/bin/sleep"
args = ["infinity"]
}
resources {
cpu = 50
memory = 50
}
}
}
}