Merge pull request #12875 from hashicorp/b-cgroupsv2-task-restarts

cgroups: make sure cgroup still exists after task restart
This commit is contained in:
Seth Hoenig 2022-05-05 10:54:29 -05:00 committed by GitHub
commit 90ff784dcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 52 additions and 5 deletions

View File

@ -783,6 +783,7 @@ func (tr *TaskRunner) runDriver() error {
taskConfig := tr.buildTaskConfig()
if tr.cpusetCgroupPathGetter != nil {
tr.logger.Trace("waiting for cgroup to exist for", "allocID", tr.allocID, "task", tr.task)
cpusetCgroupPath, err := tr.cpusetCgroupPathGetter(tr.killCtx)
if err != nil {
return err

View File

@ -253,6 +253,11 @@ func TestTaskRunner_Stop_ExitCode(t *testing.T) {
"command": "/bin/sleep",
"args": []string{"1000"},
}
task.Env = map[string]string{
"NOMAD_PARENT_CGROUP": "nomad.slice",
"NOMAD_ALLOC_ID": alloc.ID,
"NOMAD_TASK_NAME": task.Name,
}
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
defer cleanup()
@ -347,14 +352,17 @@ func TestTaskRunner_Restore_Running(t *testing.T) {
// returned once it is running and waiting in pending along with a cleanup
// func.
func setupRestoreFailureTest(t *testing.T, alloc *structs.Allocation) (*TaskRunner, *Config, func()) {
ci.Parallel(t)
task := alloc.Job.TaskGroups[0].Tasks[0]
task.Driver = "raw_exec"
task.Config = map[string]interface{}{
"command": "sleep",
"args": []string{"30"},
}
task.Env = map[string]string{
"NOMAD_PARENT_CGROUP": "nomad.slice",
"NOMAD_ALLOC_ID": alloc.ID,
"NOMAD_TASK_NAME": task.Name,
}
conf, cleanup1 := testTaskRunnerConfig(t, alloc, task.Name)
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
@ -503,6 +511,11 @@ func TestTaskRunner_Restore_System(t *testing.T) {
"command": "sleep",
"args": []string{"30"},
}
task.Env = map[string]string{
"NOMAD_PARENT_CGROUP": "nomad.slice",
"NOMAD_ALLOC_ID": alloc.ID,
"NOMAD_TASK_NAME": task.Name,
}
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
defer cleanup()
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
@ -718,7 +731,11 @@ func TestTaskRunner_TaskEnv_None(t *testing.T) {
"echo $PATH",
},
}
task.Env = map[string]string{
"NOMAD_PARENT_CGROUP": "nomad.slice",
"NOMAD_ALLOC_ID": alloc.ID,
"NOMAD_TASK_NAME": task.Name,
}
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
defer cleanup()
@ -1780,6 +1797,11 @@ func TestTaskRunner_Download_RawExec(t *testing.T) {
task.Config = map[string]interface{}{
"command": "noop.sh",
}
task.Env = map[string]string{
"NOMAD_PARENT_CGROUP": "nomad.slice",
"NOMAD_ALLOC_ID": alloc.ID,
"NOMAD_TASK_NAME": task.Name,
}
task.Artifacts = []*structs.TaskArtifact{
{
GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL),

View File

@ -143,8 +143,10 @@ func (d *killer) v2(cgroup *configs.Cgroup) error {
return err
}
// remove the cgroup from disk
return mgr.Destroy()
// note: do NOT remove the cgroup from disk; leave that to the alloc-level
// cpuset mananager.
return nil
}
// kill is used to SIGKILL all processes in cgroup

View File

@ -581,6 +581,25 @@ func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) e
return nil
}
// resetCgroup will re-create the v2 cgroup for the task after the task has been
// destroyed by libcontainer. In the case of a task restart we call DestroyTask
// which removes the cgroup - but we still need it!
//
// Ideally the cgroup management would be more unified - and we could do the creation
// on a task runner pre-start hook, eliminating the need for this hack.
func (d *Driver) resetCgroup(handle *taskHandle) {
if cgutil.UseV2 {
if handle.taskConfig.Resources != nil &&
handle.taskConfig.Resources.LinuxResources != nil &&
handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath != "" {
err := os.Mkdir(handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath, 0755)
if err != nil {
d.logger.Trace("failed to reset cgroup", "path", handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath)
}
}
}
}
func (d *Driver) DestroyTask(taskID string, force bool) error {
handle, ok := d.tasks.Get(taskID)
if !ok {
@ -599,6 +618,9 @@ func (d *Driver) DestroyTask(taskID string, force bool) error {
handle.pluginClient.Kill()
}
// workaround for the case where DestroyTask was issued on task restart
d.resetCgroup(handle)
d.tasks.Delete(taskID)
return nil
}