Merge pull request #12875 from hashicorp/b-cgroupsv2-task-restarts
cgroups: make sure cgroup still exists after task restart
This commit is contained in:
commit
90ff784dcf
|
@ -783,6 +783,7 @@ func (tr *TaskRunner) runDriver() error {
|
|||
|
||||
taskConfig := tr.buildTaskConfig()
|
||||
if tr.cpusetCgroupPathGetter != nil {
|
||||
tr.logger.Trace("waiting for cgroup to exist for", "allocID", tr.allocID, "task", tr.task)
|
||||
cpusetCgroupPath, err := tr.cpusetCgroupPathGetter(tr.killCtx)
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
|
@ -253,6 +253,11 @@ func TestTaskRunner_Stop_ExitCode(t *testing.T) {
|
|||
"command": "/bin/sleep",
|
||||
"args": []string{"1000"},
|
||||
}
|
||||
task.Env = map[string]string{
|
||||
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
||||
"NOMAD_ALLOC_ID": alloc.ID,
|
||||
"NOMAD_TASK_NAME": task.Name,
|
||||
}
|
||||
|
||||
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
||||
defer cleanup()
|
||||
|
@ -347,14 +352,17 @@ func TestTaskRunner_Restore_Running(t *testing.T) {
|
|||
// returned once it is running and waiting in pending along with a cleanup
|
||||
// func.
|
||||
func setupRestoreFailureTest(t *testing.T, alloc *structs.Allocation) (*TaskRunner, *Config, func()) {
|
||||
ci.Parallel(t)
|
||||
|
||||
task := alloc.Job.TaskGroups[0].Tasks[0]
|
||||
task.Driver = "raw_exec"
|
||||
task.Config = map[string]interface{}{
|
||||
"command": "sleep",
|
||||
"args": []string{"30"},
|
||||
}
|
||||
task.Env = map[string]string{
|
||||
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
||||
"NOMAD_ALLOC_ID": alloc.ID,
|
||||
"NOMAD_TASK_NAME": task.Name,
|
||||
}
|
||||
conf, cleanup1 := testTaskRunnerConfig(t, alloc, task.Name)
|
||||
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
|
||||
|
||||
|
@ -503,6 +511,11 @@ func TestTaskRunner_Restore_System(t *testing.T) {
|
|||
"command": "sleep",
|
||||
"args": []string{"30"},
|
||||
}
|
||||
task.Env = map[string]string{
|
||||
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
||||
"NOMAD_ALLOC_ID": alloc.ID,
|
||||
"NOMAD_TASK_NAME": task.Name,
|
||||
}
|
||||
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
||||
defer cleanup()
|
||||
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
|
||||
|
@ -718,7 +731,11 @@ func TestTaskRunner_TaskEnv_None(t *testing.T) {
|
|||
"echo $PATH",
|
||||
},
|
||||
}
|
||||
|
||||
task.Env = map[string]string{
|
||||
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
||||
"NOMAD_ALLOC_ID": alloc.ID,
|
||||
"NOMAD_TASK_NAME": task.Name,
|
||||
}
|
||||
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
||||
defer cleanup()
|
||||
|
||||
|
@ -1780,6 +1797,11 @@ func TestTaskRunner_Download_RawExec(t *testing.T) {
|
|||
task.Config = map[string]interface{}{
|
||||
"command": "noop.sh",
|
||||
}
|
||||
task.Env = map[string]string{
|
||||
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
||||
"NOMAD_ALLOC_ID": alloc.ID,
|
||||
"NOMAD_TASK_NAME": task.Name,
|
||||
}
|
||||
task.Artifacts = []*structs.TaskArtifact{
|
||||
{
|
||||
GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL),
|
||||
|
|
|
@ -143,8 +143,10 @@ func (d *killer) v2(cgroup *configs.Cgroup) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// remove the cgroup from disk
|
||||
return mgr.Destroy()
|
||||
// note: do NOT remove the cgroup from disk; leave that to the alloc-level
|
||||
// cpuset mananager.
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// kill is used to SIGKILL all processes in cgroup
|
||||
|
|
|
@ -581,6 +581,25 @@ func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) e
|
|||
return nil
|
||||
}
|
||||
|
||||
// resetCgroup will re-create the v2 cgroup for the task after the task has been
|
||||
// destroyed by libcontainer. In the case of a task restart we call DestroyTask
|
||||
// which removes the cgroup - but we still need it!
|
||||
//
|
||||
// Ideally the cgroup management would be more unified - and we could do the creation
|
||||
// on a task runner pre-start hook, eliminating the need for this hack.
|
||||
func (d *Driver) resetCgroup(handle *taskHandle) {
|
||||
if cgutil.UseV2 {
|
||||
if handle.taskConfig.Resources != nil &&
|
||||
handle.taskConfig.Resources.LinuxResources != nil &&
|
||||
handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath != "" {
|
||||
err := os.Mkdir(handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath, 0755)
|
||||
if err != nil {
|
||||
d.logger.Trace("failed to reset cgroup", "path", handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Driver) DestroyTask(taskID string, force bool) error {
|
||||
handle, ok := d.tasks.Get(taskID)
|
||||
if !ok {
|
||||
|
@ -599,6 +618,9 @@ func (d *Driver) DestroyTask(taskID string, force bool) error {
|
|||
handle.pluginClient.Kill()
|
||||
}
|
||||
|
||||
// workaround for the case where DestroyTask was issued on task restart
|
||||
d.resetCgroup(handle)
|
||||
|
||||
d.tasks.Delete(taskID)
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue