open-nomad/client/lib/cgutil/group_killer.go

//go:build linux

package cgutil

import (
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"time"

	"github.com/hashicorp/go-hclog"
	"github.com/opencontainers/runc/libcontainer/cgroups"
	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
	"github.com/opencontainers/runc/libcontainer/configs"
)

// freezer is the name of the cgroup subsystem used for stopping / starting
// a group of processes
const freezer = "freezer"

// thawed and frozen are the two states we put a cgroup in when trying to remove it
var (
	thawed = &configs.Resources{Freezer: configs.Thawed}
	frozen = &configs.Resources{Freezer: configs.Frozen}
)

// GroupKiller is used for SIGKILL-ing the process tree[s] of a cgroup by leveraging
// the freezer cgroup subsystem.
type GroupKiller interface {
	KillGroup(cgroup *configs.Cgroup) error
}

// NewGroupKiller creates a GroupKiller with executor PID pid.
func NewGroupKiller(logger hclog.Logger, pid int) GroupKiller {
	return &killer{
		logger: logger.Named("group_killer"),
		pid:    pid,
	}
}

type killer struct {
	logger hclog.Logger
	pid    int
}

// KillGroup will SIGKILL the process tree present in cgroup, using the freezer
// subsystem to prevent further forking, etc.
func (d *killer) KillGroup(cgroup *configs.Cgroup) error {
	if UseV2 {
		return d.v2(cgroup)
	}
	return d.v1(cgroup)
}

func (d *killer) v1(cgroup *configs.Cgroup) error {
	if cgroup == nil {
		return errors.New("missing cgroup")
	}

	// the actual path to our tasks freezer cgroup
	path := cgroup.Path

	d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v1", "executor_pid", d.pid)

	// move executor PID into the init freezer cgroup so we can kill the task
	// pids without killing the executor (which is the process running this code,
	// doing the killing)
	initPath, err := cgroups.GetInitCgroupPath(freezer)
	if err != nil {
		return fmt.Errorf("failed to find init cgroup: %w", err)
	}
	m := map[string]string{freezer: initPath}
	if err = cgroups.EnterPid(m, d.pid); err != nil {
		return fmt.Errorf("failed to add executor pid to init cgroup: %w", err)
	}

	// ability to freeze the cgroup
	freeze := func() {
		_ = new(fs.FreezerGroup).Set(path, frozen)
	}

	// ability to thaw the cgroup
	thaw := func() {
		_ = new(fs.FreezerGroup).Set(path, thawed)
	}

	// do the common kill logic
	if err = d.kill(path, freeze, thaw); err != nil {
		return err
	}

	// remove the cgroup from disk
	return cgroups.RemovePath(path)
}

func (d *killer) v2(cgroup *configs.Cgroup) error {
	if cgroup == nil {
		return errors.New("missing cgroup")
	}

	path := filepath.Join(CgroupRoot, cgroup.Path)

	existingPIDs, err := cgroups.GetPids(path)
	if err != nil {
		return fmt.Errorf("failed to determine pids in cgroup: %w", err)
	}

	d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v2", "executor_pid", d.pid, "existing_pids", existingPIDs)

	mgr, err := fs2.NewManager(cgroup, "")
	if err != nil {
		return fmt.Errorf("failed to create v2 cgroup manager: %w", err)
	}

	// move executor PID into the root init.scope so we can kill the task pids
	// without killing the executor (which is the process running this code, doing
	// the killing)
	init, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, "init.scope"))
	if err != nil {
		return fmt.Errorf("failed to create v2 init cgroup manager: %w", err)
	}
	if err = init.Apply(d.pid); err != nil {
		return fmt.Errorf("failed to move executor pid into init.scope cgroup: %w", err)
	}

	d.logger.Trace("move of executor pid into init.scope complete", "pid", d.pid)

	// ability to freeze the cgroup
	freeze := func() {
		_ = mgr.Freeze(configs.Frozen)
	}

	// ability to thaw the cgroup
	thaw := func() {
		_ = mgr.Freeze(configs.Thawed)
	}

	// do the common kill logic

	if err = d.kill(path, freeze, thaw); err != nil {
		return err
	}

	// note: do NOT remove the cgroup from disk; leave that to the alloc-level
	// cpuset mananager.

	return nil
}

// kill is used to SIGKILL all processes in cgroup
//
// The order of operations is
// 0. before calling this method, the executor pid has been moved outside of cgroup
// 1. freeze cgroup (so processes cannot fork further)
// 2. scan the cgroup to collect all pids
// 3. issue SIGKILL to each pid found
// 4. thaw the cgroup so processes can go die
// 5. wait on each processes until it is confirmed dead
func (d *killer) kill(cgroup string, freeze func(), thaw func()) error {
	// freeze the cgroup stopping further forking
	freeze()

	d.logger.Trace("search for pids in", "cgroup", cgroup)

	// find all the pids we intend to kill
	pids, err := cgroups.GetPids(cgroup)
	if err != nil {
		// if we fail to get pids, re-thaw before bailing so there is at least
		// a chance the processes can go die out of band
		thaw()
		return fmt.Errorf("failed to find pids: %w", err)
	}

	d.logger.Trace("send sigkill to frozen processes", "cgroup", cgroup, "pids", pids)

	var processes []*os.Process

	// kill the processes in cgroup
	for _, pid := range pids {
		p, findErr := os.FindProcess(pid)
		if findErr != nil {
			d.logger.Trace("failed to find process of pid to kill", "pid", pid, "error", findErr)
			continue
		}
		processes = append(processes, p)
		if killErr := p.Kill(); killErr != nil {
			d.logger.Trace("failed to kill process", "pid", pid, "error", killErr)
			continue
		}
	}

	// thawed the cgroup so we can wait on each process
	thaw()

	// wait on each process
	for _, p := range processes {
		// do not capture error; errors are normal here
		pState, _ := p.Wait()
		d.logger.Trace("return from wait on process", "pid", p.Pid, "state", pState)
	}

	// cgroups are not atomic, the OS takes a moment to un-mark the cgroup as in-use;
	// a tiny sleep here goes a long way for not creating noisy (but functionally benign)
	// errors about removing busy cgroup
	//
	// alternatively we could do the removal in a loop and silence the interim errors, but meh
	time.Sleep(50 * time.Millisecond)

	return nil
}
raw_exec: make raw exec driver work with cgroups v2 This PR adds support for the raw_exec driver on systems with only cgroups v2. The raw exec driver is able to use cgroups to manage processes. This happens only on Linux, when exec_driver is enabled, and the no_cgroups option is not set. The driver uses the freezer controller to freeze processes of a task, issue a sigkill, then unfreeze. Previously the implementation assumed cgroups v1, and now it also supports cgroups v2. There is a bit of refactoring in this PR, but the fundamental design remains the same. Closes #12351 #12348 2022-03-29 00:33:01 +00:00			`//go:build linux`

			`package cgutil`

			`import (`
			`"errors"`
			`"fmt"`
			`"os"`
			`"path/filepath"`
			`"time"`

			`"github.com/hashicorp/go-hclog"`
			`"github.com/opencontainers/runc/libcontainer/cgroups"`
			`"github.com/opencontainers/runc/libcontainer/cgroups/fs"`
			`"github.com/opencontainers/runc/libcontainer/cgroups/fs2"`
			`"github.com/opencontainers/runc/libcontainer/configs"`
			`)`

			`// freezer is the name of the cgroup subsystem used for stopping / starting`
			`// a group of processes`
			`const freezer = "freezer"`

			`// thawed and frozen are the two states we put a cgroup in when trying to remove it`
			`var (`
			`thawed = &configs.Resources{Freezer: configs.Thawed}`
			`frozen = &configs.Resources{Freezer: configs.Frozen}`
			`)`

			`// GroupKiller is used for SIGKILL-ing the process tree[s] of a cgroup by leveraging`
			`// the freezer cgroup subsystem.`
			`type GroupKiller interface {`
			`KillGroup(cgroup *configs.Cgroup) error`
			`}`

			`// NewGroupKiller creates a GroupKiller with executor PID pid.`
			`func NewGroupKiller(logger hclog.Logger, pid int) GroupKiller {`
			`return &killer{`
			`logger: logger.Named("group_killer"),`
			`pid: pid,`
			`}`
			`}`

			`type killer struct {`
			`logger hclog.Logger`
			`pid int`
			`}`

			`// KillGroup will SIGKILL the process tree present in cgroup, using the freezer`
			`// subsystem to prevent further forking, etc.`
			`func (d killer) KillGroup(cgroup configs.Cgroup) error {`
			`if UseV2 {`
			`return d.v2(cgroup)`
			`}`
			`return d.v1(cgroup)`
			`}`

			`func (d killer) v1(cgroup configs.Cgroup) error {`
			`if cgroup == nil {`
			`return errors.New("missing cgroup")`
			`}`

			`// the actual path to our tasks freezer cgroup`
deps: update opencontainers/runc to v1.1.3 2022-08-04 13:56:40 +00:00			`path := cgroup.Path`
raw_exec: make raw exec driver work with cgroups v2 This PR adds support for the raw_exec driver on systems with only cgroups v2. The raw exec driver is able to use cgroups to manage processes. This happens only on Linux, when exec_driver is enabled, and the no_cgroups option is not set. The driver uses the freezer controller to freeze processes of a task, issue a sigkill, then unfreeze. Previously the implementation assumed cgroups v1, and now it also supports cgroups v2. There is a bit of refactoring in this PR, but the fundamental design remains the same. Closes #12351 #12348 2022-03-29 00:33:01 +00:00
			`d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v1", "executor_pid", d.pid)`

			`// move executor PID into the init freezer cgroup so we can kill the task`
			`// pids without killing the executor (which is the process running this code,`
			`// doing the killing)`
			`initPath, err := cgroups.GetInitCgroupPath(freezer)`
			`if err != nil {`
			`return fmt.Errorf("failed to find init cgroup: %w", err)`
			`}`
			`m := map[string]string{freezer: initPath}`
			`if err = cgroups.EnterPid(m, d.pid); err != nil {`
			`return fmt.Errorf("failed to add executor pid to init cgroup: %w", err)`
			`}`

			`// ability to freeze the cgroup`
			`freeze := func() {`
			`_ = new(fs.FreezerGroup).Set(path, frozen)`
			`}`

			`// ability to thaw the cgroup`
			`thaw := func() {`
			`_ = new(fs.FreezerGroup).Set(path, thawed)`
			`}`

			`// do the common kill logic`
			`if err = d.kill(path, freeze, thaw); err != nil {`
			`return err`
			`}`

			`// remove the cgroup from disk`
			`return cgroups.RemovePath(path)`
			`}`

			`func (d killer) v2(cgroup configs.Cgroup) error {`
			`if cgroup == nil {`
			`return errors.New("missing cgroup")`
			`}`

			`path := filepath.Join(CgroupRoot, cgroup.Path)`

			`existingPIDs, err := cgroups.GetPids(path)`
			`if err != nil {`
			`return fmt.Errorf("failed to determine pids in cgroup: %w", err)`
			`}`

			`d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v2", "executor_pid", d.pid, "existing_pids", existingPIDs)`

deps: update opencontainers/runc to v1.1.3 2022-08-04 13:56:40 +00:00			`mgr, err := fs2.NewManager(cgroup, "")`
raw_exec: make raw exec driver work with cgroups v2 This PR adds support for the raw_exec driver on systems with only cgroups v2. The raw exec driver is able to use cgroups to manage processes. This happens only on Linux, when exec_driver is enabled, and the no_cgroups option is not set. The driver uses the freezer controller to freeze processes of a task, issue a sigkill, then unfreeze. Previously the implementation assumed cgroups v1, and now it also supports cgroups v2. There is a bit of refactoring in this PR, but the fundamental design remains the same. Closes #12351 #12348 2022-03-29 00:33:01 +00:00			`if err != nil {`
			`return fmt.Errorf("failed to create v2 cgroup manager: %w", err)`
			`}`

			`// move executor PID into the root init.scope so we can kill the task pids`
			`// without killing the executor (which is the process running this code, doing`
			`// the killing)`
deps: update opencontainers/runc to v1.1.3 2022-08-04 13:56:40 +00:00			`init, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, "init.scope"))`
raw_exec: make raw exec driver work with cgroups v2 This PR adds support for the raw_exec driver on systems with only cgroups v2. The raw exec driver is able to use cgroups to manage processes. This happens only on Linux, when exec_driver is enabled, and the no_cgroups option is not set. The driver uses the freezer controller to freeze processes of a task, issue a sigkill, then unfreeze. Previously the implementation assumed cgroups v1, and now it also supports cgroups v2. There is a bit of refactoring in this PR, but the fundamental design remains the same. Closes #12351 #12348 2022-03-29 00:33:01 +00:00			`if err != nil {`
			`return fmt.Errorf("failed to create v2 init cgroup manager: %w", err)`
			`}`
			`if err = init.Apply(d.pid); err != nil {`
			`return fmt.Errorf("failed to move executor pid into init.scope cgroup: %w", err)`
			`}`

			`d.logger.Trace("move of executor pid into init.scope complete", "pid", d.pid)`

			`// ability to freeze the cgroup`
			`freeze := func() {`
			`_ = mgr.Freeze(configs.Frozen)`
			`}`

			`// ability to thaw the cgroup`
			`thaw := func() {`
			`_ = mgr.Freeze(configs.Thawed)`
			`}`

			`// do the common kill logic`

			`if err = d.kill(path, freeze, thaw); err != nil {`
			`return err`
			`}`

cgroups: make sure cgroup still exists after task restart This PR modifies raw_exec and exec to ensure the cgroup for a task they are driving still exists during a task restart. These drivers have the same bug but with different root cause. For raw_exec, we were removing the cgroup in 2 places - the cpuset manager, and in the unix containment implementation (the thing that uses freezer cgroup to clean house). During a task restart, the containment would remove the cgroup, and when the task runner hooks went to start again would block on waiting for the cgroup to exist, which will never happen, because it gets created by the cpuset manager which only runs as an alloc pre-start hook. The fix here is to simply not delete the cgroup in the containment implementation; killing the PIDs is enough. The removal happens in the cpuset manager later anyway. For exec, it's the same idea, except DestroyTask is called on task failure, which in turn calls into libcontainer, which in turn deletes the cgroup. In this case we do not have control over the deletion of the cgroup, so instead we hack the cgroup back into life after the call to DestroyTask. All of this only applies to cgroups v2. 2022-05-04 18:51:53 +00:00			`// note: do NOT remove the cgroup from disk; leave that to the alloc-level`
			`// cpuset mananager.`

			`return nil`
raw_exec: make raw exec driver work with cgroups v2 This PR adds support for the raw_exec driver on systems with only cgroups v2. The raw exec driver is able to use cgroups to manage processes. This happens only on Linux, when exec_driver is enabled, and the no_cgroups option is not set. The driver uses the freezer controller to freeze processes of a task, issue a sigkill, then unfreeze. Previously the implementation assumed cgroups v1, and now it also supports cgroups v2. There is a bit of refactoring in this PR, but the fundamental design remains the same. Closes #12351 #12348 2022-03-29 00:33:01 +00:00			`}`

			`// kill is used to SIGKILL all processes in cgroup`
			`//`
			`// The order of operations is`
			`// 0. before calling this method, the executor pid has been moved outside of cgroup`
			`// 1. freeze cgroup (so processes cannot fork further)`
			`// 2. scan the cgroup to collect all pids`
			`// 3. issue SIGKILL to each pid found`
			`// 4. thaw the cgroup so processes can go die`
			`// 5. wait on each processes until it is confirmed dead`
			`func (d *killer) kill(cgroup string, freeze func(), thaw func()) error {`
			`// freeze the cgroup stopping further forking`
			`freeze()`

			`d.logger.Trace("search for pids in", "cgroup", cgroup)`

			`// find all the pids we intend to kill`
			`pids, err := cgroups.GetPids(cgroup)`
			`if err != nil {`
			`// if we fail to get pids, re-thaw before bailing so there is at least`
			`// a chance the processes can go die out of band`
			`thaw()`
			`return fmt.Errorf("failed to find pids: %w", err)`
			`}`

			`d.logger.Trace("send sigkill to frozen processes", "cgroup", cgroup, "pids", pids)`

			`var processes []*os.Process`

			`// kill the processes in cgroup`
			`for _, pid := range pids {`
			`p, findErr := os.FindProcess(pid)`
			`if findErr != nil {`
			`d.logger.Trace("failed to find process of pid to kill", "pid", pid, "error", findErr)`
			`continue`
			`}`
			`processes = append(processes, p)`
			`if killErr := p.Kill(); killErr != nil {`
			`d.logger.Trace("failed to kill process", "pid", pid, "error", killErr)`
			`continue`
			`}`
			`}`

			`// thawed the cgroup so we can wait on each process`
			`thaw()`

			`// wait on each process`
			`for _, p := range processes {`
			`// do not capture error; errors are normal here`
			`pState, _ := p.Wait()`
			`d.logger.Trace("return from wait on process", "pid", p.Pid, "state", pState)`
			`}`

			`// cgroups are not atomic, the OS takes a moment to un-mark the cgroup as in-use;`
			`// a tiny sleep here goes a long way for not creating noisy (but functionally benign)`
			`// errors about removing busy cgroup`
			`//`
			`// alternatively we could do the removal in a loop and silence the interim errors, but meh`
			`time.Sleep(50 * time.Millisecond)`

			`return nil`
			`}`