open-nomad/client/lib/cgutil/cpuset_manager_v2.go
2022-03-24 13:40:42 -05:00

332 lines
8.9 KiB
Go

//go:build linux
package cgutil
import (
"context"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/lib/cpuset"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
// CreationPID is a special PID in libcontainer used to denote a cgroup
// should be created, but with no process added.
//
// https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/cgroups/utils.go#L372
CreationPID = -1
// DefaultCgroupParentV2 is the name of Nomad's default parent cgroup, under which
// all other cgroups are managed. This can be changed with client configuration
// in case for e.g. Nomad tasks should be further constrained by an externally
// configured systemd cgroup.
DefaultCgroupParentV2 = "nomad.slice"
// rootless is (for now) always false; Nomad clients require root, so we
// assume to not need to do the extra plumbing for rootless cgroups.
rootless = false
)
// nothing is used for treating a map like a set with no values
type nothing struct{}
// present indicates something exists
var present = nothing{}
type cpusetManagerV2 struct {
logger hclog.Logger
parent string // relative to cgroup root (e.g. "nomad.slice")
parentAbs string // absolute path (e.g. "/sys/fs/cgroup/nomad.slice")
initial cpuset.CPUSet // set of initial cores (never changes)
lock sync.Mutex // hold this when managing pool / sharing / isolating
pool cpuset.CPUSet // pool of cores being shared among all tasks
sharing map[identity]nothing // sharing tasks using cores only from the pool
isolating map[identity]cpuset.CPUSet // isolating tasks using cores from the pool + reserved cores
}
func NewCpusetManagerV2(parent string, logger hclog.Logger) CpusetManager {
return &cpusetManagerV2{
parent: parent,
parentAbs: filepath.Join(CgroupRoot, parent),
logger: logger,
sharing: make(map[identity]nothing),
isolating: make(map[identity]cpuset.CPUSet),
}
}
func (c *cpusetManagerV2) Init(cores []uint16) error {
c.logger.Debug("initializing with", "cores", cores)
if err := c.ensureParent(); err != nil {
c.logger.Error("failed to init cpuset manager", "err", err)
return err
}
c.initial = cpuset.New(cores...)
return nil
}
func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) {
if alloc == nil || alloc.AllocatedResources == nil {
return
}
c.logger.Trace("add allocation", "name", alloc.Name, "id", alloc.ID)
// grab write lock while we recompute and apply changes
c.lock.Lock()
defer c.lock.Unlock()
// first update our tracking of isolating and sharing tasks
for task, resources := range alloc.AllocatedResources.Tasks {
id := makeID(alloc.ID, task)
if len(resources.Cpu.ReservedCores) > 0 {
c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...)
} else {
c.sharing[id] = present
}
}
// recompute the available sharable cpu cores
c.recalculate()
// now write out the entire cgroups space
c.reconcile()
// no need to cleanup on adds, we did not remove a task
}
func (c *cpusetManagerV2) RemoveAlloc(allocID string) {
c.logger.Trace("remove allocation", "id", allocID)
// grab write lock while we recompute and apply changes.
c.lock.Lock()
defer c.lock.Unlock()
// remove tasks of allocID from the sharing set
for id := range c.sharing {
if strings.HasPrefix(string(id), allocID) {
delete(c.sharing, id)
}
}
// remove tasks of allocID from the isolating set
for id := range c.isolating {
if strings.HasPrefix(string(id), allocID) {
delete(c.isolating, id)
}
}
// recompute available sharable cpu cores
c.recalculate()
// now write out the entire cgroups space
c.reconcile()
// now remove any tasks no longer running
c.cleanup()
}
func (c *cpusetManagerV2) CgroupPathFor(allocID, task string) CgroupPathGetter {
// The CgroupPathFor implementation must block until cgroup for allocID.task
// exists [and can accept a PID].
return func(ctx context.Context) (string, error) {
ticks, cancel := helper.NewSafeTimer(100 * time.Millisecond)
defer cancel()
for {
path := c.pathOf(makeID(allocID, task))
mgr, err := fs2.NewManager(nil, path, rootless)
if err != nil {
return "", err
}
if mgr.Exists() {
return path, nil
}
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticks.C:
continue
}
}
}
}
// recalculate the number of cores sharable by non-isolating tasks (and isolating tasks)
//
// must be called while holding c.lock
func (c *cpusetManagerV2) recalculate() {
remaining := c.initial.Copy()
for _, set := range c.isolating {
remaining = remaining.Difference(set)
}
c.pool = remaining
}
// reconcile will actually write the cpuset values for all tracked tasks.
//
// must be called while holding c.lock
func (c *cpusetManagerV2) reconcile() {
for id := range c.sharing {
c.write(id, c.pool)
}
for id, set := range c.isolating {
c.write(id, c.pool.Union(set))
}
}
// cleanup will remove any cgroups for allocations no longer being tracked
//
// must be called while holding c.lock
func (c *cpusetManagerV2) cleanup() {
// create a map to lookup ids we know about
size := len(c.sharing) + len(c.isolating)
ids := make(map[identity]nothing, size)
for id := range c.sharing {
ids[id] = present
}
for id := range c.isolating {
ids[id] = present
}
if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error {
// a cgroup is a directory
if !entry.IsDir() {
return nil
}
dir := filepath.Dir(path)
base := filepath.Base(path)
// only manage scopes directly under nomad.slice
if dir != c.parentAbs || !strings.HasSuffix(base, ".scope") {
return nil
}
// only remove the scope if we do not track it
id := identity(strings.TrimSuffix(base, ".scope"))
_, exists := ids[id]
if !exists {
c.remove(path)
}
return nil
}); err != nil {
c.logger.Error("failed to cleanup cgroup", "err", err)
}
}
//pathOf returns the absolute path to a task with identity id.
func (c *cpusetManagerV2) pathOf(id identity) string {
return filepath.Join(c.parentAbs, makeScope(id))
}
// remove does the actual fs delete of the cgroup
//
// We avoid removing a cgroup if it still contains a PID, as the cpuset manager
// may be initially empty on a Nomad client restart.
func (c *cpusetManagerV2) remove(path string) {
mgr, err := fs2.NewManager(nil, path, rootless)
if err != nil {
c.logger.Warn("failed to create manager", "path", path, "err", err)
return
}
// get the list of pids managed by this scope (should be 0 or 1)
pids, _ := mgr.GetPids()
// do not destroy the scope if a PID is still present
// this is a normal condition when an agent restarts with running tasks
// and the v2 manager is still rebuilding its tracked tasks
if len(pids) > 0 {
return
}
// remove the cgroup
if err3 := mgr.Destroy(); err3 != nil {
c.logger.Warn("failed to cleanup cgroup", "path", path, "err", err)
return
}
}
// write does the actual write of cpuset set for cgroup id
func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) {
path := c.pathOf(id)
// make a manager for the cgroup
m, err := fs2.NewManager(nil, path, rootless)
if err != nil {
c.logger.Error("failed to manage cgroup", "path", path, "err", err)
}
// create the cgroup
if err = m.Apply(CreationPID); err != nil {
c.logger.Error("failed to apply cgroup", "path", path, "err", err)
}
// set the cpuset value for the cgroup
if err = m.Set(&configs.Resources{
CpusetCpus: set.String(),
}); err != nil {
c.logger.Error("failed to set cgroup", "path", path, "err", err)
}
}
// ensureParentCgroup will create parent cgroup for the manager if it does not
// exist yet. No PIDs are added to any cgroup yet.
func (c *cpusetManagerV2) ensureParent() error {
mgr, err := fs2.NewManager(nil, c.parentAbs, rootless)
if err != nil {
return err
}
if err = mgr.Apply(CreationPID); err != nil {
return err
}
c.logger.Trace("establish cgroup hierarchy", "parent", c.parent)
return nil
}
// fromRoot returns the joined filepath of group on the CgroupRoot
func fromRoot(group string) string {
return filepath.Join(CgroupRoot, group)
}
// getCPUsFromCgroupV2 retrieves the effective cpuset for the group, which must
// be directly under the cgroup root (i.e. the parent, like nomad.slice).
func getCPUsFromCgroupV2(group string) ([]uint16, error) {
path := fromRoot(group)
effective, err := cgroups.ReadFile(path, "cpuset.cpus.effective")
if err != nil {
return nil, err
}
set, err := cpuset.Parse(effective)
if err != nil {
return nil, err
}
return set.ToSlice(), nil
}
// getParentV2 returns parent if set, otherwise the default name of Nomad's
// parent cgroup (i.e. nomad.slice).
func getParentV2(parent string) string {
if parent == "" {
return DefaultCgroupParentV2
}
return parent
}