open-nomad/drivers/shared/executor/executor_linux.go

644 lines
17 KiB
Go
Raw Normal View History

// +build linux
2016-02-05 00:03:17 +00:00
package executor
2016-02-02 21:38:38 +00:00
import (
"context"
"fmt"
"os"
"os/exec"
"path"
"path/filepath"
"strings"
"syscall"
2016-05-09 14:57:26 +00:00
"time"
"github.com/armon/circbuf"
"github.com/hashicorp/consul-template/signals"
hclog "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/stats"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/drivers/shared/executor/structs"
"github.com/hashicorp/nomad/helper/discover"
shelpers "github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
lconfigs "github.com/opencontainers/runc/libcontainer/configs"
"github.com/syndtr/gocapability/capability"
)
const (
defaultCgroupParent = "nomad"
)
var (
2016-06-10 02:45:41 +00:00
// The statistics the executor exposes when using cgroups
ExecutorCgroupMeasuredMemStats = []string{"RSS", "Cache", "Swap", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
// allCaps is all linux capabilities which is used to configure libcontainer
allCaps []string
2016-02-02 21:38:38 +00:00
)
// initialize the allCaps var with all capabilities available on the system
func init() {
last := capability.CAP_LAST_CAP
// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
if last == capability.Cap(63) {
last = capability.CAP_BLOCK_SUSPEND
}
for _, cap := range capability.List() {
if cap > last {
continue
}
allCaps = append(allCaps, fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())))
}
}
// LibcontainerExecutor implements an Executor with the runc/libcontainer api
type LibcontainerExecutor struct {
id string
command *structs.ExecCommand
logger hclog.Logger
totalCpuStats *stats.CpuStats
userCpuStats *stats.CpuStats
systemCpuStats *stats.CpuStats
pidCollector *pidCollector
container libcontainer.Container
userProc *libcontainer.Process
userProcExited chan interface{}
exitState *structs.ProcessState
}
func NewExecutorWithIsolation(logger hclog.Logger) structs.Executor {
logger = logger.Named("isolated_executor")
if err := shelpers.Init(); err != nil {
logger.Error("unable to initialize stats", "error", err)
}
return &LibcontainerExecutor{
id: strings.Replace(uuid.Generate(), "-", "_", 0),
logger: logger,
totalCpuStats: stats.NewCpuStats(),
userCpuStats: stats.NewCpuStats(),
systemCpuStats: stats.NewCpuStats(),
pidCollector: newPidCollector(logger),
2016-02-03 19:41:49 +00:00
}
2016-02-04 00:03:43 +00:00
}
2016-02-03 19:41:49 +00:00
// Launch creates a new container in libcontainer and starts a new process with it
func (l *LibcontainerExecutor) Launch(command *structs.ExecCommand) (*structs.ProcessState, error) {
l.logger.Info("launching command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
// Find the nomad executable to launch the executor process with
bin, err := discover.NomadExecutable()
if err != nil {
return nil, fmt.Errorf("unable to find the nomad binary: %v", err)
}
2018-10-16 02:37:58 +00:00
if command.Resources == nil {
command.Resources = &structs.Resources{}
2018-10-16 02:37:58 +00:00
}
l.command = command
// Move to the root cgroup until process is started
subsystems, err := cgroups.GetAllSubsystems()
if err != nil {
return nil, err
}
if err := JoinRootCgroup(subsystems); err != nil {
return nil, err
2016-02-03 19:41:49 +00:00
}
// create a new factory which will store the container state in the allocDir
factory, err := libcontainer.New(
path.Join(command.TaskDir, "../alloc/container"),
libcontainer.Cgroupfs,
libcontainer.InitArgs(bin, "libcontainer-shim"),
)
if err != nil {
return nil, fmt.Errorf("failed to create factory: %v", err)
}
// A container groups processes under the same isolation enforcement
container, err := factory.Create(l.id, newLibcontainerConfig(command))
if err != nil {
return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
}
l.container = container
2018-10-16 02:37:58 +00:00
// Look up the binary path and make it executable
absPath, err := lookupBin(command.TaskDir, command.Cmd)
if err != nil {
return nil, err
}
if err := makeExecutable(absPath); err != nil {
return nil, err
}
path := absPath
// Determine the path to run as it may have to be relative to the chroot.
rel, err := filepath.Rel(command.TaskDir, path)
if err != nil {
return nil, fmt.Errorf("failed to determine relative path base=%q target=%q: %v", command.TaskDir, path, err)
}
path = rel
combined := append([]string{path}, command.Args...)
stdout, err := command.Stdout()
if err != nil {
return nil, err
}
stderr, err := command.Stderr()
if err != nil {
return nil, err
}
// the task process will be started by the container
process := &libcontainer.Process{
Args: combined,
Env: command.Env,
Stdout: stdout,
Stderr: stderr,
Init: true,
}
if command.User != "" {
process.User = command.User
}
l.userProc = process
l.totalCpuStats = stats.NewCpuStats()
l.userCpuStats = stats.NewCpuStats()
l.systemCpuStats = stats.NewCpuStats()
// Starts the task
if err := container.Run(process); err != nil {
container.Destroy()
return nil, err
}
pid, err := process.Pid()
if err != nil {
container.Destroy()
return nil, err
}
// Join process cgroups
containerState, err := container.State()
if err != nil {
l.logger.Error("error entering user process cgroups", "executor_pid", os.Getpid(), "error", err)
}
if err := cgroups.EnterPid(containerState.CgroupPaths, os.Getpid()); err != nil {
l.logger.Error("error entering user process cgroups", "executor_pid", os.Getpid(), "error", err)
}
// start a goroutine to wait on the process to complete, so Wait calls can
// be multiplexed
l.userProcExited = make(chan interface{})
go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
go l.wait()
return &structs.ProcessState{
Pid: pid,
ExitCode: -1,
Time: time.Now(),
}, nil
}
func (l *LibcontainerExecutor) getAllPids() (map[int]*nomadPid, error) {
pids, err := l.container.Processes()
if err != nil {
return nil, err
}
nPids := make(map[int]*nomadPid)
for _, pid := range pids {
nPids[pid] = &nomadPid{
pid: pid,
cpuStatsTotal: stats.NewCpuStats(),
cpuStatsUser: stats.NewCpuStats(),
cpuStatsSys: stats.NewCpuStats(),
}
}
return nPids, nil
}
// Wait waits until a process has exited and returns it's exitcode and errors
func (l *LibcontainerExecutor) Wait(ctx context.Context) (*structs.ProcessState, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-l.userProcExited:
return l.exitState, nil
}
}
func (l *LibcontainerExecutor) wait() {
defer close(l.userProcExited)
ps, err := l.userProc.Wait()
if err != nil {
// If the process has exited before we called wait an error is returned
// the process state is embedded in the error
if exitErr, ok := err.(*exec.ExitError); ok {
ps = exitErr.ProcessState
} else {
l.logger.Error("failed to call wait on user process", "error", err)
l.exitState = &structs.ProcessState{Pid: 0, ExitCode: 0, Time: time.Now()}
return
}
}
l.command.Close()
exitCode := 1
var signal int
if status, ok := ps.Sys().(syscall.WaitStatus); ok {
exitCode = status.ExitStatus()
if status.Signaled() {
const exitSignalBase = 128
signal = int(status.Signal())
exitCode = exitSignalBase + signal
}
}
l.exitState = &structs.ProcessState{
Pid: ps.Pid(),
ExitCode: exitCode,
Signal: signal,
Time: time.Now(),
}
}
// Shutdown stops all processes started and cleans up any resources
// created (such as mountpoints, devices, etc).
func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
if l.container == nil {
2018-05-25 22:25:23 +00:00
return nil
}
// move executor to root cgroup
subsystems, err := cgroups.GetAllSubsystems()
if err != nil {
return err
}
if err := JoinRootCgroup(subsystems); err != nil {
return err
}
status, err := l.container.Status()
if err != nil {
return err
}
defer l.container.Destroy()
if status == libcontainer.Stopped {
return nil
}
if grace > 0 {
if signal == "" {
signal = "SIGINT"
}
2016-02-04 00:03:43 +00:00
sig, ok := signals.SignalLookup[signal]
if !ok {
return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
}
err = l.container.Signal(sig, false)
if err != nil {
return err
}
select {
case <-l.userProcExited:
return nil
case <-time.After(grace):
return l.container.Signal(os.Kill, false)
}
} else {
return l.container.Signal(os.Kill, false)
}
}
// UpdateResources updates the resource isolation with new values to be enforced
func (l *LibcontainerExecutor) UpdateResources(resources *structs.Resources) error {
return nil
}
// Version returns the api version of the executor
func (l *LibcontainerExecutor) Version() (*structs.ExecutorVersion, error) {
return &structs.ExecutorVersion{Version: ExecutorVersionLatest}, nil
}
// Stats returns the resource statistics for processes managed by the executor
func (l *LibcontainerExecutor) Stats() (*cstructs.TaskResourceUsage, error) {
lstats, err := l.container.Stats()
if err != nil {
return nil, err
2016-05-11 19:56:47 +00:00
}
pidStats, err := l.pidCollector.pidStats()
2016-04-28 23:06:01 +00:00
if err != nil {
return nil, err
}
2016-04-29 18:40:37 +00:00
ts := time.Now()
stats := lstats.CgroupStats
2016-04-29 18:40:37 +00:00
// Memory Related Stats
2016-04-29 20:03:02 +00:00
swap := stats.MemoryStats.SwapUsage
maxUsage := stats.MemoryStats.Usage.MaxUsage
rss := stats.MemoryStats.Stats["rss"]
cache := stats.MemoryStats.Stats["cache"]
2016-04-29 18:40:37 +00:00
ms := &cstructs.MemoryStats{
2016-04-29 20:03:02 +00:00
RSS: rss,
Cache: cache,
Swap: swap.Usage,
MaxUsage: maxUsage,
KernelUsage: stats.MemoryStats.KernelUsage.Usage,
KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
2016-06-10 02:45:41 +00:00
Measured: ExecutorCgroupMeasuredMemStats,
2016-04-29 18:40:37 +00:00
}
// CPU Related Stats
totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
2016-04-29 18:40:37 +00:00
totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
2016-05-21 07:49:17 +00:00
cs := &cstructs.CpuStats{
SystemMode: l.systemCpuStats.Percent(kernelModeTime),
UserMode: l.userCpuStats.Percent(userModeTime),
Percent: totalPercent,
2016-04-29 20:03:02 +00:00
ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
ThrottledTime: stats.CpuStats.ThrottlingData.ThrottledTime,
TotalTicks: l.systemCpuStats.TicksConsumed(totalPercent),
2016-06-10 02:45:41 +00:00
Measured: ExecutorCgroupMeasuredCpuStats,
2016-04-29 18:40:37 +00:00
}
taskResUsage := cstructs.TaskResourceUsage{
ResourceUsage: &cstructs.ResourceUsage{
MemoryStats: ms,
CpuStats: cs,
},
2016-05-28 02:08:17 +00:00
Timestamp: ts.UTC().UnixNano(),
Pids: pidStats,
}
return &taskResUsage, nil
2016-04-28 23:06:01 +00:00
}
// Signal sends a signal to the process managed by the executor
func (l *LibcontainerExecutor) Signal(s os.Signal) error {
return l.userProc.Signal(s)
}
// Exec starts an additional process inside the container
func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
combined := append([]string{cmd}, args...)
// Capture output
buf, _ := circbuf.NewBuffer(int64(cstructs.CheckBufSize))
process := &libcontainer.Process{
Args: combined,
Env: l.command.Env,
Stdout: buf,
Stderr: buf,
}
err := l.container.Run(process)
if err != nil {
return nil, 0, err
}
waitCh := make(chan *waitResult)
defer close(waitCh)
go l.handleExecWait(waitCh, process)
select {
case result := <-waitCh:
ps := result.ps
if result.err != nil {
if exitErr, ok := result.err.(*exec.ExitError); ok {
ps = exitErr.ProcessState
} else {
return nil, 0, result.err
}
}
var exitCode int
if status, ok := ps.Sys().(syscall.WaitStatus); ok {
exitCode = status.ExitStatus()
}
return buf.Bytes(), exitCode, nil
case <-time.After(time.Until(deadline)):
process.Signal(os.Kill)
return nil, 0, context.DeadlineExceeded
}
}
type waitResult struct {
ps *os.ProcessState
err error
}
func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
ps, err := process.Wait()
ch <- &waitResult{ps, err}
}
func configureCapabilities(cfg *lconfigs.Config, command *structs.ExecCommand) {
// TODO: allow better control of these
cfg.Capabilities = &lconfigs.Capabilities{
Bounding: allCaps,
Permitted: allCaps,
Inheritable: allCaps,
Ambient: allCaps,
Effective: allCaps,
}
}
func configureIsolation(cfg *lconfigs.Config, command *structs.ExecCommand) {
defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// set the new root directory for the container
cfg.Rootfs = command.TaskDir
// launch with mount namespace
cfg.Namespaces = lconfigs.Namespaces{
{Type: lconfigs.NEWNS},
}
// paths to mask using a bind mount to /dev/null to prevent reading
cfg.MaskPaths = []string{
"/proc/kcore",
"/sys/firmware",
}
// paths that should be remounted as readonly inside the container
cfg.ReadonlyPaths = []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
}
cfg.Devices = lconfigs.DefaultAutoCreatedDevices
cfg.Mounts = []*lconfigs.Mount{
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "proc",
Destination: "/proc",
Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | syscall.MS_RDONLY,
},
}
}
func configureCgroups(cfg *lconfigs.Config, command *structs.ExecCommand) error {
// If resources are not limited then manually create cgroups needed
if !command.ResourceLimits {
return configureBasicCgroups(cfg)
}
2016-02-04 00:03:43 +00:00
id := uuid.Generate()
cfg.Cgroups.Path = filepath.Join(defaultCgroupParent, id)
if command.Resources.MemoryMB > 0 {
// Total amount of memory allowed to consume
cfg.Cgroups.Resources.Memory = int64(command.Resources.MemoryMB * 1024 * 1024)
// Disable swap to avoid issues on the machine
var memSwappiness uint64 = 0
cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
}
if command.Resources.CPU < 2 {
return fmt.Errorf("resources.CPU must be equal to or greater than 2: %v", command.Resources.CPU)
}
// Set the relative CPU shares for this cgroup.
cfg.Cgroups.Resources.CpuShares = uint64(command.Resources.CPU)
if command.Resources.IOPS != 0 {
// Validate it is in an acceptable range.
if command.Resources.IOPS < 10 || command.Resources.IOPS > 1000 {
return fmt.Errorf("resources.IOPS must be between 10 and 1000: %d", command.Resources.IOPS)
}
cfg.Cgroups.Resources.BlkioWeight = uint16(command.Resources.IOPS)
2016-05-11 19:56:47 +00:00
}
return nil
}
func configureBasicCgroups(cfg *lconfigs.Config) error {
id := uuid.Generate()
// Manually create freezer cgroup
cfg.Cgroups.Paths = map[string]string{}
root, err := cgroups.FindCgroupMountpointDir()
2016-05-26 07:53:41 +00:00
if err != nil {
return err
2016-05-26 07:53:41 +00:00
}
2016-05-11 19:56:47 +00:00
if _, err := os.Stat(root); err != nil {
return err
2016-02-03 19:41:49 +00:00
}
freezer := cgroupFs.FreezerGroup{}
subsystem := freezer.Name()
path, err := cgroups.FindCgroupMountpoint(subsystem)
if err != nil {
return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err)
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
path = filepath.Join(root, filepath.Base(path), defaultCgroupParent, id)
if err = os.MkdirAll(path, 0755); err != nil {
return err
}
cfg.Cgroups.Paths[subsystem] = path
return nil
}
func newLibcontainerConfig(command *structs.ExecCommand) *lconfigs.Config {
cfg := &lconfigs.Config{
Cgroups: &lconfigs.Cgroup{
Resources: &lconfigs.Resources{
AllowAllDevices: nil,
MemorySwappiness: nil,
AllowedDevices: lconfigs.DefaultAllowedDevices,
},
},
Version: "1.0.0",
}
configureCapabilities(cfg, command)
configureIsolation(cfg, command)
configureCgroups(cfg, command)
return cfg
}
// JoinRootCgroup moves the current process to the cgroups of the init process
func JoinRootCgroup(subsystems []string) error {
mErrs := new(multierror.Error)
paths := map[string]string{}
for _, s := range subsystems {
mnt, _, err := cgroups.FindCgroupMountpointAndRoot(s)
if err != nil {
multierror.Append(mErrs, fmt.Errorf("error getting cgroup path for subsystem: %s", s))
continue
}
paths[s] = mnt
}
err := cgroups.EnterPid(paths, os.Getpid())
if err != nil {
multierror.Append(mErrs, err)
}
return mErrs.ErrorOrNil()
2016-02-03 19:41:49 +00:00
}