2018-09-24 18:37:45 +00:00
// +build linux
2016-02-05 00:03:17 +00:00
package executor
2016-02-02 21:38:38 +00:00
import (
2018-09-24 18:37:45 +00:00
"context"
2016-02-03 02:54:04 +00:00
"fmt"
2019-04-28 21:30:10 +00:00
"io"
2016-02-03 18:23:00 +00:00
"os"
2018-09-24 18:37:45 +00:00
"os/exec"
"path"
2016-02-03 02:54:04 +00:00
"path/filepath"
2016-04-19 20:48:02 +00:00
"strings"
2016-02-03 18:23:00 +00:00
"syscall"
2016-05-09 14:57:26 +00:00
"time"
2016-02-03 18:23:00 +00:00
2018-09-24 18:37:45 +00:00
"github.com/armon/circbuf"
"github.com/hashicorp/consul-template/signals"
hclog "github.com/hashicorp/go-hclog"
2019-05-07 20:58:27 +00:00
"github.com/hashicorp/nomad/client/allocdir"
2016-05-20 09:05:48 +00:00
"github.com/hashicorp/nomad/client/stats"
2018-12-11 20:27:50 +00:00
cstructs "github.com/hashicorp/nomad/client/structs"
2018-09-24 18:37:45 +00:00
shelpers "github.com/hashicorp/nomad/helper/stats"
2017-09-29 16:58:48 +00:00
"github.com/hashicorp/nomad/helper/uuid"
2018-12-07 19:03:13 +00:00
"github.com/hashicorp/nomad/nomad/structs"
2018-12-07 02:22:02 +00:00
"github.com/hashicorp/nomad/plugins/drivers"
2018-09-24 18:37:45 +00:00
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/cgroups"
lconfigs "github.com/opencontainers/runc/libcontainer/configs"
2018-12-10 03:30:23 +00:00
ldevices "github.com/opencontainers/runc/libcontainer/devices"
2020-08-19 15:57:26 +00:00
"github.com/opencontainers/runc/libcontainer/specconv"
2019-04-28 21:30:10 +00:00
lutils "github.com/opencontainers/runc/libcontainer/utils"
2018-09-24 18:37:45 +00:00
"github.com/syndtr/gocapability/capability"
2018-12-10 03:30:23 +00:00
"golang.org/x/sys/unix"
2018-09-24 18:37:45 +00:00
)
const (
2019-12-11 16:39:16 +00:00
defaultCgroupParent = "/nomad"
2016-02-03 18:23:00 +00:00
)
var (
2021-04-01 15:50:17 +00:00
// ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1
ExecutorCgroupV1MeasuredMemStats = [ ] string { "RSS" , "Cache" , "Swap" , "Usage" , "Max Usage" , "Kernel Usage" , "Kernel Max Usage" }
// ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage.
ExecutorCgroupV2MeasuredMemStats = [ ] string { "Cache" , "Swap" , "Usage" }
2018-12-11 20:27:50 +00:00
// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
2016-06-10 17:38:29 +00:00
ExecutorCgroupMeasuredCpuStats = [ ] string { "System Mode" , "User Mode" , "Throttled Periods" , "Throttled Time" , "Percent" }
2016-02-02 21:38:38 +00:00
)
2018-09-24 18:37:45 +00:00
// LibcontainerExecutor implements an Executor with the runc/libcontainer api
type LibcontainerExecutor struct {
id string
2018-12-07 01:54:14 +00:00
command * ExecCommand
2018-09-24 18:37:45 +00:00
logger hclog . Logger
totalCpuStats * stats . CpuStats
userCpuStats * stats . CpuStats
systemCpuStats * stats . CpuStats
pidCollector * pidCollector
container libcontainer . Container
userProc * libcontainer . Process
userProcExited chan interface { }
2019-03-20 11:33:05 +00:00
exitState * ProcessState
2018-09-24 18:37:45 +00:00
}
2018-12-07 01:54:14 +00:00
func NewExecutorWithIsolation ( logger hclog . Logger ) Executor {
2018-09-24 18:37:45 +00:00
logger = logger . Named ( "isolated_executor" )
if err := shelpers . Init ( ) ; err != nil {
logger . Error ( "unable to initialize stats" , "error" , err )
}
return & LibcontainerExecutor {
2019-03-01 21:33:17 +00:00
id : strings . Replace ( uuid . Generate ( ) , "-" , "_" , - 1 ) ,
2018-09-24 18:37:45 +00:00
logger : logger ,
totalCpuStats : stats . NewCpuStats ( ) ,
userCpuStats : stats . NewCpuStats ( ) ,
systemCpuStats : stats . NewCpuStats ( ) ,
pidCollector : newPidCollector ( logger ) ,
2016-02-03 19:41:49 +00:00
}
2016-02-04 00:03:43 +00:00
}
2016-02-03 19:41:49 +00:00
2018-09-24 18:37:45 +00:00
// Launch creates a new container in libcontainer and starts a new process with it
2018-12-07 01:54:14 +00:00
func ( l * LibcontainerExecutor ) Launch ( command * ExecCommand ) ( * ProcessState , error ) {
2019-05-07 21:01:05 +00:00
l . logger . Trace ( "preparing to launch command" , "command" , command . Cmd , "args" , strings . Join ( command . Args , " " ) )
2016-02-05 08:11:09 +00:00
2018-10-16 02:37:58 +00:00
if command . Resources == nil {
2018-12-07 19:03:13 +00:00
command . Resources = & drivers . Resources {
2018-12-14 00:21:41 +00:00
NomadResources : & structs . AllocatedTaskResources { } ,
2018-12-07 19:03:13 +00:00
}
2018-10-16 02:37:58 +00:00
}
2018-09-24 18:37:45 +00:00
l . command = command
// create a new factory which will store the container state in the allocDir
factory , err := libcontainer . New (
path . Join ( command . TaskDir , "../alloc/container" ) ,
libcontainer . Cgroupfs ,
2019-04-19 13:40:30 +00:00
// note that os.Args[0] refers to the executor shim typically
// and first args arguments is ignored now due
// until https://github.com/opencontainers/runc/pull/1888 is merged
2019-03-18 21:36:31 +00:00
libcontainer . InitArgs ( os . Args [ 0 ] , "libcontainer-shim" ) ,
2018-09-24 18:37:45 +00:00
)
if err != nil {
return nil , fmt . Errorf ( "failed to create factory: %v" , err )
}
2018-05-26 01:49:20 +00:00
2018-09-24 18:37:45 +00:00
// A container groups processes under the same isolation enforcement
2018-12-07 19:03:13 +00:00
containerCfg , err := newLibcontainerConfig ( command )
if err != nil {
return nil , fmt . Errorf ( "failed to configure container(%s): %v" , l . id , err )
}
container , err := factory . Create ( l . id , containerCfg )
2018-09-24 18:37:45 +00:00
if err != nil {
return nil , fmt . Errorf ( "failed to create container(%s): %v" , l . id , err )
}
l . container = container
2018-05-26 01:49:20 +00:00
2018-10-16 02:37:58 +00:00
// Look up the binary path and make it executable
2019-05-08 14:01:20 +00:00
absPath , err := lookupTaskBin ( command )
2019-05-02 17:35:24 +00:00
2018-10-16 02:37:58 +00:00
if err != nil {
return nil , err
}
if err := makeExecutable ( absPath ) ; err != nil {
return nil , err
}
path := absPath
2019-05-02 17:35:24 +00:00
// Ensure that the path is contained in the chroot, and find it relative to the container
2018-10-16 02:37:58 +00:00
rel , err := filepath . Rel ( command . TaskDir , path )
if err != nil {
return nil , fmt . Errorf ( "failed to determine relative path base=%q target=%q: %v" , command . TaskDir , path , err )
}
2019-04-01 21:17:42 +00:00
// Turn relative-to-chroot path into absolute path to avoid
2019-04-02 18:25:45 +00:00
// libcontainer trying to resolve the binary using $PATH.
// Do *not* use filepath.Join as it will translate ".."s returned by
// filepath.Rel. Prepending "/" will cause the path to be rooted in the
// chroot which is the desired behavior.
2019-04-02 18:17:12 +00:00
path = "/" + rel
2018-10-16 02:37:58 +00:00
combined := append ( [ ] string { path } , command . Args ... )
2018-09-24 18:37:45 +00:00
stdout , err := command . Stdout ( )
if err != nil {
return nil , err
}
stderr , err := command . Stderr ( )
if err != nil {
return nil , err
}
2018-05-26 01:49:20 +00:00
2019-05-03 18:42:57 +00:00
l . logger . Debug ( "launching" , "command" , command . Cmd , "args" , strings . Join ( command . Args , " " ) )
2018-09-24 18:37:45 +00:00
// the task process will be started by the container
process := & libcontainer . Process {
Args : combined ,
Env : command . Env ,
Stdout : stdout ,
Stderr : stderr ,
Init : true ,
}
2018-05-26 01:49:20 +00:00
2018-09-24 18:37:45 +00:00
if command . User != "" {
process . User = command . User
}
l . userProc = process
l . totalCpuStats = stats . NewCpuStats ( )
l . userCpuStats = stats . NewCpuStats ( )
l . systemCpuStats = stats . NewCpuStats ( )
// Starts the task
2019-09-30 15:50:22 +00:00
if err := container . Run ( process ) ; err != nil {
container . Destroy ( )
return nil , err
2018-09-24 18:37:45 +00:00
}
pid , err := process . Pid ( )
if err != nil {
container . Destroy ( )
return nil , err
}
// start a goroutine to wait on the process to complete, so Wait calls can
// be multiplexed
l . userProcExited = make ( chan interface { } )
go l . pidCollector . collectPids ( l . userProcExited , l . getAllPids )
go l . wait ( )
2018-12-07 01:54:14 +00:00
return & ProcessState {
2018-09-24 18:37:45 +00:00
Pid : pid ,
ExitCode : - 1 ,
Time : time . Now ( ) ,
} , nil
}
func ( l * LibcontainerExecutor ) getAllPids ( ) ( map [ int ] * nomadPid , error ) {
pids , err := l . container . Processes ( )
if err != nil {
return nil , err
}
nPids := make ( map [ int ] * nomadPid )
for _ , pid := range pids {
nPids [ pid ] = & nomadPid {
pid : pid ,
cpuStatsTotal : stats . NewCpuStats ( ) ,
cpuStatsUser : stats . NewCpuStats ( ) ,
cpuStatsSys : stats . NewCpuStats ( ) ,
2016-04-02 08:36:31 +00:00
}
}
2018-09-24 18:37:45 +00:00
return nPids , nil
2016-02-03 18:23:00 +00:00
}
2018-09-24 18:37:45 +00:00
// Wait waits until a process has exited and returns it's exitcode and errors
2018-12-07 01:54:14 +00:00
func ( l * LibcontainerExecutor ) Wait ( ctx context . Context ) ( * ProcessState , error ) {
2018-12-05 16:03:56 +00:00
select {
case <- ctx . Done ( ) :
return nil , ctx . Err ( )
case <- l . userProcExited :
return l . exitState , nil
}
2018-09-24 18:37:45 +00:00
}
2016-02-03 18:23:00 +00:00
2018-09-24 18:37:45 +00:00
func ( l * LibcontainerExecutor ) wait ( ) {
defer close ( l . userProcExited )
2016-02-03 18:23:00 +00:00
2018-09-24 18:37:45 +00:00
ps , err := l . userProc . Wait ( )
if err != nil {
// If the process has exited before we called wait an error is returned
// the process state is embedded in the error
if exitErr , ok := err . ( * exec . ExitError ) ; ok {
ps = exitErr . ProcessState
} else {
l . logger . Error ( "failed to call wait on user process" , "error" , err )
2018-12-04 23:57:14 +00:00
l . exitState = & ProcessState { Pid : 0 , ExitCode : 1 , Time : time . Now ( ) }
2018-09-24 18:37:45 +00:00
return
}
}
l . command . Close ( )
exitCode := 1
var signal int
if status , ok := ps . Sys ( ) . ( syscall . WaitStatus ) ; ok {
exitCode = status . ExitStatus ( )
if status . Signaled ( ) {
const exitSignalBase = 128
signal = int ( status . Signal ( ) )
exitCode = exitSignalBase + signal
}
}
2018-12-07 01:54:14 +00:00
l . exitState = & ProcessState {
2018-09-24 18:37:45 +00:00
Pid : ps . Pid ( ) ,
ExitCode : exitCode ,
Signal : signal ,
Time : time . Now ( ) ,
}
}
// Shutdown stops all processes started and cleans up any resources
// created (such as mountpoints, devices, etc).
func ( l * LibcontainerExecutor ) Shutdown ( signal string , grace time . Duration ) error {
if l . container == nil {
2018-05-25 22:25:23 +00:00
return nil
}
2018-09-24 18:37:45 +00:00
status , err := l . container . Status ( )
if err != nil {
return err
2016-02-03 18:23:00 +00:00
}
2018-09-24 18:37:45 +00:00
defer l . container . Destroy ( )
2016-02-03 18:23:00 +00:00
2018-09-24 18:37:45 +00:00
if status == libcontainer . Stopped {
return nil
}
if grace > 0 {
if signal == "" {
signal = "SIGINT"
2016-02-03 18:23:00 +00:00
}
2016-02-04 00:03:43 +00:00
2018-09-24 18:37:45 +00:00
sig , ok := signals . SignalLookup [ signal ]
if ! ok {
return fmt . Errorf ( "error unknown signal given for shutdown: %s" , signal )
}
2018-12-04 23:57:14 +00:00
// Signal initial container processes only during graceful
// shutdown; hence `false` arg.
2018-09-24 18:37:45 +00:00
err = l . container . Signal ( sig , false )
if err != nil {
return err
}
select {
case <- l . userProcExited :
return nil
case <- time . After ( grace ) :
2018-12-04 23:57:14 +00:00
// Force kill all container processes after grace period,
// hence `true` argument.
2019-03-16 03:50:17 +00:00
if err := l . container . Signal ( os . Kill , true ) ; err != nil {
return err
}
2018-09-24 18:37:45 +00:00
}
} else {
2020-12-08 20:47:04 +00:00
err := l . container . Signal ( os . Kill , true )
if err != nil {
2019-03-16 03:50:17 +00:00
return err
}
}
select {
case <- l . userProcExited :
return nil
case <- time . After ( time . Second * 15 ) :
return fmt . Errorf ( "process failed to exit after 15 seconds" )
2016-02-03 18:23:00 +00:00
}
2018-09-24 18:37:45 +00:00
}
2016-02-03 18:23:00 +00:00
2018-09-24 18:37:45 +00:00
// UpdateResources updates the resource isolation with new values to be enforced
2018-12-07 02:22:02 +00:00
func ( l * LibcontainerExecutor ) UpdateResources ( resources * drivers . Resources ) error {
2016-02-03 18:23:00 +00:00
return nil
}
2018-09-24 18:37:45 +00:00
// Version returns the api version of the executor
2018-12-07 01:54:14 +00:00
func ( l * LibcontainerExecutor ) Version ( ) ( * ExecutorVersion , error ) {
return & ExecutorVersion { Version : ExecutorVersionLatest } , nil
2018-09-24 18:37:45 +00:00
}
// Stats returns the resource statistics for processes managed by the executor
2018-12-11 20:27:50 +00:00
func ( l * LibcontainerExecutor ) Stats ( ctx context . Context , interval time . Duration ) ( <- chan * cstructs . TaskResourceUsage , error ) {
ch := make ( chan * cstructs . TaskResourceUsage )
go l . handleStats ( ch , ctx , interval )
return ch , nil
2018-09-24 18:37:45 +00:00
2018-12-11 20:27:50 +00:00
}
2016-04-29 18:40:37 +00:00
2018-12-11 20:27:50 +00:00
func ( l * LibcontainerExecutor ) handleStats ( ch chan * cstructs . TaskResourceUsage , ctx context . Context , interval time . Duration ) {
defer close ( ch )
timer := time . NewTimer ( 0 )
2021-04-01 15:50:17 +00:00
measuredMemStats := ExecutorCgroupV1MeasuredMemStats
if cgroups . IsCgroup2UnifiedMode ( ) {
measuredMemStats = ExecutorCgroupV2MeasuredMemStats
}
2018-12-11 20:27:50 +00:00
for {
select {
case <- ctx . Done ( ) :
return
case <- timer . C :
timer . Reset ( interval )
}
lstats , err := l . container . Stats ( )
if err != nil {
l . logger . Warn ( "error collecting stats" , "error" , err )
return
}
pidStats , err := l . pidCollector . pidStats ( )
if err != nil {
l . logger . Warn ( "error collecting stats" , "error" , err )
return
}
ts := time . Now ( )
stats := lstats . CgroupStats
// Memory Related Stats
swap := stats . MemoryStats . SwapUsage
maxUsage := stats . MemoryStats . Usage . MaxUsage
rss := stats . MemoryStats . Stats [ "rss" ]
cache := stats . MemoryStats . Stats [ "cache" ]
ms := & cstructs . MemoryStats {
RSS : rss ,
Cache : cache ,
Swap : swap . Usage ,
2019-01-14 23:47:52 +00:00
Usage : stats . MemoryStats . Usage . Usage ,
2018-12-11 20:27:50 +00:00
MaxUsage : maxUsage ,
KernelUsage : stats . MemoryStats . KernelUsage . Usage ,
KernelMaxUsage : stats . MemoryStats . KernelUsage . MaxUsage ,
2021-04-01 15:50:17 +00:00
Measured : measuredMemStats ,
2018-12-11 20:27:50 +00:00
}
// CPU Related Stats
totalProcessCPUUsage := float64 ( stats . CpuStats . CpuUsage . TotalUsage )
userModeTime := float64 ( stats . CpuStats . CpuUsage . UsageInUsermode )
kernelModeTime := float64 ( stats . CpuStats . CpuUsage . UsageInKernelmode )
totalPercent := l . totalCpuStats . Percent ( totalProcessCPUUsage )
cs := & cstructs . CpuStats {
SystemMode : l . systemCpuStats . Percent ( kernelModeTime ) ,
UserMode : l . userCpuStats . Percent ( userModeTime ) ,
Percent : totalPercent ,
ThrottledPeriods : stats . CpuStats . ThrottlingData . ThrottledPeriods ,
ThrottledTime : stats . CpuStats . ThrottlingData . ThrottledTime ,
TotalTicks : l . systemCpuStats . TicksConsumed ( totalPercent ) ,
Measured : ExecutorCgroupMeasuredCpuStats ,
}
taskResUsage := cstructs . TaskResourceUsage {
ResourceUsage : & cstructs . ResourceUsage {
MemoryStats : ms ,
CpuStats : cs ,
} ,
Timestamp : ts . UTC ( ) . UnixNano ( ) ,
Pids : pidStats ,
}
select {
case <- ctx . Done ( ) :
return
case ch <- & taskResUsage :
}
}
2016-04-28 23:06:01 +00:00
}
2018-09-24 18:37:45 +00:00
// Signal sends a signal to the process managed by the executor
func ( l * LibcontainerExecutor ) Signal ( s os . Signal ) error {
return l . userProc . Signal ( s )
}
// Exec starts an additional process inside the container
func ( l * LibcontainerExecutor ) Exec ( deadline time . Time , cmd string , args [ ] string ) ( [ ] byte , int , error ) {
combined := append ( [ ] string { cmd } , args ... )
// Capture output
2019-01-07 15:01:46 +00:00
buf , _ := circbuf . NewBuffer ( int64 ( drivers . CheckBufSize ) )
2018-09-24 18:37:45 +00:00
process := & libcontainer . Process {
Args : combined ,
Env : l . command . Env ,
Stdout : buf ,
Stderr : buf ,
2016-02-03 18:23:00 +00:00
}
2018-09-24 18:37:45 +00:00
err := l . container . Run ( process )
2017-03-20 21:21:13 +00:00
if err != nil {
2018-09-24 18:37:45 +00:00
return nil , 0 , err
2017-03-20 21:21:13 +00:00
}
2018-09-24 18:37:45 +00:00
waitCh := make ( chan * waitResult )
defer close ( waitCh )
go l . handleExecWait ( waitCh , process )
select {
case result := <- waitCh :
ps := result . ps
if result . err != nil {
if exitErr , ok := result . err . ( * exec . ExitError ) ; ok {
ps = exitErr . ProcessState
} else {
return nil , 0 , result . err
}
}
var exitCode int
if status , ok := ps . Sys ( ) . ( syscall . WaitStatus ) ; ok {
exitCode = status . ExitStatus ( )
2017-03-20 21:21:13 +00:00
}
2018-09-24 18:37:45 +00:00
return buf . Bytes ( ) , exitCode , nil
2017-03-20 21:21:13 +00:00
2018-09-24 18:37:45 +00:00
case <- time . After ( time . Until ( deadline ) ) :
process . Signal ( os . Kill )
return nil , 0 , context . DeadlineExceeded
2017-03-20 21:21:13 +00:00
}
2018-09-24 18:37:45 +00:00
}
2019-05-10 22:44:19 +00:00
func ( l * LibcontainerExecutor ) newTerminalSocket ( ) ( pty func ( ) ( * os . File , error ) , tty * os . File , err error ) {
2019-04-28 21:30:10 +00:00
parent , child , err := lutils . NewSockPair ( "socket" )
if err != nil {
return nil , nil , fmt . Errorf ( "failed to create terminal: %v" , err )
}
return func ( ) ( * os . File , error ) { return lutils . RecvFd ( parent ) } , child , err
}
func ( l * LibcontainerExecutor ) ExecStreaming ( ctx context . Context , cmd [ ] string , tty bool ,
stream drivers . ExecTaskStream ) error {
// the task process will be started by the container
process := & libcontainer . Process {
Args : cmd ,
Env : l . userProc . Env ,
User : l . userProc . User ,
Init : false ,
Cwd : "/" ,
}
execHelper := & execHelper {
logger : l . logger ,
newTerminal : l . newTerminalSocket ,
setTTY : func ( tty * os . File ) error {
process . ConsoleSocket = tty
return nil
} ,
setIO : func ( stdin io . Reader , stdout , stderr io . Writer ) error {
process . Stdin = stdin
process . Stdout = stdout
process . Stderr = stderr
return nil
} ,
processStart : func ( ) error { return l . container . Run ( process ) } ,
processWait : func ( ) ( * os . ProcessState , error ) {
return process . Wait ( )
} ,
}
return execHelper . run ( ctx , tty , stream )
}
2018-09-24 18:37:45 +00:00
type waitResult struct {
ps * os . ProcessState
err error
}
func ( l * LibcontainerExecutor ) handleExecWait ( ch chan * waitResult , process * libcontainer . Process ) {
ps , err := process . Wait ( )
ch <- & waitResult { ps , err }
}
2018-12-07 19:03:13 +00:00
func configureCapabilities ( cfg * lconfigs . Config , command * ExecCommand ) error {
2018-09-24 18:37:45 +00:00
// TODO: allow better control of these
2019-05-20 17:00:33 +00:00
// use capabilities list as prior to adopting libcontainer in 0.9
allCaps := supportedCaps ( )
2019-05-24 18:03:26 +00:00
// match capabilities used in Nomad 0.8
if command . User == "root" {
cfg . Capabilities = & lconfigs . Capabilities {
Bounding : allCaps ,
Permitted : allCaps ,
Effective : allCaps ,
Ambient : nil ,
Inheritable : nil ,
}
} else {
cfg . Capabilities = & lconfigs . Capabilities {
Bounding : allCaps ,
}
2016-02-03 18:23:00 +00:00
}
2018-12-07 19:03:13 +00:00
return nil
2018-09-24 18:37:45 +00:00
}
2019-05-20 17:00:33 +00:00
// supportedCaps returns a list of all supported capabilities in kernel
func supportedCaps ( ) [ ] string {
allCaps := [ ] string { }
last := capability . CAP_LAST_CAP
// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
if last == capability . Cap ( 63 ) {
last = capability . CAP_BLOCK_SUSPEND
}
for _ , cap := range capability . List ( ) {
if cap > last {
continue
}
allCaps = append ( allCaps , fmt . Sprintf ( "CAP_%s" , strings . ToUpper ( cap . String ( ) ) ) )
}
return allCaps
}
2021-02-04 19:01:51 +00:00
func configureNamespaces ( pidMode , ipcMode string ) lconfigs . Namespaces {
namespaces := lconfigs . Namespaces { { Type : lconfigs . NEWNS } }
2021-02-08 16:58:44 +00:00
if pidMode == IsolationModePrivate {
2021-02-04 19:01:51 +00:00
namespaces = append ( namespaces , lconfigs . Namespace { Type : lconfigs . NEWPID } )
}
2021-02-08 16:58:44 +00:00
if ipcMode == IsolationModePrivate {
2021-02-04 19:01:51 +00:00
namespaces = append ( namespaces , lconfigs . Namespace { Type : lconfigs . NEWIPC } )
}
return namespaces
}
2019-01-08 17:10:26 +00:00
// configureIsolation prepares the isolation primitives of the container.
// The process runs in a container configured with the following:
//
// * the task directory as the chroot
// * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
// * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
// * some special filesystems: `/proc`, `/sys`. Some case is given to avoid exec escaping or setting malicious values through them.
2018-12-07 19:03:13 +00:00
func configureIsolation ( cfg * lconfigs . Config , command * ExecCommand ) error {
2018-09-24 18:37:45 +00:00
defaultMountFlags := syscall . MS_NOEXEC | syscall . MS_NOSUID | syscall . MS_NODEV
// set the new root directory for the container
cfg . Rootfs = command . TaskDir
2020-02-13 19:12:46 +00:00
// disable pivot_root if set in the driver's configuration
cfg . NoPivotRoot = command . NoPivotRoot
2021-02-08 16:58:44 +00:00
// set up default namespaces as configured
2021-02-08 16:36:11 +00:00
cfg . Namespaces = configureNamespaces ( command . ModePID , command . ModeIPC )
2018-09-24 18:37:45 +00:00
2019-09-30 15:50:22 +00:00
if command . NetworkIsolation != nil {
cfg . Namespaces = append ( cfg . Namespaces , lconfigs . Namespace {
Type : lconfigs . NEWNET ,
Path : command . NetworkIsolation . Path ,
} )
}
2018-09-24 18:37:45 +00:00
// paths to mask using a bind mount to /dev/null to prevent reading
cfg . MaskPaths = [ ] string {
"/proc/kcore" ,
"/sys/firmware" ,
2016-02-03 18:23:00 +00:00
}
2017-03-20 21:21:13 +00:00
2018-09-24 18:37:45 +00:00
// paths that should be remounted as readonly inside the container
cfg . ReadonlyPaths = [ ] string {
"/proc/sys" , "/proc/sysrq-trigger" , "/proc/irq" , "/proc/bus" ,
}
2016-02-03 18:23:00 +00:00
2020-08-19 15:57:26 +00:00
cfg . Devices = specconv . AllowedDevices
2018-12-10 03:30:23 +00:00
if len ( command . Devices ) > 0 {
devs , err := cmdDevices ( command . Devices )
if err != nil {
return err
}
cfg . Devices = append ( cfg . Devices , devs ... )
}
2018-09-24 18:37:45 +00:00
cfg . Mounts = [ ] * lconfigs . Mount {
{
2019-01-06 22:02:43 +00:00
Source : "tmpfs" ,
2018-09-24 18:37:45 +00:00
Destination : "/dev" ,
2019-01-06 22:02:43 +00:00
Device : "tmpfs" ,
Flags : syscall . MS_NOSUID | syscall . MS_STRICTATIME ,
Data : "mode=755" ,
2018-09-24 18:37:45 +00:00
} ,
2018-12-03 15:41:01 +00:00
{
Source : "proc" ,
Destination : "/proc" ,
Device : "proc" ,
Flags : defaultMountFlags ,
} ,
2019-01-06 22:02:43 +00:00
{
Source : "devpts" ,
Destination : "/dev/pts" ,
Device : "devpts" ,
Flags : syscall . MS_NOSUID | syscall . MS_NOEXEC ,
Data : "newinstance,ptmxmode=0666,mode=0620,gid=5" ,
} ,
{
Device : "tmpfs" ,
Source : "shm" ,
Destination : "/dev/shm" ,
Data : "mode=1777,size=65536k" ,
Flags : defaultMountFlags ,
} ,
{
Source : "mqueue" ,
Destination : "/dev/mqueue" ,
Device : "mqueue" ,
Flags : defaultMountFlags ,
} ,
2018-09-24 18:37:45 +00:00
{
Source : "sysfs" ,
Destination : "/sys" ,
Device : "sysfs" ,
Flags : defaultMountFlags | syscall . MS_RDONLY ,
} ,
}
2018-12-07 19:03:13 +00:00
2018-12-10 03:30:23 +00:00
if len ( command . Mounts ) > 0 {
cfg . Mounts = append ( cfg . Mounts , cmdMounts ( command . Mounts ) ... )
}
2018-12-07 19:03:13 +00:00
return nil
2016-02-03 18:23:00 +00:00
}
2018-12-07 01:54:14 +00:00
func configureCgroups ( cfg * lconfigs . Config , command * ExecCommand ) error {
2018-09-24 18:37:45 +00:00
// If resources are not limited then manually create cgroups needed
if ! command . ResourceLimits {
return configureBasicCgroups ( cfg )
2016-02-03 18:23:00 +00:00
}
2016-02-04 00:03:43 +00:00
2018-09-24 18:37:45 +00:00
id := uuid . Generate ( )
2019-06-11 01:20:45 +00:00
cfg . Cgroups . Path = filepath . Join ( "/" , defaultCgroupParent , id )
2018-12-07 19:03:13 +00:00
if command . Resources == nil || command . Resources . NomadResources == nil {
return nil
}
2021-03-26 20:17:14 +00:00
// Total amount of memory allowed to consume
2021-03-30 20:33:55 +00:00
res := command . Resources . NomadResources
memHard , memSoft := res . Memory . MemoryMaxMB , res . Memory . MemoryMB
if memHard <= 0 {
memHard = res . Memory . MemoryMB
memSoft = 0
2021-03-26 20:17:14 +00:00
}
2021-03-30 20:33:55 +00:00
if memHard > 0 {
cfg . Cgroups . Resources . Memory = memHard * 1024 * 1024
cfg . Cgroups . Resources . MemoryReservation = memSoft * 1024 * 1024
2021-03-26 20:17:14 +00:00
2018-09-24 18:37:45 +00:00
// Disable swap to avoid issues on the machine
2018-12-07 02:22:02 +00:00
var memSwappiness uint64
2018-09-24 18:37:45 +00:00
cfg . Cgroups . Resources . MemorySwappiness = & memSwappiness
}
2016-02-03 18:23:00 +00:00
2021-03-30 20:33:55 +00:00
cpuShares := res . Cpu . CpuShares
2018-12-14 00:21:41 +00:00
if cpuShares < 2 {
return fmt . Errorf ( "resources.Cpu.CpuShares must be equal to or greater than 2: %v" , cpuShares )
2018-09-24 18:37:45 +00:00
}
// Set the relative CPU shares for this cgroup.
2018-12-14 00:21:41 +00:00
cfg . Cgroups . Resources . CpuShares = uint64 ( cpuShares )
2018-09-24 18:37:45 +00:00
return nil
}
func configureBasicCgroups ( cfg * lconfigs . Config ) error {
id := uuid . Generate ( )
// Manually create freezer cgroup
2016-05-11 19:56:47 +00:00
2019-12-11 16:39:16 +00:00
subsystem := "freezer"
2016-02-03 19:41:49 +00:00
2019-12-11 16:39:16 +00:00
path , err := getCgroupPathHelper ( subsystem , filepath . Join ( defaultCgroupParent , id ) )
2018-09-24 18:37:45 +00:00
if err != nil {
return fmt . Errorf ( "failed to find %s cgroup mountpoint: %v" , subsystem , err )
2016-04-19 20:48:02 +00:00
}
2018-09-24 18:37:45 +00:00
if err = os . MkdirAll ( path , 0755 ) ; err != nil {
return err
2016-04-19 20:48:02 +00:00
}
2019-12-11 16:39:16 +00:00
cfg . Cgroups . Paths = map [ string ] string {
subsystem : path ,
}
2018-09-24 18:37:45 +00:00
return nil
}
2016-04-19 20:48:02 +00:00
2019-12-11 15:28:41 +00:00
func getCgroupPathHelper ( subsystem , cgroup string ) ( string , error ) {
mnt , root , err := cgroups . FindCgroupMountpointAndRoot ( "" , subsystem )
if err != nil {
return "" , err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see paths from host, which don't exist in container.
relCgroup , err := filepath . Rel ( root , cgroup )
if err != nil {
return "" , err
}
return filepath . Join ( mnt , relCgroup ) , nil
}
2018-12-07 19:03:13 +00:00
func newLibcontainerConfig ( command * ExecCommand ) ( * lconfigs . Config , error ) {
2018-09-24 18:37:45 +00:00
cfg := & lconfigs . Config {
Cgroups : & lconfigs . Cgroup {
Resources : & lconfigs . Resources {
MemorySwappiness : nil ,
} ,
} ,
Version : "1.0.0" ,
2016-04-19 20:48:02 +00:00
}
2020-08-19 15:57:26 +00:00
for _ , device := range specconv . AllowedDevices {
2021-03-31 14:57:02 +00:00
cfg . Cgroups . Resources . Devices = append ( cfg . Cgroups . Resources . Devices , & device . Rule )
2020-08-19 15:57:26 +00:00
}
2016-04-19 20:48:02 +00:00
2018-12-07 19:03:13 +00:00
if err := configureCapabilities ( cfg , command ) ; err != nil {
return nil , err
}
if err := configureIsolation ( cfg , command ) ; err != nil {
return nil , err
}
if err := configureCgroups ( cfg , command ) ; err != nil {
return nil , err
}
return cfg , nil
2018-09-24 18:37:45 +00:00
}
2018-12-10 03:30:23 +00:00
// cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
func cmdDevices ( devices [ ] * drivers . DeviceConfig ) ( [ ] * lconfigs . Device , error ) {
if len ( devices ) == 0 {
return nil , nil
}
r := make ( [ ] * lconfigs . Device , len ( devices ) )
for i , d := range devices {
ed , err := ldevices . DeviceFromPath ( d . HostPath , d . Permissions )
if err != nil {
return nil , fmt . Errorf ( "failed to make device out for %s: %v" , d . HostPath , err )
}
ed . Path = d . TaskPath
r [ i ] = ed
}
return r , nil
}
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
var userMountToUnixMount = map [ string ] int {
// Empty string maps to `rprivate` for backwards compatibility in restored
// older tasks, where mount propagation will not be present.
"" : unix . MS_PRIVATE | unix . MS_REC , // rprivate
structs . VolumeMountPropagationPrivate : unix . MS_PRIVATE | unix . MS_REC , // rprivate
structs . VolumeMountPropagationHostToTask : unix . MS_SLAVE | unix . MS_REC , // rslave
structs . VolumeMountPropagationBidirectional : unix . MS_SHARED | unix . MS_REC , // rshared
}
2018-12-10 03:30:23 +00:00
// cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
func cmdMounts ( mounts [ ] * drivers . MountConfig ) [ ] * lconfigs . Mount {
if len ( mounts ) == 0 {
return nil
}
r := make ( [ ] * lconfigs . Mount , len ( mounts ) )
for i , m := range mounts {
flags := unix . MS_BIND
if m . Readonly {
flags |= unix . MS_RDONLY
}
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
2018-12-10 03:30:23 +00:00
r [ i ] = & lconfigs . Mount {
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
Source : m . HostPath ,
Destination : m . TaskPath ,
Device : "bind" ,
Flags : flags ,
PropagationFlags : [ ] int { userMountToUnixMount [ m . PropagationMode ] } ,
2018-12-10 03:30:23 +00:00
}
}
return r
}
2019-05-07 20:58:27 +00:00
// lookupTaskBin finds the file `bin` in taskDir/local, taskDir in that order, then performs
// a PATH search inside taskDir. It returns an absolute path. See also executor.lookupBin
2019-05-08 14:01:20 +00:00
func lookupTaskBin ( command * ExecCommand ) ( string , error ) {
taskDir := command . TaskDir
bin := command . Cmd
2019-05-07 20:58:27 +00:00
// Check in the local directory
localDir := filepath . Join ( taskDir , allocdir . TaskLocal )
local := filepath . Join ( localDir , bin )
if _ , err := os . Stat ( local ) ; err == nil {
return local , nil
}
// Check at the root of the task's directory
root := filepath . Join ( taskDir , bin )
if _ , err := os . Stat ( root ) ; err == nil {
return root , nil
}
2019-05-10 15:33:35 +00:00
if strings . Contains ( bin , "/" ) {
return "" , fmt . Errorf ( "file %s not found under path %s" , bin , taskDir )
}
2019-05-08 14:01:20 +00:00
path := "/usr/local/bin:/usr/bin:/bin"
2019-05-10 15:33:35 +00:00
return lookPathIn ( path , taskDir , bin )
}
// lookPathIn looks for a file with PATH inside the directory root. Like exec.LookPath
func lookPathIn ( path string , root string , bin string ) ( string , error ) {
// exec.LookPath(file string)
for _ , dir := range filepath . SplitList ( path ) {
if dir == "" {
// match unix shell behavior, empty path element == .
dir = "."
}
path := filepath . Join ( root , dir , bin )
f , err := os . Stat ( path )
if err != nil {
continue
}
if m := f . Mode ( ) ; ! m . IsDir ( ) {
return path , nil
2019-05-07 20:58:27 +00:00
}
}
2019-05-10 15:33:35 +00:00
return "" , fmt . Errorf ( "file %s not found under path %s" , bin , root )
2019-05-07 20:58:27 +00:00
}