2018-11-06 05:39:48 +00:00
|
|
|
package docker
|
|
|
|
|
|
|
|
import (
|
2020-03-31 01:21:39 +00:00
|
|
|
"bytes"
|
2018-11-20 03:41:14 +00:00
|
|
|
"context"
|
2020-03-31 01:21:39 +00:00
|
|
|
"encoding/json"
|
2018-11-06 05:39:48 +00:00
|
|
|
"fmt"
|
2020-03-31 01:21:39 +00:00
|
|
|
"io/ioutil"
|
2018-11-06 05:39:48 +00:00
|
|
|
"net"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"runtime"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
docker "github.com/fsouza/go-dockerclient"
|
|
|
|
"github.com/hashicorp/consul-template/signals"
|
|
|
|
hclog "github.com/hashicorp/go-hclog"
|
|
|
|
multierror "github.com/hashicorp/go-multierror"
|
2019-02-19 14:12:28 +00:00
|
|
|
plugin "github.com/hashicorp/go-plugin"
|
client: enable support for cgroups v2
This PR introduces support for using Nomad on systems with cgroups v2 [1]
enabled as the cgroups controller mounted on /sys/fs/cgroups. Newer Linux
distros like Ubuntu 21.10 are shipping with cgroups v2 only, causing problems
for Nomad users.
Nomad mostly "just works" with cgroups v2 due to the indirection via libcontainer,
but not so for managing cpuset cgroups. Before, Nomad has been making use of
a feature in v1 where a PID could be a member of more than one cgroup. In v2
this is no longer possible, and so the logic around computing cpuset values
must be modified. When Nomad detects v2, it manages cpuset values in-process,
rather than making use of cgroup heirarchy inheritence via shared/reserved
parents.
Nomad will only activate the v2 logic when it detects cgroups2 is mounted at
/sys/fs/cgroups. This means on systems running in hybrid mode with cgroups2
mounted at /sys/fs/cgroups/unified (as is typical) Nomad will continue to
use the v1 logic, and should operate as before. Systems that do not support
cgroups v2 are also not affected.
When v2 is activated, Nomad will create a parent called nomad.slice (unless
otherwise configured in Client conifg), and create cgroups for tasks using
naming convention <allocID>-<task>.scope. These follow the naming convention
set by systemd and also used by Docker when cgroups v2 is detected.
Client nodes now export a new fingerprint attribute, unique.cgroups.version
which will be set to 'v1' or 'v2' to indicate the cgroups regime in use by
Nomad.
The new cpuset management strategy fixes #11705, where docker tasks that
spawned processes on startup would "leak". In cgroups v2, the PIDs are
started in the cgroup they will always live in, and thus the cause of
the leak is eliminated.
[1] https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
Closes #11289
Fixes #11705 #11773 #11933
2022-02-28 22:24:01 +00:00
|
|
|
"github.com/hashicorp/nomad/client/lib/cgutil"
|
2018-11-30 11:18:39 +00:00
|
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
2018-11-06 05:39:48 +00:00
|
|
|
"github.com/hashicorp/nomad/drivers/docker/docklog"
|
2021-05-15 20:48:01 +00:00
|
|
|
"github.com/hashicorp/nomad/drivers/shared/capabilities"
|
2018-11-06 05:39:48 +00:00
|
|
|
"github.com/hashicorp/nomad/drivers/shared/eventer"
|
2021-06-16 18:55:22 +00:00
|
|
|
"github.com/hashicorp/nomad/drivers/shared/hostnames"
|
2020-08-17 14:22:08 +00:00
|
|
|
"github.com/hashicorp/nomad/drivers/shared/resolvconf"
|
2018-11-06 05:39:48 +00:00
|
|
|
nstructs "github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
"github.com/hashicorp/nomad/plugins/base"
|
|
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
2019-01-15 01:02:44 +00:00
|
|
|
pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
2021-03-08 13:59:52 +00:00
|
|
|
"github.com/ryanuber/go-glob"
|
2018-11-06 05:39:48 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
// createClientsLock is a lock that protects reading/writing global client
|
|
|
|
// variables
|
|
|
|
createClientsLock sync.Mutex
|
|
|
|
|
|
|
|
// client is a docker client with a timeout of 5 minutes. This is for doing
|
|
|
|
// all operations with the docker daemon besides which are not long running
|
|
|
|
// such as creating, killing containers, etc.
|
|
|
|
client *docker.Client
|
|
|
|
|
|
|
|
// waitClient is a docker client with no timeouts. This is used for long
|
|
|
|
// running operations such as waiting on containers and collect stats
|
|
|
|
waitClient *docker.Client
|
|
|
|
|
2019-09-13 19:25:31 +00:00
|
|
|
dockerTransientErrs = []string{
|
|
|
|
"Client.Timeout exceeded while awaiting headers",
|
|
|
|
"EOF",
|
|
|
|
"API error (500)",
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// recoverableErrTimeouts returns a recoverable error if the error was due
|
|
|
|
// to timeouts
|
|
|
|
recoverableErrTimeouts = func(err error) error {
|
|
|
|
r := false
|
|
|
|
if strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers") ||
|
|
|
|
strings.Contains(err.Error(), "EOF") {
|
|
|
|
r = true
|
|
|
|
}
|
|
|
|
return nstructs.NewRecoverableError(err, r)
|
|
|
|
}
|
2019-01-17 02:52:31 +00:00
|
|
|
|
|
|
|
// taskHandleVersion is the version of task handle which this driver sets
|
|
|
|
// and understands how to decode driver state
|
|
|
|
taskHandleVersion = 1
|
2019-04-10 13:07:08 +00:00
|
|
|
|
|
|
|
// Nvidia-container-runtime environment variable names
|
|
|
|
nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
2018-11-06 05:39:48 +00:00
|
|
|
)
|
|
|
|
|
2019-10-17 13:53:46 +00:00
|
|
|
const (
|
2021-03-08 13:59:52 +00:00
|
|
|
dockerLabelAllocID = "com.hashicorp.nomad.alloc_id"
|
|
|
|
dockerLabelJobName = "com.hashicorp.nomad.job_name"
|
|
|
|
dockerLabelJobID = "com.hashicorp.nomad.job_id"
|
|
|
|
dockerLabelTaskGroupName = "com.hashicorp.nomad.task_group_name"
|
|
|
|
dockerLabelTaskName = "com.hashicorp.nomad.task_name"
|
|
|
|
dockerLabelNamespace = "com.hashicorp.nomad.namespace"
|
|
|
|
dockerLabelNodeName = "com.hashicorp.nomad.node_name"
|
|
|
|
dockerLabelNodeID = "com.hashicorp.nomad.node_id"
|
2019-10-17 13:53:46 +00:00
|
|
|
)
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
type Driver struct {
|
|
|
|
// eventer is used to handle multiplexing of TaskEvents calls such that an
|
|
|
|
// event can be broadcast to all callers
|
|
|
|
eventer *eventer.Eventer
|
|
|
|
|
2018-11-16 18:52:54 +00:00
|
|
|
// config contains the runtime configuration for the driver set by the
|
|
|
|
// SetConfig RPC
|
|
|
|
config *DriverConfig
|
|
|
|
|
|
|
|
// clientConfig contains a driver specific subset of the Nomad client
|
|
|
|
// configuration
|
2018-11-06 05:39:48 +00:00
|
|
|
clientConfig *base.ClientDriverConfig
|
2018-11-16 18:52:54 +00:00
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// ctx is the context for the driver. It is passed to other subsystems to
|
|
|
|
// coordinate shutdown
|
|
|
|
ctx context.Context
|
|
|
|
|
|
|
|
// tasks is the in memory datastore mapping taskIDs to taskHandles
|
|
|
|
tasks *taskStore
|
|
|
|
|
2018-11-19 20:06:07 +00:00
|
|
|
// coordinator is what tracks multiple image pulls against the same docker image
|
|
|
|
coordinator *dockerCoordinator
|
|
|
|
|
2018-11-21 01:41:32 +00:00
|
|
|
// logger will log to the Nomad agent
|
2018-11-06 05:39:48 +00:00
|
|
|
logger hclog.Logger
|
2018-12-18 01:03:43 +00:00
|
|
|
|
|
|
|
// gpuRuntime indicates nvidia-docker runtime availability
|
|
|
|
gpuRuntime bool
|
2019-01-11 17:50:46 +00:00
|
|
|
|
2019-01-14 21:33:42 +00:00
|
|
|
// A tri-state boolean to know if the fingerprinting has happened and
|
|
|
|
// whether it has been successful
|
|
|
|
fingerprintSuccess *bool
|
2019-01-16 17:04:11 +00:00
|
|
|
fingerprintLock sync.RWMutex
|
2019-02-25 10:02:42 +00:00
|
|
|
|
|
|
|
// A boolean to know if the docker driver has ever been correctly detected
|
|
|
|
// for use during fingerprinting.
|
|
|
|
detected bool
|
|
|
|
detectedLock sync.RWMutex
|
2019-10-18 18:35:15 +00:00
|
|
|
|
client: enable support for cgroups v2
This PR introduces support for using Nomad on systems with cgroups v2 [1]
enabled as the cgroups controller mounted on /sys/fs/cgroups. Newer Linux
distros like Ubuntu 21.10 are shipping with cgroups v2 only, causing problems
for Nomad users.
Nomad mostly "just works" with cgroups v2 due to the indirection via libcontainer,
but not so for managing cpuset cgroups. Before, Nomad has been making use of
a feature in v1 where a PID could be a member of more than one cgroup. In v2
this is no longer possible, and so the logic around computing cpuset values
must be modified. When Nomad detects v2, it manages cpuset values in-process,
rather than making use of cgroup heirarchy inheritence via shared/reserved
parents.
Nomad will only activate the v2 logic when it detects cgroups2 is mounted at
/sys/fs/cgroups. This means on systems running in hybrid mode with cgroups2
mounted at /sys/fs/cgroups/unified (as is typical) Nomad will continue to
use the v1 logic, and should operate as before. Systems that do not support
cgroups v2 are also not affected.
When v2 is activated, Nomad will create a parent called nomad.slice (unless
otherwise configured in Client conifg), and create cgroups for tasks using
naming convention <allocID>-<task>.scope. These follow the naming convention
set by systemd and also used by Docker when cgroups v2 is detected.
Client nodes now export a new fingerprint attribute, unique.cgroups.version
which will be set to 'v1' or 'v2' to indicate the cgroups regime in use by
Nomad.
The new cpuset management strategy fixes #11705, where docker tasks that
spawned processes on startup would "leak". In cgroups v2, the PIDs are
started in the cgroup they will always live in, and thus the cause of
the leak is eliminated.
[1] https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
Closes #11289
Fixes #11705 #11773 #11933
2022-02-28 22:24:01 +00:00
|
|
|
danglingReconciler *containerReconciler
|
|
|
|
cpusetFixer CpusetFixer
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewDockerDriver returns a docker implementation of a driver plugin
|
2020-05-26 13:44:26 +00:00
|
|
|
func NewDockerDriver(ctx context.Context, logger hclog.Logger) drivers.DriverPlugin {
|
2018-11-06 05:39:48 +00:00
|
|
|
logger = logger.Named(pluginName)
|
|
|
|
return &Driver{
|
2020-05-26 13:44:26 +00:00
|
|
|
eventer: eventer.NewEventer(ctx, logger),
|
|
|
|
config: &DriverConfig{},
|
|
|
|
tasks: newTaskStore(),
|
|
|
|
ctx: ctx,
|
|
|
|
logger: logger,
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-10 15:29:18 +00:00
|
|
|
func (d *Driver) reattachToDockerLogger(reattachConfig *pstructs.ReattachConfig) (docklog.DockerLogger, *plugin.Client, error) {
|
2019-02-19 14:12:28 +00:00
|
|
|
reattach, err := pstructs.ReattachConfigToGoPlugin(reattachConfig)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
dlogger, dloggerPluginClient, err := docklog.ReattachDockerLogger(reattach)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("failed to reattach to docker logger process: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return dlogger, dloggerPluginClient, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) setupNewDockerLogger(container *docker.Container, cfg *drivers.TaskConfig, startTime time.Time) (docklog.DockerLogger, *plugin.Client, error) {
|
|
|
|
dlogger, pluginClient, err := docklog.LaunchDockerLogger(d.logger)
|
|
|
|
if err != nil {
|
|
|
|
if pluginClient != nil {
|
|
|
|
pluginClient.Kill()
|
|
|
|
}
|
|
|
|
return nil, nil, fmt.Errorf("failed to launch docker logger plugin: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := dlogger.Start(&docklog.StartOpts{
|
|
|
|
Endpoint: d.config.Endpoint,
|
|
|
|
ContainerID: container.ID,
|
2019-04-25 02:00:00 +00:00
|
|
|
TTY: container.Config.Tty,
|
2019-02-19 14:12:28 +00:00
|
|
|
Stdout: cfg.StdoutPath,
|
|
|
|
Stderr: cfg.StderrPath,
|
|
|
|
TLSCert: d.config.TLS.Cert,
|
|
|
|
TLSKey: d.config.TLS.Key,
|
|
|
|
TLSCA: d.config.TLS.CA,
|
|
|
|
StartTime: startTime.Unix(),
|
|
|
|
}); err != nil {
|
|
|
|
pluginClient.Kill()
|
|
|
|
return nil, nil, fmt.Errorf("failed to launch docker logger process %s: %v", container.ID, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return dlogger, pluginClient, nil
|
|
|
|
}
|
|
|
|
|
2018-11-12 18:51:53 +00:00
|
|
|
func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
|
|
|
|
if _, ok := d.tasks.Get(handle.Config.ID); ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var handleState taskHandleState
|
|
|
|
if err := handle.GetDriverState(&handleState); err != nil {
|
|
|
|
return fmt.Errorf("failed to decode driver task state: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
client, _, err := d.dockerClients()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to get docker client: %v", err)
|
|
|
|
}
|
|
|
|
|
2020-12-10 15:29:18 +00:00
|
|
|
container, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
|
|
|
|
ID: handleState.ContainerID,
|
|
|
|
})
|
2018-11-12 18:51:53 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to inspect container for id %q: %v", handleState.ContainerID, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
h := &taskHandle{
|
|
|
|
client: client,
|
|
|
|
waitClient: waitClient,
|
|
|
|
logger: d.logger.With("container_id", container.ID),
|
|
|
|
task: handle.Config,
|
2018-11-20 02:41:25 +00:00
|
|
|
containerID: container.ID,
|
|
|
|
containerImage: container.Image,
|
2018-11-12 18:51:53 +00:00
|
|
|
doneCh: make(chan bool),
|
|
|
|
waitCh: make(chan struct{}),
|
2018-11-20 02:07:30 +00:00
|
|
|
removeContainerOnExit: d.config.GC.Container,
|
2018-11-12 18:51:53 +00:00
|
|
|
net: handleState.DriverNetwork,
|
|
|
|
}
|
|
|
|
|
2019-12-07 03:11:41 +00:00
|
|
|
if !d.config.DisableLogCollection {
|
|
|
|
h.dlogger, h.dloggerPluginClient, err = d.reattachToDockerLogger(handleState.ReattachConfig)
|
2019-02-19 14:12:28 +00:00
|
|
|
if err != nil {
|
2019-12-07 03:11:41 +00:00
|
|
|
d.logger.Warn("failed to reattach to docker logger process", "error", err)
|
|
|
|
|
|
|
|
h.dlogger, h.dloggerPluginClient, err = d.setupNewDockerLogger(container, handle.Config, time.Now())
|
|
|
|
if err != nil {
|
|
|
|
if err := client.StopContainer(handleState.ContainerID, 0); err != nil {
|
|
|
|
d.logger.Warn("failed to stop container during cleanup", "container_id", handleState.ContainerID, "error", err)
|
|
|
|
}
|
|
|
|
return fmt.Errorf("failed to setup replacement docker logger: %v", err)
|
2019-02-18 11:22:30 +00:00
|
|
|
}
|
2019-02-19 14:12:28 +00:00
|
|
|
|
2019-12-07 03:11:41 +00:00
|
|
|
if err := handle.SetDriverState(h.buildState()); err != nil {
|
|
|
|
if err := client.StopContainer(handleState.ContainerID, 0); err != nil {
|
|
|
|
d.logger.Warn("failed to stop container during cleanup", "container_id", handleState.ContainerID, "error", err)
|
|
|
|
}
|
|
|
|
return fmt.Errorf("failed to store driver state: %v", err)
|
2019-02-18 11:22:30 +00:00
|
|
|
}
|
2019-02-19 14:12:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-12 18:51:53 +00:00
|
|
|
d.tasks.Set(handle.Config.ID, h)
|
|
|
|
go h.run()
|
|
|
|
|
|
|
|
return nil
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2019-01-04 23:01:35 +00:00
|
|
|
func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drivers.DriverNetwork, error) {
|
2018-11-06 05:39:48 +00:00
|
|
|
if _, ok := d.tasks.Get(cfg.ID); ok {
|
2018-11-21 01:33:31 +00:00
|
|
|
return nil, nil, fmt.Errorf("task with ID %q already started", cfg.ID)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
var driverConfig TaskConfig
|
|
|
|
|
|
|
|
if err := cfg.DecodeDriverConfig(&driverConfig); err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("failed to decode driver config: %v", err)
|
|
|
|
}
|
|
|
|
|
2019-04-05 02:10:18 +00:00
|
|
|
if driverConfig.Image == "" {
|
|
|
|
return nil, nil, fmt.Errorf("image name required for docker driver")
|
|
|
|
}
|
|
|
|
|
2021-01-22 13:36:09 +00:00
|
|
|
driverConfig.Image = strings.TrimPrefix(driverConfig.Image, "https://")
|
2019-04-05 02:10:18 +00:00
|
|
|
|
2019-01-17 02:52:31 +00:00
|
|
|
handle := drivers.NewTaskHandle(taskHandleVersion)
|
2018-11-06 05:39:48 +00:00
|
|
|
handle.Config = cfg
|
|
|
|
|
|
|
|
// Initialize docker API clients
|
|
|
|
client, _, err := d.dockerClients()
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("Failed to connect to docker daemon: %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
id, err := d.createImage(cfg, &driverConfig, client)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
|
2020-05-04 17:08:47 +00:00
|
|
|
if runtime.GOOS == "windows" {
|
|
|
|
err = d.convertAllocPathsForWindowsLCOW(cfg, driverConfig.Image)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-27 21:09:13 +00:00
|
|
|
containerCfg, err := d.createContainerConfig(cfg, &driverConfig, driverConfig.Image)
|
2018-11-06 05:39:48 +00:00
|
|
|
if err != nil {
|
|
|
|
d.logger.Error("failed to create container configuration", "image_name", driverConfig.Image,
|
|
|
|
"image_id", id, "error", err)
|
|
|
|
return nil, nil, fmt.Errorf("Failed to create container configuration for image %q (%q): %v", driverConfig.Image, id, err)
|
|
|
|
}
|
|
|
|
|
2018-11-12 12:39:55 +00:00
|
|
|
startAttempts := 0
|
|
|
|
CREATE:
|
2019-05-14 00:59:31 +00:00
|
|
|
container, err := d.createContainer(client, containerCfg, driverConfig.Image)
|
2018-11-06 05:39:48 +00:00
|
|
|
if err != nil {
|
|
|
|
d.logger.Error("failed to create container", "error", err)
|
2019-09-18 12:45:59 +00:00
|
|
|
client.RemoveContainer(docker.RemoveContainerOptions{
|
|
|
|
ID: containerCfg.Name,
|
|
|
|
Force: true,
|
|
|
|
})
|
2018-11-21 01:33:31 +00:00
|
|
|
return nil, nil, nstructs.WrapRecoverable(fmt.Sprintf("failed to create container: %v", err), err)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
d.logger.Info("created container", "container_id", container.ID)
|
|
|
|
|
|
|
|
// We don't need to start the container if the container is already running
|
|
|
|
// since we don't create containers which are already present on the host
|
|
|
|
// and are running
|
|
|
|
if !container.State.Running {
|
|
|
|
// Start the container
|
|
|
|
if err := d.startContainer(container); err != nil {
|
|
|
|
d.logger.Error("failed to start container", "container_id", container.ID, "error", err)
|
2018-11-12 12:39:55 +00:00
|
|
|
client.RemoveContainer(docker.RemoveContainerOptions{
|
|
|
|
ID: container.ID,
|
|
|
|
Force: true,
|
|
|
|
})
|
|
|
|
// Some sort of docker race bug, recreating the container usually works
|
|
|
|
if strings.Contains(err.Error(), "OCI runtime create failed: container with id exists:") && startAttempts < 5 {
|
|
|
|
startAttempts++
|
|
|
|
d.logger.Debug("reattempting container create/start sequence", "attempt", startAttempts, "container_id", id)
|
|
|
|
goto CREATE
|
|
|
|
}
|
2018-11-21 01:33:31 +00:00
|
|
|
return nil, nil, nstructs.WrapRecoverable(fmt.Sprintf("Failed to start container %s: %s", container.ID, err), err)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2020-12-10 15:29:18 +00:00
|
|
|
// Inspect container to get all of the container metadata as much of the
|
|
|
|
// metadata (eg networking) isn't populated until the container is started
|
|
|
|
runningContainer, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
|
|
|
|
ID: container.ID,
|
|
|
|
})
|
2018-11-06 05:39:48 +00:00
|
|
|
if err != nil {
|
2019-09-16 14:40:56 +00:00
|
|
|
client.RemoveContainer(docker.RemoveContainerOptions{
|
|
|
|
ID: container.ID,
|
|
|
|
Force: true,
|
|
|
|
})
|
2018-11-06 05:39:48 +00:00
|
|
|
msg := "failed to inspect started container"
|
|
|
|
d.logger.Error(msg, "error", err)
|
2019-09-18 12:45:59 +00:00
|
|
|
client.RemoveContainer(docker.RemoveContainerOptions{
|
|
|
|
ID: container.ID,
|
|
|
|
Force: true,
|
|
|
|
})
|
2018-11-06 05:39:48 +00:00
|
|
|
return nil, nil, nstructs.NewRecoverableError(fmt.Errorf("%s %s: %s", msg, container.ID, err), true)
|
|
|
|
}
|
|
|
|
container = runningContainer
|
|
|
|
d.logger.Info("started container", "container_id", container.ID)
|
|
|
|
} else {
|
|
|
|
d.logger.Debug("re-attaching to container", "container_id",
|
|
|
|
container.ID, "container_state", container.State.String())
|
|
|
|
}
|
|
|
|
|
client: enable support for cgroups v2
This PR introduces support for using Nomad on systems with cgroups v2 [1]
enabled as the cgroups controller mounted on /sys/fs/cgroups. Newer Linux
distros like Ubuntu 21.10 are shipping with cgroups v2 only, causing problems
for Nomad users.
Nomad mostly "just works" with cgroups v2 due to the indirection via libcontainer,
but not so for managing cpuset cgroups. Before, Nomad has been making use of
a feature in v1 where a PID could be a member of more than one cgroup. In v2
this is no longer possible, and so the logic around computing cpuset values
must be modified. When Nomad detects v2, it manages cpuset values in-process,
rather than making use of cgroup heirarchy inheritence via shared/reserved
parents.
Nomad will only activate the v2 logic when it detects cgroups2 is mounted at
/sys/fs/cgroups. This means on systems running in hybrid mode with cgroups2
mounted at /sys/fs/cgroups/unified (as is typical) Nomad will continue to
use the v1 logic, and should operate as before. Systems that do not support
cgroups v2 are also not affected.
When v2 is activated, Nomad will create a parent called nomad.slice (unless
otherwise configured in Client conifg), and create cgroups for tasks using
naming convention <allocID>-<task>.scope. These follow the naming convention
set by systemd and also used by Docker when cgroups v2 is detected.
Client nodes now export a new fingerprint attribute, unique.cgroups.version
which will be set to 'v1' or 'v2' to indicate the cgroups regime in use by
Nomad.
The new cpuset management strategy fixes #11705, where docker tasks that
spawned processes on startup would "leak". In cgroups v2, the PIDs are
started in the cgroup they will always live in, and thus the cause of
the leak is eliminated.
[1] https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
Closes #11289
Fixes #11705 #11773 #11933
2022-02-28 22:24:01 +00:00
|
|
|
if !cgutil.UseV2 {
|
|
|
|
// This does not apply to cgroups.v2, which only allows setting the PID
|
|
|
|
// into exactly 1 group. For cgroups.v2, we use the cpuset fixer to reconcile
|
|
|
|
// the cpuset value into the cgroups created by docker in the background.
|
|
|
|
if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" {
|
|
|
|
if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err)
|
|
|
|
}
|
2021-04-13 03:59:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-07 03:11:41 +00:00
|
|
|
collectingLogs := !d.config.DisableLogCollection
|
|
|
|
|
|
|
|
var dlogger docklog.DockerLogger
|
|
|
|
var pluginClient *plugin.Client
|
|
|
|
|
|
|
|
if collectingLogs {
|
|
|
|
dlogger, pluginClient, err = d.setupNewDockerLogger(container, cfg, time.Unix(0, 0))
|
|
|
|
if err != nil {
|
|
|
|
d.logger.Error("an error occurred after container startup, terminating container", "container_id", container.ID)
|
|
|
|
client.RemoveContainer(docker.RemoveContainerOptions{ID: container.ID, Force: true})
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Detect container address
|
|
|
|
ip, autoUse := d.detectIP(container, &driverConfig)
|
|
|
|
|
2019-01-04 23:01:35 +00:00
|
|
|
net := &drivers.DriverNetwork{
|
2018-11-06 05:39:48 +00:00
|
|
|
PortMap: driverConfig.PortMap,
|
|
|
|
IP: ip,
|
|
|
|
AutoAdvertise: autoUse,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return a driver handle
|
|
|
|
h := &taskHandle{
|
|
|
|
client: client,
|
|
|
|
waitClient: waitClient,
|
|
|
|
dlogger: dlogger,
|
|
|
|
dloggerPluginClient: pluginClient,
|
|
|
|
logger: d.logger.With("container_id", container.ID),
|
|
|
|
task: cfg,
|
2018-11-20 02:41:25 +00:00
|
|
|
containerID: container.ID,
|
|
|
|
containerImage: container.Image,
|
2018-11-06 05:39:48 +00:00
|
|
|
doneCh: make(chan bool),
|
|
|
|
waitCh: make(chan struct{}),
|
2018-11-20 02:07:30 +00:00
|
|
|
removeContainerOnExit: d.config.GC.Container,
|
2018-11-06 05:39:48 +00:00
|
|
|
net: net,
|
|
|
|
}
|
2018-11-12 18:51:53 +00:00
|
|
|
|
|
|
|
if err := handle.SetDriverState(h.buildState()); err != nil {
|
2018-11-19 20:06:07 +00:00
|
|
|
d.logger.Error("error encoding container occurred after startup, terminating container", "container_id", container.ID, "error", err)
|
2019-12-07 03:11:41 +00:00
|
|
|
if collectingLogs {
|
|
|
|
dlogger.Stop()
|
|
|
|
pluginClient.Kill()
|
|
|
|
}
|
2018-11-12 18:51:53 +00:00
|
|
|
client.RemoveContainer(docker.RemoveContainerOptions{ID: container.ID, Force: true})
|
2018-11-19 20:06:07 +00:00
|
|
|
return nil, nil, err
|
2018-11-12 18:51:53 +00:00
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
d.tasks.Set(cfg.ID, h)
|
|
|
|
go h.run()
|
|
|
|
|
|
|
|
return handle, net, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// createContainerClient is the subset of Docker Client methods used by the
|
|
|
|
// createContainer method to ease testing subtle error conditions.
|
|
|
|
type createContainerClient interface {
|
|
|
|
CreateContainer(docker.CreateContainerOptions) (*docker.Container, error)
|
|
|
|
InspectContainer(id string) (*docker.Container, error)
|
|
|
|
ListContainers(docker.ListContainersOptions) ([]docker.APIContainers, error)
|
|
|
|
RemoveContainer(opts docker.RemoveContainerOptions) error
|
|
|
|
}
|
|
|
|
|
|
|
|
// createContainer creates the container given the passed configuration. It
|
|
|
|
// attempts to handle any transient Docker errors.
|
|
|
|
func (d *Driver) createContainer(client createContainerClient, config docker.CreateContainerOptions,
|
2019-05-14 00:59:31 +00:00
|
|
|
image string) (*docker.Container, error) {
|
2018-11-06 05:39:48 +00:00
|
|
|
// Create a container
|
|
|
|
attempted := 0
|
|
|
|
CREATE:
|
|
|
|
container, createErr := client.CreateContainer(config)
|
|
|
|
if createErr == nil {
|
|
|
|
return container, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
d.logger.Debug("failed to create container", "container_name",
|
2019-05-14 00:59:31 +00:00
|
|
|
config.Name, "image_name", image, "image_id", config.Config.Image,
|
2018-11-06 05:39:48 +00:00
|
|
|
"attempt", attempted+1, "error", createErr)
|
|
|
|
|
|
|
|
// Volume management tools like Portworx may not have detached a volume
|
|
|
|
// from a previous node before Nomad started a task replacement task.
|
|
|
|
// Treat these errors as recoverable so we retry.
|
|
|
|
if strings.Contains(strings.ToLower(createErr.Error()), "volume is attached on another node") {
|
|
|
|
return nil, nstructs.NewRecoverableError(createErr, true)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the container already exists determine whether it's already
|
|
|
|
// running or if it's dead and needs to be recreated.
|
|
|
|
if strings.Contains(strings.ToLower(createErr.Error()), "container already exists") {
|
2019-09-18 20:34:57 +00:00
|
|
|
|
|
|
|
container, err := d.containerByName(config.Name)
|
2018-11-06 05:39:48 +00:00
|
|
|
if err != nil {
|
2019-09-18 20:34:57 +00:00
|
|
|
return nil, err
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2019-09-18 20:34:57 +00:00
|
|
|
if container != nil && container.State.Running {
|
|
|
|
return container, nil
|
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
|
2020-04-19 19:34:45 +00:00
|
|
|
// Purge conflicting container if found.
|
|
|
|
// If container is nil here, the conflicting container was
|
|
|
|
// deleted in our check here, so retry again.
|
|
|
|
if container != nil {
|
|
|
|
// Delete matching containers
|
|
|
|
err = client.RemoveContainer(docker.RemoveContainerOptions{
|
|
|
|
ID: container.ID,
|
|
|
|
Force: true,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
d.logger.Error("failed to purge container", "container_id", container.ID)
|
|
|
|
return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err))
|
|
|
|
} else {
|
|
|
|
d.logger.Info("purged container", "container_id", container.ID)
|
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if attempted < 5 {
|
|
|
|
attempted++
|
2019-09-18 12:09:13 +00:00
|
|
|
time.Sleep(nextBackoff(attempted))
|
2018-11-06 05:39:48 +00:00
|
|
|
goto CREATE
|
|
|
|
}
|
|
|
|
} else if strings.Contains(strings.ToLower(createErr.Error()), "no such image") {
|
|
|
|
// There is still a very small chance this is possible even with the
|
|
|
|
// coordinator so retry.
|
|
|
|
return nil, nstructs.NewRecoverableError(createErr, true)
|
2019-09-13 19:25:31 +00:00
|
|
|
} else if isDockerTransientError(createErr) && attempted < 5 {
|
|
|
|
attempted++
|
2019-09-18 12:09:13 +00:00
|
|
|
time.Sleep(nextBackoff(attempted))
|
2019-09-13 19:25:31 +00:00
|
|
|
goto CREATE
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil, recoverableErrTimeouts(createErr)
|
|
|
|
}
|
|
|
|
|
|
|
|
// startContainer starts the passed container. It attempts to handle any
|
|
|
|
// transient Docker errors.
|
|
|
|
func (d *Driver) startContainer(c *docker.Container) error {
|
|
|
|
// Start a container
|
|
|
|
attempted := 0
|
|
|
|
START:
|
|
|
|
startErr := client.StartContainer(c.ID, c.HostConfig)
|
2019-09-13 16:59:14 +00:00
|
|
|
if startErr == nil || strings.Contains(startErr.Error(), "Container already running") {
|
2018-11-06 05:39:48 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
d.logger.Debug("failed to start container", "container_id", c.ID, "attempt", attempted+1, "error", startErr)
|
|
|
|
|
2019-09-13 19:25:31 +00:00
|
|
|
if isDockerTransientError(startErr) {
|
2018-11-06 05:39:48 +00:00
|
|
|
if attempted < 5 {
|
|
|
|
attempted++
|
2019-09-18 12:09:13 +00:00
|
|
|
time.Sleep(nextBackoff(attempted))
|
2018-11-06 05:39:48 +00:00
|
|
|
goto START
|
|
|
|
}
|
|
|
|
return nstructs.NewRecoverableError(startErr, true)
|
|
|
|
}
|
|
|
|
|
|
|
|
return recoverableErrTimeouts(startErr)
|
|
|
|
}
|
|
|
|
|
2019-09-18 12:09:13 +00:00
|
|
|
// nextBackoff returns appropriate docker backoff durations after attempted attempts.
|
|
|
|
func nextBackoff(attempted int) time.Duration {
|
|
|
|
// attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s
|
|
|
|
// TODO: add randomization factor and extract to a helper
|
|
|
|
return 1 << (2 * uint64(attempted)) * 50 * time.Millisecond
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// createImage creates a docker image either by pulling it from a registry or by
|
|
|
|
// loading it from the file system
|
|
|
|
func (d *Driver) createImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (string, error) {
|
|
|
|
image := driverConfig.Image
|
|
|
|
repo, tag := parseDockerImage(image)
|
|
|
|
|
|
|
|
// We're going to check whether the image is already downloaded. If the tag
|
|
|
|
// is "latest", or ForcePull is set, we have to check for a new version every time so we don't
|
|
|
|
// bother to check and cache the id here. We'll download first, then cache.
|
|
|
|
if driverConfig.ForcePull {
|
|
|
|
d.logger.Debug("force pulling image instead of inspecting local", "image_ref", dockerImageRef(repo, tag))
|
|
|
|
} else if tag != "latest" {
|
|
|
|
if dockerImage, _ := client.InspectImage(image); dockerImage != nil {
|
|
|
|
// Image exists so just increment its reference count
|
2020-05-13 15:17:47 +00:00
|
|
|
d.coordinator.IncrementImageReference(dockerImage.ID, image, task.ID)
|
2018-11-06 05:39:48 +00:00
|
|
|
return dockerImage.ID, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load the image if specified
|
|
|
|
if driverConfig.LoadImage != "" {
|
|
|
|
return d.loadImage(task, driverConfig, client)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Download the image
|
|
|
|
return d.pullImage(task, driverConfig, client, repo, tag)
|
|
|
|
}
|
|
|
|
|
|
|
|
// pullImage creates an image by pulling it from a docker registry
|
|
|
|
func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client, repo, tag string) (id string, err error) {
|
|
|
|
authOptions, err := d.resolveRegistryAuthentication(driverConfig, repo)
|
|
|
|
if err != nil {
|
|
|
|
if driverConfig.AuthSoftFail {
|
|
|
|
d.logger.Warn("Failed to find docker repo auth", "repo", repo, "error", err)
|
|
|
|
} else {
|
|
|
|
return "", fmt.Errorf("Failed to find docker auth for repo %q: %v", repo, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if authIsEmpty(authOptions) {
|
|
|
|
d.logger.Debug("did not find docker auth for repo", "repo", repo)
|
|
|
|
}
|
|
|
|
|
|
|
|
d.eventer.EmitEvent(&drivers.TaskEvent{
|
|
|
|
TaskID: task.ID,
|
2018-12-18 03:36:06 +00:00
|
|
|
AllocID: task.AllocID,
|
|
|
|
TaskName: task.Name,
|
2018-11-06 05:39:48 +00:00
|
|
|
Timestamp: time.Now(),
|
|
|
|
Message: "Downloading image",
|
|
|
|
Annotations: map[string]string{
|
|
|
|
"image": dockerImageRef(repo, tag),
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2020-08-12 07:58:07 +00:00
|
|
|
pullDur, err := time.ParseDuration(driverConfig.ImagePullTimeout)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("Failed to parse image_pull_timeout: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), pullDur, d.config.pullActivityTimeoutDuration)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn {
|
|
|
|
return func(msg string, annotations map[string]string) {
|
|
|
|
d.eventer.EmitEvent(&drivers.TaskEvent{
|
|
|
|
TaskID: task.ID,
|
2018-12-18 03:36:06 +00:00
|
|
|
AllocID: task.AllocID,
|
|
|
|
TaskName: task.Name,
|
2018-11-06 05:39:48 +00:00
|
|
|
Timestamp: time.Now(),
|
|
|
|
Message: msg,
|
|
|
|
Annotations: annotations,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// authBackend encapsulates a function that resolves registry credentials.
|
|
|
|
type authBackend func(string) (*docker.AuthConfiguration, error)
|
|
|
|
|
|
|
|
// resolveRegistryAuthentication attempts to retrieve auth credentials for the
|
|
|
|
// repo, trying all authentication-backends possible.
|
|
|
|
func (d *Driver) resolveRegistryAuthentication(driverConfig *TaskConfig, repo string) (*docker.AuthConfiguration, error) {
|
|
|
|
return firstValidAuth(repo, []authBackend{
|
|
|
|
authFromTaskConfig(driverConfig),
|
2018-11-20 02:32:08 +00:00
|
|
|
authFromDockerConfig(d.config.Auth.Config),
|
|
|
|
authFromHelper(d.config.Auth.Helper),
|
2018-11-06 05:39:48 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// loadImage creates an image by loading it from the file system
|
|
|
|
func (d *Driver) loadImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (id string, err error) {
|
|
|
|
|
|
|
|
archive := filepath.Join(task.TaskDir().LocalDir, driverConfig.LoadImage)
|
|
|
|
d.logger.Debug("loading image from disk", "archive", archive)
|
|
|
|
|
|
|
|
f, err := os.Open(archive)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("unable to open image archive: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := client.LoadImage(docker.LoadImageOptions{InputStream: f}); err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
f.Close()
|
|
|
|
|
|
|
|
dockerImage, err := client.InspectImage(driverConfig.Image)
|
|
|
|
if err != nil {
|
|
|
|
return "", recoverableErrTimeouts(err)
|
|
|
|
}
|
|
|
|
|
2018-11-20 02:32:08 +00:00
|
|
|
d.coordinator.IncrementImageReference(dockerImage.ID, driverConfig.Image, task.ID)
|
2018-11-06 05:39:48 +00:00
|
|
|
return dockerImage.ID, nil
|
|
|
|
}
|
|
|
|
|
2020-05-04 17:08:47 +00:00
|
|
|
func (d *Driver) convertAllocPathsForWindowsLCOW(task *drivers.TaskConfig, image string) error {
|
|
|
|
imageConfig, err := client.InspectImage(image)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("the image does not exist: %v", err)
|
|
|
|
}
|
|
|
|
// LCOW If we are running a Linux Container on Windows, we need to mount it correctly, as c:\ does not exist on unix
|
|
|
|
if imageConfig.OS == "linux" {
|
|
|
|
a := []rune(task.Env[taskenv.AllocDir])
|
|
|
|
task.Env[taskenv.AllocDir] = strings.ReplaceAll(string(a[2:]), "\\", "/")
|
|
|
|
l := []rune(task.Env[taskenv.TaskLocalDir])
|
|
|
|
task.Env[taskenv.TaskLocalDir] = strings.ReplaceAll(string(l[2:]), "\\", "/")
|
|
|
|
s := []rune(task.Env[taskenv.SecretsDir])
|
|
|
|
task.Env[taskenv.SecretsDir] = strings.ReplaceAll(string(s[2:]), "\\", "/")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
|
2020-05-04 17:08:47 +00:00
|
|
|
func (d *Driver) containerBinds(task *drivers.TaskConfig, driverConfig *TaskConfig) ([]string, error) {
|
2018-11-30 11:18:39 +00:00
|
|
|
allocDirBind := fmt.Sprintf("%s:%s", task.TaskDir().SharedAllocDir, task.Env[taskenv.AllocDir])
|
|
|
|
taskLocalBind := fmt.Sprintf("%s:%s", task.TaskDir().LocalDir, task.Env[taskenv.TaskLocalDir])
|
|
|
|
secretDirBind := fmt.Sprintf("%s:%s", task.TaskDir().SecretsDir, task.Env[taskenv.SecretsDir])
|
2018-11-06 05:39:48 +00:00
|
|
|
binds := []string{allocDirBind, taskLocalBind, secretDirBind}
|
|
|
|
|
2019-04-17 09:13:34 +00:00
|
|
|
taskLocalBindVolume := driverConfig.VolumeDriver == ""
|
2018-12-11 19:22:50 +00:00
|
|
|
|
2019-04-17 09:13:34 +00:00
|
|
|
if !d.config.Volumes.Enabled && !taskLocalBindVolume {
|
2018-11-20 03:41:14 +00:00
|
|
|
return nil, fmt.Errorf("volumes are not enabled; cannot use volume driver %q", driverConfig.VolumeDriver)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, userbind := range driverConfig.Volumes {
|
2019-02-25 02:35:51 +00:00
|
|
|
// This assumes host OS = docker container OS.
|
|
|
|
// Not true, when we support Linux containers on Windows
|
|
|
|
src, dst, mode, err := parseVolumeSpec(userbind, runtime.GOOS)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid docker volume %q: %v", userbind, err)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2019-04-17 09:13:34 +00:00
|
|
|
// Paths inside task dir are always allowed when using the default driver,
|
|
|
|
// Relative paths are always allowed as they mount within a container
|
|
|
|
// When a VolumeDriver is set, we assume we receive a binding in the format
|
|
|
|
// volume-name:container-dest
|
|
|
|
// Otherwise, we assume we receive a relative path binding in the format
|
|
|
|
// relative/to/task:/also/in/container
|
|
|
|
if taskLocalBindVolume {
|
2019-02-25 02:35:51 +00:00
|
|
|
src = expandPath(task.TaskDir().Dir, src)
|
2019-04-17 09:13:34 +00:00
|
|
|
} else {
|
|
|
|
// Resolve dotted path segments
|
2019-02-25 02:35:51 +00:00
|
|
|
src = filepath.Clean(src)
|
2019-04-17 09:13:34 +00:00
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
|
2019-02-25 02:35:51 +00:00
|
|
|
if !d.config.Volumes.Enabled && !isParentPath(task.AllocDir, src) {
|
2019-04-17 09:13:34 +00:00
|
|
|
return nil, fmt.Errorf("volumes are not enabled; cannot mount host paths: %+q", userbind)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2019-02-25 02:35:51 +00:00
|
|
|
bind := src + ":" + dst
|
|
|
|
if mode != "" {
|
|
|
|
bind += ":" + mode
|
|
|
|
}
|
|
|
|
binds = append(binds, bind)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2018-11-20 02:32:08 +00:00
|
|
|
if selinuxLabel := d.config.Volumes.SelinuxLabel; selinuxLabel != "" {
|
2018-11-06 05:39:48 +00:00
|
|
|
// Apply SELinux Label to each volume
|
|
|
|
for i := range binds {
|
|
|
|
binds[i] = fmt.Sprintf("%s:%s", binds[i], selinuxLabel)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return binds, nil
|
|
|
|
}
|
|
|
|
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
var userMountToUnixMount = map[string]string{
|
|
|
|
// Empty string maps to `rprivate` for backwards compatibility in restored
|
|
|
|
// older tasks, where mount propagation will not be present.
|
|
|
|
"": "rprivate",
|
|
|
|
nstructs.VolumeMountPropagationPrivate: "rprivate",
|
|
|
|
nstructs.VolumeMountPropagationHostToTask: "rslave",
|
|
|
|
nstructs.VolumeMountPropagationBidirectional: "rshared",
|
|
|
|
}
|
|
|
|
|
2020-03-31 01:21:39 +00:00
|
|
|
// takes a local seccomp daemon, reads the file contents for sending to the daemon
|
|
|
|
// this code modified slightly from the docker CLI code
|
|
|
|
// https://github.com/docker/cli/blob/8ef8547eb6934b28497d309d21e280bcd25145f5/cli/command/container/opts.go#L840
|
|
|
|
func parseSecurityOpts(securityOpts []string) ([]string, error) {
|
|
|
|
for key, opt := range securityOpts {
|
|
|
|
con := strings.SplitN(opt, "=", 2)
|
|
|
|
if len(con) == 1 && con[0] != "no-new-privileges" {
|
|
|
|
if strings.Contains(opt, ":") {
|
|
|
|
con = strings.SplitN(opt, ":", 2)
|
|
|
|
} else {
|
|
|
|
return securityOpts, fmt.Errorf("invalid security_opt: %q", opt)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if con[0] == "seccomp" && con[1] != "unconfined" {
|
|
|
|
f, err := ioutil.ReadFile(con[1])
|
|
|
|
if err != nil {
|
|
|
|
return securityOpts, fmt.Errorf("opening seccomp profile (%s) failed: %v", con[1], err)
|
|
|
|
}
|
|
|
|
b := bytes.NewBuffer(nil)
|
|
|
|
if err := json.Compact(b, f); err != nil {
|
|
|
|
return securityOpts, fmt.Errorf("compacting json for seccomp profile (%s) failed: %v", con[1], err)
|
|
|
|
}
|
|
|
|
securityOpts[key] = fmt.Sprintf("seccomp=%s", b.Bytes())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return securityOpts, nil
|
|
|
|
}
|
|
|
|
|
2020-05-31 17:38:27 +00:00
|
|
|
// memoryLimits computes the memory and memory_reservation values passed along to
|
2021-03-26 20:16:06 +00:00
|
|
|
// the docker host config. These fields represent hard and soft/reserved memory
|
|
|
|
// limits from docker's perspective, respectively.
|
2020-05-31 17:38:27 +00:00
|
|
|
//
|
|
|
|
// The memory field on the task configuration can be interpreted as a hard or soft
|
|
|
|
// limit. Before Nomad v0.11.3, it was always a hard limit. Now, it is interpreted
|
|
|
|
// as a soft limit if the memory_hard_limit value is configured on the docker
|
|
|
|
// task driver configuration. When memory_hard_limit is set, the docker host
|
|
|
|
// config is configured such that the memory field is equal to memory_hard_limit
|
|
|
|
// value, and the memory_reservation field is set to the task driver memory value.
|
|
|
|
//
|
|
|
|
// If memory_hard_limit is not set (i.e. zero value), then the memory field of
|
|
|
|
// the task resource config is interpreted as a hard limit. In this case both the
|
|
|
|
// memory is set to the task resource memory value and memory_reservation is left
|
|
|
|
// unset.
|
|
|
|
//
|
|
|
|
// Returns (memory (hard), memory_reservation (soft)) values in bytes.
|
2021-03-26 20:16:06 +00:00
|
|
|
func memoryLimits(driverHardLimitMB int64, taskMemory drivers.MemoryResources) (memory, reserve int64) {
|
|
|
|
softBytes := taskMemory.MemoryMB * 1024 * 1024
|
|
|
|
|
|
|
|
hard := driverHardLimitMB
|
|
|
|
if taskMemory.MemoryMaxMB > hard {
|
|
|
|
hard = taskMemory.MemoryMaxMB
|
|
|
|
}
|
|
|
|
|
|
|
|
if hard <= 0 {
|
|
|
|
return softBytes, 0
|
2020-05-31 17:38:27 +00:00
|
|
|
}
|
2021-03-26 20:16:06 +00:00
|
|
|
return hard * 1024 * 1024, softBytes
|
2020-05-31 17:38:27 +00:00
|
|
|
}
|
|
|
|
|
2022-05-17 20:11:57 +00:00
|
|
|
// Extract the cgroup parent from the nomad cgroup (only for linux/v2)
|
|
|
|
func cgroupParent(resources *drivers.Resources) string {
|
|
|
|
var parent string
|
|
|
|
if cgutil.UseV2 && resources != nil && resources.LinuxResources != nil {
|
|
|
|
parent, _ = cgutil.SplitPath(resources.LinuxResources.CpusetCgroupPath)
|
|
|
|
}
|
|
|
|
return parent
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *TaskConfig,
|
|
|
|
imageID string) (docker.CreateContainerOptions, error) {
|
|
|
|
|
2019-09-04 13:33:35 +00:00
|
|
|
// ensure that PortMap variables are populated early on
|
|
|
|
task.Env = taskenv.SetPortMapEnvs(task.Env, driverConfig.PortMap)
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
logger := d.logger.With("task_name", task.Name)
|
|
|
|
var c docker.CreateContainerOptions
|
|
|
|
if task.Resources == nil {
|
|
|
|
// Guard against missing resources. We should never have been able to
|
|
|
|
// schedule a job without specifying this.
|
|
|
|
logger.Error("task.Resources is empty")
|
|
|
|
return c, fmt.Errorf("task.Resources is empty")
|
|
|
|
}
|
|
|
|
binds, err := d.containerBinds(task, driverConfig)
|
|
|
|
if err != nil {
|
|
|
|
return c, err
|
|
|
|
}
|
2018-11-12 12:39:55 +00:00
|
|
|
logger.Trace("binding volumes", "volumes", binds)
|
2018-11-06 05:39:48 +00:00
|
|
|
|
|
|
|
// create the config block that will later be consumed by go-dockerclient
|
|
|
|
config := &docker.Config{
|
|
|
|
Image: imageID,
|
|
|
|
Entrypoint: driverConfig.Entrypoint,
|
|
|
|
Hostname: driverConfig.Hostname,
|
|
|
|
User: task.User,
|
|
|
|
Tty: driverConfig.TTY,
|
|
|
|
OpenStdin: driverConfig.Interactive,
|
|
|
|
}
|
|
|
|
|
|
|
|
if driverConfig.WorkDir != "" {
|
|
|
|
config.WorkingDir = driverConfig.WorkDir
|
|
|
|
}
|
|
|
|
|
2020-05-12 14:13:50 +00:00
|
|
|
containerRuntime := driverConfig.Runtime
|
|
|
|
if _, ok := task.DeviceEnv[nvidiaVisibleDevices]; ok {
|
|
|
|
if !d.gpuRuntime {
|
2020-05-12 14:56:47 +00:00
|
|
|
return c, fmt.Errorf("requested docker runtime %q was not found", d.config.GPURuntimeName)
|
2020-05-12 14:13:50 +00:00
|
|
|
}
|
|
|
|
if containerRuntime != "" && containerRuntime != d.config.GPURuntimeName {
|
|
|
|
return c, fmt.Errorf("conflicting runtime requests: gpu runtime %q conflicts with task runtime %q", d.config.GPURuntimeName, containerRuntime)
|
|
|
|
}
|
|
|
|
containerRuntime = d.config.GPURuntimeName
|
|
|
|
}
|
2020-05-12 15:03:08 +00:00
|
|
|
if _, ok := d.config.allowRuntimes[containerRuntime]; !ok && containerRuntime != "" {
|
2020-05-12 14:56:47 +00:00
|
|
|
return c, fmt.Errorf("requested runtime %q is not allowed", containerRuntime)
|
2020-05-12 14:13:50 +00:00
|
|
|
}
|
|
|
|
|
2021-03-26 20:16:06 +00:00
|
|
|
memory, memoryReservation := memoryLimits(driverConfig.MemoryHardLimit, task.Resources.NomadResources.Memory)
|
2020-05-31 17:38:27 +00:00
|
|
|
|
2021-12-21 18:31:34 +00:00
|
|
|
var pidsLimit int64
|
|
|
|
|
|
|
|
// Pids limit defined in Nomad plugin config. Defaults to 0 (Unlimited).
|
|
|
|
if d.config.PidsLimit > 0 {
|
|
|
|
pidsLimit = d.config.PidsLimit
|
|
|
|
}
|
|
|
|
|
|
|
|
// Override Nomad plugin config pids limit, by user defined pids limit.
|
|
|
|
if driverConfig.PidsLimit > 0 {
|
|
|
|
if d.config.PidsLimit > 0 && driverConfig.PidsLimit > d.config.PidsLimit {
|
|
|
|
return c, fmt.Errorf("pids_limit cannot be greater than nomad plugin config pids_limit: %d", d.config.PidsLimit)
|
|
|
|
}
|
|
|
|
pidsLimit = driverConfig.PidsLimit
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
hostConfig := &docker.HostConfig{
|
2022-05-17 20:11:57 +00:00
|
|
|
CgroupParent: cgroupParent(task.Resources), // if applicable
|
client: enable support for cgroups v2
This PR introduces support for using Nomad on systems with cgroups v2 [1]
enabled as the cgroups controller mounted on /sys/fs/cgroups. Newer Linux
distros like Ubuntu 21.10 are shipping with cgroups v2 only, causing problems
for Nomad users.
Nomad mostly "just works" with cgroups v2 due to the indirection via libcontainer,
but not so for managing cpuset cgroups. Before, Nomad has been making use of
a feature in v1 where a PID could be a member of more than one cgroup. In v2
this is no longer possible, and so the logic around computing cpuset values
must be modified. When Nomad detects v2, it manages cpuset values in-process,
rather than making use of cgroup heirarchy inheritence via shared/reserved
parents.
Nomad will only activate the v2 logic when it detects cgroups2 is mounted at
/sys/fs/cgroups. This means on systems running in hybrid mode with cgroups2
mounted at /sys/fs/cgroups/unified (as is typical) Nomad will continue to
use the v1 logic, and should operate as before. Systems that do not support
cgroups v2 are also not affected.
When v2 is activated, Nomad will create a parent called nomad.slice (unless
otherwise configured in Client conifg), and create cgroups for tasks using
naming convention <allocID>-<task>.scope. These follow the naming convention
set by systemd and also used by Docker when cgroups v2 is detected.
Client nodes now export a new fingerprint attribute, unique.cgroups.version
which will be set to 'v1' or 'v2' to indicate the cgroups regime in use by
Nomad.
The new cpuset management strategy fixes #11705, where docker tasks that
spawned processes on startup would "leak". In cgroups v2, the PIDs are
started in the cgroup they will always live in, and thus the cause of
the leak is eliminated.
[1] https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
Closes #11289
Fixes #11705 #11773 #11933
2022-02-28 22:24:01 +00:00
|
|
|
|
2020-05-31 17:38:27 +00:00
|
|
|
Memory: memory, // hard limit
|
|
|
|
MemoryReservation: memoryReservation, // soft limit
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
CPUShares: task.Resources.LinuxResources.CPUShares,
|
|
|
|
|
|
|
|
// Binds are used to mount a host volume into the container. We mount a
|
|
|
|
// local directory for storage and a shared alloc directory that can be
|
|
|
|
// used to share data between different tasks in the same task group.
|
|
|
|
Binds: binds,
|
|
|
|
|
2018-11-20 14:49:50 +00:00
|
|
|
StorageOpt: driverConfig.StorageOpt,
|
2018-11-06 05:39:48 +00:00
|
|
|
VolumeDriver: driverConfig.VolumeDriver,
|
|
|
|
|
2021-12-21 18:31:34 +00:00
|
|
|
PidsLimit: &pidsLimit,
|
2018-11-06 05:39:48 +00:00
|
|
|
|
2020-05-12 14:13:50 +00:00
|
|
|
Runtime: containerRuntime,
|
2018-12-18 01:03:43 +00:00
|
|
|
}
|
|
|
|
|
2020-06-25 16:30:16 +00:00
|
|
|
// This translates to docker create/run --cpuset-cpus option.
|
|
|
|
// --cpuset-cpus limit the specific CPUs or cores a container can use.
|
2021-04-13 03:59:49 +00:00
|
|
|
// Nomad natively manages cpusets, setting this option will override
|
|
|
|
// Nomad managed cpusets.
|
2020-06-25 16:30:16 +00:00
|
|
|
if driverConfig.CPUSetCPUs != "" {
|
|
|
|
hostConfig.CPUSetCPUs = driverConfig.CPUSetCPUs
|
|
|
|
}
|
|
|
|
|
2021-10-15 19:53:25 +00:00
|
|
|
// Enable tini (docker-init) init system.
|
|
|
|
if driverConfig.Init {
|
|
|
|
hostConfig.Init = driverConfig.Init
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// Calculate CPU Quota
|
|
|
|
// cfs_quota_us is the time per core, so we must
|
|
|
|
// multiply the time by the number of cores available
|
|
|
|
// See https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu
|
|
|
|
if driverConfig.CPUHardLimit {
|
2018-11-12 12:39:55 +00:00
|
|
|
numCores := runtime.NumCPU()
|
2018-11-06 05:39:48 +00:00
|
|
|
if driverConfig.CPUCFSPeriod < 0 || driverConfig.CPUCFSPeriod > 1000000 {
|
|
|
|
return c, fmt.Errorf("invalid value for cpu_cfs_period")
|
|
|
|
}
|
|
|
|
if driverConfig.CPUCFSPeriod == 0 {
|
|
|
|
driverConfig.CPUCFSPeriod = task.Resources.LinuxResources.CPUPeriod
|
|
|
|
}
|
|
|
|
hostConfig.CPUPeriod = driverConfig.CPUCFSPeriod
|
2018-11-16 16:08:53 +00:00
|
|
|
hostConfig.CPUQuota = int64(task.Resources.LinuxResources.PercentTicks*float64(driverConfig.CPUCFSPeriod)) * int64(numCores)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Windows does not support MemorySwap/MemorySwappiness #2193
|
|
|
|
if runtime.GOOS == "windows" {
|
|
|
|
hostConfig.MemorySwap = 0
|
2020-03-30 20:50:36 +00:00
|
|
|
hostConfig.MemorySwappiness = nil
|
2018-11-06 05:39:48 +00:00
|
|
|
} else {
|
2020-06-12 19:11:28 +00:00
|
|
|
hostConfig.MemorySwap = memory
|
2020-03-30 20:50:36 +00:00
|
|
|
|
|
|
|
// disable swap explicitly in non-Windows environments
|
|
|
|
var swapiness int64 = 0
|
|
|
|
hostConfig.MemorySwappiness = &swapiness
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2019-02-28 21:40:18 +00:00
|
|
|
loggingDriver := driverConfig.Logging.Type
|
2019-02-28 20:25:17 +00:00
|
|
|
if loggingDriver == "" {
|
2019-02-28 21:40:18 +00:00
|
|
|
loggingDriver = driverConfig.Logging.Driver
|
2019-02-28 20:25:17 +00:00
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
hostConfig.LogConfig = docker.LogConfig{
|
2019-02-28 20:25:17 +00:00
|
|
|
Type: loggingDriver,
|
2018-11-06 05:39:48 +00:00
|
|
|
Config: driverConfig.Logging.Config,
|
|
|
|
}
|
|
|
|
|
2019-06-17 16:50:23 +00:00
|
|
|
if hostConfig.LogConfig.Type == "" && hostConfig.LogConfig.Config == nil {
|
2021-03-12 21:04:33 +00:00
|
|
|
logger.Trace("no docker log driver provided, defaulting to plugin config")
|
|
|
|
hostConfig.LogConfig.Type = d.config.Logging.Type
|
|
|
|
hostConfig.LogConfig.Config = d.config.Logging.Config
|
2019-06-17 16:50:23 +00:00
|
|
|
}
|
|
|
|
|
2020-05-31 17:38:27 +00:00
|
|
|
logger.Debug("configured resources",
|
|
|
|
"memory", hostConfig.Memory, "memory_reservation", hostConfig.MemoryReservation,
|
2018-11-06 05:39:48 +00:00
|
|
|
"cpu_shares", hostConfig.CPUShares, "cpu_quota", hostConfig.CPUQuota,
|
|
|
|
"cpu_period", hostConfig.CPUPeriod)
|
2020-05-04 17:08:47 +00:00
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
logger.Debug("binding directories", "binds", hclog.Fmt("%#v", hostConfig.Binds))
|
|
|
|
|
|
|
|
// set privileged mode
|
|
|
|
if driverConfig.Privileged && !d.config.AllowPrivileged {
|
|
|
|
return c, fmt.Errorf(`Docker privileged mode is disabled on this Nomad agent`)
|
|
|
|
}
|
|
|
|
hostConfig.Privileged = driverConfig.Privileged
|
|
|
|
|
2021-05-12 20:22:09 +00:00
|
|
|
// set add/drop capabilities
|
2021-05-15 20:48:01 +00:00
|
|
|
if hostConfig.CapAdd, hostConfig.CapDrop, err = capabilities.Delta(
|
|
|
|
capabilities.DockerDefaults(), d.config.AllowCaps, driverConfig.CapAdd, driverConfig.CapDrop,
|
|
|
|
); err != nil {
|
2021-05-12 20:22:09 +00:00
|
|
|
return c, err
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// set SHM size
|
|
|
|
if driverConfig.ShmSize != 0 {
|
|
|
|
hostConfig.ShmSize = driverConfig.ShmSize
|
|
|
|
}
|
|
|
|
|
2018-12-04 21:46:16 +00:00
|
|
|
// Setup devices
|
2018-12-04 19:50:59 +00:00
|
|
|
for _, device := range driverConfig.Devices {
|
|
|
|
dd, err := device.toDockerDevice()
|
|
|
|
if err != nil {
|
|
|
|
return c, err
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
2018-12-04 19:50:59 +00:00
|
|
|
hostConfig.Devices = append(hostConfig.Devices, dd)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
2018-12-04 21:46:16 +00:00
|
|
|
for _, device := range task.Devices {
|
|
|
|
hostConfig.Devices = append(hostConfig.Devices, docker.Device{
|
|
|
|
PathOnHost: device.HostPath,
|
|
|
|
PathInContainer: device.TaskPath,
|
|
|
|
CgroupPermissions: device.Permissions,
|
|
|
|
})
|
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
|
|
|
|
// Setup mounts
|
|
|
|
for _, m := range driverConfig.Mounts {
|
2020-12-15 19:13:50 +00:00
|
|
|
hm, err := d.toDockerMount(&m, task)
|
2018-11-26 21:45:01 +00:00
|
|
|
if err != nil {
|
|
|
|
return c, err
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
2020-12-15 19:13:50 +00:00
|
|
|
hostConfig.Mounts = append(hostConfig.Mounts, *hm)
|
|
|
|
}
|
|
|
|
for _, m := range driverConfig.MountsList {
|
|
|
|
hm, err := d.toDockerMount(&m, task)
|
|
|
|
if err != nil {
|
|
|
|
return c, err
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
2020-12-15 19:13:50 +00:00
|
|
|
hostConfig.Mounts = append(hostConfig.Mounts, *hm)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
|
2021-06-16 18:55:22 +00:00
|
|
|
// Setup /etc/hosts
|
|
|
|
// If the task's network_mode is unset our hostname and IP will come from
|
|
|
|
// the Nomad-owned network (if in use), so we need to generate an
|
|
|
|
// /etc/hosts file that matches the network rather than the default one
|
|
|
|
// that comes from the pause container
|
|
|
|
if task.NetworkIsolation != nil && driverConfig.NetworkMode == "" {
|
|
|
|
etcHostMount, err := hostnames.GenerateEtcHostsMount(
|
2021-06-30 15:10:04 +00:00
|
|
|
task.AllocDir, task.NetworkIsolation, driverConfig.ExtraHosts)
|
2021-06-16 18:55:22 +00:00
|
|
|
if err != nil {
|
|
|
|
return c, fmt.Errorf("failed to build mount for /etc/hosts: %v", err)
|
|
|
|
}
|
|
|
|
if etcHostMount != nil {
|
|
|
|
// erase the extra_hosts field if we have a mount so we don't get
|
|
|
|
// conflicting options error from dockerd
|
|
|
|
driverConfig.ExtraHosts = nil
|
|
|
|
hostConfig.Mounts = append(hostConfig.Mounts, docker.HostMount{
|
|
|
|
Target: etcHostMount.TaskPath,
|
|
|
|
Source: etcHostMount.HostPath,
|
|
|
|
Type: "bind",
|
|
|
|
ReadOnly: etcHostMount.Readonly,
|
|
|
|
BindOptions: &docker.BindOptions{
|
|
|
|
Propagation: etcHostMount.PropagationMode,
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-17 14:22:08 +00:00
|
|
|
// Setup DNS
|
|
|
|
// If task DNS options are configured Nomad will manage the resolv.conf file
|
|
|
|
// Docker driver dns options are not compatible with task dns options
|
|
|
|
if task.DNS != nil {
|
|
|
|
dnsMount, err := resolvconf.GenerateDNSMount(task.TaskDir().Dir, task.DNS)
|
|
|
|
if err != nil {
|
|
|
|
return c, fmt.Errorf("failed to build mount for resolv.conf: %v", err)
|
|
|
|
}
|
|
|
|
hostConfig.Mounts = append(hostConfig.Mounts, docker.HostMount{
|
|
|
|
Target: dnsMount.TaskPath,
|
|
|
|
Source: dnsMount.HostPath,
|
|
|
|
Type: "bind",
|
|
|
|
ReadOnly: dnsMount.Readonly,
|
|
|
|
BindOptions: &docker.BindOptions{
|
|
|
|
Propagation: dnsMount.PropagationMode,
|
|
|
|
},
|
|
|
|
})
|
|
|
|
} else {
|
|
|
|
if len(driverConfig.DNSSearchDomains) > 0 {
|
|
|
|
hostConfig.DNSSearch = driverConfig.DNSSearchDomains
|
|
|
|
}
|
|
|
|
if len(driverConfig.DNSOptions) > 0 {
|
|
|
|
hostConfig.DNSOptions = driverConfig.DNSOptions
|
|
|
|
}
|
|
|
|
// set DNS servers
|
|
|
|
for _, ip := range driverConfig.DNSServers {
|
|
|
|
if net.ParseIP(ip) != nil {
|
|
|
|
hostConfig.DNS = append(hostConfig.DNS, ip)
|
|
|
|
} else {
|
|
|
|
logger.Error("invalid ip address for container dns server", "ip", ip)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-04 21:46:16 +00:00
|
|
|
for _, m := range task.Mounts {
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
hm := docker.HostMount{
|
2018-12-04 21:46:16 +00:00
|
|
|
Type: "bind",
|
|
|
|
Target: m.TaskPath,
|
|
|
|
Source: m.HostPath,
|
|
|
|
ReadOnly: m.Readonly,
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// MountPropagation is only supported by Docker on Linux:
|
|
|
|
// https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation
|
|
|
|
if runtime.GOOS == "linux" {
|
|
|
|
hm.BindOptions = &docker.BindOptions{
|
|
|
|
Propagation: userMountToUnixMount[m.PropagationMode],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
hostConfig.Mounts = append(hostConfig.Mounts, hm)
|
2018-12-04 21:46:16 +00:00
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
|
|
|
|
hostConfig.ExtraHosts = driverConfig.ExtraHosts
|
|
|
|
|
|
|
|
hostConfig.IpcMode = driverConfig.IPCMode
|
|
|
|
hostConfig.PidMode = driverConfig.PidMode
|
|
|
|
hostConfig.UTSMode = driverConfig.UTSMode
|
|
|
|
hostConfig.UsernsMode = driverConfig.UsernsMode
|
|
|
|
hostConfig.SecurityOpt = driverConfig.SecurityOpt
|
|
|
|
hostConfig.Sysctls = driverConfig.Sysctl
|
|
|
|
|
2020-03-31 01:21:39 +00:00
|
|
|
hostConfig.SecurityOpt, err = parseSecurityOpts(driverConfig.SecurityOpt)
|
|
|
|
if err != nil {
|
|
|
|
return c, fmt.Errorf("failed to parse security_opt configuration: %v", err)
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
ulimits, err := sliceMergeUlimit(driverConfig.Ulimit)
|
|
|
|
if err != nil {
|
|
|
|
return c, fmt.Errorf("failed to parse ulimit configuration: %v", err)
|
|
|
|
}
|
|
|
|
hostConfig.Ulimits = ulimits
|
|
|
|
|
|
|
|
hostConfig.ReadonlyRootfs = driverConfig.ReadonlyRootfs
|
|
|
|
|
2019-06-15 02:16:31 +00:00
|
|
|
// set the docker network mode
|
2018-11-06 05:39:48 +00:00
|
|
|
hostConfig.NetworkMode = driverConfig.NetworkMode
|
2019-06-15 02:16:31 +00:00
|
|
|
|
|
|
|
// if the driver config does not specify a network mode then try to use the
|
|
|
|
// shared alloc network
|
2018-11-06 05:39:48 +00:00
|
|
|
if hostConfig.NetworkMode == "" {
|
2019-06-15 03:06:31 +00:00
|
|
|
if task.NetworkIsolation != nil && task.NetworkIsolation.Path != "" {
|
2019-06-15 02:16:31 +00:00
|
|
|
// find the previously created parent container to join networks with
|
2019-05-14 00:59:31 +00:00
|
|
|
netMode := fmt.Sprintf("container:%s", task.NetworkIsolation.Labels[dockerNetSpecLabelKey])
|
|
|
|
logger.Debug("configuring network mode for task group", "network_mode", netMode)
|
|
|
|
hostConfig.NetworkMode = netMode
|
|
|
|
} else {
|
|
|
|
// docker default
|
2019-07-24 08:20:55 +00:00
|
|
|
logger.Debug("networking mode not specified; using default")
|
|
|
|
hostConfig.NetworkMode = "default"
|
2019-05-14 00:59:31 +00:00
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Setup port mapping and exposed ports
|
2020-08-11 22:30:22 +00:00
|
|
|
ports := newPublishedPorts(logger)
|
|
|
|
switch {
|
|
|
|
case task.Resources.Ports != nil && len(driverConfig.Ports) > 0:
|
|
|
|
// Do not set up docker port mapping if shared alloc networking is used
|
|
|
|
if strings.HasPrefix(hostConfig.NetworkMode, "container:") {
|
|
|
|
break
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2020-08-11 22:30:22 +00:00
|
|
|
for _, port := range driverConfig.Ports {
|
|
|
|
if mapping, ok := task.Resources.Ports.Get(port); ok {
|
|
|
|
ports.add(mapping.Label, mapping.HostIP, mapping.Value, mapping.To)
|
|
|
|
} else {
|
|
|
|
return c, fmt.Errorf("Port %q not found, check network stanza", port)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
2020-08-11 22:30:22 +00:00
|
|
|
}
|
|
|
|
case len(task.Resources.NomadResources.Networks) > 0:
|
|
|
|
network := task.Resources.NomadResources.Networks[0]
|
2018-11-06 05:39:48 +00:00
|
|
|
|
2020-08-11 22:30:22 +00:00
|
|
|
for _, port := range network.ReservedPorts {
|
|
|
|
ports.addMapped(port.Label, network.IP, port.Value, driverConfig.PortMap)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, port := range network.DynamicPorts {
|
2020-08-11 22:30:22 +00:00
|
|
|
ports.addMapped(port.Label, network.IP, port.Value, driverConfig.PortMap)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2020-08-11 22:30:22 +00:00
|
|
|
default:
|
|
|
|
if len(driverConfig.PortMap) > 0 {
|
2020-09-03 19:17:37 +00:00
|
|
|
if task.Resources.Ports != nil {
|
|
|
|
return c, fmt.Errorf("'port_map' cannot map group network ports, use 'ports' instead")
|
|
|
|
}
|
2020-08-11 22:30:22 +00:00
|
|
|
return c, fmt.Errorf("Trying to map ports but no network interface is available")
|
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
2020-08-11 22:30:22 +00:00
|
|
|
hostConfig.PortBindings = ports.publishedPorts
|
|
|
|
config.ExposedPorts = ports.exposedPorts
|
2018-11-06 05:39:48 +00:00
|
|
|
|
|
|
|
// If the user specified a custom command to run, we'll inject it here.
|
|
|
|
if driverConfig.Command != "" {
|
|
|
|
// Validate command
|
|
|
|
if err := validateCommand(driverConfig.Command, "args"); err != nil {
|
|
|
|
return c, err
|
|
|
|
}
|
|
|
|
|
|
|
|
cmd := []string{driverConfig.Command}
|
|
|
|
if len(driverConfig.Args) != 0 {
|
|
|
|
cmd = append(cmd, driverConfig.Args...)
|
|
|
|
}
|
|
|
|
logger.Debug("setting container startup command", "command", strings.Join(cmd, " "))
|
|
|
|
config.Cmd = cmd
|
|
|
|
} else if len(driverConfig.Args) != 0 {
|
|
|
|
config.Cmd = driverConfig.Args
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(driverConfig.Labels) > 0 {
|
|
|
|
config.Labels = driverConfig.Labels
|
|
|
|
}
|
|
|
|
|
2019-10-18 18:45:45 +00:00
|
|
|
labels := make(map[string]string, len(driverConfig.Labels)+1)
|
2019-10-17 13:53:46 +00:00
|
|
|
for k, v := range driverConfig.Labels {
|
2019-10-18 18:45:45 +00:00
|
|
|
labels[k] = v
|
2019-10-17 13:53:46 +00:00
|
|
|
}
|
2021-03-08 13:59:52 +00:00
|
|
|
// main mandatory label
|
2019-10-18 18:45:45 +00:00
|
|
|
labels[dockerLabelAllocID] = task.AllocID
|
2021-03-08 13:59:52 +00:00
|
|
|
|
|
|
|
//optional labels, as configured in plugin configuration
|
|
|
|
for _, configurationExtraLabel := range d.config.ExtraLabels {
|
|
|
|
if glob.Glob(configurationExtraLabel, "job_name") {
|
|
|
|
labels[dockerLabelJobName] = task.JobName
|
|
|
|
}
|
|
|
|
if glob.Glob(configurationExtraLabel, "job_id") {
|
|
|
|
labels[dockerLabelJobID] = task.JobID
|
|
|
|
}
|
|
|
|
if glob.Glob(configurationExtraLabel, "task_group_name") {
|
|
|
|
labels[dockerLabelTaskGroupName] = task.TaskGroupName
|
|
|
|
}
|
|
|
|
if glob.Glob(configurationExtraLabel, "task_name") {
|
|
|
|
labels[dockerLabelTaskName] = task.Name
|
|
|
|
}
|
|
|
|
if glob.Glob(configurationExtraLabel, "namespace") {
|
|
|
|
labels[dockerLabelNamespace] = task.Namespace
|
|
|
|
}
|
|
|
|
if glob.Glob(configurationExtraLabel, "node_name") {
|
|
|
|
labels[dockerLabelNodeName] = task.NodeName
|
|
|
|
}
|
|
|
|
if glob.Glob(configurationExtraLabel, "node_id") {
|
|
|
|
labels[dockerLabelNodeID] = task.NodeID
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-18 18:45:45 +00:00
|
|
|
config.Labels = labels
|
2019-10-17 13:53:46 +00:00
|
|
|
logger.Debug("applied labels on the container", "labels", config.Labels)
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
config.Env = task.EnvList()
|
|
|
|
|
2021-04-03 07:50:23 +00:00
|
|
|
containerName := fmt.Sprintf("%s-%s", strings.ReplaceAll(task.Name, "/", "_"), task.AllocID)
|
2018-11-06 05:39:48 +00:00
|
|
|
logger.Debug("setting container name", "container_name", containerName)
|
|
|
|
|
|
|
|
var networkingConfig *docker.NetworkingConfig
|
|
|
|
if len(driverConfig.NetworkAliases) > 0 || driverConfig.IPv4Address != "" || driverConfig.IPv6Address != "" {
|
|
|
|
networkingConfig = &docker.NetworkingConfig{
|
|
|
|
EndpointsConfig: map[string]*docker.EndpointConfig{
|
|
|
|
hostConfig.NetworkMode: {},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(driverConfig.NetworkAliases) > 0 {
|
|
|
|
networkingConfig.EndpointsConfig[hostConfig.NetworkMode].Aliases = driverConfig.NetworkAliases
|
|
|
|
logger.Debug("setting container network aliases", "network_mode", hostConfig.NetworkMode,
|
|
|
|
"network_aliases", strings.Join(driverConfig.NetworkAliases, ", "))
|
|
|
|
}
|
|
|
|
|
|
|
|
if driverConfig.IPv4Address != "" || driverConfig.IPv6Address != "" {
|
|
|
|
networkingConfig.EndpointsConfig[hostConfig.NetworkMode].IPAMConfig = &docker.EndpointIPAMConfig{
|
|
|
|
IPv4Address: driverConfig.IPv4Address,
|
|
|
|
IPv6Address: driverConfig.IPv6Address,
|
|
|
|
}
|
|
|
|
logger.Debug("setting container network configuration", "network_mode", hostConfig.NetworkMode,
|
|
|
|
"ipv4_address", driverConfig.IPv4Address, "ipv6_address", driverConfig.IPv6Address)
|
|
|
|
}
|
|
|
|
|
|
|
|
if driverConfig.MacAddress != "" {
|
|
|
|
config.MacAddress = driverConfig.MacAddress
|
|
|
|
logger.Debug("setting container mac address", "mac_address", config.MacAddress)
|
|
|
|
}
|
|
|
|
|
2022-08-11 15:19:39 +00:00
|
|
|
if driverConfig.Healthchecks.Disabled() {
|
|
|
|
// Override any image-supplied health-check with disable sentinel.
|
|
|
|
// https://github.com/docker/engine-api/blob/master/types/container/config.go#L16
|
|
|
|
config.Healthcheck = &docker.HealthConfig{Test: []string{"NONE"}}
|
|
|
|
logger.Debug("setting container healthchecks to be disabled")
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
return docker.CreateContainerOptions{
|
|
|
|
Name: containerName,
|
|
|
|
Config: config,
|
|
|
|
HostConfig: hostConfig,
|
|
|
|
NetworkingConfig: networkingConfig,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
2020-12-15 19:13:50 +00:00
|
|
|
func (d *Driver) toDockerMount(m *DockerMount, task *drivers.TaskConfig) (*docker.HostMount, error) {
|
|
|
|
hm, err := m.toDockerHostMount()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
switch hm.Type {
|
|
|
|
case "bind":
|
|
|
|
hm.Source = expandPath(task.TaskDir().Dir, hm.Source)
|
|
|
|
|
|
|
|
// paths inside alloc dir are always allowed as they mount within
|
|
|
|
// a container, and treated as relative to task dir
|
|
|
|
if !d.config.Volumes.Enabled && !isParentPath(task.AllocDir, hm.Source) {
|
|
|
|
return nil, fmt.Errorf(
|
|
|
|
"volumes are not enabled; cannot mount host path: %q %q",
|
|
|
|
hm.Source, task.AllocDir)
|
|
|
|
}
|
|
|
|
case "tmpfs":
|
|
|
|
// no source, so no sandbox check required
|
|
|
|
default: // "volume", but also any new thing that comes along
|
|
|
|
if !d.config.Volumes.Enabled {
|
|
|
|
return nil, fmt.Errorf(
|
|
|
|
"volumes are not enabled; cannot mount volume: %q", hm.Source)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &hm, nil
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// detectIP of Docker container. Returns the first IP found as well as true if
|
|
|
|
// the IP should be advertised (bridge network IPs return false). Returns an
|
|
|
|
// empty string and false if no IP could be found.
|
|
|
|
func (d *Driver) detectIP(c *docker.Container, driverConfig *TaskConfig) (string, bool) {
|
|
|
|
if c.NetworkSettings == nil {
|
|
|
|
// This should only happen if there's been a coding error (such
|
|
|
|
// as not calling InspectContainer after CreateContainer). Code
|
|
|
|
// defensively in case the Docker API changes subtly.
|
|
|
|
d.logger.Error("no network settings for container", "container_id", c.ID)
|
|
|
|
return "", false
|
|
|
|
}
|
|
|
|
|
|
|
|
ip, ipName := "", ""
|
|
|
|
auto := false
|
|
|
|
for name, net := range c.NetworkSettings.Networks {
|
|
|
|
if net.IPAddress == "" {
|
|
|
|
// Ignore networks without an IP address
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
ip = net.IPAddress
|
|
|
|
if driverConfig.AdvertiseIPv6Addr {
|
|
|
|
ip = net.GlobalIPv6Address
|
|
|
|
auto = true
|
|
|
|
}
|
|
|
|
ipName = name
|
|
|
|
|
|
|
|
// Don't auto-advertise IPs for default networks (bridge on
|
|
|
|
// Linux, nat on Windows)
|
|
|
|
if name != "bridge" && name != "nat" {
|
|
|
|
auto = true
|
|
|
|
}
|
|
|
|
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
if n := len(c.NetworkSettings.Networks); n > 1 {
|
|
|
|
d.logger.Warn("multiple Docker networks for container found but Nomad only supports 1",
|
|
|
|
"total_networks", n,
|
|
|
|
"container_id", c.ID,
|
|
|
|
"container_network", ipName)
|
|
|
|
}
|
|
|
|
|
|
|
|
return ip, auto
|
|
|
|
}
|
|
|
|
|
2019-09-18 20:34:57 +00:00
|
|
|
// containerByName finds a running container by name, and returns an error
|
|
|
|
// if the container is dead or can't be found.
|
|
|
|
func (d *Driver) containerByName(name string) (*docker.Container, error) {
|
|
|
|
|
|
|
|
client, _, err := d.dockerClients()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
containers, err := client.ListContainers(docker.ListContainersOptions{
|
|
|
|
All: true,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
d.logger.Error("failed to query list of containers matching name",
|
|
|
|
"container_name", name)
|
|
|
|
return nil, recoverableErrTimeouts(
|
|
|
|
fmt.Errorf("Failed to query list of containers: %s", err))
|
|
|
|
}
|
|
|
|
|
|
|
|
// container names with a / pre-pended to the Nomad generated container names
|
|
|
|
containerName := "/" + name
|
|
|
|
var (
|
|
|
|
shimContainer docker.APIContainers
|
|
|
|
found bool
|
|
|
|
)
|
|
|
|
OUTER:
|
|
|
|
for _, shimContainer = range containers {
|
|
|
|
d.logger.Trace("listed container", "names", hclog.Fmt("%+v", shimContainer.Names))
|
|
|
|
for _, name := range shimContainer.Names {
|
|
|
|
if name == containerName {
|
|
|
|
d.logger.Trace("Found container",
|
|
|
|
"container_name", containerName, "container_id", shimContainer.ID)
|
|
|
|
found = true
|
|
|
|
break OUTER
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !found {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-12-10 15:29:18 +00:00
|
|
|
container, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
|
|
|
|
ID: shimContainer.ID,
|
|
|
|
})
|
2019-09-18 20:34:57 +00:00
|
|
|
if err != nil {
|
|
|
|
err = fmt.Errorf("Failed to inspect container %s: %s", shimContainer.ID, err)
|
|
|
|
|
|
|
|
// This error is always recoverable as it could
|
|
|
|
// be caused by races between listing
|
|
|
|
// containers and this container being removed.
|
|
|
|
// See #2802
|
|
|
|
return nil, nstructs.NewRecoverableError(err, true)
|
|
|
|
}
|
|
|
|
return container, nil
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// validateCommand validates that the command only has a single value and
|
|
|
|
// returns a user friendly error message telling them to use the passed
|
|
|
|
// argField.
|
|
|
|
func validateCommand(command, argField string) error {
|
|
|
|
trimmed := strings.TrimSpace(command)
|
|
|
|
if len(trimmed) == 0 {
|
|
|
|
return fmt.Errorf("command empty: %q", command)
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(trimmed) != len(command) {
|
|
|
|
return fmt.Errorf("command contains extra white space: %q", command)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) WaitTask(ctx context.Context, taskID string) (<-chan *drivers.ExitResult, error) {
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return nil, drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
ch := make(chan *drivers.ExitResult)
|
|
|
|
go d.handleWait(ctx, ch, h)
|
|
|
|
return ch, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) handleWait(ctx context.Context, ch chan *drivers.ExitResult, h *taskHandle) {
|
|
|
|
defer close(ch)
|
|
|
|
select {
|
|
|
|
case <-h.waitCh:
|
2018-11-21 01:41:32 +00:00
|
|
|
ch <- h.ExitResult()
|
2018-11-06 05:39:48 +00:00
|
|
|
case <-ctx.Done():
|
|
|
|
ch <- &drivers.ExitResult{
|
|
|
|
Err: ctx.Err(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-30 16:36:26 +00:00
|
|
|
func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) error {
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
2021-04-22 18:45:16 +00:00
|
|
|
return h.Kill(timeout, signal)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) DestroyTask(taskID string, force bool) error {
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
2020-12-10 15:29:18 +00:00
|
|
|
c, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
|
|
|
|
ID: h.containerID,
|
|
|
|
})
|
2018-11-06 05:39:48 +00:00
|
|
|
if err != nil {
|
2019-06-03 19:17:57 +00:00
|
|
|
switch err.(type) {
|
|
|
|
case *docker.NoSuchContainer:
|
2019-06-03 21:15:54 +00:00
|
|
|
h.logger.Info("container was removed out of band, will proceed with DestroyTask",
|
2019-06-03 19:17:57 +00:00
|
|
|
"error", err)
|
|
|
|
default:
|
|
|
|
return fmt.Errorf("failed to inspect container state: %v", err)
|
2019-05-29 22:38:43 +00:00
|
|
|
}
|
2019-06-03 19:17:57 +00:00
|
|
|
} else {
|
|
|
|
if c.State.Running {
|
|
|
|
if !force {
|
|
|
|
return fmt.Errorf("must call StopTask for the given task before Destroy or set force to true")
|
|
|
|
}
|
|
|
|
if err := h.client.StopContainer(h.containerID, 0); err != nil {
|
|
|
|
h.logger.Warn("failed to stop container during destroy", "error", err)
|
|
|
|
}
|
2019-05-29 22:38:43 +00:00
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
|
2019-06-03 19:17:57 +00:00
|
|
|
if h.removeContainerOnExit {
|
|
|
|
if err := h.client.RemoveContainer(docker.RemoveContainerOptions{ID: h.containerID, RemoveVolumes: true, Force: true}); err != nil {
|
|
|
|
h.logger.Error("error removing container", "error", err)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
h.logger.Debug("not removing container due to config")
|
2019-05-29 22:38:43 +00:00
|
|
|
}
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2018-11-09 04:38:47 +00:00
|
|
|
if err := d.cleanupImage(h); err != nil {
|
|
|
|
h.logger.Error("failed to cleanup image after destroying container",
|
|
|
|
"error", err)
|
|
|
|
}
|
|
|
|
|
2018-12-18 14:53:31 +00:00
|
|
|
d.tasks.Delete(taskID)
|
2018-11-09 04:38:47 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// cleanupImage removes a Docker image. No error is returned if the image
|
|
|
|
// doesn't exist or is still in use. Requires the global client to already be
|
|
|
|
// initialized.
|
|
|
|
func (d *Driver) cleanupImage(handle *taskHandle) error {
|
2018-11-20 02:32:08 +00:00
|
|
|
if !d.config.GC.Image {
|
2018-11-09 04:38:47 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-11-20 02:41:25 +00:00
|
|
|
d.coordinator.RemoveImage(handle.containerImage, handle.task.ID)
|
2018-11-09 04:38:47 +00:00
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) InspectTask(taskID string) (*drivers.TaskStatus, error) {
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return nil, drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
2020-12-10 15:29:18 +00:00
|
|
|
container, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
|
|
|
|
ID: h.containerID,
|
|
|
|
})
|
2018-11-20 02:41:25 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to inspect container %q: %v", h.containerID, err)
|
|
|
|
}
|
2018-11-09 04:38:47 +00:00
|
|
|
status := &drivers.TaskStatus{
|
|
|
|
ID: h.task.ID,
|
|
|
|
Name: h.task.Name,
|
2018-11-20 02:41:25 +00:00
|
|
|
StartedAt: container.State.StartedAt,
|
|
|
|
CompletedAt: container.State.FinishedAt,
|
2018-11-09 04:38:47 +00:00
|
|
|
DriverAttributes: map[string]string{
|
2018-11-20 02:41:25 +00:00
|
|
|
"container_id": container.ID,
|
2018-11-09 04:38:47 +00:00
|
|
|
},
|
|
|
|
NetworkOverride: h.net,
|
2018-11-21 01:41:32 +00:00
|
|
|
ExitResult: h.ExitResult(),
|
2018-11-09 04:38:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
status.State = drivers.TaskStateUnknown
|
2018-11-20 02:41:25 +00:00
|
|
|
if container.State.Running {
|
2018-11-09 04:38:47 +00:00
|
|
|
status.State = drivers.TaskStateRunning
|
|
|
|
}
|
2018-11-20 02:41:25 +00:00
|
|
|
if container.State.Dead {
|
2018-11-09 04:38:47 +00:00
|
|
|
status.State = drivers.TaskStateExited
|
|
|
|
}
|
|
|
|
|
|
|
|
return status, nil
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
2018-12-11 20:27:50 +00:00
|
|
|
func (d *Driver) TaskStats(ctx context.Context, taskID string, interval time.Duration) (<-chan *drivers.TaskResourceUsage, error) {
|
2018-11-06 05:39:48 +00:00
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return nil, drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
2018-12-11 20:27:50 +00:00
|
|
|
return h.Stats(ctx, interval)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) TaskEvents(ctx context.Context) (<-chan *drivers.TaskEvent, error) {
|
|
|
|
return d.eventer.TaskEvents(ctx)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) SignalTask(taskID string, signal string) error {
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
|
|
|
sig, err := signals.Parse(signal)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to parse signal: %v", err)
|
|
|
|
}
|
|
|
|
|
2020-12-02 21:22:38 +00:00
|
|
|
// TODO: review whether we can timeout in this and other Docker API
|
|
|
|
// calls without breaking the expected client behavior.
|
|
|
|
// see https://github.com/hashicorp/nomad/issues/9503
|
|
|
|
return h.Signal(context.Background(), sig)
|
2018-11-06 05:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) ExecTask(taskID string, cmd []string, timeout time.Duration) (*drivers.ExecTaskResult, error) {
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return nil, drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(cmd) == 0 {
|
|
|
|
return nil, fmt.Errorf("cmd is required, but was empty")
|
|
|
|
}
|
|
|
|
|
2018-11-19 20:06:07 +00:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
|
|
defer cancel()
|
2018-11-06 05:39:48 +00:00
|
|
|
|
|
|
|
return h.Exec(ctx, cmd[0], cmd[1:])
|
|
|
|
}
|
|
|
|
|
2019-04-28 21:26:15 +00:00
|
|
|
var _ drivers.ExecTaskStreamingDriver = (*Driver)(nil)
|
|
|
|
|
|
|
|
func (d *Driver) ExecTaskStreaming(ctx context.Context, taskID string, opts *drivers.ExecOptions) (*drivers.ExitResult, error) {
|
|
|
|
defer opts.Stdout.Close()
|
|
|
|
defer opts.Stderr.Close()
|
|
|
|
|
|
|
|
done := make(chan interface{})
|
|
|
|
defer close(done)
|
|
|
|
|
|
|
|
h, ok := d.tasks.Get(taskID)
|
|
|
|
if !ok {
|
|
|
|
return nil, drivers.ErrTaskNotFound
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(opts.Command) == 0 {
|
|
|
|
return nil, fmt.Errorf("command is required but was empty")
|
|
|
|
}
|
|
|
|
|
|
|
|
createExecOpts := docker.CreateExecOptions{
|
|
|
|
AttachStdin: true,
|
|
|
|
AttachStdout: true,
|
|
|
|
AttachStderr: true,
|
|
|
|
Tty: opts.Tty,
|
|
|
|
Cmd: opts.Command,
|
|
|
|
Container: h.containerID,
|
|
|
|
Context: ctx,
|
|
|
|
}
|
|
|
|
exec, err := h.client.CreateExec(createExecOpts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create exec object: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
case <-done:
|
|
|
|
return
|
|
|
|
case s, ok := <-opts.ResizeCh:
|
|
|
|
if !ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
client.ResizeExecTTY(exec.ID, s.Height, s.Width)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
startOpts := docker.StartExecOptions{
|
|
|
|
Detach: false,
|
|
|
|
|
|
|
|
// When running in TTY, we must use a raw terminal.
|
|
|
|
// If not, we set RawTerminal to false to allow docker client
|
|
|
|
// to interpret special stdout/stderr messages
|
|
|
|
Tty: opts.Tty,
|
|
|
|
RawTerminal: opts.Tty,
|
|
|
|
|
|
|
|
InputStream: opts.Stdin,
|
|
|
|
OutputStream: opts.Stdout,
|
|
|
|
ErrorStream: opts.Stderr,
|
|
|
|
Context: ctx,
|
|
|
|
}
|
|
|
|
if err := client.StartExec(exec.ID, startOpts); err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to start exec: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// StartExec returns after process completes, but InspectExec seems to have a delay
|
|
|
|
// get in getting status code
|
|
|
|
|
|
|
|
const execTerminatingTimeout = 3 * time.Second
|
|
|
|
start := time.Now()
|
|
|
|
var res *docker.ExecInspect
|
2019-08-16 13:01:50 +00:00
|
|
|
for (res == nil || res.Running) && time.Since(start) <= execTerminatingTimeout {
|
2019-04-28 21:26:15 +00:00
|
|
|
res, err = client.InspectExec(exec.ID)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to inspect exec result: %v", err)
|
|
|
|
}
|
|
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
}
|
|
|
|
|
|
|
|
if res == nil || res.Running {
|
|
|
|
return nil, fmt.Errorf("failed to retrieve exec result")
|
|
|
|
}
|
|
|
|
|
|
|
|
return &drivers.ExitResult{
|
|
|
|
ExitCode: res.ExitCode,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
2018-11-06 05:39:48 +00:00
|
|
|
// dockerClients creates two *docker.Client, one for long running operations and
|
|
|
|
// the other for shorter operations. In test / dev mode we can use ENV vars to
|
|
|
|
// connect to the docker daemon. In production mode we will read docker.endpoint
|
|
|
|
// from the config file.
|
|
|
|
func (d *Driver) dockerClients() (*docker.Client, *docker.Client, error) {
|
|
|
|
createClientsLock.Lock()
|
|
|
|
defer createClientsLock.Unlock()
|
|
|
|
|
|
|
|
if client != nil && waitClient != nil {
|
|
|
|
return client, waitClient, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var err error
|
|
|
|
|
2020-09-30 16:36:26 +00:00
|
|
|
// Only initialize the client if it hasn't yet been done
|
2018-11-06 05:39:48 +00:00
|
|
|
if client == nil {
|
|
|
|
client, err = d.newDockerClient(dockerTimeout)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only initialize the waitClient if it hasn't yet been done
|
|
|
|
if waitClient == nil {
|
|
|
|
waitClient, err = d.newDockerClient(0 * time.Minute)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return client, waitClient, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// newDockerClient creates a new *docker.Client with a configurable timeout
|
|
|
|
func (d *Driver) newDockerClient(timeout time.Duration) (*docker.Client, error) {
|
|
|
|
var err error
|
|
|
|
var merr multierror.Error
|
|
|
|
var newClient *docker.Client
|
|
|
|
|
|
|
|
// Default to using whatever is configured in docker.endpoint. If this is
|
|
|
|
// not specified we'll fall back on NewClientFromEnv which reads config from
|
|
|
|
// the DOCKER_* environment variables DOCKER_HOST, DOCKER_TLS_VERIFY, and
|
|
|
|
// DOCKER_CERT_PATH. This allows us to lock down the config in production
|
|
|
|
// but also accept the standard ENV configs for dev and test.
|
|
|
|
dockerEndpoint := d.config.Endpoint
|
|
|
|
if dockerEndpoint != "" {
|
|
|
|
cert := d.config.TLS.Cert
|
|
|
|
key := d.config.TLS.Key
|
|
|
|
ca := d.config.TLS.CA
|
|
|
|
|
|
|
|
if cert+key+ca != "" {
|
|
|
|
d.logger.Debug("using TLS client connection", "endpoint", dockerEndpoint)
|
|
|
|
newClient, err = docker.NewTLSClient(dockerEndpoint, cert, key, ca)
|
|
|
|
if err != nil {
|
|
|
|
merr.Errors = append(merr.Errors, err)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
d.logger.Debug("using standard client connection", "endpoint", dockerEndpoint)
|
|
|
|
newClient, err = docker.NewClient(dockerEndpoint)
|
|
|
|
if err != nil {
|
|
|
|
merr.Errors = append(merr.Errors, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
d.logger.Debug("using client connection initialized from environment")
|
|
|
|
newClient, err = docker.NewClientFromEnv()
|
|
|
|
if err != nil {
|
|
|
|
merr.Errors = append(merr.Errors, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if timeout != 0 && newClient != nil {
|
|
|
|
newClient.SetTimeout(timeout)
|
|
|
|
}
|
|
|
|
return newClient, merr.ErrorOrNil()
|
|
|
|
}
|
|
|
|
|
|
|
|
func sliceMergeUlimit(ulimitsRaw map[string]string) ([]docker.ULimit, error) {
|
|
|
|
var ulimits []docker.ULimit
|
|
|
|
|
|
|
|
for name, ulimitRaw := range ulimitsRaw {
|
|
|
|
if len(ulimitRaw) == 0 {
|
|
|
|
return []docker.ULimit{}, fmt.Errorf("Malformed ulimit specification %v: %q, cannot be empty", name, ulimitRaw)
|
|
|
|
}
|
|
|
|
// hard limit is optional
|
2020-12-09 19:05:18 +00:00
|
|
|
if !strings.Contains(ulimitRaw, ":") {
|
2018-11-06 05:39:48 +00:00
|
|
|
ulimitRaw = ulimitRaw + ":" + ulimitRaw
|
|
|
|
}
|
|
|
|
|
|
|
|
splitted := strings.SplitN(ulimitRaw, ":", 2)
|
|
|
|
if len(splitted) < 2 {
|
|
|
|
return []docker.ULimit{}, fmt.Errorf("Malformed ulimit specification %v: %v", name, ulimitRaw)
|
|
|
|
}
|
|
|
|
soft, err := strconv.Atoi(splitted[0])
|
|
|
|
if err != nil {
|
|
|
|
return []docker.ULimit{}, fmt.Errorf("Malformed soft ulimit %v: %v", name, ulimitRaw)
|
|
|
|
}
|
|
|
|
hard, err := strconv.Atoi(splitted[1])
|
|
|
|
if err != nil {
|
|
|
|
return []docker.ULimit{}, fmt.Errorf("Malformed hard ulimit %v: %v", name, ulimitRaw)
|
|
|
|
}
|
|
|
|
|
|
|
|
ulimit := docker.ULimit{
|
|
|
|
Name: name,
|
|
|
|
Soft: int64(soft),
|
|
|
|
Hard: int64(hard),
|
|
|
|
}
|
|
|
|
ulimits = append(ulimits, ulimit)
|
|
|
|
}
|
|
|
|
return ulimits, nil
|
|
|
|
}
|
2018-12-20 12:25:07 +00:00
|
|
|
|
2019-09-13 19:25:31 +00:00
|
|
|
func isDockerTransientError(err error) bool {
|
|
|
|
if err == nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
errMsg := err.Error()
|
|
|
|
for _, te := range dockerTransientErrs {
|
|
|
|
if strings.Contains(errMsg, te) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|