drivers/docker: refactor use of clients in docker driver (#17731)

* drivers/docker: refactor use of clients in docker driver

This PR refactors how we manage the two underlying clients used by the
docker driver for communicating with the docker daemon. We keep two clients
- one with a hard-coded timeout that applies to all operations no matter
what, intended for use with short lived / async calls to docker. The other
has no timeout and is the responsibility of the caller to set a context
that will ensure the call eventually terminates.

The use of these two clients has been confusing and mistakes were made
in a number of places where calls were making use of the wrong client.

This PR makes it so that a user must explicitly call a function to get
the client that makes sense for that use case.

Fixes #17023

* cr: followup items
This commit is contained in:
Seth Hoenig 2023-06-26 15:21:42 -05:00 committed by GitHub
parent 4c6906d873
commit d590123637
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 188 additions and 124 deletions

3
.changelog/17731.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
drivers/docker: Fixed a bug where long-running docker operations would incorrectly timeout
```

View File

@ -765,7 +765,7 @@ func (d *Driver) SetConfig(c *base.Config) error {
d.clientConfig = c.AgentConfig.Driver
}
dockerClient, _, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
return fmt.Errorf("failed to get docker client: %v", err)
}

View File

@ -30,6 +30,7 @@ import (
"github.com/hashicorp/nomad/drivers/shared/eventer"
"github.com/hashicorp/nomad/drivers/shared/hostnames"
"github.com/hashicorp/nomad/drivers/shared/resolvconf"
"github.com/hashicorp/nomad/helper"
nstructs "github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/drivers"
@ -39,19 +40,6 @@ import (
)
var (
// createClientsLock is a lock that protects reading/writing global client
// variables
createClientsLock sync.Mutex
// client is a docker client with a timeout of 5 minutes. This is for doing
// all operations with the docker daemon besides which are not long running
// such as creating, killing containers, etc.
client *docker.Client
// waitClient is a docker client with no timeouts. This is used for long
// running operations such as waiting on containers and collect stats
waitClient *docker.Client
dockerTransientErrs = []string{
"Client.Timeout exceeded while awaiting headers",
"EOF",
@ -159,6 +147,10 @@ type Driver struct {
detected bool
detectedLock sync.RWMutex
dockerClientLock sync.Mutex
dockerClient *docker.Client // for most docker api calls (use getDockerClient())
infinityClient *docker.Client // for wait and stop calls (use getInfinityClient())
danglingReconciler *containerReconciler
cpusetFixer CpusetFixer
}
@ -229,12 +221,17 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
return fmt.Errorf("failed to decode driver task state: %v", err)
}
client, _, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
return fmt.Errorf("failed to get docker client: %v", err)
return fmt.Errorf("failed to get docker client: %w", err)
}
container, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
infinityClient, err := d.getInfinityClient()
if err != nil {
return fmt.Errorf("failed to get docker long operations client: %w", err)
}
container, err := dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: handleState.ContainerID,
})
if err != nil {
@ -242,8 +239,8 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
}
h := &taskHandle{
client: client,
waitClient: waitClient,
dockerClient: dockerClient,
infinityClient: infinityClient,
logger: d.logger.With("container_id", container.ID),
task: handle.Config,
containerID: container.ID,
@ -261,14 +258,14 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
h.dlogger, h.dloggerPluginClient, err = d.setupNewDockerLogger(container, handle.Config, time.Now())
if err != nil {
if err := client.StopContainer(handleState.ContainerID, 0); err != nil {
if err := dockerClient.StopContainer(handleState.ContainerID, 0); err != nil {
d.logger.Warn("failed to stop container during cleanup", "container_id", handleState.ContainerID, "error", err)
}
return fmt.Errorf("failed to setup replacement docker logger: %v", err)
}
if err := handle.SetDriverState(h.buildState()); err != nil {
if err := client.StopContainer(handleState.ContainerID, 0); err != nil {
if err := dockerClient.StopContainer(handleState.ContainerID, 0); err != nil {
d.logger.Warn("failed to stop container during cleanup", "container_id", handleState.ContainerID, "error", err)
}
return fmt.Errorf("failed to store driver state: %v", err)
@ -315,13 +312,19 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive
handle := drivers.NewTaskHandle(taskHandleVersion)
handle.Config = cfg
// Initialize docker API clients
client, _, err := d.dockerClients()
// we'll need the normal docker client
dockerClient, err := d.getDockerClient()
if err != nil {
return nil, nil, fmt.Errorf("Failed to connect to docker daemon: %s", err)
return nil, nil, fmt.Errorf("Failed to create docker client: %v", err)
}
id, err := d.createImage(cfg, &driverConfig, client)
// and also the long operations client
infinityClient, err := d.getInfinityClient()
if err != nil {
return nil, nil, fmt.Errorf("Failed to create long operations docker client: %v", err)
}
id, err := d.createImage(cfg, &driverConfig, dockerClient)
if err != nil {
return nil, nil, err
}
@ -342,10 +345,10 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive
startAttempts := 0
CREATE:
container, err := d.createContainer(client, containerCfg, driverConfig.Image)
container, err := d.createContainer(dockerClient, containerCfg, driverConfig.Image)
if err != nil {
d.logger.Error("failed to create container", "error", err)
client.RemoveContainer(docker.RemoveContainerOptions{
dockerClient.RemoveContainer(docker.RemoveContainerOptions{
ID: containerCfg.Name,
Force: true,
})
@ -361,7 +364,7 @@ CREATE:
// Start the container
if err := d.startContainer(container); err != nil {
d.logger.Error("failed to start container", "container_id", container.ID, "error", err)
client.RemoveContainer(docker.RemoveContainerOptions{
dockerClient.RemoveContainer(docker.RemoveContainerOptions{
ID: container.ID,
Force: true,
})
@ -376,17 +379,17 @@ CREATE:
// Inspect container to get all of the container metadata as much of the
// metadata (eg networking) isn't populated until the container is started
runningContainer, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
runningContainer, err := dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: container.ID,
})
if err != nil {
client.RemoveContainer(docker.RemoveContainerOptions{
dockerClient.RemoveContainer(docker.RemoveContainerOptions{
ID: container.ID,
Force: true,
})
msg := "failed to inspect started container"
d.logger.Error(msg, "error", err)
client.RemoveContainer(docker.RemoveContainerOptions{
dockerClient.RemoveContainer(docker.RemoveContainerOptions{
ID: container.ID,
Force: true,
})
@ -419,7 +422,7 @@ CREATE:
dlogger, pluginClient, err = d.setupNewDockerLogger(container, cfg, time.Unix(0, 0))
if err != nil {
d.logger.Error("an error occurred after container startup, terminating container", "container_id", container.ID)
client.RemoveContainer(docker.RemoveContainerOptions{ID: container.ID, Force: true})
dockerClient.RemoveContainer(docker.RemoveContainerOptions{ID: container.ID, Force: true})
return nil, nil, err
}
}
@ -435,8 +438,8 @@ CREATE:
// Return a driver handle
h := &taskHandle{
client: client,
waitClient: waitClient,
dockerClient: dockerClient,
infinityClient: infinityClient,
dlogger: dlogger,
dloggerPluginClient: pluginClient,
logger: d.logger.With("container_id", container.ID),
@ -455,7 +458,7 @@ CREATE:
dlogger.Stop()
pluginClient.Kill()
}
client.RemoveContainer(docker.RemoveContainerOptions{ID: container.ID, Force: true})
dockerClient.RemoveContainer(docker.RemoveContainerOptions{ID: container.ID, Force: true})
return nil, nil, err
}
@ -548,10 +551,15 @@ CREATE:
// startContainer starts the passed container. It attempts to handle any
// transient Docker errors.
func (d *Driver) startContainer(c *docker.Container) error {
dockerClient, err := d.getDockerClient()
if err != nil {
return err
}
// Start a container
attempted := 0
START:
startErr := client.StartContainer(c.ID, c.HostConfig)
startErr := dockerClient.StartContainer(c.ID, c.HostConfig)
if startErr == nil || strings.Contains(startErr.Error(), "Container already running") {
return nil
}
@ -691,7 +699,12 @@ func (d *Driver) loadImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c
}
func (d *Driver) convertAllocPathsForWindowsLCOW(task *drivers.TaskConfig, image string) error {
imageConfig, err := client.InspectImage(image)
dockerClient, err := d.getDockerClient()
if err != nil {
return err
}
imageConfig, err := dockerClient.InspectImage(image)
if err != nil {
return fmt.Errorf("the image does not exist: %v", err)
}
@ -763,7 +776,7 @@ func (d *Driver) containerBinds(task *drivers.TaskConfig, driverConfig *TaskConf
func (d *Driver) findPauseContainer(allocID string) (string, error) {
_, dockerClient, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
return "", err
}
@ -799,7 +812,7 @@ func (d *Driver) findPauseContainer(allocID string) (string, error) {
// tracking. Basically just scan all containers and pull the ID from anything
// that has the Nomad Label and has Name with prefix "/nomad_init_".
func (d *Driver) recoverPauseContainers(ctx context.Context) {
_, dockerClient, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
d.logger.Error("failed to recover pause containers", "error", err)
return
@ -1459,11 +1472,11 @@ func (d *Driver) detectIP(c *docker.Container, driverConfig *TaskConfig) (string
// if the container is dead or can't be found.
func (d *Driver) containerByName(name string) (*docker.Container, error) {
client, _, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
return nil, err
}
containers, err := client.ListContainers(docker.ListContainersOptions{
containers, err := dockerClient.ListContainers(docker.ListContainersOptions{
All: true,
})
if err != nil {
@ -1495,7 +1508,7 @@ OUTER:
return nil, nil
}
container, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
container, err := dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: shimContainer.ID,
})
if err != nil {
@ -1563,7 +1576,12 @@ func (d *Driver) DestroyTask(taskID string, force bool) error {
return drivers.ErrTaskNotFound
}
c, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
dockerClient, err := d.getDockerClient()
if err != nil {
return err
}
c, err := dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: h.containerID,
})
if err != nil {
@ -1579,13 +1597,13 @@ func (d *Driver) DestroyTask(taskID string, force bool) error {
if !force {
return fmt.Errorf("must call StopTask for the given task before Destroy or set force to true")
}
if err := h.client.StopContainer(h.containerID, 0); err != nil {
if err := dockerClient.StopContainer(h.containerID, 0); err != nil {
h.logger.Warn("failed to stop container during destroy", "error", err)
}
}
if h.removeContainerOnExit {
if err := h.client.RemoveContainer(docker.RemoveContainerOptions{ID: h.containerID, RemoveVolumes: true, Force: true}); err != nil {
if err := dockerClient.RemoveContainer(docker.RemoveContainerOptions{ID: h.containerID, RemoveVolumes: true, Force: true}); err != nil {
h.logger.Error("error removing container", "error", err)
}
} else {
@ -1621,7 +1639,12 @@ func (d *Driver) InspectTask(taskID string) (*drivers.TaskStatus, error) {
return nil, drivers.ErrTaskNotFound
}
container, err := client.InspectContainerWithOptions(docker.InspectContainerOptions{
dockerClient, err := d.getDockerClient()
if err != nil {
return nil, err
}
container, err := dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: h.containerID,
})
if err != nil {
@ -1723,7 +1746,13 @@ func (d *Driver) ExecTaskStreaming(ctx context.Context, taskID string, opts *dri
Container: h.containerID,
Context: ctx,
}
exec, err := h.client.CreateExec(createExecOpts)
dockerClient, err := d.getDockerClient()
if err != nil {
return nil, err
}
exec, err := dockerClient.CreateExec(createExecOpts)
if err != nil {
return nil, fmt.Errorf("failed to create exec object: %v", err)
}
@ -1739,7 +1768,7 @@ func (d *Driver) ExecTaskStreaming(ctx context.Context, taskID string, opts *dri
if !ok {
return
}
client.ResizeExecTTY(exec.ID, s.Height, s.Width)
dockerClient.ResizeExecTTY(exec.ID, s.Height, s.Width)
}
}
}()
@ -1758,7 +1787,7 @@ func (d *Driver) ExecTaskStreaming(ctx context.Context, taskID string, opts *dri
ErrorStream: opts.Stderr,
Context: ctx,
}
if err := client.StartExec(exec.ID, startOpts); err != nil {
if err := dockerClient.StartExec(exec.ID, startOpts); err != nil {
return nil, fmt.Errorf("failed to start exec: %v", err)
}
@ -1769,7 +1798,7 @@ func (d *Driver) ExecTaskStreaming(ctx context.Context, taskID string, opts *dri
start := time.Now()
var res *docker.ExecInspect
for (res == nil || res.Running) && time.Since(start) <= execTerminatingTimeout {
res, err = client.InspectExec(exec.ID)
res, err = dockerClient.InspectExec(exec.ID)
if err != nil {
return nil, fmt.Errorf("failed to inspect exec result: %v", err)
}
@ -1785,37 +1814,37 @@ func (d *Driver) ExecTaskStreaming(ctx context.Context, taskID string, opts *dri
}, nil
}
// dockerClients creates two *docker.Client, one for long running operations and
// the other for shorter operations. In test / dev mode we can use ENV vars to
// connect to the docker daemon. In production mode we will read docker.endpoint
// from the config file.
func (d *Driver) dockerClients() (*docker.Client, *docker.Client, error) {
createClientsLock.Lock()
defer createClientsLock.Unlock()
func (d *Driver) getOrCreateClient(timeout time.Duration) (*docker.Client, error) {
var (
client *docker.Client
err error
)
if client != nil && waitClient != nil {
return client, waitClient, nil
helper.WithLock(&d.dockerClientLock, func() {
if timeout == 0 {
if d.infinityClient == nil {
d.infinityClient, err = d.newDockerClient(0)
}
client = d.infinityClient
} else {
if d.dockerClient == nil {
d.dockerClient, err = d.newDockerClient(timeout)
}
client = d.dockerClient
}
})
var err error
return client, err
}
// Only initialize the client if it hasn't yet been done
if client == nil {
client, err = d.newDockerClient(dockerTimeout)
if err != nil {
return nil, nil, err
}
}
// getInfinityClient creates a docker API client with no timeout.
func (d *Driver) getInfinityClient() (*docker.Client, error) {
return d.getOrCreateClient(0)
}
// Only initialize the waitClient if it hasn't yet been done
if waitClient == nil {
waitClient, err = d.newDockerClient(0 * time.Minute)
if err != nil {
return nil, nil, err
}
}
return client, waitClient, nil
// getDockerClient creates a docker API client with a hard-coded timeout.
func (d *Driver) getDockerClient() (*docker.Client, error) {
return d.getOrCreateClient(dockerTimeout)
}
// newDockerClient creates a new *docker.Client with a configurable timeout

View File

@ -25,6 +25,7 @@ import (
"github.com/hashicorp/nomad/plugins/drivers"
dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils"
tu "github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@ -125,25 +126,25 @@ func TestDockerDriver_NetworkMode_Host(t *testing.T) {
copyImage(t, task.TaskDir(), "busybox.tar")
_, _, err := d.StartTask(task)
require.NoError(t, err)
must.NoError(t, err)
require.NoError(t, d.WaitUntilStarted(task.ID, 5*time.Second))
must.NoError(t, d.WaitUntilStarted(task.ID, 5*time.Second))
defer d.DestroyTask(task.ID, true)
dockerDriver, ok := d.Impl().(*Driver)
require.True(t, ok)
must.True(t, ok)
handle, ok := dockerDriver.tasks.Get(task.ID)
require.True(t, ok)
must.True(t, ok)
client := newTestDockerClient(t)
container, err := client.InspectContainer(handle.containerID)
if err != nil {
t.Fatalf("err: %v", err)
}
must.NoError(t, err)
actual := container.HostConfig.NetworkMode
require.Equal(t, expected, actual)
must.Eq(t, expected, actual)
}
func TestDockerDriver_CPUCFSPeriod(t *testing.T) {

View File

@ -98,7 +98,7 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
return fp
}
client, _, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
if d.fingerprintSuccessful() {
d.logger.Info("failed to initialize client", "error", err)
@ -110,10 +110,10 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
}
}
env, err := client.Version()
env, err := dockerClient.Version()
if err != nil {
if d.fingerprintSuccessful() {
d.logger.Debug("could not connect to docker daemon", "endpoint", client.Endpoint(), "error", err)
d.logger.Debug("could not connect to docker daemon", "endpoint", dockerClient.Endpoint(), "error", err)
}
d.setFingerprintFailure()
@ -143,7 +143,7 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
fp.Attributes["driver.docker.volumes.enabled"] = pstructs.NewBoolAttribute(true)
}
if nets, err := client.ListNetworks(); err != nil {
if nets, err := dockerClient.ListNetworks(); err != nil {
d.logger.Warn("error discovering bridge IP", "error", err)
} else {
for _, n := range nets {
@ -167,7 +167,7 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
}
}
if dockerInfo, err := client.Info(); err != nil {
if dockerInfo, err := dockerClient.Info(); err != nil {
d.logger.Warn("failed to get Docker system info", "error", err)
} else {
runtimeNames := make([]string, 0, len(dockerInfo.Runtimes))

View File

@ -25,8 +25,17 @@ import (
)
type taskHandle struct {
client *docker.Client
waitClient *docker.Client
// dockerClient is useful for normal docker API calls. It should be used
// for all calls that aren't Wait() or Stop() (and their variations).
dockerClient *docker.Client
// infinityClient is useful for
// - the Wait docker API call(s) (no limit on container lifetime)
// - the Stop docker API call(s) (context with task kill_timeout required)
// Do not use this client for any other docker API calls, instead use the
// normal dockerClient which includes a default timeout.
infinityClient *docker.Client
logger hclog.Logger
dlogger docklog.DockerLogger
dloggerPluginClient *plugin.Client
@ -80,7 +89,7 @@ func (h *taskHandle) Exec(ctx context.Context, cmd string, args []string) (*driv
Container: h.containerID,
Context: ctx,
}
exec, err := h.client.CreateExec(createExecOpts)
exec, err := h.dockerClient.CreateExec(createExecOpts)
if err != nil {
return nil, err
}
@ -95,12 +104,12 @@ func (h *taskHandle) Exec(ctx context.Context, cmd string, args []string) (*driv
ErrorStream: stderr,
Context: ctx,
}
if err := client.StartExec(exec.ID, startOpts); err != nil {
if err := h.dockerClient.StartExec(exec.ID, startOpts); err != nil {
return nil, err
}
execResult.Stdout = stdout.Bytes()
execResult.Stderr = stderr.Bytes()
res, err := client.InspectExec(exec.ID)
res, err := h.dockerClient.InspectExec(exec.ID)
if err != nil {
return execResult, err
}
@ -120,13 +129,14 @@ func (h *taskHandle) Signal(ctx context.Context, s os.Signal) error {
// MacOS signals to the correct signal number for docker. Or we change the
// interface to take a signal string and leave it up to driver to map?
dockerSignal := docker.Signal(sysSig)
opts := docker.KillContainerOptions{
ID: h.containerID,
Signal: dockerSignal,
Signal: docker.Signal(sysSig),
Context: ctx,
}
return h.client.KillContainer(opts)
// remember Kill just means send a signal; this is not the complex StopContainer case
return h.dockerClient.KillContainer(opts)
}
// parseSignal interprets the signal name into an os.Signal. If no name is
@ -159,7 +169,13 @@ func (h *taskHandle) Kill(killTimeout time.Duration, signal string) error {
// Signal is used to kill the container with the desired signal before
// calling StopContainer
if signal == "" {
err = h.client.StopContainer(h.containerID, uint(killTimeout.Seconds()))
// give the context timeout some wiggle room beyond the kill timeout
// docker will use, so we can happy path even in the force kill case
graciousTimeout := killTimeout + dockerTimeout
ctx, cancel := context.WithTimeout(context.Background(), graciousTimeout)
defer cancel()
apiTimeout := uint(killTimeout.Seconds())
err = h.infinityClient.StopContainerWithContext(h.containerID, apiTimeout, ctx)
} else {
ctx, cancel := context.WithTimeout(context.Background(), killTimeout)
defer cancel()
@ -191,8 +207,8 @@ func (h *taskHandle) Kill(killTimeout time.Duration, signal string) error {
case <-ctx.Done():
}
// Stop the container
err = h.client.StopContainer(h.containerID, 0)
// Stop the container forcefully.
err = h.dockerClient.StopContainer(h.containerID, 0)
}
if err != nil {
@ -230,7 +246,7 @@ func (h *taskHandle) shutdownLogger() {
func (h *taskHandle) run() {
defer h.shutdownLogger()
exitCode, werr := h.waitClient.WaitContainer(h.containerID)
exitCode, werr := h.infinityClient.WaitContainer(h.containerID)
if werr != nil {
h.logger.Error("failed to wait for container; already terminated")
}
@ -239,7 +255,7 @@ func (h *taskHandle) run() {
werr = fmt.Errorf("Docker container exited with non-zero exit code: %d", exitCode)
}
container, ierr := h.waitClient.InspectContainerWithOptions(docker.InspectContainerOptions{
container, ierr := h.dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: h.containerID,
})
oom := false
@ -264,8 +280,8 @@ func (h *taskHandle) run() {
close(h.doneCh)
// Stop the container just incase the docker daemon's wait returned
// incorrectly
if err := h.client.StopContainer(h.containerID, 0); err != nil {
// incorrectly.
if err := h.dockerClient.StopContainer(h.containerID, 0); err != nil {
_, noSuchContainer := err.(*docker.NoSuchContainer)
_, containerNotRunning := err.(*docker.ContainerNotRunning)
if !containerNotRunning && !noSuchContainer {

View File

@ -26,7 +26,7 @@ const (
func (d *Driver) CreateNetwork(allocID string, createSpec *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) {
// Initialize docker API clients
client, _, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
return nil, false, fmt.Errorf("failed to connect to docker daemon: %s", err)
}
@ -68,7 +68,7 @@ func (d *Driver) CreateNetwork(allocID string, createSpec *drivers.NetworkCreate
return specFromContainer(container, createSpec.Hostname), false, nil
}
container, err = d.createContainer(client, *config, d.config.InfraImage)
container, err = d.createContainer(dockerClient, *config, d.config.InfraImage)
if err != nil {
return nil, false, err
}
@ -79,7 +79,7 @@ func (d *Driver) CreateNetwork(allocID string, createSpec *drivers.NetworkCreate
// until the container is started, InspectContainerWithOptions
// returns a mostly-empty struct
container, err = client.InspectContainerWithOptions(docker.InspectContainerOptions{
container, err = dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
ID: container.ID,
})
if err != nil {
@ -122,17 +122,17 @@ func (d *Driver) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSp
// let the background reconciliation keep trying
d.pauseContainers.remove(id)
client, _, err := d.dockerClients()
dockerClient, err := d.getDockerClient()
if err != nil {
return fmt.Errorf("failed to connect to docker daemon: %s", err)
}
timeout := uint(1) // this is the pause container, just kill it fast
if err := client.StopContainerWithContext(id, timeout, d.ctx); err != nil {
if err := dockerClient.StopContainerWithContext(id, timeout, d.ctx); err != nil {
d.logger.Warn("failed to stop pause container", "id", id, "error", err)
}
if err := client.RemoveContainer(docker.RemoveContainerOptions{
if err := dockerClient.RemoveContainer(docker.RemoveContainerOptions{
Force: true,
ID: id,
}); err != nil {
@ -144,7 +144,7 @@ func (d *Driver) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSp
// The Docker image ID is needed in order to correctly update the image
// reference count. Any error finding this, however, should not result
// in an error shutting down the allocrunner.
dockerImage, err := client.InspectImage(d.config.InfraImage)
dockerImage, err := dockerClient.InspectImage(d.config.InfraImage)
if err != nil {
d.logger.Warn("InspectImage failed for infra_image container destroy",
"image", d.config.InfraImage, "error", err)
@ -187,6 +187,11 @@ func (d *Driver) createSandboxContainerConfig(allocID string, createSpec *driver
func (d *Driver) pullInfraImage(allocID string) error {
repo, tag := parseDockerImage(d.config.InfraImage)
dockerClient, err := d.getDockerClient()
if err != nil {
return err
}
// There's a (narrow) time-of-check-time-of-use race here. If we call
// InspectImage and then a concurrent task shutdown happens before we call
// IncrementImageReference, we could end up removing the image, and it
@ -194,7 +199,7 @@ func (d *Driver) pullInfraImage(allocID string) error {
d.coordinator.imageLock.Lock()
if tag != "latest" {
dockerImage, err := client.InspectImage(d.config.InfraImage)
dockerImage, err := dockerClient.InspectImage(d.config.InfraImage)
if err != nil {
d.logger.Debug("InspectImage failed for infra_image container pull",
"image", d.config.InfraImage, "error", err)

View File

@ -24,8 +24,8 @@ import (
type containerReconciler struct {
ctx context.Context
config *ContainerGCConfig
client *docker.Client
logger hclog.Logger
getClient func() (*docker.Client, error)
isDriverHealthy func() bool
trackedContainers func() *set.Set[string]
@ -38,7 +38,7 @@ func newReconciler(d *Driver) *containerReconciler {
return &containerReconciler{
ctx: d.ctx,
config: &d.config.GC.DanglingContainers,
client: client,
getClient: d.getDockerClient,
logger: d.logger,
isDriverHealthy: func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() },
@ -109,9 +109,14 @@ func (r *containerReconciler) removeDanglingContainersIteration() error {
return nil
}
dockerClient, err := r.getClient()
if err != nil {
return err
}
for _, id := range untracked.Slice() {
ctx, cancel := r.dockerAPIQueryContext()
err := client.RemoveContainer(docker.RemoveContainerOptions{
err := dockerClient.RemoveContainer(docker.RemoveContainerOptions{
Context: ctx,
ID: id,
Force: true,
@ -135,7 +140,12 @@ func (r *containerReconciler) untrackedContainers(tracked *set.Set[string], cuto
ctx, cancel := r.dockerAPIQueryContext()
defer cancel()
cc, err := client.ListContainers(docker.ListContainersOptions{
dockerClient, err := r.getClient()
if err != nil {
return nil, err
}
cc, err := dockerClient.ListContainers(docker.ListContainersOptions{
Context: ctx,
All: false, // only reconcile running containers
})

View File

@ -131,7 +131,7 @@ func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, inte
}
// Stats blocks until an error has occurred, or doneCh has been closed
if err := h.client.Stats(statsOpts); err != nil && err != io.ErrClosedPipe {
if err := h.dockerClient.Stats(statsOpts); err != nil && err != io.ErrClosedPipe {
// An error occurred during stats collection, retry with backoff
h.logger.Debug("error collecting stats from container", "error", err)