d590123637
* drivers/docker: refactor use of clients in docker driver This PR refactors how we manage the two underlying clients used by the docker driver for communicating with the docker daemon. We keep two clients - one with a hard-coded timeout that applies to all operations no matter what, intended for use with short lived / async calls to docker. The other has no timeout and is the responsibility of the caller to set a context that will ensure the call eventually terminates. The use of these two clients has been confusing and mistakes were made in a number of places where calls were making use of the wrong client. This PR makes it so that a user must explicitly call a function to get the client that makes sense for that use case. Fixes #17023 * cr: followup items
239 lines
5.9 KiB
Go
239 lines
5.9 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package docker
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"regexp"
|
|
"sync"
|
|
"time"
|
|
|
|
docker "github.com/fsouza/go-dockerclient"
|
|
hclog "github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/go-set"
|
|
)
|
|
|
|
// containerReconciler detects and kills unexpectedly running containers.
|
|
//
|
|
// Due to Docker architecture and network based communication, it is
|
|
// possible for Docker to start a container successfully, but have the
|
|
// creation API call fail with a network error. containerReconciler
|
|
// scans for these untracked containers and kill them.
|
|
type containerReconciler struct {
|
|
ctx context.Context
|
|
config *ContainerGCConfig
|
|
logger hclog.Logger
|
|
getClient func() (*docker.Client, error)
|
|
|
|
isDriverHealthy func() bool
|
|
trackedContainers func() *set.Set[string]
|
|
isNomadContainer func(c docker.APIContainers) bool
|
|
|
|
once sync.Once
|
|
}
|
|
|
|
func newReconciler(d *Driver) *containerReconciler {
|
|
return &containerReconciler{
|
|
ctx: d.ctx,
|
|
config: &d.config.GC.DanglingContainers,
|
|
getClient: d.getDockerClient,
|
|
logger: d.logger,
|
|
|
|
isDriverHealthy: func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() },
|
|
trackedContainers: d.trackedContainers,
|
|
isNomadContainer: isNomadContainer,
|
|
}
|
|
}
|
|
|
|
func (r *containerReconciler) Start() {
|
|
if !r.config.Enabled {
|
|
r.logger.Debug("skipping dangling containers handling; is disabled")
|
|
return
|
|
}
|
|
|
|
r.once.Do(func() {
|
|
go r.removeDanglingContainersGoroutine()
|
|
})
|
|
}
|
|
|
|
func (r *containerReconciler) removeDanglingContainersGoroutine() {
|
|
period := r.config.period
|
|
|
|
lastIterSucceeded := true
|
|
|
|
// ensure that we wait for at least a period or creation timeout
|
|
// for first container GC iteration
|
|
// The initial period is a grace period for restore allocation
|
|
// before a driver may kill containers launched by an earlier nomad
|
|
// process.
|
|
initialDelay := period
|
|
if r.config.CreationGrace > initialDelay {
|
|
initialDelay = r.config.CreationGrace
|
|
}
|
|
|
|
timer := time.NewTimer(initialDelay)
|
|
for {
|
|
select {
|
|
case <-timer.C:
|
|
if r.isDriverHealthy() {
|
|
err := r.removeDanglingContainersIteration()
|
|
if err != nil && lastIterSucceeded {
|
|
r.logger.Warn("failed to remove dangling containers", "error", err)
|
|
}
|
|
lastIterSucceeded = (err == nil)
|
|
}
|
|
|
|
timer.Reset(period)
|
|
case <-r.ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r *containerReconciler) removeDanglingContainersIteration() error {
|
|
cutoff := time.Now().Add(-r.config.CreationGrace)
|
|
tracked := r.trackedContainers()
|
|
untracked, err := r.untrackedContainers(tracked, cutoff)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to find untracked containers: %v", err)
|
|
}
|
|
|
|
if untracked.Empty() {
|
|
return nil
|
|
}
|
|
|
|
if r.config.DryRun {
|
|
r.logger.Info("detected untracked containers", "container_ids", untracked)
|
|
return nil
|
|
}
|
|
|
|
dockerClient, err := r.getClient()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, id := range untracked.Slice() {
|
|
ctx, cancel := r.dockerAPIQueryContext()
|
|
err := dockerClient.RemoveContainer(docker.RemoveContainerOptions{
|
|
Context: ctx,
|
|
ID: id,
|
|
Force: true,
|
|
})
|
|
cancel()
|
|
if err != nil {
|
|
r.logger.Warn("failed to remove untracked container", "container_id", id, "error", err)
|
|
} else {
|
|
r.logger.Info("removed untracked container", "container_id", id)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// untrackedContainers returns the ids of containers that suspected
|
|
// to have been started by Nomad but aren't tracked by this driver
|
|
func (r *containerReconciler) untrackedContainers(tracked *set.Set[string], cutoffTime time.Time) (*set.Set[string], error) {
|
|
result := set.New[string](10)
|
|
|
|
ctx, cancel := r.dockerAPIQueryContext()
|
|
defer cancel()
|
|
|
|
dockerClient, err := r.getClient()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cc, err := dockerClient.ListContainers(docker.ListContainersOptions{
|
|
Context: ctx,
|
|
All: false, // only reconcile running containers
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list containers: %v", err)
|
|
}
|
|
|
|
cutoff := cutoffTime.Unix()
|
|
|
|
for _, c := range cc {
|
|
if tracked.Contains(c.ID) {
|
|
continue
|
|
}
|
|
|
|
if c.Created > cutoff {
|
|
continue
|
|
}
|
|
|
|
if !r.isNomadContainer(c) {
|
|
continue
|
|
}
|
|
|
|
result.Insert(c.ID)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
// dockerAPIQueryTimeout returns a context for docker API response with an appropriate timeout
|
|
// to protect against wedged locked-up API call.
|
|
//
|
|
// We'll try hitting Docker API on subsequent iteration.
|
|
func (r *containerReconciler) dockerAPIQueryContext() (context.Context, context.CancelFunc) {
|
|
// use a reasonable floor to avoid very small limit
|
|
timeout := 30 * time.Second
|
|
|
|
if timeout < r.config.period {
|
|
timeout = r.config.period
|
|
}
|
|
|
|
return context.WithTimeout(context.Background(), timeout)
|
|
}
|
|
|
|
func isNomadContainer(c docker.APIContainers) bool {
|
|
if _, ok := c.Labels[dockerLabelAllocID]; ok {
|
|
return true
|
|
}
|
|
|
|
// pre-0.10 containers aren't tagged or labeled in any way,
|
|
// so use cheap heuristic based on mount paths
|
|
// before inspecting container details
|
|
if !hasMount(c, "/alloc") ||
|
|
!hasMount(c, "/local") ||
|
|
!hasMount(c, "/secrets") ||
|
|
!hasNomadName(c) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func hasMount(c docker.APIContainers, p string) bool {
|
|
for _, m := range c.Mounts {
|
|
if m.Destination == p {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`)
|
|
|
|
func hasNomadName(c docker.APIContainers) bool {
|
|
for _, n := range c.Names {
|
|
if nomadContainerNamePattern.MatchString(n) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// trackedContainers returns the set of container IDs of containers that were
|
|
// started by Driver and are expected to be running. This includes both normal
|
|
// Task containers, as well as infra pause containers.
|
|
func (d *Driver) trackedContainers() *set.Set[string] {
|
|
// collect the task containers
|
|
ids := d.tasks.IDs()
|
|
// now also accumulate pause containers
|
|
return d.pauseContainers.union(ids)
|
|
}
|