2019-09-13 15:24:58 +00:00
|
|
|
package docker
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"regexp"
|
2019-10-18 18:35:15 +00:00
|
|
|
"sync"
|
2019-09-13 15:24:58 +00:00
|
|
|
"time"
|
|
|
|
|
|
|
|
docker "github.com/fsouza/go-dockerclient"
|
2019-10-17 12:37:18 +00:00
|
|
|
hclog "github.com/hashicorp/go-hclog"
|
2019-09-13 15:24:58 +00:00
|
|
|
)
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
// containerReconciler detects and kills unexpectedly running containers.
|
|
|
|
//
|
|
|
|
// Due to Docker architecture and network based communication, it is
|
|
|
|
// possible for Docker to start a container successfully, but have the
|
|
|
|
// creation API call fail with a network error. containerReconciler
|
|
|
|
// scans for these untracked containers and kill them.
|
|
|
|
type containerReconciler struct {
|
|
|
|
ctx context.Context
|
|
|
|
config *ContainerGCConfig
|
|
|
|
client *docker.Client
|
|
|
|
logger hclog.Logger
|
|
|
|
|
|
|
|
isDriverHealthy func() bool
|
|
|
|
trackedContainers func() map[string]bool
|
|
|
|
isNomadContainer func(c docker.APIContainers) bool
|
2019-10-18 18:35:15 +00:00
|
|
|
|
|
|
|
once sync.Once
|
2019-10-17 12:37:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func newReconciler(d *Driver) *containerReconciler {
|
|
|
|
return &containerReconciler{
|
|
|
|
ctx: d.ctx,
|
|
|
|
config: &d.config.GC.DanglingContainers,
|
|
|
|
client: client,
|
|
|
|
logger: d.logger,
|
|
|
|
|
|
|
|
isDriverHealthy: func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() },
|
|
|
|
trackedContainers: d.trackedContainers,
|
|
|
|
isNomadContainer: isNomadContainer,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *containerReconciler) Start() {
|
|
|
|
if !r.config.Enabled {
|
|
|
|
r.logger.Debug("skipping dangling containers handling; is disabled")
|
2019-09-13 15:24:58 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2019-10-18 18:35:15 +00:00
|
|
|
r.once.Do(func() {
|
|
|
|
go r.removeDanglingContainersGoroutine()
|
|
|
|
})
|
2019-10-17 12:37:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (r *containerReconciler) removeDanglingContainersGoroutine() {
|
|
|
|
period := r.config.period
|
2019-09-13 15:24:58 +00:00
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
lastIterSucceeded := true
|
2019-09-13 15:24:58 +00:00
|
|
|
|
2019-09-13 17:59:36 +00:00
|
|
|
// ensure that we wait for at least a period or creation timeout
|
|
|
|
// for first container GC iteration
|
|
|
|
// The initial period is a grace period for restore allocation
|
|
|
|
// before a driver may kill containers launched by an earlier nomad
|
|
|
|
// process.
|
|
|
|
initialDelay := period
|
2019-10-17 12:37:18 +00:00
|
|
|
if r.config.CreationGrace > initialDelay {
|
|
|
|
initialDelay = r.config.CreationGrace
|
2019-09-13 17:59:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
timer := time.NewTimer(initialDelay)
|
2019-09-13 15:24:58 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-timer.C:
|
2019-10-17 12:37:18 +00:00
|
|
|
if r.isDriverHealthy() {
|
|
|
|
err := r.removeDanglingContainersIteration()
|
|
|
|
if err != nil && lastIterSucceeded {
|
|
|
|
r.logger.Warn("failed to remove dangling containers", "error", err)
|
2019-09-13 15:24:58 +00:00
|
|
|
}
|
2019-10-17 12:37:18 +00:00
|
|
|
lastIterSucceeded = (err == nil)
|
2019-09-13 15:24:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
timer.Reset(period)
|
2019-10-17 12:37:18 +00:00
|
|
|
case <-r.ctx.Done():
|
2019-09-13 15:24:58 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
func (r *containerReconciler) removeDanglingContainersIteration() error {
|
|
|
|
cutoff := time.Now().Add(-r.config.CreationGrace)
|
|
|
|
tracked := r.trackedContainers()
|
|
|
|
untracked, err := r.untrackedContainers(tracked, cutoff)
|
2019-09-13 15:24:58 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to find untracked containers: %v", err)
|
|
|
|
}
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
if len(untracked) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if r.config.DryRun {
|
|
|
|
r.logger.Info("detected untracked containers", "container_ids", untracked)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-09-13 15:24:58 +00:00
|
|
|
for _, id := range untracked {
|
2019-10-18 19:03:58 +00:00
|
|
|
ctx, cancel := r.dockerAPIQueryContext()
|
2019-09-13 15:24:58 +00:00
|
|
|
err := client.RemoveContainer(docker.RemoveContainerOptions{
|
2019-10-18 19:03:58 +00:00
|
|
|
Context: ctx,
|
|
|
|
ID: id,
|
|
|
|
Force: true,
|
2019-09-13 15:24:58 +00:00
|
|
|
})
|
2019-10-18 19:03:58 +00:00
|
|
|
cancel()
|
2019-09-13 15:24:58 +00:00
|
|
|
if err != nil {
|
2019-10-17 12:37:18 +00:00
|
|
|
r.logger.Warn("failed to remove untracked container", "container_id", id, "error", err)
|
|
|
|
} else {
|
|
|
|
r.logger.Info("removed untracked container", "container_id", id)
|
2019-09-13 15:24:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-09-16 14:40:56 +00:00
|
|
|
// untrackedContainers returns the ids of containers that suspected
|
|
|
|
// to have been started by Nomad but aren't tracked by this driver
|
2019-10-17 12:37:18 +00:00
|
|
|
func (r *containerReconciler) untrackedContainers(tracked map[string]bool, cutoffTime time.Time) ([]string, error) {
|
2019-09-13 15:24:58 +00:00
|
|
|
result := []string{}
|
|
|
|
|
2019-10-18 19:03:58 +00:00
|
|
|
ctx, cancel := r.dockerAPIQueryContext()
|
|
|
|
defer cancel()
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
cc, err := client.ListContainers(docker.ListContainersOptions{
|
2019-10-18 19:03:58 +00:00
|
|
|
Context: ctx,
|
|
|
|
All: false, // only reconcile running containers
|
2019-10-17 12:37:18 +00:00
|
|
|
})
|
2019-09-13 15:24:58 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to list containers: %v", err)
|
|
|
|
}
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
cutoff := cutoffTime.Unix()
|
2019-09-13 15:24:58 +00:00
|
|
|
|
|
|
|
for _, c := range cc {
|
|
|
|
if tracked[c.ID] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if c.Created > cutoff {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
if !r.isNomadContainer(c) {
|
2019-09-13 15:24:58 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
result = append(result, c.ID)
|
|
|
|
}
|
|
|
|
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
|
2019-10-18 19:03:58 +00:00
|
|
|
// dockerAPIQueryTimeout returns a context for docker API response with an appropriate timeout
|
|
|
|
// to protect against wedged locked-up API call.
|
|
|
|
//
|
|
|
|
// We'll try hitting Docker API on subsequent iteration.
|
|
|
|
func (r *containerReconciler) dockerAPIQueryContext() (context.Context, context.CancelFunc) {
|
|
|
|
// use a reasoanble floor to avoid very small limit
|
|
|
|
timeout := 30 * time.Second
|
|
|
|
|
|
|
|
if timeout < r.config.period {
|
|
|
|
timeout = r.config.period
|
|
|
|
}
|
|
|
|
|
|
|
|
return context.WithTimeout(context.Background(), timeout)
|
|
|
|
}
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
func isNomadContainer(c docker.APIContainers) bool {
|
2019-10-17 14:28:23 +00:00
|
|
|
if _, ok := c.Labels[dockerLabelAllocID]; ok {
|
2019-09-13 15:24:58 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// pre-0.10 containers aren't tagged or labeled in any way,
|
2019-10-17 12:37:18 +00:00
|
|
|
// so use cheap heuristic based on mount paths
|
2019-09-13 15:24:58 +00:00
|
|
|
// before inspecting container details
|
|
|
|
if !hasMount(c, "/alloc") ||
|
|
|
|
!hasMount(c, "/local") ||
|
|
|
|
!hasMount(c, "/secrets") ||
|
|
|
|
!hasNomadName(c) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2019-10-17 12:37:18 +00:00
|
|
|
return true
|
2019-09-13 15:24:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func hasMount(c docker.APIContainers, p string) bool {
|
|
|
|
for _, m := range c.Mounts {
|
|
|
|
if m.Destination == p {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`)
|
|
|
|
|
|
|
|
func hasNomadName(c docker.APIContainers) bool {
|
|
|
|
for _, n := range c.Names {
|
|
|
|
if nomadContainerNamePattern.MatchString(n) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Driver) trackedContainers() map[string]bool {
|
|
|
|
d.tasks.lock.RLock()
|
|
|
|
defer d.tasks.lock.RUnlock()
|
|
|
|
|
|
|
|
r := make(map[string]bool, len(d.tasks.store))
|
|
|
|
for _, h := range d.tasks.store {
|
|
|
|
r[h.containerID] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
return r
|
|
|
|
}
|