open-nomad/drivers/docker/reconciler.go
Mahmood Ali aa59280edc docker: periodically reconcile containers
When running at scale, it's possible that Docker Engine starts
containers successfully but gets wedged in a way where API call fails.
The Docker Engine may remain unavailable for arbitrary long time.

Here, we introduce a periodic reconcilation process that ensures that any
container started by nomad is tracked, and killed if is running
unexpectedly.

Basically, the periodic job inspects any container that isn't tracked in
its handlers.  A creation grace period is used to prevent killing newly
created containers that aren't registered yet.

Also, we aim to avoid killing unrelated containters started by host or
through raw_exec drivers.  The logic is to pattern against containers
environment variables and mounts to infer if they are an alloc docker
container.

Lastly, the periodic job can be disabled to avoid any interference if
need be.
2019-10-17 08:36:01 -04:00

165 lines
3.5 KiB
Go

package docker
import (
"context"
"fmt"
"regexp"
"strings"
"time"
docker "github.com/fsouza/go-dockerclient"
)
func (d *Driver) removeDanglingContainersGoroutine() {
if !d.config.GC.DanglingContainers.Enabled {
d.logger.Debug("skipping dangling containers handling; is disabled")
return
}
period := d.config.GC.DanglingContainers.period
succeeded := true
timer := time.NewTimer(period)
for {
select {
case <-timer.C:
if d.previouslyDetected() && d.fingerprintSuccessful() {
err := d.removeDanglingContainersIteration()
if err != nil && succeeded {
d.logger.Warn("failed to remove dangling containers", "error", err)
}
succeeded = (err == nil)
}
timer.Reset(period)
case <-d.ctx.Done():
return
}
}
}
func (d *Driver) removeDanglingContainersIteration() error {
tracked := d.trackedContainers()
untracked, err := d.untrackedContainers(tracked, d.config.GC.DanglingContainers.creationTimeout)
if err != nil {
return fmt.Errorf("failed to find untracked containers: %v", err)
}
for _, id := range untracked {
d.logger.Info("removing untracked container", "container_id", id)
err := client.RemoveContainer(docker.RemoveContainerOptions{
ID: id,
Force: true,
})
if err != nil {
d.logger.Warn("failed to remove untracked container", "container_id", id, "error", err)
}
}
return nil
}
// untrackedContainers returns the ids of containers that look
func (d *Driver) untrackedContainers(tracked map[string]bool, creationTimeout time.Duration) ([]string, error) {
result := []string{}
cc, err := client.ListContainers(docker.ListContainersOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list containers: %v", err)
}
cutoff := time.Now().Add(-creationTimeout).Unix()
for _, c := range cc {
if tracked[c.ID] {
continue
}
if c.Created > cutoff {
continue
}
if !d.isNomadContainer(c) {
continue
}
result = append(result, c.ID)
}
return result, nil
}
func (d *Driver) isNomadContainer(c docker.APIContainers) bool {
if _, ok := c.Labels["com.hashicorp.nomad.alloc_id"]; ok {
return true
}
// pre-0.10 containers aren't tagged or labeled in any way,
// so use cheap heauristic based on mount paths
// before inspecting container details
if !hasMount(c, "/alloc") ||
!hasMount(c, "/local") ||
!hasMount(c, "/secrets") ||
!hasNomadName(c) {
return false
}
// double check before killing process
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()
ci, err := client.InspectContainerWithContext(c.ID, ctx)
if err != nil {
return false
}
env := ci.Config.Env
return hasEnvVar(env, "NOMAD_ALLOC_ID") &&
hasEnvVar(env, "NOMAD_GROUP_NAME")
}
func hasMount(c docker.APIContainers, p string) bool {
for _, m := range c.Mounts {
if m.Destination == p {
return true
}
}
return false
}
var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`)
func hasNomadName(c docker.APIContainers) bool {
for _, n := range c.Names {
if nomadContainerNamePattern.MatchString(n) {
return true
}
}
return false
}
func hasEnvVar(vars []string, key string) bool {
for _, v := range vars {
if strings.HasPrefix(v, key+"=") {
return true
}
}
return false
}
func (d *Driver) trackedContainers() map[string]bool {
d.tasks.lock.RLock()
defer d.tasks.lock.RUnlock()
r := make(map[string]bool, len(d.tasks.store))
for _, h := range d.tasks.store {
r[h.containerID] = true
}
return r
}