open-nomad/command/agent/consul/check_watcher.go

342 lines
8.9 KiB
Go

package consul
import (
"context"
"fmt"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/nomad/structs"
)
const (
// defaultPollFreq is the default rate to poll the Consul Checks API
defaultPollFreq = 900 * time.Millisecond
)
// ChecksAPI is the part of the Consul API the checkWatcher requires.
type ChecksAPI interface {
// Checks returns a list of all checks.
Checks() (map[string]*api.AgentCheck, error)
}
// WorkloadRestarter allows the checkWatcher to restart tasks or entire task groups.
type WorkloadRestarter interface {
Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error
}
// checkRestart handles restarting a task if a check is unhealthy.
type checkRestart struct {
allocID string
taskName string
checkID string
checkName string
taskKey string // composite of allocID + taskName for uniqueness
task WorkloadRestarter
grace time.Duration
interval time.Duration
timeLimit time.Duration
ignoreWarnings bool
// Mutable fields
// unhealthyState is the time a check first went unhealthy. Set to the
// zero value if the check passes before timeLimit.
unhealthyState time.Time
// graceUntil is when the check's grace period expires and unhealthy
// checks should be counted.
graceUntil time.Time
logger log.Logger
}
// apply restart state for check and restart task if necessary. Current
// timestamp is passed in so all check updates have the same view of time (and
// to ease testing).
//
// Returns true if a restart was triggered in which case this check should be
// removed (checks are added on task startup).
func (c *checkRestart) apply(ctx context.Context, now time.Time, status string) bool {
healthy := func() {
if !c.unhealthyState.IsZero() {
c.logger.Debug("canceling restart because check became healthy")
c.unhealthyState = time.Time{}
}
}
switch status {
case api.HealthCritical:
case api.HealthWarning:
if c.ignoreWarnings {
// Warnings are ignored, reset state and exit
healthy()
return false
}
default:
// All other statuses are ok, reset state and exit
healthy()
return false
}
if now.Before(c.graceUntil) {
// In grace period, exit
return false
}
if c.unhealthyState.IsZero() {
// First failure, set restart deadline
if c.timeLimit != 0 {
c.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", c.timeLimit)
}
c.unhealthyState = now
}
// restart timeLimit after start of this check becoming unhealthy
restartAt := c.unhealthyState.Add(c.timeLimit)
// Must test >= because if limit=1, restartAt == first failure
if now.Equal(restartAt) || now.After(restartAt) {
// hasn't become healthy by deadline, restart!
c.logger.Debug("restarting due to unhealthy check")
// Tell TaskRunner to restart due to failure
reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
go asyncRestart(ctx, c.logger, c.task, event)
return true
}
return false
}
// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
// to be called in a goroutine.
func asyncRestart(ctx context.Context, logger log.Logger, task WorkloadRestarter, event *structs.TaskEvent) {
// Check watcher restarts are always failures
const failure = true
// Restarting is asynchronous so there's no reason to allow this
// goroutine to block indefinitely.
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
if err := task.Restart(ctx, event, failure); err != nil {
// Restart errors are not actionable and only relevant when
// debugging allocation lifecycle management.
logger.Debug("failed to restart task", "error", err,
"event_time", event.Time, "event_type", event.Type)
}
}
// checkWatchUpdates add or remove checks from the watcher
type checkWatchUpdate struct {
checkID string
remove bool
checkRestart *checkRestart
}
// checkWatcher watches Consul checks and restarts tasks when they're
// unhealthy.
type checkWatcher struct {
consul ChecksAPI
// pollFreq is how often to poll the checks API and defaults to
// defaultPollFreq
pollFreq time.Duration
// checkUpdateCh is how watches (and removals) are sent to the main
// watching loop
checkUpdateCh chan checkWatchUpdate
// done is closed when Run has exited
done chan struct{}
// lastErr is true if the last Consul call failed. It is used to
// squelch repeated error messages.
lastErr bool
logger log.Logger
}
// newCheckWatcher creates a new checkWatcher but does not call its Run method.
func newCheckWatcher(logger log.Logger, consul ChecksAPI) *checkWatcher {
return &checkWatcher{
consul: consul,
pollFreq: defaultPollFreq,
checkUpdateCh: make(chan checkWatchUpdate, 8),
done: make(chan struct{}),
logger: logger.ResetNamed("consul.health"),
}
}
// Run the main Consul checks watching loop to restart tasks when their checks
// fail. Blocks until context is canceled.
func (w *checkWatcher) Run(ctx context.Context) {
defer close(w.done)
// map of check IDs to their metadata
checks := map[string]*checkRestart{}
// timer for check polling
checkTimer := time.NewTimer(0)
defer checkTimer.Stop() // ensure timer is never leaked
stopTimer := func() {
checkTimer.Stop()
select {
case <-checkTimer.C:
default:
}
}
// disable by default
stopTimer()
// Main watch loop
for {
// disable polling if there are no checks
if len(checks) == 0 {
stopTimer()
}
select {
case update := <-w.checkUpdateCh:
if update.remove {
// Remove a check
delete(checks, update.checkID)
continue
}
// Add/update a check
checks[update.checkID] = update.checkRestart
w.logger.Debug("watching check", "alloc_id", update.checkRestart.allocID,
"task", update.checkRestart.taskName, "check", update.checkRestart.checkName)
// if first check was added make sure polling is enabled
if len(checks) == 1 {
stopTimer()
checkTimer.Reset(w.pollFreq)
}
case <-ctx.Done():
return
case <-checkTimer.C:
checkTimer.Reset(w.pollFreq)
// Set "now" as the point in time the following check results represent
now := time.Now()
results, err := w.consul.Checks()
if err != nil {
if !w.lastErr {
w.lastErr = true
w.logger.Error("failed retrieving health checks", "error", err)
}
continue
}
w.lastErr = false
// Keep track of tasks restarted this period so they
// are only restarted once and all of their checks are
// removed.
restartedTasks := map[string]struct{}{}
// Loop over watched checks and update their status from results
for cid, check := range checks {
// Shortcircuit if told to exit
if ctx.Err() != nil {
return
}
if _, ok := restartedTasks[check.taskKey]; ok {
// Check for this task already restarted; remove and skip check
delete(checks, cid)
continue
}
result, ok := results[cid]
if !ok {
// Only warn if outside grace period to avoid races with check registration
if now.After(check.graceUntil) {
w.logger.Warn("watched check not found in Consul", "check", check.checkName, "check_id", cid)
}
continue
}
restarted := check.apply(ctx, now, result.Status)
if restarted {
// Checks are registered+watched on
// startup, so it's safe to remove them
// whenever they're restarted
delete(checks, cid)
restartedTasks[check.taskKey] = struct{}{}
}
}
// Ensure even passing checks for restartedTasks are removed
if len(restartedTasks) > 0 {
for cid, check := range checks {
if _, ok := restartedTasks[check.taskKey]; ok {
delete(checks, cid)
}
}
}
}
}
}
// Watch a check and restart its task if unhealthy.
func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter WorkloadRestarter) {
if !check.TriggersRestarts() {
// Not watched, noop
return
}
c := &checkRestart{
allocID: allocID,
taskName: taskName,
checkID: checkID,
checkName: check.Name,
taskKey: fmt.Sprintf("%s%s", allocID, taskName), // unique task ID
task: restarter,
interval: check.Interval,
grace: check.CheckRestart.Grace,
graceUntil: time.Now().Add(check.CheckRestart.Grace),
timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1),
ignoreWarnings: check.CheckRestart.IgnoreWarnings,
logger: w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name),
}
update := checkWatchUpdate{
checkID: checkID,
checkRestart: c,
}
select {
case w.checkUpdateCh <- update:
// sent watch
case <-w.done:
// exited; nothing to do
}
}
// Unwatch a check.
func (w *checkWatcher) Unwatch(cid string) {
c := checkWatchUpdate{
checkID: cid,
remove: true,
}
select {
case w.checkUpdateCh <- c:
// sent remove watch
case <-w.done:
// exited; nothing to do
}
}