open-nomad/client/alloc_runner_health_watcher.go

package client

import (
	"context"
	"time"

	"github.com/hashicorp/consul/api"
	"github.com/hashicorp/nomad/helper"
	"github.com/hashicorp/nomad/nomad/structs"
)

const (
	// consulCheckLookupInterval is the  interval at which we check if the
	// Consul checks are healthy or unhealthy.
	consulCheckLookupInterval = 500 * time.Millisecond
)

// watchHealth is responsible for watching an allocation's task status and
// potentially consul health check status to determine if the allocation is
// healthy or unhealthy.
func (r *AllocRunner) watchHealth(ctx context.Context) {
	// See if we should watch the allocs health
	alloc := r.Alloc()
	if alloc.DeploymentID == "" {
		r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment")
		return
	}

	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
	if tg == nil {
		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
		return
	}

	// Checks marks whether we should be watching for Consul health checks
	desiredChecks := 0
	var checkTicker *time.Ticker
	var checkCh <-chan time.Time

	u := tg.Update
	switch {
	case u == nil:
		r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID)
		return
	case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual:
		r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID)
		return
	case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks:
		for _, task := range tg.Tasks {
			for _, s := range task.Services {
				desiredChecks += len(s.Checks)
			}
		}

		checkTicker = time.NewTicker(consulCheckLookupInterval)
		checkCh = checkTicker.C
	}

	// Get a listener so we know when an allocation is updated.
	l := r.allocBroadcast.Listen()

	// Create a deadline timer for the health
	deadline := time.NewTimer(u.HealthyDeadline)

	// Create a healthy timer
	latestTaskHealthy := time.Unix(0, 0)
	latestChecksHealthy := time.Unix(0, 0)
	healthyTimer := time.NewTimer(0)
	if !healthyTimer.Stop() {
		<-healthyTimer.C
	}

	// Cleanup function
	defer func() {
		if !deadline.Stop() {
			<-deadline.C
		}
		if !healthyTimer.Stop() {
			<-healthyTimer.C
		}
		if checkTicker != nil {
			checkTicker.Stop()
		}
		l.Close()
	}()

	setHealth := func(h bool) {
		r.allocLock.Lock()
		r.allocHealth = helper.BoolToPtr(h)
		r.allocLock.Unlock()
		r.syncStatus()
	}

	var checks []*api.AgentCheck
	first := true
OUTER:
	for {
		if !first {
			select {
			case <-ctx.Done():
				return
			case newAlloc, ok := <-l.Ch:
				if !ok {
					return
				}

				alloc = newAlloc
				r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)
			case <-checkCh:
				newChecks, err := r.consulClient.Checks(alloc)
				if err != nil {
					r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v", alloc.ID, err)
				}

				checks = newChecks
			case <-deadline.C:
				// We have exceeded our deadline without being healthy.
				r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q hit healthy deadline", alloc.ID)
				setHealth(false)
				return
			case <-healthyTimer.C:
				r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID)
				setHealth(true)
				return
			}
		}
		first = false

		// If the alloc is being stopped by the server just exit
		switch alloc.DesiredStatus {
		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
			r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
			return
		}

		if len(alloc.TaskStates) != len(tg.Tasks) {
			r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started")
			continue OUTER
		}

		// If the task is dead or has restarted, fail
		for _, tstate := range alloc.TaskStates {
			if tstate.Failed || !tstate.FinishedAt.IsZero() || tstate.Restarts != 0 {
				r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID)
				setHealth(false)
				return
			}
		}

		// If we should have checks and they aren't all healthy continue
		if len(checks) != desiredChecks {
			r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, len(checks), alloc.ID)
			continue OUTER
		}

		// Check if all the checks are passing
		for _, check := range checks {
			if check.Status != api.HealthPassing {
				r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)
				latestChecksHealthy = time.Time{}
				continue OUTER
			}
		}
		if latestChecksHealthy.IsZero() {
			latestChecksHealthy = time.Now()
		}

		// Determine if the allocation is healthy
		for task, tstate := range alloc.TaskStates {
			if tstate.State != structs.TaskStateRunning {
				r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID)
				continue OUTER
			}

			if tstate.StartedAt.After(latestTaskHealthy) {
				latestTaskHealthy = tstate.StartedAt
			}
		}

		// Don't need to set the timer if we are healthy and have marked
		// ourselves healthy.
		if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Healthy != nil && *alloc.DeploymentStatus.Healthy {
			continue OUTER
		}

		// Determine when we can mark ourselves as healthy.
		totalHealthy := latestTaskHealthy
		if totalHealthy.Before(latestChecksHealthy) {
			totalHealthy = latestChecksHealthy
		}
		d := time.Until(totalHealthy.Add(u.MinHealthyTime))

		if !healthyTimer.Stop() {
			select {
			case <-healthyTimer.C:
			default:
			}
		}

		healthyTimer.Reset(d)
		r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID)
	}
}
initial watcher 2017-07-03 04:49:56 +00:00			`package client`

			`import (`
watcher per alloc 2017-07-03 22:03:42 +00:00			`"context"`
initial watcher 2017-07-03 04:49:56 +00:00			`"time"`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`"github.com/hashicorp/consul/api"`
initial watcher 2017-07-03 04:49:56 +00:00			`"github.com/hashicorp/nomad/helper"`
			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`const (`
			`// consulCheckLookupInterval is the interval at which we check if the`
			`// Consul checks are healthy or unhealthy.`
			`consulCheckLookupInterval = 500 * time.Millisecond`
			`)`

initial watcher 2017-07-03 04:49:56 +00:00			`// watchHealth is responsible for watching an allocation's task status and`
			`// potentially consul health check status to determine if the allocation is`
			`// healthy or unhealthy.`
watcher per alloc 2017-07-03 22:03:42 +00:00			`func (r *AllocRunner) watchHealth(ctx context.Context) {`
			`// See if we should watch the allocs health`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`alloc := r.Alloc()`
watcher per alloc 2017-07-03 22:03:42 +00:00			`if alloc.DeploymentID == "" {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment")`
			`return`
			`}`

initial watcher 2017-07-03 04:49:56 +00:00			`tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)`
			`if tg == nil {`
			`r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")`
			`return`
			`}`

			`// Checks marks whether we should be watching for Consul health checks`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`desiredChecks := 0`
			`var checkTicker *time.Ticker`
			`var checkCh <-chan time.Time`
initial watcher 2017-07-03 04:49:56 +00:00
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`u := tg.Update`
initial watcher 2017-07-03 04:49:56 +00:00			`switch {`
			`case u == nil:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID)`
			`return`
			`case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID)`
			`return`
			`case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks:`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`for _, task := range tg.Tasks {`
			`for _, s := range task.Services {`
			`desiredChecks += len(s.Checks)`
			`}`
			`}`

			`checkTicker = time.NewTicker(consulCheckLookupInterval)`
			`checkCh = checkTicker.C`
initial watcher 2017-07-03 04:49:56 +00:00			`}`

			`// Get a listener so we know when an allocation is updated.`
			`l := r.allocBroadcast.Listen()`

			`// Create a deadline timer for the health`
			`deadline := time.NewTimer(u.HealthyDeadline)`

			`// Create a healthy timer`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`latestTaskHealthy := time.Unix(0, 0)`
			`latestChecksHealthy := time.Unix(0, 0)`
initial watcher 2017-07-03 04:49:56 +00:00			`healthyTimer := time.NewTimer(0)`
			`if !healthyTimer.Stop() {`
			`<-healthyTimer.C`
			`}`

			`// Cleanup function`
			`defer func() {`
			`if !deadline.Stop() {`
			`<-deadline.C`
			`}`
			`if !healthyTimer.Stop() {`
			`<-healthyTimer.C`
			`}`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`if checkTicker != nil {`
			`checkTicker.Stop()`
			`}`
initial watcher 2017-07-03 04:49:56 +00:00			`l.Close()`
			`}()`

			`setHealth := func(h bool) {`
			`r.allocLock.Lock()`
			`r.allocHealth = helper.BoolToPtr(h)`
			`r.allocLock.Unlock()`
			`r.syncStatus()`
			`}`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`var checks []*api.AgentCheck`
initial watcher 2017-07-03 04:49:56 +00:00			`first := true`
			`OUTER:`
			`for {`
			`if !first {`
			`select {`
watcher per alloc 2017-07-03 22:03:42 +00:00			`case <-ctx.Done():`
initial watcher 2017-07-03 04:49:56 +00:00			`return`
			`case newAlloc, ok := <-l.Ch:`
			`if !ok {`
			`return`
			`}`

			`alloc = newAlloc`
			`r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`case <-checkCh:`
			`newChecks, err := r.consulClient.Checks(alloc)`
			`if err != nil {`
Warn log 2017-07-06 16:57:49 +00:00			`r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v", alloc.ID, err)`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`}`

			`checks = newChecks`
initial watcher 2017-07-03 04:49:56 +00:00			`case <-deadline.C:`
			`// We have exceeded our deadline without being healthy.`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q hit healthy deadline", alloc.ID)`
initial watcher 2017-07-03 04:49:56 +00:00			`setHealth(false)`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`return`
initial watcher 2017-07-03 04:49:56 +00:00			`case <-healthyTimer.C:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID)`
			`setHealth(true)`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`return`
initial watcher 2017-07-03 04:49:56 +00:00			`}`
			`}`
			`first = false`

			`// If the alloc is being stopped by the server just exit`
			`switch alloc.DesiredStatus {`
			`case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)`
			`return`
			`}`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`if len(alloc.TaskStates) != len(tg.Tasks) {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started")`
			`continue OUTER`
			`}`

initial watcher 2017-07-03 04:49:56 +00:00			`// If the task is dead or has restarted, fail`
			`for _, tstate := range alloc.TaskStates {`
			`if tstate.Failed \|\| !tstate.FinishedAt.IsZero() \|\| tstate.Restarts != 0 {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID)`
			`setHealth(false)`
			`return`
			`}`
			`}`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`// If we should have checks and they aren't all healthy continue`
			`if len(checks) != desiredChecks {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, len(checks), alloc.ID)`
			`continue OUTER`
			`}`

			`// Check if all the checks are passing`
			`for _, check := range checks {`
			`if check.Status != api.HealthPassing {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)`
			`latestChecksHealthy = time.Time{}`
			`continue OUTER`
			`}`
			`}`
			`if latestChecksHealthy.IsZero() {`
			`latestChecksHealthy = time.Now()`
			`}`

initial watcher 2017-07-03 04:49:56 +00:00			`// Determine if the allocation is healthy`
			`for task, tstate := range alloc.TaskStates {`
			`if tstate.State != structs.TaskStateRunning {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID)`
			`continue OUTER`
			`}`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`if tstate.StartedAt.After(latestTaskHealthy) {`
			`latestTaskHealthy = tstate.StartedAt`
initial watcher 2017-07-03 04:49:56 +00:00			`}`
			`}`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`// Don't need to set the timer if we are healthy and have marked`
			`// ourselves healthy.`
			`if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Healthy != nil && *alloc.DeploymentStatus.Healthy {`
initial watcher 2017-07-03 04:49:56 +00:00			`continue OUTER`
			`}`

Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00			`// Determine when we can mark ourselves as healthy.`
			`totalHealthy := latestTaskHealthy`
			`if totalHealthy.Before(latestChecksHealthy) {`
			`totalHealthy = latestChecksHealthy`
			`}`
			`d := time.Until(totalHealthy.Add(u.MinHealthyTime))`

initial watcher 2017-07-03 04:49:56 +00:00			`if !healthyTimer.Stop() {`
			`select {`
			`case <-healthyTimer.C:`
			`default:`
			`}`
			`}`
Client watches for allocation health using task state and Consul checks This PR adds watching of allocation health at the client. The client can watch for health based on the tasks running on time and also based on the consul checks passing. 2017-07-04 19:24:27 +00:00
initial watcher 2017-07-03 04:49:56 +00:00			`healthyTimer.Reset(d)`
			`r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID)`
			`}`
			`}`