open-nomad/client/alloc_runner_health_watcher.go

package client

import (
	"context"
	"time"

	"github.com/hashicorp/nomad/helper"
	"github.com/hashicorp/nomad/nomad/structs"
)

// watchHealth is responsible for watching an allocation's task status and
// potentially consul health check status to determine if the allocation is
// healthy or unhealthy.
func (r *AllocRunner) watchHealth(ctx context.Context) {
	// Get our alloc and the task group
	alloc := r.Alloc()

	// See if we should watch the allocs health
	if alloc.DeploymentID == "" {
		r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment")
		return
	}

	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
	if tg == nil {
		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
		return
	}
	u := tg.Update

	// Checks marks whether we should be watching for Consul health checks
	checks := false
	r.logger.Printf("XXX %v", checks)

	switch {
	case u == nil:
		r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID)
		return
	case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual:
		r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID)
		return
	case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks:
		checks = true
	}

	// Get a listener so we know when an allocation is updated.
	l := r.allocBroadcast.Listen()

	// Create a deadline timer for the health
	deadline := time.NewTimer(u.HealthyDeadline)

	// Create a healthy timer
	latestHealthyTime := time.Unix(0, 0)
	healthyTimer := time.NewTimer(0)
	if !healthyTimer.Stop() {
		<-healthyTimer.C
	}

	// Cleanup function
	defer func() {
		if !deadline.Stop() {
			<-deadline.C
		}
		if !healthyTimer.Stop() {
			<-healthyTimer.C
		}
		l.Close()
	}()

	setHealth := func(h bool) {
		r.allocLock.Lock()
		r.allocHealth = helper.BoolToPtr(h)
		r.allocLock.Unlock()
		r.syncStatus()
	}

	first := true
OUTER:
	for {
		if !first {
			select {
			case <-ctx.Done():
				return
			case newAlloc, ok := <-l.Ch:
				if !ok {
					return
				}

				alloc = newAlloc
				r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)
			case <-deadline.C:
				// We have exceeded our deadline without being healthy.
				setHealth(false)
			case <-healthyTimer.C:
				r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID)
				setHealth(true)
			}
		}
		first = false

		// If the alloc is being stopped by the server just exit
		switch alloc.DesiredStatus {
		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
			r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
			return
		}

		// If the task is dead or has restarted, fail
		for _, tstate := range alloc.TaskStates {
			if tstate.Failed || !tstate.FinishedAt.IsZero() || tstate.Restarts != 0 {
				r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID)
				setHealth(false)
				return
			}
		}

		// Determine if the allocation is healthy
		for task, tstate := range alloc.TaskStates {
			if tstate.State != structs.TaskStateRunning {
				r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID)
				continue OUTER
			}

			if tstate.StartedAt.After(latestHealthyTime) {
				latestHealthyTime = tstate.StartedAt
			}
		}

		// If we are already healthy we don't set the timer
		healthyThreshold := latestHealthyTime.Add(u.MinHealthyTime)
		if time.Now().After(healthyThreshold) {
			continue OUTER
		}

		// Start the time til we are healthy
		if !healthyTimer.Stop() {
			select {
			case <-healthyTimer.C:
			default:
			}
		}
		d := time.Until(healthyThreshold)
		healthyTimer.Reset(d)
		r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID)
	}
}
initial watcher 2017-07-03 04:49:56 +00:00			`package client`

			`import (`
watcher per alloc 2017-07-03 22:03:42 +00:00			`"context"`
initial watcher 2017-07-03 04:49:56 +00:00			`"time"`

			`"github.com/hashicorp/nomad/helper"`
			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`

			`// watchHealth is responsible for watching an allocation's task status and`
			`// potentially consul health check status to determine if the allocation is`
			`// healthy or unhealthy.`
watcher per alloc 2017-07-03 22:03:42 +00:00			`func (r *AllocRunner) watchHealth(ctx context.Context) {`
initial watcher 2017-07-03 04:49:56 +00:00			`// Get our alloc and the task group`
			`alloc := r.Alloc()`
watcher per alloc 2017-07-03 22:03:42 +00:00
			`// See if we should watch the allocs health`
			`if alloc.DeploymentID == "" {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment")`
			`return`
			`}`

initial watcher 2017-07-03 04:49:56 +00:00			`tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)`
			`if tg == nil {`
			`r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")`
			`return`
			`}`
			`u := tg.Update`

			`// Checks marks whether we should be watching for Consul health checks`
			`checks := false`
			`r.logger.Printf("XXX %v", checks)`

			`switch {`
			`case u == nil:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID)`
			`return`
			`case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID)`
			`return`
			`case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks:`
			`checks = true`
			`}`

			`// Get a listener so we know when an allocation is updated.`
			`l := r.allocBroadcast.Listen()`

			`// Create a deadline timer for the health`
			`deadline := time.NewTimer(u.HealthyDeadline)`

			`// Create a healthy timer`
			`latestHealthyTime := time.Unix(0, 0)`
			`healthyTimer := time.NewTimer(0)`
			`if !healthyTimer.Stop() {`
			`<-healthyTimer.C`
			`}`

			`// Cleanup function`
			`defer func() {`
			`if !deadline.Stop() {`
			`<-deadline.C`
			`}`
			`if !healthyTimer.Stop() {`
			`<-healthyTimer.C`
			`}`
			`l.Close()`
			`}()`

			`setHealth := func(h bool) {`
			`r.allocLock.Lock()`
			`r.allocHealth = helper.BoolToPtr(h)`
			`r.allocLock.Unlock()`
			`r.syncStatus()`
			`}`

			`first := true`
			`OUTER:`
			`for {`
			`if !first {`
			`select {`
watcher per alloc 2017-07-03 22:03:42 +00:00			`case <-ctx.Done():`
initial watcher 2017-07-03 04:49:56 +00:00			`return`
			`case newAlloc, ok := <-l.Ch:`
			`if !ok {`
			`return`
			`}`

			`alloc = newAlloc`
			`r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)`
			`case <-deadline.C:`
			`// We have exceeded our deadline without being healthy.`
			`setHealth(false)`
			`case <-healthyTimer.C:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID)`
			`setHealth(true)`
			`}`
			`}`
			`first = false`

			`// If the alloc is being stopped by the server just exit`
			`switch alloc.DesiredStatus {`
			`case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:`
			`r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)`
			`return`
			`}`

			`// If the task is dead or has restarted, fail`
			`for _, tstate := range alloc.TaskStates {`
			`if tstate.Failed \|\| !tstate.FinishedAt.IsZero() \|\| tstate.Restarts != 0 {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID)`
			`setHealth(false)`
			`return`
			`}`
			`}`

			`// Determine if the allocation is healthy`
			`for task, tstate := range alloc.TaskStates {`
			`if tstate.State != structs.TaskStateRunning {`
			`r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID)`
			`continue OUTER`
			`}`

			`if tstate.StartedAt.After(latestHealthyTime) {`
			`latestHealthyTime = tstate.StartedAt`
			`}`
			`}`

			`// If we are already healthy we don't set the timer`
			`healthyThreshold := latestHealthyTime.Add(u.MinHealthyTime)`
			`if time.Now().After(healthyThreshold) {`
			`continue OUTER`
			`}`

			`// Start the time til we are healthy`
			`if !healthyTimer.Stop() {`
			`select {`
			`case <-healthyTimer.C:`
			`default:`
			`}`
			`}`
			`d := time.Until(healthyThreshold)`
			`healthyTimer.Reset(d)`
			`r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID)`
			`}`
			`}`