open-nomad/client/alloc_runner_health_watcher.go

147 lines
3.8 KiB
Go
Raw Normal View History

2017-07-03 04:49:56 +00:00
package client
import (
2017-07-03 22:03:42 +00:00
"context"
2017-07-03 04:49:56 +00:00
"time"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
// watchHealth is responsible for watching an allocation's task status and
// potentially consul health check status to determine if the allocation is
// healthy or unhealthy.
2017-07-03 22:03:42 +00:00
func (r *AllocRunner) watchHealth(ctx context.Context) {
2017-07-03 04:49:56 +00:00
// Get our alloc and the task group
alloc := r.Alloc()
2017-07-03 22:03:42 +00:00
// See if we should watch the allocs health
if alloc.DeploymentID == "" {
r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment")
return
}
2017-07-03 04:49:56 +00:00
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
return
}
u := tg.Update
// Checks marks whether we should be watching for Consul health checks
checks := false
r.logger.Printf("XXX %v", checks)
switch {
case u == nil:
r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID)
return
case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual:
r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID)
return
case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks:
checks = true
}
// Get a listener so we know when an allocation is updated.
l := r.allocBroadcast.Listen()
// Create a deadline timer for the health
deadline := time.NewTimer(u.HealthyDeadline)
// Create a healthy timer
latestHealthyTime := time.Unix(0, 0)
healthyTimer := time.NewTimer(0)
if !healthyTimer.Stop() {
<-healthyTimer.C
}
// Cleanup function
defer func() {
if !deadline.Stop() {
<-deadline.C
}
if !healthyTimer.Stop() {
<-healthyTimer.C
}
l.Close()
}()
setHealth := func(h bool) {
r.allocLock.Lock()
r.allocHealth = helper.BoolToPtr(h)
r.allocLock.Unlock()
r.syncStatus()
}
first := true
OUTER:
for {
if !first {
select {
2017-07-03 22:03:42 +00:00
case <-ctx.Done():
2017-07-03 04:49:56 +00:00
return
case newAlloc, ok := <-l.Ch:
if !ok {
return
}
alloc = newAlloc
r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)
case <-deadline.C:
// We have exceeded our deadline without being healthy.
setHealth(false)
case <-healthyTimer.C:
r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID)
setHealth(true)
}
}
first = false
// If the alloc is being stopped by the server just exit
switch alloc.DesiredStatus {
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
return
}
// If the task is dead or has restarted, fail
for _, tstate := range alloc.TaskStates {
if tstate.Failed || !tstate.FinishedAt.IsZero() || tstate.Restarts != 0 {
r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID)
setHealth(false)
return
}
}
// Determine if the allocation is healthy
for task, tstate := range alloc.TaskStates {
if tstate.State != structs.TaskStateRunning {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID)
continue OUTER
}
if tstate.StartedAt.After(latestHealthyTime) {
latestHealthyTime = tstate.StartedAt
}
}
// If we are already healthy we don't set the timer
healthyThreshold := latestHealthyTime.Add(u.MinHealthyTime)
if time.Now().After(healthyThreshold) {
continue OUTER
}
// Start the time til we are healthy
if !healthyTimer.Stop() {
select {
case <-healthyTimer.C:
default:
}
}
d := time.Until(healthyThreshold)
healthyTimer.Reset(d)
r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID)
}
}