2017-07-03 04:49:56 +00:00
package client
import (
2017-07-03 22:03:42 +00:00
"context"
2017-07-03 04:49:56 +00:00
"time"
2017-07-04 19:24:27 +00:00
"github.com/hashicorp/consul/api"
2017-07-03 04:49:56 +00:00
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
2017-07-04 19:24:27 +00:00
const (
// consulCheckLookupInterval is the interval at which we check if the
// Consul checks are healthy or unhealthy.
consulCheckLookupInterval = 500 * time . Millisecond
)
2017-07-03 04:49:56 +00:00
// watchHealth is responsible for watching an allocation's task status and
// potentially consul health check status to determine if the allocation is
// healthy or unhealthy.
2017-07-03 22:03:42 +00:00
func ( r * AllocRunner ) watchHealth ( ctx context . Context ) {
// See if we should watch the allocs health
2017-07-04 19:24:27 +00:00
alloc := r . Alloc ( )
2017-07-03 22:03:42 +00:00
if alloc . DeploymentID == "" {
r . logger . Printf ( "[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment" )
return
}
2017-07-03 04:49:56 +00:00
tg := alloc . Job . LookupTaskGroup ( alloc . TaskGroup )
if tg == nil {
r . logger . Printf ( "[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher" )
return
}
// Checks marks whether we should be watching for Consul health checks
2017-07-04 19:24:27 +00:00
desiredChecks := 0
var checkTicker * time . Ticker
var checkCh <- chan time . Time
2017-07-03 04:49:56 +00:00
2017-07-04 19:24:27 +00:00
u := tg . Update
2017-07-03 04:49:56 +00:00
switch {
case u == nil :
r . logger . Printf ( "[TRACE] client.alloc_watcher: no update block for alloc %q. exiting" , alloc . ID )
return
case u . HealthCheck == structs . UpdateStrategyHealthCheck_Manual :
r . logger . Printf ( "[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting" , alloc . ID )
return
case u . HealthCheck == structs . UpdateStrategyHealthCheck_Checks :
2017-07-04 19:24:27 +00:00
for _ , task := range tg . Tasks {
for _ , s := range task . Services {
desiredChecks += len ( s . Checks )
}
}
checkTicker = time . NewTicker ( consulCheckLookupInterval )
checkCh = checkTicker . C
2017-07-03 04:49:56 +00:00
}
// Get a listener so we know when an allocation is updated.
l := r . allocBroadcast . Listen ( )
// Create a deadline timer for the health
deadline := time . NewTimer ( u . HealthyDeadline )
// Create a healthy timer
2017-07-04 19:24:27 +00:00
latestTaskHealthy := time . Unix ( 0 , 0 )
latestChecksHealthy := time . Unix ( 0 , 0 )
2017-07-03 04:49:56 +00:00
healthyTimer := time . NewTimer ( 0 )
if ! healthyTimer . Stop ( ) {
<- healthyTimer . C
}
// Cleanup function
defer func ( ) {
if ! deadline . Stop ( ) {
<- deadline . C
}
if ! healthyTimer . Stop ( ) {
<- healthyTimer . C
}
2017-07-04 19:24:27 +00:00
if checkTicker != nil {
checkTicker . Stop ( )
}
2017-07-03 04:49:56 +00:00
l . Close ( )
} ( )
setHealth := func ( h bool ) {
r . allocLock . Lock ( )
r . allocHealth = helper . BoolToPtr ( h )
r . allocLock . Unlock ( )
r . syncStatus ( )
}
2017-07-04 19:24:27 +00:00
var checks [ ] * api . AgentCheck
2017-07-03 04:49:56 +00:00
first := true
OUTER :
for {
if ! first {
select {
2017-07-03 22:03:42 +00:00
case <- ctx . Done ( ) :
2017-07-03 04:49:56 +00:00
return
case newAlloc , ok := <- l . Ch :
if ! ok {
return
}
alloc = newAlloc
r . logger . Printf ( "[TRACE] client.alloc_watcher: new alloc version for %q" , alloc . ID )
2017-07-04 19:24:27 +00:00
case <- checkCh :
newChecks , err := r . consulClient . Checks ( alloc )
if err != nil {
r . logger . Printf ( "[TRACE] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v" , alloc . ID , err )
}
checks = newChecks
2017-07-03 04:49:56 +00:00
case <- deadline . C :
// We have exceeded our deadline without being healthy.
2017-07-04 19:24:27 +00:00
r . logger . Printf ( "[TRACE] client.alloc_watcher: alloc %q hit healthy deadline" , alloc . ID )
2017-07-03 04:49:56 +00:00
setHealth ( false )
2017-07-04 19:24:27 +00:00
return
2017-07-03 04:49:56 +00:00
case <- healthyTimer . C :
r . logger . Printf ( "[TRACE] client.alloc_watcher: alloc %q is healthy" , alloc . ID )
setHealth ( true )
2017-07-04 19:24:27 +00:00
return
2017-07-03 04:49:56 +00:00
}
}
first = false
// If the alloc is being stopped by the server just exit
switch alloc . DesiredStatus {
case structs . AllocDesiredStatusStop , structs . AllocDesiredStatusEvict :
r . logger . Printf ( "[TRACE] client.alloc_watcher: desired status terminal for alloc %q" , alloc . ID )
return
}
2017-07-04 19:24:27 +00:00
if len ( alloc . TaskStates ) != len ( tg . Tasks ) {
r . logger . Printf ( "[TRACE] client.alloc_watcher: all task runners haven't started" )
continue OUTER
}
2017-07-03 04:49:56 +00:00
// If the task is dead or has restarted, fail
for _ , tstate := range alloc . TaskStates {
if tstate . Failed || ! tstate . FinishedAt . IsZero ( ) || tstate . Restarts != 0 {
r . logger . Printf ( "[TRACE] client.alloc_watcher: setting health to false for alloc %q" , alloc . ID )
setHealth ( false )
return
}
}
2017-07-04 19:24:27 +00:00
// If we should have checks and they aren't all healthy continue
if len ( checks ) != desiredChecks {
r . logger . Printf ( "[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q" , desiredChecks , len ( checks ) , alloc . ID )
continue OUTER
}
// Check if all the checks are passing
for _ , check := range checks {
if check . Status != api . HealthPassing {
r . logger . Printf ( "[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q" , check . CheckID , alloc . ID )
latestChecksHealthy = time . Time { }
continue OUTER
}
}
if latestChecksHealthy . IsZero ( ) {
latestChecksHealthy = time . Now ( )
}
2017-07-03 04:49:56 +00:00
// Determine if the allocation is healthy
for task , tstate := range alloc . TaskStates {
if tstate . State != structs . TaskStateRunning {
r . logger . Printf ( "[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q" , task , alloc . ID )
continue OUTER
}
2017-07-04 19:24:27 +00:00
if tstate . StartedAt . After ( latestTaskHealthy ) {
latestTaskHealthy = tstate . StartedAt
2017-07-03 04:49:56 +00:00
}
}
2017-07-04 19:24:27 +00:00
// Don't need to set the timer if we are healthy and have marked
// ourselves healthy.
if alloc . DeploymentStatus != nil && alloc . DeploymentStatus . Healthy != nil && * alloc . DeploymentStatus . Healthy {
2017-07-03 04:49:56 +00:00
continue OUTER
}
2017-07-04 19:24:27 +00:00
// Determine when we can mark ourselves as healthy.
totalHealthy := latestTaskHealthy
if totalHealthy . Before ( latestChecksHealthy ) {
totalHealthy = latestChecksHealthy
}
d := time . Until ( totalHealthy . Add ( u . MinHealthyTime ) )
2017-07-03 04:49:56 +00:00
if ! healthyTimer . Stop ( ) {
select {
case <- healthyTimer . C :
default :
}
}
2017-07-04 19:24:27 +00:00
2017-07-03 04:49:56 +00:00
healthyTimer . Reset ( d )
r . logger . Printf ( "[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q" , d , alloc . ID )
}
}