Vet and small improvement on watcher failure detection

This commit is contained in:
Alex Dadgar 2017-07-07 14:53:01 -07:00
parent 45712c6ca3
commit bf97a2455c
2 changed files with 27 additions and 4 deletions

View file

@ -497,6 +497,17 @@ func (r *AllocRunner) Alloc() *structs.Allocation {
alloc.ClientStatus = getClientStatus(r.taskStates)
r.taskStatusLock.RUnlock()
// If the client status is failed and we are part of a deployment, mark the
// alloc as unhealthy. This guards against the watcher not be started.
r.allocLock.Lock()
if alloc.ClientStatus == structs.AllocClientStatusFailed &&
alloc.DeploymentID != "" && !alloc.DeploymentStatus.IsUnhealthy() {
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(false),
}
}
r.allocLock.Unlock()
return alloc
}
@ -670,10 +681,6 @@ func (r *AllocRunner) Run() {
defer close(r.waitCh)
go r.dirtySyncState()
// Start the watcher
wCtx, watcherCancel := context.WithCancel(r.ctx)
go r.watchHealth(wCtx)
// Find the task group to run in the allocation
alloc := r.Alloc()
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
@ -716,6 +723,10 @@ func (r *AllocRunner) Run() {
return
}
// Start the watcher
wCtx, watcherCancel := context.WithCancel(r.ctx)
go r.watchHealth(wCtx)
// Start the task runners
r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID)
r.taskLock.Lock()
@ -787,6 +798,10 @@ OUTER:
// Block until we should destroy the state of the alloc
r.handleDestroy()
// Free up the context. It has likely exited already
watcherCancel()
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
}

View file

@ -133,6 +133,14 @@ OUTER:
return
}
// If the alloc is marked as failed by the client set the status to
// unhealthy
if alloc.ClientStatus == structs.AllocClientStatusFailed {
r.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID)
setHealth(false)
return
}
if len(alloc.TaskStates) != len(tg.Tasks) {
r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started")
continue OUTER