From bf97a2455c3455c771b422b6bf63210512fb425c Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Fri, 7 Jul 2017 14:53:01 -0700 Subject: [PATCH] Vet and small improvement on watcher failure detection --- client/alloc_runner.go | 23 +++++++++++++++++++---- client/alloc_runner_health_watcher.go | 8 ++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index a54544439..ebd21e95e 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -497,6 +497,17 @@ func (r *AllocRunner) Alloc() *structs.Allocation { alloc.ClientStatus = getClientStatus(r.taskStates) r.taskStatusLock.RUnlock() + // If the client status is failed and we are part of a deployment, mark the + // alloc as unhealthy. This guards against the watcher not be started. + r.allocLock.Lock() + if alloc.ClientStatus == structs.AllocClientStatusFailed && + alloc.DeploymentID != "" && !alloc.DeploymentStatus.IsUnhealthy() { + alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(false), + } + } + r.allocLock.Unlock() + return alloc } @@ -670,10 +681,6 @@ func (r *AllocRunner) Run() { defer close(r.waitCh) go r.dirtySyncState() - // Start the watcher - wCtx, watcherCancel := context.WithCancel(r.ctx) - go r.watchHealth(wCtx) - // Find the task group to run in the allocation alloc := r.Alloc() tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) @@ -716,6 +723,10 @@ func (r *AllocRunner) Run() { return } + // Start the watcher + wCtx, watcherCancel := context.WithCancel(r.ctx) + go r.watchHealth(wCtx) + // Start the task runners r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID) r.taskLock.Lock() @@ -787,6 +798,10 @@ OUTER: // Block until we should destroy the state of the alloc r.handleDestroy() + + // Free up the context. It has likely exited already + watcherCancel() + r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID) } diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go index 92e5c4149..c27f6a16d 100644 --- a/client/alloc_runner_health_watcher.go +++ b/client/alloc_runner_health_watcher.go @@ -133,6 +133,14 @@ OUTER: return } + // If the alloc is marked as failed by the client set the status to + // unhealthy + if alloc.ClientStatus == structs.AllocClientStatusFailed { + r.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID) + setHealth(false) + return + } + if len(alloc.TaskStates) != len(tg.Tasks) { r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started") continue OUTER