Vet and small improvement on watcher failure detection

2017-07-07 14:53:01 -07:00 · 2017-07-07 14:53:01 -07:00 · bf97a2455c
parent 45712c6ca3
commit bf97a2455c
2 changed files with 27 additions and 4 deletions
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@ -497,6 +497,17 @@ func (r *AllocRunner) Alloc() *structs.Allocation {
 	alloc.ClientStatus = getClientStatus(r.taskStates)
 	r.taskStatusLock.RUnlock()

+	// If the client status is failed and we are part of a deployment, mark the
+	// alloc as unhealthy. This guards against the watcher not be started.
+	r.allocLock.Lock()
+	if alloc.ClientStatus == structs.AllocClientStatusFailed &&
+		alloc.DeploymentID != "" && !alloc.DeploymentStatus.IsUnhealthy() {
+		alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(false),
+		}
+	}
+	r.allocLock.Unlock()
+
 	return alloc
 }

@ -670,10 +681,6 @@ func (r *AllocRunner) Run() {
 	defer close(r.waitCh)
 	go r.dirtySyncState()

-	// Start the watcher
-	wCtx, watcherCancel := context.WithCancel(r.ctx)
-	go r.watchHealth(wCtx)
-
 	// Find the task group to run in the allocation
 	alloc := r.Alloc()
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
@ -716,6 +723,10 @@ func (r *AllocRunner) Run() {
 		return
 	}

+	// Start the watcher
+	wCtx, watcherCancel := context.WithCancel(r.ctx)
+	go r.watchHealth(wCtx)
+
 	// Start the task runners
 	r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID)
 	r.taskLock.Lock()
@ -787,6 +798,10 @@ OUTER:

 	// Block until we should destroy the state of the alloc
 	r.handleDestroy()
+
+	// Free up the context. It has likely exited already
+	watcherCancel()
+
 	r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
 }

--- a/client/alloc_runner_health_watcher.go
+++ b/client/alloc_runner_health_watcher.go
@ -133,6 +133,14 @@ OUTER:
 			return
 		}

+		// If the alloc is marked as failed by the client set the status to
+		// unhealthy
+		if alloc.ClientStatus == structs.AllocClientStatusFailed {
+			r.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID)
+			setHealth(false)
+			return
+		}
+
 		if len(alloc.TaskStates) != len(tg.Tasks) {
 			r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started")
 			continue OUTER