Merge pull request #7944 from hashicorp/b-health-checks-after-task-health

Allocs are healthy if service checks get healthy before task health
2020-05-13 09:34:03 -04:00 · 2020-05-13 09:34:03 -04:00 · 2622b88b00
parent 3b4116e0db 0ece631e60
commit 2622b88b00
2 changed files with 119 additions and 3 deletions
--- a/client/allochealth/tracker.go
+++ b/client/allochealth/tracker.go
@ -210,7 +210,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
 }

 // setCheckHealth is used to mark the checks as either healthy or unhealthy.
-func (t *Tracker) setCheckHealth(healthy bool) {
+// returns true if health is propagated and no more health monitoring is needed
+func (t *Tracker) setCheckHealth(healthy bool) bool {
 	t.l.Lock()
 	defer t.l.Unlock()

@ -220,7 +221,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {

 	// Only signal if we are healthy and so is the tasks
 	if !t.checksHealthy {
-		return
+		return false
 	}

 	select {
@ -230,6 +231,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {

 	// Shutdown the tracker
 	t.cancelFn()
+	return true
 }

 // markAllocStopped is used to mark the allocation as having stopped.
@ -379,7 +381,12 @@ OUTER:
 				allocReg = newAllocReg
 			}
 		case <-healthyTimer.C:
-			t.setCheckHealth(true)
+			if t.setCheckHealth(true) {
+				// final health set and propagated
+				return
+			}
+			// tasks are unhealthy, reset and wait until all is healthy
+			primed = false
 		}

 		if allocReg == nil {
--- a/client/allochealth/tracker_test.go
+++ b/client/allochealth/tracker_test.go
@ -227,3 +227,112 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
 		require.Fail(t, "expected a health status")
 	}
 }
+
+// TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc
+// healthy, if the checks pass before task health pass
+func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
+	t.Parallel()
+
+	alloc := mock.Alloc()
+	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+
+	// new task starting unhealthy, without services
+	task2 := task.Copy()
+	task2.Name = task2.Name + "2"
+	task2.Services = nil
+	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
+
+	// Synthesize running alloc and tasks
+	alloc.ClientStatus = structs.AllocClientStatusRunning
+	alloc.TaskStates = map[string]*structs.TaskState{
+		task.Name: {
+			State:     structs.TaskStateRunning,
+			StartedAt: time.Now(),
+		},
+		task2.Name: {
+			State: structs.TaskStatePending,
+		},
+	}
+
+	// Make Consul response
+	check := &consulapi.AgentCheck{
+		Name:   task.Services[0].Checks[0].Name,
+		Status: consulapi.HealthPassing,
+	}
+	taskRegs := map[string]*agentconsul.ServiceRegistrations{
+		task.Name: {
+			Services: map[string]*agentconsul.ServiceRegistration{
+				task.Services[0].Name: {
+					Service: &consulapi.AgentService{
+						ID:      "foo",
+						Service: task.Services[0].Name,
+					},
+					Checks: []*consulapi.AgentCheck{check},
+				},
+			},
+		},
+	}
+
+	logger := testlog.HCLogger(t)
+	b := cstructs.NewAllocBroadcaster(logger)
+	defer b.Close()
+
+	// Don't reply on the first call
+	var called uint64
+	consul := consul.NewMockConsulServiceClient(t, logger)
+	consul.AllocRegistrationsFn = func(string) (*agentconsul.AllocRegistration, error) {
+		if atomic.AddUint64(&called, 1) == 1 {
+			return nil, nil
+		}
+
+		reg := &agentconsul.AllocRegistration{
+			Tasks: taskRegs,
+		}
+
+		return reg, nil
+	}
+
+	ctx, cancelFn := context.WithCancel(context.Background())
+	defer cancelFn()
+
+	checkInterval := 10 * time.Millisecond
+	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
+		time.Millisecond, true)
+	tracker.checkLookupInterval = checkInterval
+	tracker.Start()
+
+	// assert that we don't get marked healthy
+	select {
+	case <-time.After(4 * checkInterval):
+		// still unhealthy, good
+	case h := <-tracker.HealthyCh():
+		require.Fail(t, "unexpected health event", h)
+	}
+	require.False(t, tracker.tasksHealthy)
+	require.False(t, tracker.checksHealthy)
+
+	// now set task to healthy
+	runningAlloc := alloc.Copy()
+	runningAlloc.TaskStates = map[string]*structs.TaskState{
+		task.Name: {
+			State:     structs.TaskStateRunning,
+			StartedAt: time.Now(),
+		},
+		task2.Name: {
+			State:     structs.TaskStateRunning,
+			StartedAt: time.Now(),
+		},
+	}
+	err := b.Send(runningAlloc)
+	require.NoError(t, err)
+
+	// eventually, it is marked as healthy
+	select {
+	case <-time.After(4 * checkInterval):
+		require.Fail(t, "timed out while waiting for health")
+	case h := <-tracker.HealthyCh():
+		require.True(t, h)
+	}
+
+}