Merge pull request #7944 from hashicorp/b-health-checks-after-task-health

Allocs are healthy if service checks get healthy before task health
This commit is contained in:
Mahmood Ali 2020-05-13 09:34:03 -04:00 committed by GitHub
commit 2622b88b00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 3 deletions

View File

@ -210,7 +210,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
}
// setCheckHealth is used to mark the checks as either healthy or unhealthy.
func (t *Tracker) setCheckHealth(healthy bool) {
// returns true if health is propagated and no more health monitoring is needed
func (t *Tracker) setCheckHealth(healthy bool) bool {
t.l.Lock()
defer t.l.Unlock()
@ -220,7 +221,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {
// Only signal if we are healthy and so is the tasks
if !t.checksHealthy {
return
return false
}
select {
@ -230,6 +231,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {
// Shutdown the tracker
t.cancelFn()
return true
}
// markAllocStopped is used to mark the allocation as having stopped.
@ -379,7 +381,12 @@ OUTER:
allocReg = newAllocReg
}
case <-healthyTimer.C:
t.setCheckHealth(true)
if t.setCheckHealth(true) {
// final health set and propagated
return
}
// tasks are unhealthy, reset and wait until all is healthy
primed = false
}
if allocReg == nil {

View File

@ -227,3 +227,112 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
require.Fail(t, "expected a health status")
}
}
// TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc
// healthy, if the checks pass before task health pass
func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
t.Parallel()
alloc := mock.Alloc()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
task := alloc.Job.TaskGroups[0].Tasks[0]
// new task starting unhealthy, without services
task2 := task.Copy()
task2.Name = task2.Name + "2"
task2.Services = nil
alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
task.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
task2.Name: {
State: structs.TaskStatePending,
},
}
// Make Consul response
check := &consulapi.AgentCheck{
Name: task.Services[0].Checks[0].Name,
Status: consulapi.HealthPassing,
}
taskRegs := map[string]*agentconsul.ServiceRegistrations{
task.Name: {
Services: map[string]*agentconsul.ServiceRegistration{
task.Services[0].Name: {
Service: &consulapi.AgentService{
ID: "foo",
Service: task.Services[0].Name,
},
Checks: []*consulapi.AgentCheck{check},
},
},
},
}
logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()
// Don't reply on the first call
var called uint64
consul := consul.NewMockConsulServiceClient(t, logger)
consul.AllocRegistrationsFn = func(string) (*agentconsul.AllocRegistration, error) {
if atomic.AddUint64(&called, 1) == 1 {
return nil, nil
}
reg := &agentconsul.AllocRegistration{
Tasks: taskRegs,
}
return reg, nil
}
ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()
checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
time.Millisecond, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()
// assert that we don't get marked healthy
select {
case <-time.After(4 * checkInterval):
// still unhealthy, good
case h := <-tracker.HealthyCh():
require.Fail(t, "unexpected health event", h)
}
require.False(t, tracker.tasksHealthy)
require.False(t, tracker.checksHealthy)
// now set task to healthy
runningAlloc := alloc.Copy()
runningAlloc.TaskStates = map[string]*structs.TaskState{
task.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
task2.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
}
err := b.Send(runningAlloc)
require.NoError(t, err)
// eventually, it is marked as healthy
select {
case <-time.After(4 * checkInterval):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}
}