open-nomad/command/agent/consul/check_watcher_test.go

439 lines
13 KiB
Go

package consul
import (
"context"
"fmt"
"sync"
"testing"
"time"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)
// checkRestartRecord is used by a testFakeCtx to record when restarts occur
// due to a watched check.
type checkRestartRecord struct {
timestamp time.Time
source string
reason string
failure bool
}
// fakeCheckRestarter is a test implementation of TaskRestarter.
type fakeCheckRestarter struct {
// restarts is a slice of all of the restarts triggered by the checkWatcher
restarts []checkRestartRecord
// need the checkWatcher to re-Watch restarted tasks like TaskRunner
watcher *checkWatcher
// check to re-Watch on restarts
check *structs.ServiceCheck
allocID string
taskName string
checkName string
mu sync.Mutex
}
// newFakeCheckRestart creates a new TaskRestarter. It needs all of the
// parameters checkWatcher.Watch expects.
func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter {
return &fakeCheckRestarter{
watcher: w,
check: c,
allocID: allocID,
taskName: taskName,
checkName: checkName,
}
}
// Restart implements part of the TaskRestarter interface needed for check
// watching and is normally fulfilled by a TaskRunner.
//
// Restarts are recorded in the []restarts field and re-Watch the check.
//func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) {
func (c *fakeCheckRestarter) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
c.mu.Lock()
defer c.mu.Unlock()
restart := checkRestartRecord{
timestamp: time.Now(),
source: event.Type,
reason: event.DisplayMessage,
failure: failure,
}
c.restarts = append(c.restarts, restart)
// Re-Watch the check just like TaskRunner
c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c)
return nil
}
// String for debugging
func (c *fakeCheckRestarter) String() string {
c.mu.Lock()
defer c.mu.Unlock()
s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName)
for _, r := range c.restarts {
s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure)
}
return s
}
// GetRestarts for testing in a threadsafe way
func (c *fakeCheckRestarter) GetRestarts() []checkRestartRecord {
c.mu.Lock()
defer c.mu.Unlock()
o := make([]checkRestartRecord, len(c.restarts))
copy(o, c.restarts)
return o
}
// checkResponse is a response returned by the fakeChecksAPI after the given
// time.
type checkResponse struct {
at time.Time
id string
status string
}
// fakeChecksAPI implements the Checks() method for testing Consul.
type fakeChecksAPI struct {
// responses is a map of check ids to their status at a particular
// time. checkResponses must be in chronological order.
responses map[string][]checkResponse
mu sync.Mutex
}
func newFakeChecksAPI() *fakeChecksAPI {
return &fakeChecksAPI{responses: make(map[string][]checkResponse)}
}
// add a new check status to Consul at the given time.
func (c *fakeChecksAPI) add(id, status string, at time.Time) {
c.mu.Lock()
c.responses[id] = append(c.responses[id], checkResponse{at, id, status})
c.mu.Unlock()
}
func (c *fakeChecksAPI) ChecksWithFilterOpts(filter string, opts *api.QueryOptions) (map[string]*api.AgentCheck, error) {
c.mu.Lock()
defer c.mu.Unlock()
now := time.Now()
result := make(map[string]*api.AgentCheck, len(c.responses))
// Use the latest response for each check
for k, vs := range c.responses {
for _, v := range vs {
if v.at.After(now) {
break
}
result[k] = &api.AgentCheck{
CheckID: k,
Name: k,
Status: v.status,
}
}
}
return result, nil
}
// testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test
// logger and faster poll frequency.
func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) {
logger := testlog.HCLogger(t)
checksAPI := newFakeChecksAPI()
namespacesClient := NewNamespacesClient(NewMockNamespaces(nil), NewMockAgent(ossFeatures))
cw := newCheckWatcher(logger, checksAPI, namespacesClient)
cw.pollFreq = 10 * time.Millisecond
return checksAPI, cw
}
func testCheck() *structs.ServiceCheck {
return &structs.ServiceCheck{
Name: "testcheck",
Interval: 100 * time.Millisecond,
Timeout: 100 * time.Millisecond,
CheckRestart: &structs.CheckRestart{
Limit: 3,
Grace: 100 * time.Millisecond,
IgnoreWarnings: false,
},
}
}
// TestCheckWatcher_Skip asserts unwatched checks are ignored.
func TestCheckWatcher_Skip(t *testing.T) {
ci.Parallel(t)
// Create a check with restarting disabled
check := testCheck()
check.CheckRestart = nil
logger := testlog.HCLogger(t)
checksAPI := newFakeChecksAPI()
namespacesClient := NewNamespacesClient(NewMockNamespaces(nil), NewMockAgent(ossFeatures))
cw := newCheckWatcher(logger, checksAPI, namespacesClient)
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check)
cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1)
// Check should have been dropped as it's not watched
if n := len(cw.checkUpdateCh); n != 0 {
t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n)
}
}
// TestCheckWatcher_Healthy asserts healthy tasks are not restarted.
func TestCheckWatcher_Healthy(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
check1 := testCheck()
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
check2 := testCheck()
check2.CheckRestart.Limit = 1
check2.CheckRestart.Grace = 0
restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2)
cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2)
// Make both checks healthy from the beginning
fakeAPI.add("testcheck1", "passing", time.Time{})
fakeAPI.add("testcheck2", "passing", time.Time{})
// Run
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure restart was never called
if n := len(restarter1.restarts); n > 0 {
t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1)
}
if n := len(restarter2.restarts); n > 0 {
t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2)
}
}
// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once.
func TestCheckWatcher_Unhealthy(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
check1 := testCheck()
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
// Check has always been failing
fakeAPI.add("testcheck1", "critical", time.Time{})
// Run
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure restart was called exactly once
require.Len(t, restarter1.restarts, 1)
}
// TestCheckWatcher_HealthyWarning asserts checks in warning with
// ignore_warnings=true do not restart tasks.
func TestCheckWatcher_HealthyWarning(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
check1 := testCheck()
check1.CheckRestart.Limit = 1
check1.CheckRestart.Grace = 0
check1.CheckRestart.IgnoreWarnings = true
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
// Check is always in warning but that's ok
fakeAPI.add("testcheck1", "warning", time.Time{})
// Run
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure restart was never called on check 1
if n := len(restarter1.restarts); n > 0 {
t.Errorf("expected check 1 to not be restarted but found %d", n)
}
}
// TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy
// before the unhealthy limit is reached do not restart tasks.
func TestCheckWatcher_Flapping(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
check1 := testCheck()
check1.CheckRestart.Grace = 0
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
// Check flaps and is never failing for the full 200ms needed to restart
now := time.Now()
fakeAPI.add("testcheck1", "passing", now)
fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond))
fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond))
fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond))
fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond))
ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure restart was never called on check 1
if n := len(restarter1.restarts); n > 0 {
t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
}
}
// TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts.
func TestCheckWatcher_Unwatch(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
// Unwatch immediately
check1 := testCheck()
check1.CheckRestart.Limit = 1
check1.CheckRestart.Grace = 100 * time.Millisecond
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
cw.Unwatch("testcheck1")
// Always failing
fakeAPI.add("testcheck1", "critical", time.Time{})
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure restart was never called on check 1
if n := len(restarter1.restarts); n > 0 {
t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
}
}
// TestCheckWatcher_MultipleChecks asserts that when there are multiple checks
// for a single task, all checks should be removed when any of them restart the
// task to avoid multiple restarts.
func TestCheckWatcher_MultipleChecks(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
check1 := testCheck()
check1.CheckRestart.Limit = 1
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
check2 := testCheck()
check2.CheckRestart.Limit = 1
restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2)
cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2)
check3 := testCheck()
check3.CheckRestart.Limit = 1
restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3)
cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3)
// check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart
now := time.Now()
fakeAPI.add("testcheck1", "critical", now)
fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond))
fakeAPI.add("testcheck2", "critical", now)
fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond))
fakeAPI.add("testcheck3", "passing", time.Time{})
// Run
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure that restart was only called once on check 1 or 2. Since
// checks are in a map it's random which check triggers the restart
// first.
if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 {
t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s",
n, restarter1, restarter2)
}
if n := len(restarter3.restarts); n != 0 {
t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3)
}
}
// TestCheckWatcher_Deadlock asserts that check watcher will not deadlock when
// attempting to restart a task even if its update queue is full.
// https://github.com/hashicorp/nomad/issues/5395
func TestCheckWatcher_Deadlock(t *testing.T) {
ci.Parallel(t)
fakeAPI, cw := testWatcherSetup(t)
// If TR.Restart blocks, restarting len(checkUpdateCh)+1 checks causes
// a deadlock due to checkWatcher.Run being blocked in
// checkRestart.apply and unable to process updates from the chan!
n := cap(cw.checkUpdateCh) + 1
checks := make([]*structs.ServiceCheck, n)
restarters := make([]*fakeCheckRestarter, n)
for i := 0; i < n; i++ {
c := testCheck()
r := newFakeCheckRestarter(cw,
fmt.Sprintf("alloc%d", i),
fmt.Sprintf("task%d", i),
fmt.Sprintf("check%d", i),
c,
)
checks[i] = c
restarters[i] = r
}
// Run
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go cw.Run(ctx)
// Watch
for _, r := range restarters {
cw.Watch(r.allocID, r.taskName, r.checkName, r.check, r)
}
// Make them all fail
for _, r := range restarters {
fakeAPI.add(r.checkName, "critical", time.Time{})
}
// Ensure that restart was called exactly once on all checks
testutil.WaitForResult(func() (bool, error) {
for _, r := range restarters {
if n := len(r.GetRestarts()); n != 1 {
return false, fmt.Errorf("expected 1 restart but found %d", n)
}
}
return true, nil
}, func(err error) {
require.NoError(t, err)
})
}