b3ea68948b
Go 1.19 will forecefully format all your doc strings. To get this out of the way, here is one big commit with all the changes gofmt wants to make.
438 lines
13 KiB
Go
438 lines
13 KiB
Go
package consul
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/hashicorp/consul/api"
|
|
"github.com/hashicorp/nomad/ci"
|
|
"github.com/hashicorp/nomad/helper/testlog"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/testutil"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// checkRestartRecord is used by a testFakeCtx to record when restarts occur
|
|
// due to a watched check.
|
|
type checkRestartRecord struct {
|
|
timestamp time.Time
|
|
source string
|
|
reason string
|
|
failure bool
|
|
}
|
|
|
|
// fakeCheckRestarter is a test implementation of TaskRestarter.
|
|
type fakeCheckRestarter struct {
|
|
// restarts is a slice of all of the restarts triggered by the checkWatcher
|
|
restarts []checkRestartRecord
|
|
|
|
// need the checkWatcher to re-Watch restarted tasks like TaskRunner
|
|
watcher *checkWatcher
|
|
|
|
// check to re-Watch on restarts
|
|
check *structs.ServiceCheck
|
|
allocID string
|
|
taskName string
|
|
checkName string
|
|
|
|
mu sync.Mutex
|
|
}
|
|
|
|
// newFakeCheckRestart creates a new TaskRestarter. It needs all of the
|
|
// parameters checkWatcher.Watch expects.
|
|
func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter {
|
|
return &fakeCheckRestarter{
|
|
watcher: w,
|
|
check: c,
|
|
allocID: allocID,
|
|
taskName: taskName,
|
|
checkName: checkName,
|
|
}
|
|
}
|
|
|
|
// Restart implements part of the TaskRestarter interface needed for check
|
|
// watching and is normally fulfilled by a TaskRunner.
|
|
//
|
|
// Restarts are recorded in the []restarts field and re-Watch the check.
|
|
func (c *fakeCheckRestarter) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
restart := checkRestartRecord{
|
|
timestamp: time.Now(),
|
|
source: event.Type,
|
|
reason: event.DisplayMessage,
|
|
failure: failure,
|
|
}
|
|
c.restarts = append(c.restarts, restart)
|
|
|
|
// Re-Watch the check just like TaskRunner
|
|
c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c)
|
|
return nil
|
|
}
|
|
|
|
// String for debugging
|
|
func (c *fakeCheckRestarter) String() string {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName)
|
|
for _, r := range c.restarts {
|
|
s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure)
|
|
}
|
|
return s
|
|
}
|
|
|
|
// GetRestarts for testing in a threadsafe way
|
|
func (c *fakeCheckRestarter) GetRestarts() []checkRestartRecord {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
o := make([]checkRestartRecord, len(c.restarts))
|
|
copy(o, c.restarts)
|
|
return o
|
|
}
|
|
|
|
// checkResponse is a response returned by the fakeChecksAPI after the given
|
|
// time.
|
|
type checkResponse struct {
|
|
at time.Time
|
|
id string
|
|
status string
|
|
}
|
|
|
|
// fakeChecksAPI implements the Checks() method for testing Consul.
|
|
type fakeChecksAPI struct {
|
|
// responses is a map of check ids to their status at a particular
|
|
// time. checkResponses must be in chronological order.
|
|
responses map[string][]checkResponse
|
|
|
|
mu sync.Mutex
|
|
}
|
|
|
|
func newFakeChecksAPI() *fakeChecksAPI {
|
|
return &fakeChecksAPI{responses: make(map[string][]checkResponse)}
|
|
}
|
|
|
|
// add a new check status to Consul at the given time.
|
|
func (c *fakeChecksAPI) add(id, status string, at time.Time) {
|
|
c.mu.Lock()
|
|
c.responses[id] = append(c.responses[id], checkResponse{at, id, status})
|
|
c.mu.Unlock()
|
|
}
|
|
|
|
func (c *fakeChecksAPI) ChecksWithFilterOpts(filter string, opts *api.QueryOptions) (map[string]*api.AgentCheck, error) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
now := time.Now()
|
|
result := make(map[string]*api.AgentCheck, len(c.responses))
|
|
|
|
// Use the latest response for each check
|
|
for k, vs := range c.responses {
|
|
for _, v := range vs {
|
|
if v.at.After(now) {
|
|
break
|
|
}
|
|
result[k] = &api.AgentCheck{
|
|
CheckID: k,
|
|
Name: k,
|
|
Status: v.status,
|
|
}
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test
|
|
// logger and faster poll frequency.
|
|
func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) {
|
|
logger := testlog.HCLogger(t)
|
|
checksAPI := newFakeChecksAPI()
|
|
namespacesClient := NewNamespacesClient(NewMockNamespaces(nil), NewMockAgent(ossFeatures))
|
|
cw := newCheckWatcher(logger, checksAPI, namespacesClient)
|
|
cw.pollFreq = 10 * time.Millisecond
|
|
return checksAPI, cw
|
|
}
|
|
|
|
func testCheck() *structs.ServiceCheck {
|
|
return &structs.ServiceCheck{
|
|
Name: "testcheck",
|
|
Interval: 100 * time.Millisecond,
|
|
Timeout: 100 * time.Millisecond,
|
|
CheckRestart: &structs.CheckRestart{
|
|
Limit: 3,
|
|
Grace: 100 * time.Millisecond,
|
|
IgnoreWarnings: false,
|
|
},
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_Skip asserts unwatched checks are ignored.
|
|
func TestCheckWatcher_Skip(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
// Create a check with restarting disabled
|
|
check := testCheck()
|
|
check.CheckRestart = nil
|
|
|
|
logger := testlog.HCLogger(t)
|
|
checksAPI := newFakeChecksAPI()
|
|
namespacesClient := NewNamespacesClient(NewMockNamespaces(nil), NewMockAgent(ossFeatures))
|
|
|
|
cw := newCheckWatcher(logger, checksAPI, namespacesClient)
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1)
|
|
|
|
// Check should have been dropped as it's not watched
|
|
if n := len(cw.checkUpdateCh); n != 0 {
|
|
t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n)
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_Healthy asserts healthy tasks are not restarted.
|
|
func TestCheckWatcher_Healthy(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
check1 := testCheck()
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
|
|
|
check2 := testCheck()
|
|
check2.CheckRestart.Limit = 1
|
|
check2.CheckRestart.Grace = 0
|
|
restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2)
|
|
cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2)
|
|
|
|
// Make both checks healthy from the beginning
|
|
fakeAPI.add("testcheck1", "passing", time.Time{})
|
|
fakeAPI.add("testcheck2", "passing", time.Time{})
|
|
|
|
// Run
|
|
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
|
defer cancel()
|
|
cw.Run(ctx)
|
|
|
|
// Ensure restart was never called
|
|
if n := len(restarter1.restarts); n > 0 {
|
|
t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1)
|
|
}
|
|
if n := len(restarter2.restarts); n > 0 {
|
|
t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2)
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once.
|
|
func TestCheckWatcher_Unhealthy(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
check1 := testCheck()
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
|
|
|
// Check has always been failing
|
|
fakeAPI.add("testcheck1", "critical", time.Time{})
|
|
|
|
// Run
|
|
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
|
defer cancel()
|
|
cw.Run(ctx)
|
|
|
|
// Ensure restart was called exactly once
|
|
require.Len(t, restarter1.restarts, 1)
|
|
}
|
|
|
|
// TestCheckWatcher_HealthyWarning asserts checks in warning with
|
|
// ignore_warnings=true do not restart tasks.
|
|
func TestCheckWatcher_HealthyWarning(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
check1 := testCheck()
|
|
check1.CheckRestart.Limit = 1
|
|
check1.CheckRestart.Grace = 0
|
|
check1.CheckRestart.IgnoreWarnings = true
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
|
|
|
// Check is always in warning but that's ok
|
|
fakeAPI.add("testcheck1", "warning", time.Time{})
|
|
|
|
// Run
|
|
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
|
|
defer cancel()
|
|
cw.Run(ctx)
|
|
|
|
// Ensure restart was never called on check 1
|
|
if n := len(restarter1.restarts); n > 0 {
|
|
t.Errorf("expected check 1 to not be restarted but found %d", n)
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy
|
|
// before the unhealthy limit is reached do not restart tasks.
|
|
func TestCheckWatcher_Flapping(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
check1 := testCheck()
|
|
check1.CheckRestart.Grace = 0
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
|
|
|
// Check flaps and is never failing for the full 200ms needed to restart
|
|
now := time.Now()
|
|
fakeAPI.add("testcheck1", "passing", now)
|
|
fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond))
|
|
fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond))
|
|
fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond))
|
|
fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond))
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
|
|
defer cancel()
|
|
cw.Run(ctx)
|
|
|
|
// Ensure restart was never called on check 1
|
|
if n := len(restarter1.restarts); n > 0 {
|
|
t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts.
|
|
func TestCheckWatcher_Unwatch(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
// Unwatch immediately
|
|
check1 := testCheck()
|
|
check1.CheckRestart.Limit = 1
|
|
check1.CheckRestart.Grace = 100 * time.Millisecond
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
|
cw.Unwatch("testcheck1")
|
|
|
|
// Always failing
|
|
fakeAPI.add("testcheck1", "critical", time.Time{})
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
|
|
defer cancel()
|
|
cw.Run(ctx)
|
|
|
|
// Ensure restart was never called on check 1
|
|
if n := len(restarter1.restarts); n > 0 {
|
|
t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_MultipleChecks asserts that when there are multiple checks
|
|
// for a single task, all checks should be removed when any of them restart the
|
|
// task to avoid multiple restarts.
|
|
func TestCheckWatcher_MultipleChecks(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
check1 := testCheck()
|
|
check1.CheckRestart.Limit = 1
|
|
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
|
|
|
check2 := testCheck()
|
|
check2.CheckRestart.Limit = 1
|
|
restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2)
|
|
|
|
check3 := testCheck()
|
|
check3.CheckRestart.Limit = 1
|
|
restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3)
|
|
cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3)
|
|
|
|
// check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart
|
|
now := time.Now()
|
|
fakeAPI.add("testcheck1", "critical", now)
|
|
fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond))
|
|
fakeAPI.add("testcheck2", "critical", now)
|
|
fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond))
|
|
fakeAPI.add("testcheck3", "passing", time.Time{})
|
|
|
|
// Run
|
|
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
|
defer cancel()
|
|
cw.Run(ctx)
|
|
|
|
// Ensure that restart was only called once on check 1 or 2. Since
|
|
// checks are in a map it's random which check triggers the restart
|
|
// first.
|
|
if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 {
|
|
t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s",
|
|
n, restarter1, restarter2)
|
|
}
|
|
|
|
if n := len(restarter3.restarts); n != 0 {
|
|
t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3)
|
|
}
|
|
}
|
|
|
|
// TestCheckWatcher_Deadlock asserts that check watcher will not deadlock when
|
|
// attempting to restart a task even if its update queue is full.
|
|
// https://github.com/hashicorp/nomad/issues/5395
|
|
func TestCheckWatcher_Deadlock(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
fakeAPI, cw := testWatcherSetup(t)
|
|
|
|
// If TR.Restart blocks, restarting len(checkUpdateCh)+1 checks causes
|
|
// a deadlock due to checkWatcher.Run being blocked in
|
|
// checkRestart.apply and unable to process updates from the chan!
|
|
n := cap(cw.checkUpdateCh) + 1
|
|
checks := make([]*structs.ServiceCheck, n)
|
|
restarters := make([]*fakeCheckRestarter, n)
|
|
for i := 0; i < n; i++ {
|
|
c := testCheck()
|
|
r := newFakeCheckRestarter(cw,
|
|
fmt.Sprintf("alloc%d", i),
|
|
fmt.Sprintf("task%d", i),
|
|
fmt.Sprintf("check%d", i),
|
|
c,
|
|
)
|
|
checks[i] = c
|
|
restarters[i] = r
|
|
}
|
|
|
|
// Run
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go cw.Run(ctx)
|
|
|
|
// Watch
|
|
for _, r := range restarters {
|
|
cw.Watch(r.allocID, r.taskName, r.checkName, r.check, r)
|
|
}
|
|
|
|
// Make them all fail
|
|
for _, r := range restarters {
|
|
fakeAPI.add(r.checkName, "critical", time.Time{})
|
|
}
|
|
|
|
// Ensure that restart was called exactly once on all checks
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
for _, r := range restarters {
|
|
if n := len(r.GetRestarts()); n != 1 {
|
|
return false, fmt.Errorf("expected 1 restart but found %d", n)
|
|
}
|
|
}
|
|
return true, nil
|
|
}, func(err error) {
|
|
require.NoError(t, err)
|
|
})
|
|
}
|