client: add support for checks in nomad services
This PR adds support for specifying checks in services registered to the built-in nomad service provider. Currently only HTTP and TCP checks are supported, though more types could be added later.
This commit is contained in:
parent
2f444a43f0
commit
297d386bdc
|
@ -8,9 +8,8 @@ import (
|
|||
"io"
|
||||
"time"
|
||||
|
||||
metrics "github.com/armon/go-metrics"
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/go-msgpack/codec"
|
||||
|
||||
"github.com/hashicorp/nomad/acl"
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
|
@ -137,6 +136,29 @@ func (a *Allocations) Stats(args *cstructs.AllocStatsRequest, reply *cstructs.Al
|
|||
return nil
|
||||
}
|
||||
|
||||
// Checks is used to retrieve nomad service discovery check status information.
|
||||
func (a *Allocations) Checks(args *cstructs.AllocChecksRequest, reply *cstructs.AllocChecksResponse) error {
|
||||
defer metrics.MeasureSince([]string{"client", "allocations", "checks"}, time.Now())
|
||||
|
||||
// Get the allocation
|
||||
alloc, err := a.c.GetAlloc(args.AllocID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Check read-job permission
|
||||
if aclObj, aclErr := a.c.ResolveToken(args.AuthToken); aclErr != nil {
|
||||
return aclErr
|
||||
} else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityReadJob) {
|
||||
return nstructs.ErrPermissionDenied
|
||||
}
|
||||
|
||||
// Get the status information for the allocation
|
||||
reply.Results = a.c.checkStore.List(alloc.ID)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// exec is used to execute command in a running task
|
||||
func (a *Allocations) exec(conn io.ReadWriteCloser) {
|
||||
defer metrics.MeasureSince([]string{"client", "allocations", "exec"}, time.Now())
|
||||
|
|
|
@ -23,6 +23,7 @@ import (
|
|||
nconfig "github.com/hashicorp/nomad/nomad/structs/config"
|
||||
"github.com/hashicorp/nomad/plugins/drivers"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/shoenig/test/must"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
@ -529,6 +530,92 @@ func TestAllocations_Stats_ACL(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestAlloc_Checks(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
client, cleanup := TestClient(t, nil)
|
||||
t.Cleanup(func() {
|
||||
must.NoError(t, cleanup())
|
||||
})
|
||||
|
||||
now := time.Date(2022, 3, 4, 5, 6, 7, 8, time.UTC).Unix()
|
||||
|
||||
qr1 := &nstructs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: "healthiness",
|
||||
Status: "passing",
|
||||
Output: "nomad: http ok",
|
||||
Timestamp: now,
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}
|
||||
|
||||
qr2 := &nstructs.CheckQueryResult{
|
||||
ID: "def456",
|
||||
Mode: "readiness",
|
||||
Status: "passing",
|
||||
Output: "nomad: http ok",
|
||||
Timestamp: now,
|
||||
Group: "group",
|
||||
Service: "service2",
|
||||
Check: "check",
|
||||
}
|
||||
|
||||
t.Run("alloc does not exist", func(t *testing.T) {
|
||||
request := cstructs.AllocChecksRequest{AllocID: "d3e34248-4843-be75-d4fd-4899975cfb38"}
|
||||
var response cstructs.AllocChecksResponse
|
||||
err := client.ClientRPC("Allocations.Checks", &request, &response)
|
||||
must.EqError(t, err, `Unknown allocation "d3e34248-4843-be75-d4fd-4899975cfb38"`)
|
||||
})
|
||||
|
||||
t.Run("no checks for alloc", func(t *testing.T) {
|
||||
alloc := mock.Alloc()
|
||||
must.NoError(t, client.addAlloc(alloc, ""))
|
||||
|
||||
request := cstructs.AllocChecksRequest{AllocID: alloc.ID}
|
||||
var response cstructs.AllocChecksResponse
|
||||
err := client.ClientRPC("Allocations.Checks", &request, &response)
|
||||
must.NoError(t, err)
|
||||
must.MapEmpty(t, response.Results)
|
||||
})
|
||||
|
||||
t.Run("two in one alloc", func(t *testing.T) {
|
||||
alloc := mock.Alloc()
|
||||
must.NoError(t, client.addAlloc(alloc, ""))
|
||||
must.NoError(t, client.checkStore.Set(alloc.ID, qr1))
|
||||
must.NoError(t, client.checkStore.Set(alloc.ID, qr2))
|
||||
|
||||
request := cstructs.AllocChecksRequest{AllocID: alloc.ID}
|
||||
var response cstructs.AllocChecksResponse
|
||||
err := client.ClientRPC("Allocations.Checks", &request, &response)
|
||||
must.NoError(t, err)
|
||||
must.MapEq(t, map[nstructs.CheckID]*nstructs.CheckQueryResult{
|
||||
"abc123": qr1,
|
||||
"def456": qr2,
|
||||
}, response.Results)
|
||||
})
|
||||
|
||||
t.Run("ignore unrelated alloc", func(t *testing.T) {
|
||||
alloc1 := mock.Alloc()
|
||||
must.NoError(t, client.addAlloc(alloc1, ""))
|
||||
|
||||
alloc2 := mock.Alloc()
|
||||
must.NoError(t, client.addAlloc(alloc2, ""))
|
||||
must.NoError(t, client.checkStore.Set(alloc1.ID, qr1))
|
||||
must.NoError(t, client.checkStore.Set(alloc2.ID, qr2))
|
||||
|
||||
request := cstructs.AllocChecksRequest{AllocID: alloc1.ID}
|
||||
var response cstructs.AllocChecksResponse
|
||||
err := client.ClientRPC("Allocations.Checks", &request, &response)
|
||||
must.NoError(t, err)
|
||||
must.MapEq(t, map[nstructs.CheckID]*nstructs.CheckQueryResult{
|
||||
"abc123": qr1,
|
||||
}, response.Results)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAlloc_ExecStreaming(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
require := require.New(t)
|
||||
|
|
|
@ -8,9 +8,11 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/hashicorp/consul/api"
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -18,9 +20,9 @@ const (
|
|||
// AllocHealthEventSource is the source used for emitting task events
|
||||
AllocHealthEventSource = "Alloc Unhealthy"
|
||||
|
||||
// consulCheckLookupInterval is the interval at which we check if the
|
||||
// Consul checks are healthy or unhealthy.
|
||||
consulCheckLookupInterval = 500 * time.Millisecond
|
||||
// checkLookupInterval is the pace at which we check if the Consul or Nomad
|
||||
// checks for an allocation are healthy or unhealthy.
|
||||
checkLookupInterval = 500 * time.Millisecond
|
||||
)
|
||||
|
||||
// Tracker tracks the health of an allocation and makes health events watchable
|
||||
|
@ -40,23 +42,30 @@ type Tracker struct {
|
|||
// considered healthy
|
||||
minHealthyTime time.Duration
|
||||
|
||||
// checkLookupInterval is the interval at which we check if the
|
||||
// Consul checks are healthy or unhealthy.
|
||||
// checkLookupInterval is the repeated interval after which which we check
|
||||
// if the Consul checks are healthy or unhealthy.
|
||||
checkLookupInterval time.Duration
|
||||
|
||||
// useChecks specifies whether to use Consul healh checks or not
|
||||
// useChecks specifies whether to consider Consul and Nomad service checks.
|
||||
useChecks bool
|
||||
|
||||
// consulCheckCount is the number of checks the task group will attempt to
|
||||
// register
|
||||
// consulCheckCount is the total number of Consul service checks in the task
|
||||
// group including task level checks.
|
||||
consulCheckCount int
|
||||
|
||||
// nomadCheckCount is the total the number of Nomad service checks in the task
|
||||
// group including task level checks.
|
||||
nomadCheckCount int
|
||||
|
||||
// allocUpdates is a listener for retrieving new alloc updates
|
||||
allocUpdates *cstructs.AllocListener
|
||||
|
||||
// consulClient is used to look up the state of the task's checks
|
||||
// consulClient is used to look up the status of Consul service checks
|
||||
consulClient serviceregistration.Handler
|
||||
|
||||
// checkStore is used to lookup the status of Nomad service checks
|
||||
checkStore checkstore.Shim
|
||||
|
||||
// healthy is used to signal whether we have determined the allocation to be
|
||||
// healthy or unhealthy
|
||||
healthy chan bool
|
||||
|
@ -69,11 +78,11 @@ type Tracker struct {
|
|||
// These tasks may terminate without affecting alloc health
|
||||
lifecycleTasks map[string]string
|
||||
|
||||
// l is used to lock shared fields listed below
|
||||
l sync.Mutex
|
||||
// lock is used to lock shared fields listed below
|
||||
lock sync.Mutex
|
||||
|
||||
// tasksHealthy marks whether all the tasks have met their health check
|
||||
// (disregards Consul)
|
||||
// (disregards Consul and Nomad checks)
|
||||
tasksHealthy bool
|
||||
|
||||
// allocFailed marks whether the allocation failed
|
||||
|
@ -82,22 +91,31 @@ type Tracker struct {
|
|||
// checksHealthy marks whether all the task's Consul checks are healthy
|
||||
checksHealthy bool
|
||||
|
||||
// taskHealth contains the health state for each task
|
||||
// taskHealth contains the health state for each task in the allocation
|
||||
// name -> state
|
||||
taskHealth map[string]*taskHealthState
|
||||
|
||||
// logger is for logging things
|
||||
logger hclog.Logger
|
||||
}
|
||||
|
||||
// NewTracker returns a health tracker for the given allocation. An alloc
|
||||
// listener and consul API object are given so that the watcher can detect
|
||||
// health changes.
|
||||
func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation,
|
||||
allocUpdates *cstructs.AllocListener, consulClient serviceregistration.Handler,
|
||||
minHealthyTime time.Duration, useChecks bool) *Tracker {
|
||||
// NewTracker returns a health tracker for the given allocation.
|
||||
//
|
||||
// Depending on job configuration, an allocation's health takes into consideration
|
||||
// - An alloc listener
|
||||
// - Consul checks (via consul API)
|
||||
// - Nomad checks (via client state)
|
||||
func NewTracker(
|
||||
parentCtx context.Context,
|
||||
logger hclog.Logger,
|
||||
alloc *structs.Allocation,
|
||||
allocUpdates *cstructs.AllocListener,
|
||||
consulClient serviceregistration.Handler,
|
||||
checkStore checkstore.Shim,
|
||||
minHealthyTime time.Duration,
|
||||
useChecks bool,
|
||||
) *Tracker {
|
||||
|
||||
// Do not create a named sub-logger as the hook controlling
|
||||
// this struct should pass in an appropriately named
|
||||
// sub-logger.
|
||||
t := &Tracker{
|
||||
healthy: make(chan bool, 1),
|
||||
allocStopped: make(chan struct{}),
|
||||
|
@ -107,7 +125,8 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A
|
|||
useChecks: useChecks,
|
||||
allocUpdates: allocUpdates,
|
||||
consulClient: consulClient,
|
||||
checkLookupInterval: consulCheckLookupInterval,
|
||||
checkStore: checkStore,
|
||||
checkLookupInterval: checkLookupInterval,
|
||||
logger: logger,
|
||||
lifecycleTasks: map[string]string{},
|
||||
}
|
||||
|
@ -120,24 +139,42 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A
|
|||
t.lifecycleTasks[task.Name] = task.Lifecycle.Hook
|
||||
}
|
||||
|
||||
for _, s := range task.Services {
|
||||
t.consulCheckCount += len(s.Checks)
|
||||
}
|
||||
c, n := countChecks(task.Services)
|
||||
t.consulCheckCount += c
|
||||
t.nomadCheckCount += n
|
||||
}
|
||||
|
||||
for _, s := range t.tg.Services {
|
||||
t.consulCheckCount += len(s.Checks)
|
||||
}
|
||||
c, n := countChecks(t.tg.Services)
|
||||
t.consulCheckCount += c
|
||||
t.nomadCheckCount += n
|
||||
|
||||
t.ctx, t.cancelFn = context.WithCancel(parentCtx)
|
||||
return t
|
||||
}
|
||||
|
||||
func countChecks(services []*structs.Service) (consul, nomad int) {
|
||||
for _, service := range services {
|
||||
switch service.Provider {
|
||||
case structs.ServiceProviderNomad:
|
||||
nomad += len(service.Checks)
|
||||
default:
|
||||
consul += len(service.Checks)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Start starts the watcher.
|
||||
func (t *Tracker) Start() {
|
||||
go t.watchTaskEvents()
|
||||
if t.useChecks {
|
||||
|
||||
switch {
|
||||
case !t.useChecks:
|
||||
return
|
||||
case t.consulCheckCount > 0:
|
||||
go t.watchConsulEvents()
|
||||
case t.nomadCheckCount > 0:
|
||||
go t.watchNomadEvents()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,8 +194,8 @@ func (t *Tracker) AllocStoppedCh() <-chan struct{} {
|
|||
// health has been determined. Only tasks that have contributed to the
|
||||
// allocation being unhealthy will have an event.
|
||||
func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
|
||||
t.l.Lock()
|
||||
defer t.l.Unlock()
|
||||
t.lock.Lock()
|
||||
defer t.lock.Unlock()
|
||||
|
||||
// Nothing to do since the failure wasn't task related
|
||||
if t.allocFailed {
|
||||
|
@ -180,10 +217,11 @@ func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
|
|||
}
|
||||
|
||||
// setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
|
||||
// allocation is terminal, health is immediately broadcasted.
|
||||
// allocation is terminal, health is immediately broadcast.
|
||||
func (t *Tracker) setTaskHealth(healthy, terminal bool) {
|
||||
t.l.Lock()
|
||||
defer t.l.Unlock()
|
||||
t.lock.Lock()
|
||||
defer t.lock.Unlock()
|
||||
|
||||
t.tasksHealthy = healthy
|
||||
|
||||
// if unhealthy, force waiting for new checks health status
|
||||
|
@ -192,15 +230,23 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
|
|||
return
|
||||
}
|
||||
|
||||
// If we are marked healthy but we also require Consul to be healthy and it
|
||||
// isn't yet, return, unless the task is terminal
|
||||
requireConsul := t.useChecks && t.consulCheckCount > 0
|
||||
if !terminal && healthy && requireConsul && !t.checksHealthy {
|
||||
// If we are marked healthy but we also require Consul checks to be healthy
|
||||
// and they are not yet, return, unless the task is terminal.
|
||||
usesConsulChecks := t.useChecks && t.consulCheckCount > 0
|
||||
if !terminal && healthy && usesConsulChecks && !t.checksHealthy {
|
||||
return
|
||||
}
|
||||
|
||||
// If we are marked healthy but also require Nomad checks to be healthy and
|
||||
// they are not yet, return, unless the task is terminal.
|
||||
usesNomadChecks := t.useChecks && t.nomadCheckCount > 0
|
||||
if !terminal && healthy && usesNomadChecks && !t.checksHealthy {
|
||||
return
|
||||
}
|
||||
|
||||
select {
|
||||
case t.healthy <- healthy:
|
||||
// nothing
|
||||
default:
|
||||
}
|
||||
|
||||
|
@ -210,9 +256,13 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
|
|||
|
||||
// setCheckHealth is used to mark the checks as either healthy or unhealthy.
|
||||
// returns true if health is propagated and no more health monitoring is needed
|
||||
//
|
||||
// todo: this is currently being shared by watchConsulEvents and watchNomadEvents,
|
||||
// and must be split up if/when we support registering services (and thus checks)
|
||||
// of different providers.
|
||||
func (t *Tracker) setCheckHealth(healthy bool) bool {
|
||||
t.l.Lock()
|
||||
defer t.l.Unlock()
|
||||
t.lock.Lock()
|
||||
defer t.lock.Unlock()
|
||||
|
||||
// check health should always be false if tasks are unhealthy
|
||||
// as checks might be missing from unhealthy tasks
|
||||
|
@ -225,10 +275,11 @@ func (t *Tracker) setCheckHealth(healthy bool) bool {
|
|||
|
||||
select {
|
||||
case t.healthy <- healthy:
|
||||
// nothing
|
||||
default:
|
||||
}
|
||||
|
||||
// Shutdown the tracker
|
||||
// Shutdown the tracker, things are healthy so nothing to do
|
||||
t.cancelFn()
|
||||
return true
|
||||
}
|
||||
|
@ -244,13 +295,8 @@ func (t *Tracker) markAllocStopped() {
|
|||
func (t *Tracker) watchTaskEvents() {
|
||||
alloc := t.alloc
|
||||
allStartedTime := time.Time{}
|
||||
healthyTimer := time.NewTimer(0)
|
||||
if !healthyTimer.Stop() {
|
||||
select {
|
||||
case <-healthyTimer.C:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
waiter := newHealthyFuture()
|
||||
|
||||
for {
|
||||
// If the alloc is being stopped by the server just exit
|
||||
|
@ -262,7 +308,7 @@ func (t *Tracker) watchTaskEvents() {
|
|||
}
|
||||
|
||||
// Store the task states
|
||||
t.l.Lock()
|
||||
t.lock.Lock()
|
||||
for task, state := range alloc.TaskStates {
|
||||
//TODO(schmichael) for now skip unknown tasks as
|
||||
//they're task group services which don't currently
|
||||
|
@ -271,7 +317,7 @@ func (t *Tracker) watchTaskEvents() {
|
|||
v.state = state
|
||||
}
|
||||
}
|
||||
t.l.Unlock()
|
||||
t.lock.Unlock()
|
||||
|
||||
// Detect if the alloc is unhealthy or if all tasks have started yet
|
||||
latestStartTime := time.Time{}
|
||||
|
@ -308,9 +354,9 @@ func (t *Tracker) watchTaskEvents() {
|
|||
// individual tasks failed, that means something failed at the alloc
|
||||
// level.
|
||||
if alloc.ClientStatus == structs.AllocClientStatusFailed {
|
||||
t.l.Lock()
|
||||
t.lock.Lock()
|
||||
t.allocFailed = true
|
||||
t.l.Unlock()
|
||||
t.lock.Unlock()
|
||||
|
||||
t.setTaskHealth(false, true)
|
||||
return
|
||||
|
@ -321,17 +367,12 @@ func (t *Tracker) watchTaskEvents() {
|
|||
t.setTaskHealth(false, false)
|
||||
|
||||
// Avoid the timer from firing at the old start time
|
||||
if !healthyTimer.Stop() {
|
||||
select {
|
||||
case <-healthyTimer.C:
|
||||
default:
|
||||
}
|
||||
}
|
||||
waiter.disable()
|
||||
|
||||
// Set the timer since all tasks are started
|
||||
if !latestStartTime.IsZero() {
|
||||
allStartedTime = latestStartTime
|
||||
healthyTimer.Reset(t.minHealthyTime)
|
||||
waiter.wait(t.minHealthyTime)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -343,33 +384,66 @@ func (t *Tracker) watchTaskEvents() {
|
|||
return
|
||||
}
|
||||
alloc = newAlloc
|
||||
case <-healthyTimer.C:
|
||||
case <-waiter.C():
|
||||
t.setTaskHealth(true, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// watchConsulEvents is a watcher for the health of the allocation's Consul
|
||||
// checks. If all checks report healthy the watcher will exit after the
|
||||
// MinHealthyTime has been reached, Otherwise the watcher will continue to
|
||||
// check unhealthy checks until the ctx is cancelled
|
||||
func (t *Tracker) watchConsulEvents() {
|
||||
// checkTicker is the ticker that triggers us to look at the checks in
|
||||
// Consul
|
||||
checkTicker := time.NewTicker(t.checkLookupInterval)
|
||||
defer checkTicker.Stop()
|
||||
// healthyFuture is used to fire after checks have been healthy for MinHealthyTime
|
||||
type healthyFuture struct {
|
||||
timer *time.Timer
|
||||
}
|
||||
|
||||
// healthyTimer fires when the checks have been healthy for the
|
||||
// MinHealthyTime
|
||||
healthyTimer := time.NewTimer(0)
|
||||
if !healthyTimer.Stop() {
|
||||
// newHealthyFuture will create a healthyFuture in a disabled state, and
|
||||
// will do nothing until a call to wait takes place
|
||||
func newHealthyFuture() *healthyFuture {
|
||||
timer := time.NewTimer(0)
|
||||
ht := &healthyFuture{timer: timer}
|
||||
ht.disable()
|
||||
return ht
|
||||
}
|
||||
|
||||
// disable the healthyFuture from triggering
|
||||
func (h *healthyFuture) disable() {
|
||||
if !h.timer.Stop() {
|
||||
// must ensure channel is clear
|
||||
// https://pkg.go.dev/time#Timer.Stop
|
||||
select {
|
||||
case <-healthyTimer.C:
|
||||
case <-h.timer.C:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// primed marks whether the healthy timer has been set
|
||||
// wait will reset the healthyFuture to trigger after dur passes.
|
||||
func (h *healthyFuture) wait(dur time.Duration) {
|
||||
// must ensure timer is stopped
|
||||
// https://pkg.go.dev/time#Timer.Reset
|
||||
h.disable()
|
||||
h.timer.Reset(dur)
|
||||
}
|
||||
|
||||
// C returns a channel on which the future will send when ready.
|
||||
func (h *healthyFuture) C() <-chan time.Time {
|
||||
return h.timer.C
|
||||
}
|
||||
|
||||
// watchConsulEvents is a watcher for the health of the allocation's Consul
|
||||
// checks. If all checks report healthy the watcher will exit after the
|
||||
// MinHealthyTime has been reached, otherwise the watcher will continue to
|
||||
// check unhealthy checks until the ctx is cancelled.
|
||||
//
|
||||
// Does not watch Nomad service checks; see watchNomadEvents for those.
|
||||
func (t *Tracker) watchConsulEvents() {
|
||||
// checkTicker is the ticker that triggers us to look at the checks in Consul
|
||||
checkTicker := time.NewTicker(t.checkLookupInterval)
|
||||
defer checkTicker.Stop()
|
||||
|
||||
// waiter is used to fire when the checks have been healthy for the MinHealthyTime
|
||||
waiter := newHealthyFuture()
|
||||
|
||||
// primed marks whether the healthy waiter has been set
|
||||
primed := false
|
||||
|
||||
// Store whether the last Consul checks call was successful or not
|
||||
|
@ -381,8 +455,12 @@ func (t *Tracker) watchConsulEvents() {
|
|||
OUTER:
|
||||
for {
|
||||
select {
|
||||
|
||||
// we are shutting down
|
||||
case <-t.ctx.Done():
|
||||
return
|
||||
|
||||
// it is time to check the checks
|
||||
case <-checkTicker.C:
|
||||
newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
|
||||
if err != nil {
|
||||
|
@ -395,12 +473,15 @@ OUTER:
|
|||
consulChecksErr = false
|
||||
allocReg = newAllocReg
|
||||
}
|
||||
case <-healthyTimer.C:
|
||||
|
||||
// enough time has passed with healthy checks
|
||||
case <-waiter.C():
|
||||
if t.setCheckHealth(true) {
|
||||
// final health set and propagated
|
||||
return
|
||||
}
|
||||
// tasks are unhealthy, reset and wait until all is healthy
|
||||
// checks are healthy but tasks are unhealthy,
|
||||
// reset and wait until all is healthy
|
||||
primed = false
|
||||
}
|
||||
|
||||
|
@ -409,7 +490,7 @@ OUTER:
|
|||
}
|
||||
|
||||
// Store the task registrations
|
||||
t.l.Lock()
|
||||
t.lock.Lock()
|
||||
for task, reg := range allocReg.Tasks {
|
||||
//TODO(schmichael) for now skip unknown tasks as
|
||||
//they're task group services which don't currently
|
||||
|
@ -418,7 +499,7 @@ OUTER:
|
|||
v.taskRegistrations = reg
|
||||
}
|
||||
}
|
||||
t.l.Unlock()
|
||||
t.lock.Unlock()
|
||||
|
||||
// Detect if all the checks are passing
|
||||
passed := true
|
||||
|
@ -427,16 +508,16 @@ OUTER:
|
|||
for _, treg := range allocReg.Tasks {
|
||||
for _, sreg := range treg.Services {
|
||||
for _, check := range sreg.Checks {
|
||||
onupdate := sreg.CheckOnUpdate[check.CheckID]
|
||||
onUpdate := sreg.CheckOnUpdate[check.CheckID]
|
||||
switch check.Status {
|
||||
case api.HealthPassing:
|
||||
continue
|
||||
case api.HealthWarning:
|
||||
if onupdate == structs.OnUpdateIgnoreWarn || onupdate == structs.OnUpdateIgnore {
|
||||
if onUpdate == structs.OnUpdateIgnoreWarn || onUpdate == structs.OnUpdateIgnore {
|
||||
continue
|
||||
}
|
||||
case api.HealthCritical:
|
||||
if onupdate == structs.OnUpdateIgnore {
|
||||
if onUpdate == structs.OnUpdateIgnore {
|
||||
continue
|
||||
}
|
||||
default:
|
||||
|
@ -452,25 +533,94 @@ OUTER:
|
|||
if !passed {
|
||||
// Reset the timer since we have transitioned back to unhealthy
|
||||
if primed {
|
||||
if !healthyTimer.Stop() {
|
||||
select {
|
||||
case <-healthyTimer.C:
|
||||
default:
|
||||
}
|
||||
}
|
||||
primed = false
|
||||
waiter.disable()
|
||||
}
|
||||
} else if !primed {
|
||||
// Reset the timer to fire after MinHealthyTime
|
||||
if !healthyTimer.Stop() {
|
||||
select {
|
||||
case <-healthyTimer.C:
|
||||
default:
|
||||
primed = true
|
||||
waiter.disable()
|
||||
waiter.wait(t.minHealthyTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// watchNomadEvents is a watcher for the health of the allocation's Nomad checks.
|
||||
// If all checks report healthy the watcher will exit after the MinHealthyTime has
|
||||
// been reached, otherwise the watcher will continue to check unhealthy checks until
|
||||
// the ctx is cancelled.
|
||||
//
|
||||
// Does not watch Consul service checks; see watchConsulEvents for those.
|
||||
func (t *Tracker) watchNomadEvents() {
|
||||
// checkTicker is the ticker that triggers us to look at the checks in Nomad
|
||||
checkTicker, cancel := helper.NewSafeTimer(t.checkLookupInterval)
|
||||
defer cancel()
|
||||
|
||||
// waiter is used to fire when the checks have been healthy for the MinHealthyTime
|
||||
waiter := newHealthyFuture()
|
||||
|
||||
// allocID of the allocation we are watching checks for
|
||||
allocID := t.alloc.ID
|
||||
|
||||
// primed marks whether the healthy waiter has been set
|
||||
primed := false
|
||||
|
||||
// latest set of nomad check results
|
||||
var results map[structs.CheckID]*structs.CheckQueryResult
|
||||
|
||||
for {
|
||||
select {
|
||||
|
||||
// we are shutting down
|
||||
case <-t.ctx.Done():
|
||||
return
|
||||
|
||||
// it is time to check the checks
|
||||
case <-checkTicker.C:
|
||||
results = t.checkStore.List(allocID)
|
||||
checkTicker.Reset(t.checkLookupInterval)
|
||||
|
||||
// enough time has passed with healthy checks
|
||||
case <-waiter.C():
|
||||
if t.setCheckHealth(true) { // todo(shoenig) this needs to be split between Consul and Nomad
|
||||
return // final health set and propagated
|
||||
}
|
||||
// checks are healthy but tasks are unhealthy, reset and wait
|
||||
primed = false
|
||||
}
|
||||
|
||||
// scan to see if any checks are failing
|
||||
passing := true
|
||||
for _, result := range results {
|
||||
switch result.Status {
|
||||
case structs.CheckSuccess:
|
||||
continue
|
||||
case structs.CheckFailure:
|
||||
if result.Mode == structs.Readiness {
|
||||
continue
|
||||
}
|
||||
passing = false
|
||||
default:
|
||||
// i.e. pending check; do not consider healthy or ready
|
||||
passing = false
|
||||
}
|
||||
|
||||
if !passing {
|
||||
break // 1+ check is failing; no need to continue
|
||||
}
|
||||
}
|
||||
|
||||
if !passing {
|
||||
// at least one check is failing, transition to unhealthy
|
||||
t.setCheckHealth(false)
|
||||
primed = false
|
||||
waiter.disable()
|
||||
}
|
||||
|
||||
if passing && !primed {
|
||||
// healthy but not yet primed, set timer to wait
|
||||
primed = true
|
||||
healthyTimer.Reset(t.minHealthyTime)
|
||||
waiter.wait(t.minHealthyTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -488,15 +638,13 @@ type taskHealthState struct {
|
|||
// strategy of the group. It returns true if the task has contributed to the
|
||||
// allocation being unhealthy and if so, an event description of why.
|
||||
func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyTime time.Duration, useChecks bool) (string, bool) {
|
||||
requireChecks := false
|
||||
desiredChecks := 0
|
||||
for _, s := range t.task.Services {
|
||||
if nc := len(s.Checks); nc > 0 {
|
||||
requireChecks = true
|
||||
desiredChecks += nc
|
||||
}
|
||||
}
|
||||
requireChecks = requireChecks && useChecks
|
||||
requireChecks := (desiredChecks > 0) && useChecks
|
||||
|
||||
if t.state != nil {
|
||||
if t.state.Failed {
|
||||
|
@ -507,7 +655,7 @@ func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyT
|
|||
case structs.TaskStatePending:
|
||||
return fmt.Sprintf("Task not running by healthy_deadline of %v", healthyDeadline), true
|
||||
case structs.TaskStateDead:
|
||||
// hook tasks are healthy when dead successfully
|
||||
// non-sidecar hook lifecycle tasks are healthy if they exit with success
|
||||
if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar {
|
||||
return "Unhealthy because of dead task", true
|
||||
}
|
||||
|
|
|
@ -10,16 +10,19 @@ import (
|
|||
consulapi "github.com/hashicorp/consul/api"
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
regmock "github.com/hashicorp/nomad/client/serviceregistration/mock"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/shoenig/test/must"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestTracker_Checks_Healthy(t *testing.T) {
|
||||
func TestTracker_ConsulChecks_Healthy(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
alloc := mock.Alloc()
|
||||
|
@ -60,7 +63,7 @@ func TestTracker_Checks_Healthy(t *testing.T) {
|
|||
|
||||
// Don't reply on the first call
|
||||
var called uint64
|
||||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
|
||||
if atomic.AddUint64(&called, 1) == 1 {
|
||||
return nil, nil
|
||||
|
@ -76,9 +79,9 @@ func TestTracker_Checks_Healthy(t *testing.T) {
|
|||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
time.Millisecond, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
|
@ -90,6 +93,143 @@ func TestTracker_Checks_Healthy(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTracker_NomadChecks_Healthy(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
|
||||
alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad"
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Synthesize running alloc and tasks
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
alloc.TaskStates = map[string]*structs.TaskState{
|
||||
alloc.Job.TaskGroups[0].Tasks[0].Name: {
|
||||
State: structs.TaskStateRunning,
|
||||
StartedAt: time.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
err := checks.Set(alloc.ID, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: "healthiness",
|
||||
Status: "pending",
|
||||
Output: "nomad: waiting to run",
|
||||
Timestamp: time.Now().Unix(),
|
||||
Group: alloc.TaskGroup,
|
||||
Task: alloc.Job.TaskGroups[0].Tasks[0].Name,
|
||||
Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
|
||||
Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
|
||||
})
|
||||
must.NoError(t, err)
|
||||
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
go func() {
|
||||
// wait a bit then update the check to passing
|
||||
time.Sleep(15 * time.Millisecond)
|
||||
must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: "healthiness",
|
||||
Status: "success",
|
||||
Output: "nomad: http ok",
|
||||
Timestamp: time.Now().Unix(),
|
||||
Group: alloc.TaskGroup,
|
||||
Task: alloc.Job.TaskGroups[0].Tasks[0].Name,
|
||||
Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
|
||||
Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
|
||||
}))
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-time.After(4 * checkInterval):
|
||||
t.Fatalf("timed out while waiting for success")
|
||||
case healthy := <-tracker.HealthyCh():
|
||||
must.True(t, healthy)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_NomadChecks_Unhealthy(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
|
||||
alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad"
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Synthesize running alloc and tasks
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
alloc.TaskStates = map[string]*structs.TaskState{
|
||||
alloc.Job.TaskGroups[0].Tasks[0].Name: {
|
||||
State: structs.TaskStateRunning,
|
||||
StartedAt: time.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
err := checks.Set(alloc.ID, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: "healthiness",
|
||||
Status: "pending", // start out pending
|
||||
Output: "nomad: waiting to run",
|
||||
Timestamp: time.Now().Unix(),
|
||||
Group: alloc.TaskGroup,
|
||||
Task: alloc.Job.TaskGroups[0].Tasks[0].Name,
|
||||
Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
|
||||
Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
|
||||
})
|
||||
must.NoError(t, err)
|
||||
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
go func() {
|
||||
// wait a bit then update the check to failing
|
||||
time.Sleep(15 * time.Millisecond)
|
||||
must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: "healthiness",
|
||||
Status: "failing",
|
||||
Output: "connection refused",
|
||||
Timestamp: time.Now().Unix(),
|
||||
Group: alloc.TaskGroup,
|
||||
Task: alloc.Job.TaskGroups[0].Tasks[0].Name,
|
||||
Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
|
||||
Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
|
||||
}))
|
||||
}()
|
||||
|
||||
// make sure we are always unhealthy across 4 check intervals
|
||||
for i := 0; i < 4; i++ {
|
||||
<-time.After(checkInterval)
|
||||
select {
|
||||
case <-tracker.HealthyCh():
|
||||
t.Fatalf("should not receive on healthy chan with failing check")
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
|
@ -112,13 +252,13 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
|
|||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
time.Millisecond, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
|
@ -153,13 +293,13 @@ func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
|
|||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
|
@ -171,7 +311,7 @@ func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTracker_Checks_Unhealthy(t *testing.T) {
|
||||
func TestTracker_ConsulChecks_Unhealthy(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
alloc := mock.Alloc()
|
||||
|
@ -220,7 +360,7 @@ func TestTracker_Checks_Unhealthy(t *testing.T) {
|
|||
|
||||
// Don't reply on the first call
|
||||
var called uint64
|
||||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
|
||||
if atomic.AddUint64(&called, 1) == 1 {
|
||||
return nil, nil
|
||||
|
@ -236,9 +376,9 @@ func TestTracker_Checks_Unhealthy(t *testing.T) {
|
|||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
time.Millisecond, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
|
@ -249,9 +389,9 @@ func TestTracker_Checks_Unhealthy(t *testing.T) {
|
|||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
tracker.l.Lock()
|
||||
tracker.lock.Lock()
|
||||
require.False(t, tracker.checksHealthy)
|
||||
tracker.l.Unlock()
|
||||
tracker.lock.Unlock()
|
||||
|
||||
select {
|
||||
case v := <-tracker.HealthyCh():
|
||||
|
@ -270,8 +410,7 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
|
|||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
tracker := NewTracker(ctx, logger, alloc, nil, nil,
|
||||
time.Millisecond, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, nil, nil, nil, time.Millisecond, true)
|
||||
|
||||
assertNoHealth := func() {
|
||||
require.NoError(t, tracker.ctx.Err())
|
||||
|
@ -362,7 +501,7 @@ func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
|
|||
|
||||
// Don't reply on the first call
|
||||
var called uint64
|
||||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
|
||||
if atomic.AddUint64(&called, 1) == 1 {
|
||||
return nil, nil
|
||||
|
@ -378,9 +517,9 @@ func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
|
|||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
time.Millisecond, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
|
@ -419,7 +558,7 @@ func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
|
|||
|
||||
}
|
||||
|
||||
func TestTracker_Checks_OnUpdate(t *testing.T) {
|
||||
func TestTracker_ConsulChecks_OnUpdate(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
cases := []struct {
|
||||
|
@ -504,7 +643,7 @@ func TestTracker_Checks_OnUpdate(t *testing.T) {
|
|||
|
||||
// Don't reply on the first call
|
||||
var called uint64
|
||||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
|
||||
if atomic.AddUint64(&called, 1) == 1 {
|
||||
return nil, nil
|
||||
|
@ -520,9 +659,9 @@ func TestTracker_Checks_OnUpdate(t *testing.T) {
|
|||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
time.Millisecond, true)
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
|
@ -548,3 +687,120 @@ func TestTracker_Checks_OnUpdate(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_NomadChecks_OnUpdate(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
checkMode structs.CheckMode
|
||||
checkResult structs.CheckStatus
|
||||
expectedPass bool
|
||||
}{
|
||||
{
|
||||
name: "mode is healthiness and check is healthy",
|
||||
checkMode: structs.Healthiness,
|
||||
checkResult: structs.CheckSuccess,
|
||||
expectedPass: true,
|
||||
},
|
||||
{
|
||||
name: "mode is healthiness and check is unhealthy",
|
||||
checkMode: structs.Healthiness,
|
||||
checkResult: structs.CheckFailure,
|
||||
expectedPass: false,
|
||||
},
|
||||
{
|
||||
name: "mode is readiness and check is healthy",
|
||||
checkMode: structs.Readiness,
|
||||
checkResult: structs.CheckSuccess,
|
||||
expectedPass: true,
|
||||
},
|
||||
{
|
||||
name: "mode is readiness and check is healthy",
|
||||
checkMode: structs.Readiness,
|
||||
checkResult: structs.CheckFailure,
|
||||
expectedPass: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
|
||||
alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad"
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
// Synthesize running alloc and tasks
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
alloc.TaskStates = map[string]*structs.TaskState{
|
||||
alloc.Job.TaskGroups[0].Tasks[0].Name: {
|
||||
State: structs.TaskStateRunning,
|
||||
StartedAt: time.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
// Set a check that is pending
|
||||
checks := checkstore.NewStore(logger, state.NewMemDB(logger))
|
||||
err := checks.Set(alloc.ID, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: tc.checkMode,
|
||||
Status: structs.CheckPending,
|
||||
Output: "nomad: waiting to run",
|
||||
Timestamp: time.Now().Unix(),
|
||||
Group: alloc.TaskGroup,
|
||||
Task: alloc.Job.TaskGroups[0].Tasks[0].Name,
|
||||
Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
|
||||
Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
|
||||
})
|
||||
must.NoError(t, err)
|
||||
|
||||
go func() {
|
||||
// wait a bit then update the check to passing
|
||||
time.Sleep(15 * time.Millisecond)
|
||||
must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: tc.checkMode,
|
||||
Status: tc.checkResult,
|
||||
Output: "some output",
|
||||
Timestamp: time.Now().Unix(),
|
||||
Group: alloc.TaskGroup,
|
||||
Task: alloc.Job.TaskGroups[0].Tasks[0].Name,
|
||||
Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
|
||||
Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
|
||||
}))
|
||||
}()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
consul := regmock.NewServiceRegistrationHandler(logger)
|
||||
minHealthyTime := 1 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, minHealthyTime, true)
|
||||
tracker.checkLookupInterval = 10 * time.Millisecond
|
||||
tracker.Start()
|
||||
|
||||
select {
|
||||
case <-time.After(8 * tracker.checkLookupInterval):
|
||||
if !tc.expectedPass {
|
||||
// tracker should still be running
|
||||
must.NoError(t, tracker.ctx.Err())
|
||||
return
|
||||
}
|
||||
t.Fatal("timed out while waiting for health")
|
||||
case h := <-tracker.HealthyCh():
|
||||
require.True(t, h)
|
||||
}
|
||||
|
||||
// For healthy checks, the tracker should stop watching
|
||||
select {
|
||||
case <-tracker.ctx.Done():
|
||||
// Ok, tracker should exit after reporting healthy
|
||||
default:
|
||||
t.Fatal("expected tracker to exit after reporting healthy")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import (
|
|||
"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
|
||||
"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
||||
cstate "github.com/hashicorp/nomad/client/state"
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
|
@ -41,6 +42,7 @@ type allocRunner struct {
|
|||
// Logger is the logger for the alloc runner.
|
||||
logger log.Logger
|
||||
|
||||
// clientConfig is the client configuration block.
|
||||
clientConfig *config.Config
|
||||
|
||||
// stateUpdater is used to emit updated alloc state
|
||||
|
@ -183,6 +185,9 @@ type allocRunner struct {
|
|||
// to perform service and check registration and deregistration.
|
||||
serviceRegWrapper *wrapper.HandlerWrapper
|
||||
|
||||
// checkStore contains check status information
|
||||
checkStore checkstore.Shim
|
||||
|
||||
// getter is an interface for retrieving artifacts.
|
||||
getter cinterfaces.ArtifactGetter
|
||||
}
|
||||
|
@ -229,6 +234,7 @@ func NewAllocRunner(config *Config) (*allocRunner, error) {
|
|||
serversContactedCh: config.ServersContactedCh,
|
||||
rpcClient: config.RPCClient,
|
||||
serviceRegWrapper: config.ServiceRegWrapper,
|
||||
checkStore: config.CheckStore,
|
||||
getter: config.Getter,
|
||||
}
|
||||
|
||||
|
|
|
@ -149,21 +149,22 @@ func (ar *allocRunner) initRunnerHooks(config *clientconfig.Config) error {
|
|||
newCgroupHook(ar.Alloc(), ar.cpusetManager),
|
||||
newUpstreamAllocsHook(hookLogger, ar.prevAllocWatcher),
|
||||
newDiskMigrationHook(hookLogger, ar.prevAllocMigrator, ar.allocDir),
|
||||
newAllocHealthWatcherHook(hookLogger, alloc, hs, ar.Listener(), ar.consulClient),
|
||||
newAllocHealthWatcherHook(hookLogger, alloc, hs, ar.Listener(), ar.consulClient, ar.checkStore),
|
||||
newNetworkHook(hookLogger, ns, alloc, nm, nc, ar, builtTaskEnv),
|
||||
newGroupServiceHook(groupServiceHookConfig{
|
||||
alloc: alloc,
|
||||
namespace: alloc.ServiceProviderNamespace(),
|
||||
serviceRegWrapper: ar.serviceRegWrapper,
|
||||
restarter: ar,
|
||||
taskEnvBuilder: envBuilder,
|
||||
networkStatusGetter: ar,
|
||||
logger: hookLogger,
|
||||
shutdownDelayCtx: ar.shutdownDelayCtx,
|
||||
alloc: alloc,
|
||||
namespace: alloc.ServiceProviderNamespace(),
|
||||
serviceRegWrapper: ar.serviceRegWrapper,
|
||||
restarter: ar,
|
||||
taskEnvBuilder: envBuilder,
|
||||
networkStatus: ar,
|
||||
logger: hookLogger,
|
||||
shutdownDelayCtx: ar.shutdownDelayCtx,
|
||||
}),
|
||||
newConsulGRPCSocketHook(hookLogger, alloc, ar.allocDir, config.ConsulConfig),
|
||||
newConsulHTTPSocketHook(hookLogger, alloc, ar.allocDir, config.ConsulConfig),
|
||||
newCSIHook(alloc, hookLogger, ar.csiManager, ar.rpcClient, ar, hrs, ar.clientConfig.Node.SecretID),
|
||||
newChecksHook(hookLogger, alloc, ar.checkStore, ar),
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
|
@ -1076,10 +1076,10 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
|
|||
|
||||
// Assert that we have an event explaining why we are unhealthy.
|
||||
require.Len(t, lastUpdate.TaskStates, 1)
|
||||
state := lastUpdate.TaskStates[task.Name]
|
||||
require.NotNil(t, state)
|
||||
require.NotEmpty(t, state.Events)
|
||||
last := state.Events[len(state.Events)-1]
|
||||
taskState := lastUpdate.TaskStates[task.Name]
|
||||
require.NotNil(t, taskState)
|
||||
require.NotEmpty(t, taskState.Events)
|
||||
last := taskState.Events[len(taskState.Events)-1]
|
||||
require.Equal(t, allochealth.AllocHealthEventSource, last.Type)
|
||||
require.Contains(t, last.Message, "by healthy_deadline")
|
||||
}
|
||||
|
|
|
@ -0,0 +1,270 @@
|
|||
package allocrunner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
const (
|
||||
// checksHookName is the name of this hook as appears in logs
|
||||
checksHookName = "checks_hook"
|
||||
)
|
||||
|
||||
// observers maintains a map from check_id -> observer for a particular check. Each
|
||||
// observer in the map must share the same context.
|
||||
type observers map[structs.CheckID]*observer
|
||||
|
||||
// An observer is used to execute a particular check on its interval and update the
|
||||
// check store with those results.
|
||||
type observer struct {
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
checker checks.Checker
|
||||
checkStore checkstore.Shim
|
||||
|
||||
qc *checks.QueryContext
|
||||
check *structs.ServiceCheck
|
||||
allocID string
|
||||
}
|
||||
|
||||
// start checking our check on its interval
|
||||
func (o *observer) start() {
|
||||
// compromise between immediate (too early) and waiting full interval (slow)
|
||||
firstWait := o.check.Interval / 2
|
||||
|
||||
timer, cancel := helper.NewSafeTimer(firstWait)
|
||||
defer cancel()
|
||||
|
||||
for {
|
||||
select {
|
||||
|
||||
// exit the observer
|
||||
case <-o.ctx.Done():
|
||||
return
|
||||
|
||||
// time to execute the check
|
||||
case <-timer.C:
|
||||
query := checks.GetCheckQuery(o.check)
|
||||
result := o.checker.Do(o.ctx, o.qc, query)
|
||||
|
||||
// and put the results into the store
|
||||
_ = o.checkStore.Set(o.allocID, result)
|
||||
|
||||
// setup timer for next interval
|
||||
timer.Reset(o.check.Interval)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// stop checking our check - this will also interrupt an in-progress execution
|
||||
func (o *observer) stop() {
|
||||
o.cancel()
|
||||
}
|
||||
|
||||
// checksHook manages checks of Nomad service registrations, at both the group and
|
||||
// task level, by storing / removing them from the Client state store.
|
||||
//
|
||||
// Does not manage Consul service checks; see groupServiceHook instead.
|
||||
type checksHook struct {
|
||||
logger hclog.Logger
|
||||
network structs.NetworkStatus
|
||||
shim checkstore.Shim
|
||||
checker checks.Checker
|
||||
allocID string
|
||||
|
||||
// fields that get re-initialized on allocation update
|
||||
lock sync.RWMutex
|
||||
ctx context.Context
|
||||
stop func()
|
||||
observers observers
|
||||
alloc *structs.Allocation
|
||||
}
|
||||
|
||||
func newChecksHook(
|
||||
logger hclog.Logger,
|
||||
alloc *structs.Allocation,
|
||||
shim checkstore.Shim,
|
||||
network structs.NetworkStatus,
|
||||
) *checksHook {
|
||||
h := &checksHook{
|
||||
logger: logger.Named(checksHookName),
|
||||
allocID: alloc.ID,
|
||||
alloc: alloc,
|
||||
shim: shim,
|
||||
network: network,
|
||||
checker: checks.New(logger),
|
||||
}
|
||||
h.initialize(alloc)
|
||||
return h
|
||||
}
|
||||
|
||||
// initialize the dynamic fields of checksHook, which is to say setup all the
|
||||
// observers and query context things associated with the alloc.
|
||||
//
|
||||
// Should be called during initial setup only.
|
||||
func (h *checksHook) initialize(alloc *structs.Allocation) {
|
||||
h.lock.Lock()
|
||||
defer h.lock.Unlock()
|
||||
|
||||
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||
if tg == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// fresh context and stop function for this allocation
|
||||
h.ctx, h.stop = context.WithCancel(context.Background())
|
||||
|
||||
// fresh set of observers
|
||||
h.observers = make(observers)
|
||||
|
||||
// set the initial alloc
|
||||
h.alloc = alloc
|
||||
}
|
||||
|
||||
// observe will create the observer for each service in services.
|
||||
// services must use only nomad service provider.
|
||||
//
|
||||
// Caller must hold h.lock.
|
||||
func (h *checksHook) observe(alloc *structs.Allocation, services []*structs.Service) {
|
||||
var ports structs.AllocatedPorts
|
||||
var networks structs.Networks
|
||||
if alloc.AllocatedResources != nil {
|
||||
ports = alloc.AllocatedResources.Shared.Ports
|
||||
networks = alloc.AllocatedResources.Shared.Networks
|
||||
}
|
||||
|
||||
for _, service := range services {
|
||||
for _, check := range service.Checks {
|
||||
|
||||
// remember the initialization time
|
||||
now := time.Now().UTC().Unix()
|
||||
|
||||
// create the deterministic check id for this check
|
||||
id := structs.NomadCheckID(alloc.ID, alloc.TaskGroup, check)
|
||||
|
||||
// an observer for this check already exists
|
||||
if _, exists := h.observers[id]; exists {
|
||||
continue
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(h.ctx)
|
||||
|
||||
// create the observer for this check
|
||||
h.observers[id] = &observer{
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
check: check.Copy(),
|
||||
checkStore: h.shim,
|
||||
checker: h.checker,
|
||||
allocID: h.allocID,
|
||||
qc: &checks.QueryContext{
|
||||
ID: id,
|
||||
CustomAddress: service.Address,
|
||||
ServicePortLabel: service.PortLabel,
|
||||
Ports: ports,
|
||||
Networks: networks,
|
||||
NetworkStatus: h.network,
|
||||
Group: alloc.Name,
|
||||
Task: service.TaskName,
|
||||
Service: service.Name,
|
||||
Check: check.Name,
|
||||
},
|
||||
}
|
||||
|
||||
// insert a pending result into state store for each check
|
||||
result := checks.Stub(id, structs.GetCheckMode(check), now, alloc.Name, service.TaskName, service.Name, check.Name)
|
||||
if err := h.shim.Set(h.allocID, result); err != nil {
|
||||
h.logger.Error("failed to set initial check status", "id", h.allocID, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// start the observer
|
||||
go h.observers[id].start()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *checksHook) Name() string {
|
||||
return checksHookName
|
||||
}
|
||||
|
||||
func (h *checksHook) Prerun() error {
|
||||
h.lock.Lock()
|
||||
defer h.lock.Unlock()
|
||||
|
||||
group := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
|
||||
if group == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// create and start observers of nomad service checks in alloc
|
||||
h.observe(h.alloc, group.NomadServices())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *checksHook) Update(request *interfaces.RunnerUpdateRequest) error {
|
||||
h.lock.Lock()
|
||||
defer h.lock.Unlock()
|
||||
|
||||
group := request.Alloc.Job.LookupTaskGroup(request.Alloc.TaskGroup)
|
||||
if group == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// get all group and task level services using nomad provider
|
||||
services := group.NomadServices()
|
||||
|
||||
// create a set of the updated set of checks
|
||||
next := make([]structs.CheckID, 0, len(h.observers))
|
||||
for _, service := range services {
|
||||
for _, check := range service.Checks {
|
||||
next = append(next, structs.NomadCheckID(
|
||||
request.Alloc.ID,
|
||||
request.Alloc.TaskGroup,
|
||||
check,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// stop the observers of the checks we are removing
|
||||
remove := h.shim.Difference(request.Alloc.ID, next)
|
||||
for _, id := range remove {
|
||||
h.observers[id].stop()
|
||||
delete(h.observers, id)
|
||||
}
|
||||
|
||||
// remove checks that are no longer part of the allocation
|
||||
if err := h.shim.Remove(request.Alloc.ID, remove); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// remember this new alloc
|
||||
h.alloc = request.Alloc
|
||||
|
||||
// ensure we are observing new checks (idempotent)
|
||||
h.observe(request.Alloc, services)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *checksHook) PreKill() {
|
||||
h.lock.Lock()
|
||||
defer h.lock.Unlock()
|
||||
|
||||
// terminate our hook context, which threads down to all observers
|
||||
h.stop()
|
||||
|
||||
// purge all checks for this allocation from the client state store
|
||||
if err := h.shim.Purge(h.allocID); err != nil {
|
||||
h.logger.Error("failed to purge check results", "alloc_id", h.allocID, "error", err)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,312 @@
|
|||
package allocrunner
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/shoenig/test/must"
|
||||
)
|
||||
|
||||
var (
|
||||
_ interfaces.RunnerPrerunHook = (*checksHook)(nil)
|
||||
_ interfaces.RunnerUpdateHook = (*checksHook)(nil)
|
||||
_ interfaces.RunnerPreKillHook = (*checksHook)(nil)
|
||||
)
|
||||
|
||||
func makeCheckStore(logger hclog.Logger) checkstore.Shim {
|
||||
db := state.NewMemDB(logger)
|
||||
checkStore := checkstore.NewStore(logger, db)
|
||||
return checkStore
|
||||
}
|
||||
|
||||
func allocWithNomadChecks(addr, port string, onGroup bool) *structs.Allocation {
|
||||
alloc := mock.Alloc()
|
||||
group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||
|
||||
task := "task-one"
|
||||
if onGroup {
|
||||
task = ""
|
||||
}
|
||||
|
||||
services := []*structs.Service{
|
||||
{
|
||||
Name: "service-one",
|
||||
TaskName: "web",
|
||||
PortLabel: port,
|
||||
AddressMode: "auto",
|
||||
Address: addr,
|
||||
Provider: "nomad",
|
||||
Checks: []*structs.ServiceCheck{
|
||||
{
|
||||
Name: "check-ok",
|
||||
Type: "http",
|
||||
Path: "/",
|
||||
Protocol: "http",
|
||||
PortLabel: port,
|
||||
AddressMode: "auto",
|
||||
Interval: 250 * time.Millisecond,
|
||||
Timeout: 1 * time.Second,
|
||||
Method: "GET",
|
||||
TaskName: task,
|
||||
},
|
||||
{
|
||||
Name: "check-error",
|
||||
Type: "http",
|
||||
Path: "/fail",
|
||||
Protocol: "http",
|
||||
PortLabel: port,
|
||||
AddressMode: "auto",
|
||||
Interval: 250 * time.Millisecond,
|
||||
Timeout: 1 * time.Second,
|
||||
Method: "GET",
|
||||
TaskName: task,
|
||||
},
|
||||
{
|
||||
Name: "check-hang",
|
||||
Type: "http",
|
||||
Path: "/hang",
|
||||
Protocol: "http",
|
||||
PortLabel: port,
|
||||
AddressMode: "auto",
|
||||
Interval: 250 * time.Millisecond,
|
||||
Timeout: 500 * time.Millisecond,
|
||||
Method: "GET",
|
||||
TaskName: task,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
switch onGroup {
|
||||
case true:
|
||||
group.Tasks[0].Services = nil
|
||||
group.Services = services
|
||||
case false:
|
||||
group.Services = nil
|
||||
group.Tasks[0].Services = services
|
||||
}
|
||||
return alloc
|
||||
}
|
||||
|
||||
func allocWithDifferentNomadChecks(id, addr, port string) *structs.Allocation {
|
||||
alloc := allocWithNomadChecks(addr, port, true)
|
||||
alloc.ID = id
|
||||
group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||
|
||||
group.Services[0].Checks[2].Path = "/" // the hanging check is now ok
|
||||
|
||||
// append 4th check, this one is failing
|
||||
group.Services[0].Checks = append(group.Services[0].Checks, &structs.ServiceCheck{
|
||||
Name: "check-error-2",
|
||||
Type: "http",
|
||||
Path: "/fail",
|
||||
Protocol: "http",
|
||||
PortLabel: port,
|
||||
AddressMode: "auto",
|
||||
Interval: 250 * time.Millisecond,
|
||||
Timeout: 1 * time.Second,
|
||||
Method: "GET",
|
||||
})
|
||||
return alloc
|
||||
}
|
||||
|
||||
var checkHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/fail":
|
||||
w.WriteHeader(500)
|
||||
_, _ = io.WriteString(w, "500 problem")
|
||||
case "/hang":
|
||||
time.Sleep(2 * time.Second)
|
||||
_, _ = io.WriteString(w, "too slow")
|
||||
default:
|
||||
w.WriteHeader(200)
|
||||
_, _ = io.WriteString(w, "200 ok")
|
||||
}
|
||||
})
|
||||
|
||||
func TestCheckHook_Checks_ResultsSet(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
// create an http server with various responses
|
||||
ts := httptest.NewServer(checkHandler)
|
||||
defer ts.Close()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
onGroup bool
|
||||
}{
|
||||
{name: "group-level", onGroup: true},
|
||||
{name: "task-level", onGroup: false},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
checkStore := makeCheckStore(logger)
|
||||
|
||||
// get the address and port for http server
|
||||
tokens := strings.Split(ts.URL, ":")
|
||||
addr, port := strings.TrimPrefix(tokens[1], "//"), tokens[2]
|
||||
|
||||
network := mock.NewNetworkStatus(addr)
|
||||
|
||||
alloc := allocWithNomadChecks(addr, port, tc.onGroup)
|
||||
|
||||
h := newChecksHook(logger, alloc, checkStore, network)
|
||||
|
||||
// initialize is called; observers are created but not started yet
|
||||
must.MapEmpty(t, h.observers)
|
||||
|
||||
// calling pre-run starts the observers
|
||||
err := h.Prerun()
|
||||
must.NoError(t, err)
|
||||
|
||||
testutil.WaitForResultUntil(
|
||||
2*time.Second,
|
||||
func() (bool, error) {
|
||||
results := checkStore.List(alloc.ID)
|
||||
passing, failing, pending := 0, 0, 0
|
||||
for _, result := range results {
|
||||
switch result.Status {
|
||||
case structs.CheckSuccess:
|
||||
passing++
|
||||
case structs.CheckFailure:
|
||||
failing++
|
||||
case structs.CheckPending:
|
||||
pending++
|
||||
}
|
||||
}
|
||||
if passing != 1 || failing != 2 || pending != 0 {
|
||||
fmt.Printf("results %v\n", results)
|
||||
return false, fmt.Errorf(
|
||||
"expected 1 passing, 2 failing, 0 pending, got %d passing, %d failing, %d pending",
|
||||
passing, failing, pending,
|
||||
)
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
func(err error) {
|
||||
t.Fatalf(err.Error())
|
||||
},
|
||||
)
|
||||
|
||||
h.PreKill() // stop observers, cleanup
|
||||
|
||||
// assert shim no longer contains results for the alloc
|
||||
results := checkStore.List(alloc.ID)
|
||||
must.MapEmpty(t, results)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckHook_Checks_UpdateSet(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
// create an http server with various responses
|
||||
ts := httptest.NewServer(checkHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// get the address and port for http server
|
||||
tokens := strings.Split(ts.URL, ":")
|
||||
addr, port := strings.TrimPrefix(tokens[1], "//"), tokens[2]
|
||||
|
||||
shim := makeCheckStore(logger)
|
||||
|
||||
network := mock.NewNetworkStatus(addr)
|
||||
|
||||
alloc := allocWithNomadChecks(addr, port, true)
|
||||
|
||||
h := newChecksHook(logger, alloc, shim, network)
|
||||
|
||||
// calling pre-run starts the observers
|
||||
err := h.Prerun()
|
||||
must.NoError(t, err)
|
||||
|
||||
// initial set of checks
|
||||
testutil.WaitForResultUntil(
|
||||
2*time.Second,
|
||||
func() (bool, error) {
|
||||
results := shim.List(alloc.ID)
|
||||
passing, failing, pending := 0, 0, 0
|
||||
for _, result := range results {
|
||||
switch result.Status {
|
||||
case structs.CheckSuccess:
|
||||
passing++
|
||||
case structs.CheckFailure:
|
||||
failing++
|
||||
case structs.CheckPending:
|
||||
pending++
|
||||
}
|
||||
}
|
||||
if passing != 1 || failing != 2 || pending != 0 {
|
||||
fmt.Printf("results %v\n", results)
|
||||
return false, fmt.Errorf(
|
||||
"(initial set) expected 1 passing, 2 failing, 0 pending, got %d passing, %d failing, %d pending",
|
||||
passing, failing, pending,
|
||||
)
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
func(err error) {
|
||||
t.Fatalf(err.Error())
|
||||
},
|
||||
)
|
||||
|
||||
request := &interfaces.RunnerUpdateRequest{
|
||||
Alloc: allocWithDifferentNomadChecks(alloc.ID, addr, port),
|
||||
}
|
||||
|
||||
err = h.Update(request)
|
||||
must.NoError(t, err)
|
||||
|
||||
// updated set of checks
|
||||
testutil.WaitForResultUntil(
|
||||
2*time.Second,
|
||||
func() (bool, error) {
|
||||
results := shim.List(alloc.ID)
|
||||
passing, failing, pending := 0, 0, 0
|
||||
for _, result := range results {
|
||||
switch result.Status {
|
||||
case structs.CheckSuccess:
|
||||
passing++
|
||||
case structs.CheckFailure:
|
||||
failing++
|
||||
case structs.CheckPending:
|
||||
pending++
|
||||
}
|
||||
}
|
||||
if passing != 2 || failing != 2 || pending != 0 {
|
||||
fmt.Printf("results %v\n", results)
|
||||
return false, fmt.Errorf(
|
||||
"(updated set) expected 2 passing, 2 failing, 0 pending, got %d passing, %d failing, %d pending",
|
||||
passing, failing, pending,
|
||||
)
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
func(err error) {
|
||||
t.Fatalf(err.Error())
|
||||
},
|
||||
)
|
||||
|
||||
h.PreKill() // stop observers, cleanup
|
||||
|
||||
// assert shim no longer contains results for the alloc
|
||||
results := shim.List(alloc.ID)
|
||||
must.MapEmpty(t, results)
|
||||
}
|
|
@ -12,6 +12,7 @@ import (
|
|||
"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
|
||||
"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
||||
cstate "github.com/hashicorp/nomad/client/state"
|
||||
"github.com/hashicorp/nomad/client/vaultclient"
|
||||
|
@ -87,6 +88,9 @@ type Config struct {
|
|||
// to perform service and check registration and deregistration.
|
||||
ServiceRegWrapper *wrapper.HandlerWrapper
|
||||
|
||||
// CheckStore contains check result information.
|
||||
CheckStore checkstore.Shim
|
||||
|
||||
// Getter is an interface for retrieving artifacts.
|
||||
Getter interfaces.ArtifactGetter
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
||||
"github.com/hashicorp/nomad/client/taskenv"
|
||||
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -18,21 +19,17 @@ const (
|
|||
groupServiceHookName = "group_services"
|
||||
)
|
||||
|
||||
type networkStatusGetter interface {
|
||||
NetworkStatus() *structs.AllocNetworkStatus
|
||||
}
|
||||
|
||||
// groupServiceHook manages task group Consul service registration and
|
||||
// deregistration.
|
||||
type groupServiceHook struct {
|
||||
allocID string
|
||||
jobID string
|
||||
group string
|
||||
restarter agentconsul.WorkloadRestarter
|
||||
prerun bool
|
||||
deregistered bool
|
||||
networkStatusGetter networkStatusGetter
|
||||
shutdownDelayCtx context.Context
|
||||
allocID string
|
||||
jobID string
|
||||
group string
|
||||
restarter agentconsul.WorkloadRestarter
|
||||
prerun bool
|
||||
deregistered bool
|
||||
networkStatus structs.NetworkStatus
|
||||
shutdownDelayCtx context.Context
|
||||
|
||||
// namespace is the Nomad or Consul namespace in which service
|
||||
// registrations will be made. This field may be updated.
|
||||
|
@ -58,12 +55,12 @@ type groupServiceHook struct {
|
|||
}
|
||||
|
||||
type groupServiceHookConfig struct {
|
||||
alloc *structs.Allocation
|
||||
restarter agentconsul.WorkloadRestarter
|
||||
taskEnvBuilder *taskenv.Builder
|
||||
networkStatusGetter networkStatusGetter
|
||||
shutdownDelayCtx context.Context
|
||||
logger log.Logger
|
||||
alloc *structs.Allocation
|
||||
restarter agentconsul.WorkloadRestarter
|
||||
taskEnvBuilder *taskenv.Builder
|
||||
networkStatus structs.NetworkStatus
|
||||
shutdownDelayCtx context.Context
|
||||
logger log.Logger
|
||||
|
||||
// namespace is the Nomad or Consul namespace in which service
|
||||
// registrations will be made.
|
||||
|
@ -83,18 +80,18 @@ func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook {
|
|||
}
|
||||
|
||||
h := &groupServiceHook{
|
||||
allocID: cfg.alloc.ID,
|
||||
jobID: cfg.alloc.JobID,
|
||||
group: cfg.alloc.TaskGroup,
|
||||
restarter: cfg.restarter,
|
||||
namespace: cfg.namespace,
|
||||
taskEnvBuilder: cfg.taskEnvBuilder,
|
||||
delay: shutdownDelay,
|
||||
networkStatusGetter: cfg.networkStatusGetter,
|
||||
logger: cfg.logger.Named(groupServiceHookName),
|
||||
serviceRegWrapper: cfg.serviceRegWrapper,
|
||||
services: tg.Services,
|
||||
shutdownDelayCtx: cfg.shutdownDelayCtx,
|
||||
allocID: cfg.alloc.ID,
|
||||
jobID: cfg.alloc.JobID,
|
||||
group: cfg.alloc.TaskGroup,
|
||||
restarter: cfg.restarter,
|
||||
namespace: cfg.namespace,
|
||||
taskEnvBuilder: cfg.taskEnvBuilder,
|
||||
delay: shutdownDelay,
|
||||
networkStatus: cfg.networkStatus,
|
||||
logger: cfg.logger.Named(groupServiceHookName),
|
||||
serviceRegWrapper: cfg.serviceRegWrapper,
|
||||
services: tg.Services,
|
||||
shutdownDelayCtx: cfg.shutdownDelayCtx,
|
||||
}
|
||||
|
||||
if cfg.alloc.AllocatedResources != nil {
|
||||
|
@ -210,10 +207,13 @@ func (h *groupServiceHook) preKillLocked() {
|
|||
|
||||
h.logger.Debug("delay before killing tasks", "group", h.group, "shutdown_delay", h.delay)
|
||||
|
||||
timer, cancel := helper.NewSafeTimer(h.delay)
|
||||
defer cancel()
|
||||
|
||||
select {
|
||||
// Wait for specified shutdown_delay unless ignored
|
||||
// This will block an agent from shutting down.
|
||||
case <-time.After(h.delay):
|
||||
case <-timer.C:
|
||||
case <-h.shutdownDelayCtx.Done():
|
||||
}
|
||||
}
|
||||
|
@ -241,8 +241,8 @@ func (h *groupServiceHook) getWorkloadServices() *serviceregistration.WorkloadSe
|
|||
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)
|
||||
|
||||
var netStatus *structs.AllocNetworkStatus
|
||||
if h.networkStatusGetter != nil {
|
||||
netStatus = h.networkStatusGetter.NetworkStatus()
|
||||
if h.networkStatus != nil {
|
||||
netStatus = h.networkStatus.NetworkStatus()
|
||||
}
|
||||
|
||||
// Create task services struct with request's driver metadata
|
|
@ -6,35 +6,39 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/client/allochealth"
|
||||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
// healthMutator is able to set/clear alloc health.
|
||||
// healthSetter is able to set/clear alloc health.
|
||||
type healthSetter interface {
|
||||
// HasHealth returns true if health is already set.
|
||||
HasHealth() bool
|
||||
|
||||
// Set health via the mutator
|
||||
// SetHealth via the mutator.
|
||||
SetHealth(healthy, isDeploy bool, taskEvents map[string]*structs.TaskEvent)
|
||||
|
||||
// Clear health when the deployment ID changes
|
||||
// ClearHealth for when the deployment ID changes.
|
||||
ClearHealth()
|
||||
}
|
||||
|
||||
// allocHealthWatcherHook is responsible for watching an allocation's task
|
||||
// status and (optionally) Consul health check status to determine if the
|
||||
// allocation is health or unhealthy. Used by deployments and migrations.
|
||||
// allocation is healthy or unhealthy. Used by deployments and migrations.
|
||||
type allocHealthWatcherHook struct {
|
||||
healthSetter healthSetter
|
||||
|
||||
// consul client used to monitor health checks
|
||||
// consul client used to monitor Consul service health checks
|
||||
consul serviceregistration.Handler
|
||||
|
||||
// checkStore is used to monitor Nomad service health checks
|
||||
checkStore checkstore.Shim
|
||||
|
||||
// listener is given to trackers to listen for alloc updates and closed
|
||||
// when the alloc is destroyed.
|
||||
listener *cstructs.AllocListener
|
||||
|
@ -64,11 +68,11 @@ type allocHealthWatcherHook struct {
|
|||
// hold hookLock to access.
|
||||
isDeploy bool
|
||||
|
||||
logger log.Logger
|
||||
logger hclog.Logger
|
||||
}
|
||||
|
||||
func newAllocHealthWatcherHook(logger log.Logger, alloc *structs.Allocation, hs healthSetter,
|
||||
listener *cstructs.AllocListener, consul serviceregistration.Handler) interfaces.RunnerHook {
|
||||
func newAllocHealthWatcherHook(logger hclog.Logger, alloc *structs.Allocation, hs healthSetter,
|
||||
listener *cstructs.AllocListener, consul serviceregistration.Handler, checkStore checkstore.Shim) interfaces.RunnerHook {
|
||||
|
||||
// Neither deployments nor migrations care about the health of
|
||||
// non-service jobs so never watch their health
|
||||
|
@ -85,6 +89,7 @@ func newAllocHealthWatcherHook(logger log.Logger, alloc *structs.Allocation, hs
|
|||
cancelFn: func() {}, // initialize to prevent nil func panics
|
||||
watchDone: closedDone,
|
||||
consul: consul,
|
||||
checkStore: checkStore,
|
||||
healthSetter: hs,
|
||||
listener: listener,
|
||||
}
|
||||
|
@ -132,8 +137,9 @@ func (h *allocHealthWatcherHook) init() error {
|
|||
|
||||
h.logger.Trace("watching", "deadline", deadline, "checks", useChecks, "min_healthy_time", minHealthyTime)
|
||||
// Create a new tracker, start it, and watch for health results.
|
||||
tracker := allochealth.NewTracker(ctx, h.logger, h.alloc,
|
||||
h.listener, h.consul, minHealthyTime, useChecks)
|
||||
tracker := allochealth.NewTracker(
|
||||
ctx, h.logger, h.alloc, h.listener, h.consul, h.checkStore, minHealthyTime, useChecks,
|
||||
)
|
||||
tracker.Start()
|
||||
|
||||
// Create a new done chan and start watching for health updates
|
||||
|
@ -194,7 +200,7 @@ func (h *allocHealthWatcherHook) Postrun() error {
|
|||
|
||||
func (h *allocHealthWatcherHook) Shutdown() {
|
||||
// Same as Postrun
|
||||
h.Postrun()
|
||||
_ = h.Postrun()
|
||||
}
|
||||
|
||||
// watchHealth watches alloc health until it is set, the alloc is stopped, the
|
||||
|
|
|
@ -96,7 +96,8 @@ func TestHealthHook_PrerunPostrun(t *testing.T) {
|
|||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
hs := &mockHealthSetter{}
|
||||
|
||||
h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul)
|
||||
checks := new(mock.CheckShim)
|
||||
h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul, checks)
|
||||
|
||||
// Assert we implemented the right interfaces
|
||||
prerunh, ok := h.(interfaces.RunnerPrerunHook)
|
||||
|
@ -134,7 +135,8 @@ func TestHealthHook_PrerunUpdatePostrun(t *testing.T) {
|
|||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
hs := &mockHealthSetter{}
|
||||
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook)
|
||||
checks := new(mock.CheckShim)
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook)
|
||||
|
||||
// Prerun
|
||||
require.NoError(h.Prerun())
|
||||
|
@ -173,7 +175,8 @@ func TestHealthHook_UpdatePrerunPostrun(t *testing.T) {
|
|||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
hs := &mockHealthSetter{}
|
||||
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook)
|
||||
checks := new(mock.CheckShim)
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook)
|
||||
|
||||
// Set a DeploymentID to cause ClearHealth to be called
|
||||
alloc.DeploymentID = uuid.Generate()
|
||||
|
@ -214,7 +217,8 @@ func TestHealthHook_Postrun(t *testing.T) {
|
|||
consul := regMock.NewServiceRegistrationHandler(logger)
|
||||
hs := &mockHealthSetter{}
|
||||
|
||||
h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul).(*allocHealthWatcherHook)
|
||||
checks := new(mock.CheckShim)
|
||||
h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook)
|
||||
|
||||
// Postrun
|
||||
require.NoError(h.Postrun())
|
||||
|
@ -280,7 +284,8 @@ func TestHealthHook_SetHealth_healthy(t *testing.T) {
|
|||
|
||||
hs := newMockHealthSetter()
|
||||
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook)
|
||||
checks := new(mock.CheckShim)
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook)
|
||||
|
||||
// Prerun
|
||||
require.NoError(h.Prerun())
|
||||
|
@ -368,7 +373,8 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) {
|
|||
|
||||
hs := newMockHealthSetter()
|
||||
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook)
|
||||
checks := new(mock.CheckShim)
|
||||
h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook)
|
||||
|
||||
// Prerun
|
||||
require.NoError(h.Prerun())
|
||||
|
@ -389,7 +395,7 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) {
|
|||
func TestHealthHook_SystemNoop(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.SystemAlloc(), nil, nil, nil)
|
||||
h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.SystemAlloc(), nil, nil, nil, nil)
|
||||
|
||||
// Assert that it's the noop impl
|
||||
_, ok := h.(noopAllocHealthWatcherHook)
|
||||
|
@ -410,7 +416,7 @@ func TestHealthHook_SystemNoop(t *testing.T) {
|
|||
func TestHealthHook_BatchNoop(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.BatchAlloc(), nil, nil, nil)
|
||||
h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.BatchAlloc(), nil, nil, nil, nil)
|
||||
|
||||
// Assert that it's the noop impl
|
||||
_, ok := h.(noopAllocHealthWatcherHook)
|
||||
|
|
|
@ -4,19 +4,19 @@ import (
|
|||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
// RunnnerHook is a lifecycle hook into the life cycle of an allocation runner.
|
||||
// RunnerHook is a lifecycle hook into the life cycle of an allocation runner.
|
||||
type RunnerHook interface {
|
||||
Name() string
|
||||
}
|
||||
|
||||
// RunnerPrerunHooks are executed before calling TaskRunner.Run for
|
||||
// A RunnerPrerunHook is executed before calling TaskRunner.Run for
|
||||
// non-terminal allocations. Terminal allocations do *not* call prerun.
|
||||
type RunnerPrerunHook interface {
|
||||
RunnerHook
|
||||
Prerun() error
|
||||
}
|
||||
|
||||
// RunnerPreKillHooks are executed inside of KillTasks before
|
||||
// A RunnerPreKillHook is executed inside of KillTasks before
|
||||
// iterating and killing each task. It will run before the Leader
|
||||
// task is killed.
|
||||
type RunnerPreKillHook interface {
|
||||
|
@ -25,7 +25,7 @@ type RunnerPreKillHook interface {
|
|||
PreKill()
|
||||
}
|
||||
|
||||
// RunnerPostrunHooks are executed after calling TaskRunner.Run, even for
|
||||
// A RunnerPostrunHook is executed after calling TaskRunner.Run, even for
|
||||
// terminal allocations. Therefore Postrun hooks must be safe to call without
|
||||
// first calling Prerun hooks.
|
||||
type RunnerPostrunHook interface {
|
||||
|
@ -33,7 +33,7 @@ type RunnerPostrunHook interface {
|
|||
Postrun() error
|
||||
}
|
||||
|
||||
// RunnerDestroyHooks are executed after AllocRunner.Run has exited and must
|
||||
// A RunnerDestroyHook is executed after AllocRunner.Run has exited and must
|
||||
// make a best effort cleanup allocation resources. Destroy hooks must be safe
|
||||
// to call without first calling Prerun.
|
||||
type RunnerDestroyHook interface {
|
||||
|
@ -41,10 +41,10 @@ type RunnerDestroyHook interface {
|
|||
Destroy() error
|
||||
}
|
||||
|
||||
// RunnerUpdateHooks are executed when an allocation update is received from
|
||||
// A RunnerUpdateHook is executed when an allocation update is received from
|
||||
// the server. Update is called concurrently with AllocRunner execution and
|
||||
// therefore must be safe for concurrent access with other hook methods. Calls
|
||||
// to Update are serialized so allocaiton updates will always be processed in
|
||||
// to Update are serialized so allocation updates will always be processed in
|
||||
// order.
|
||||
type RunnerUpdateHook interface {
|
||||
RunnerHook
|
||||
|
@ -55,7 +55,7 @@ type RunnerUpdateRequest struct {
|
|||
Alloc *structs.Allocation
|
||||
}
|
||||
|
||||
// RunnerTaskRestartHooks are executed just before the allocation runner is
|
||||
// A RunnerTaskRestartHook is executed just before the allocation runner is
|
||||
// going to restart all tasks.
|
||||
type RunnerTaskRestartHook interface {
|
||||
RunnerHook
|
||||
|
|
|
@ -14,6 +14,7 @@ import (
|
|||
"github.com/hashicorp/nomad/client/devicemanager"
|
||||
"github.com/hashicorp/nomad/client/lib/cgutil"
|
||||
"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/mock"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
|
@ -67,12 +68,14 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu
|
|||
consulRegMock := mock.NewServiceRegistrationHandler(clientConf.Logger)
|
||||
nomadRegMock := mock.NewServiceRegistrationHandler(clientConf.Logger)
|
||||
|
||||
stateDB := new(state.NoopDB)
|
||||
|
||||
conf := &Config{
|
||||
// Copy the alloc in case the caller edits and reuses it
|
||||
Alloc: alloc.Copy(),
|
||||
Logger: clientConf.Logger,
|
||||
ClientConfig: clientConf,
|
||||
StateDB: state.NoopDB{},
|
||||
StateDB: stateDB,
|
||||
Consul: consulRegMock,
|
||||
ConsulSI: consul.NewMockServiceIdentitiesClient(),
|
||||
Vault: vaultclient.NewMockVaultClient(),
|
||||
|
@ -84,6 +87,7 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu
|
|||
CpusetManager: new(cgutil.NoopCpusetManager),
|
||||
ServersContactedCh: make(chan struct{}),
|
||||
ServiceRegWrapper: wrapper.NewHandlerWrapper(clientConf.Logger, consulRegMock, nomadRegMock),
|
||||
CheckStore: checkstore.NewStore(clientConf.Logger, stateDB),
|
||||
Getter: getter.TestDefaultGetter(t),
|
||||
}
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
package allocrunner
|
|
@ -37,6 +37,7 @@ import (
|
|||
"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
|
||||
"github.com/hashicorp/nomad/client/servers"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/nsd"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
|
@ -166,7 +167,7 @@ type AllocRunner interface {
|
|||
}
|
||||
|
||||
// Client is used to implement the client interaction with Nomad. Clients
|
||||
// are expected to register as a schedulable node to the servers, and to
|
||||
// are expected to register as a schedule-able node to the servers, and to
|
||||
// run allocations as determined by the servers.
|
||||
type Client struct {
|
||||
config *config.Config
|
||||
|
@ -237,6 +238,10 @@ type Client struct {
|
|||
// registrations.
|
||||
nomadService serviceregistration.Handler
|
||||
|
||||
// checkStore is used to store group and task checks and their current pass/fail
|
||||
// status.
|
||||
checkStore checkstore.Shim
|
||||
|
||||
// serviceRegWrapper wraps the consulService and nomadService
|
||||
// implementations so that the alloc and task runner service hooks can call
|
||||
// this without needing to identify which backend provider should be used.
|
||||
|
@ -693,6 +698,10 @@ func (c *Client) init() error {
|
|||
c.cpusetManager = new(cgutil.NoopCpusetManager)
|
||||
}
|
||||
}
|
||||
|
||||
// setup the check store
|
||||
c.checkStore = checkstore.NewStore(c.logger, c.stateDB)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -1175,6 +1184,7 @@ func (c *Client) restoreState() error {
|
|||
DriverManager: c.drivermanager,
|
||||
ServersContactedCh: c.serversContactedCh,
|
||||
ServiceRegWrapper: c.serviceRegWrapper,
|
||||
CheckStore: c.checkStore,
|
||||
RPCClient: c,
|
||||
Getter: c.getter,
|
||||
}
|
||||
|
@ -2513,6 +2523,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error
|
|||
DeviceManager: c.devicemanager,
|
||||
DriverManager: c.drivermanager,
|
||||
ServiceRegWrapper: c.serviceRegWrapper,
|
||||
CheckStore: c.checkStore,
|
||||
RPCClient: c,
|
||||
Getter: c.getter,
|
||||
}
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
package checkstore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
// A Shim is used to track the latest check status information, one layer above
|
||||
// the client persistent store so we can do efficient indexing, etc.
|
||||
type Shim interface {
|
||||
// Set the latest result for a specific check.
|
||||
Set(allocID string, result *structs.CheckQueryResult) error
|
||||
|
||||
// List the latest results for a specific allocation.
|
||||
List(allocID string) map[structs.CheckID]*structs.CheckQueryResult
|
||||
|
||||
// Difference returns the set of IDs being stored that are not in ids.
|
||||
Difference(allocID string, ids []structs.CheckID) []structs.CheckID
|
||||
|
||||
// Remove will remove ids from the cache and persistent store.
|
||||
Remove(allocID string, ids []structs.CheckID) error
|
||||
|
||||
// Purge results for a specific allocation.
|
||||
Purge(allocID string) error
|
||||
}
|
||||
|
||||
type shim struct {
|
||||
log hclog.Logger
|
||||
|
||||
db state.StateDB
|
||||
|
||||
lock sync.RWMutex
|
||||
current checks.ClientResults
|
||||
}
|
||||
|
||||
// NewStore creates a new store.
|
||||
func NewStore(log hclog.Logger, db state.StateDB) Shim {
|
||||
s := &shim{
|
||||
log: log.Named("check_store"),
|
||||
db: db,
|
||||
current: make(checks.ClientResults),
|
||||
}
|
||||
s.restore()
|
||||
return s
|
||||
}
|
||||
|
||||
func (s *shim) restore() {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
results, err := s.db.GetCheckResults()
|
||||
if err != nil {
|
||||
s.log.Error("failed to restore health check results", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
for id, m := range results {
|
||||
s.current[id] = helper.CopyMap(m)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *shim) Set(allocID string, qr *structs.CheckQueryResult) error {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
if _, exists := s.current[allocID]; !exists {
|
||||
s.current[allocID] = make(map[structs.CheckID]*structs.CheckQueryResult)
|
||||
}
|
||||
|
||||
// lookup existing result
|
||||
previous, exists := s.current[allocID][qr.ID]
|
||||
|
||||
// only insert a stub if no result exists yet
|
||||
if qr.Status == structs.CheckPending && exists {
|
||||
return nil
|
||||
}
|
||||
|
||||
s.log.Trace("setting check status", "alloc_id", allocID, "check_id", qr.ID, "status", qr.Status)
|
||||
|
||||
// always keep in-memory shim up to date with latest result
|
||||
s.current[allocID][qr.ID] = qr
|
||||
|
||||
// only update persistent store if status changes (optimization)
|
||||
// on Client restart restored check results may be outdated but the status
|
||||
// is the same as the most recent result
|
||||
if !exists || previous.Status != qr.Status {
|
||||
return s.db.PutCheckResult(allocID, qr)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *shim) List(allocID string) map[structs.CheckID]*structs.CheckQueryResult {
|
||||
s.lock.RLock()
|
||||
defer s.lock.RUnlock()
|
||||
|
||||
m, exists := s.current[allocID]
|
||||
if !exists {
|
||||
return nil
|
||||
}
|
||||
|
||||
return helper.CopyMap(m)
|
||||
}
|
||||
|
||||
func (s *shim) Purge(allocID string) error {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
// remove from our map
|
||||
delete(s.current, allocID)
|
||||
|
||||
// remove from persistent store
|
||||
return s.db.PurgeCheckResults(allocID)
|
||||
}
|
||||
|
||||
func (s *shim) Remove(allocID string, ids []structs.CheckID) error {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
// remove from cache
|
||||
for _, id := range ids {
|
||||
delete(s.current[allocID], id)
|
||||
}
|
||||
|
||||
// remove from persistent store
|
||||
return s.db.DeleteCheckResults(allocID, ids)
|
||||
}
|
||||
|
||||
func (s *shim) Difference(allocID string, ids []structs.CheckID) []structs.CheckID {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
var remove []structs.CheckID
|
||||
for id := range s.current[allocID] {
|
||||
if !slices.Contains(ids, id) {
|
||||
remove = append(remove, id)
|
||||
}
|
||||
}
|
||||
|
||||
return remove
|
||||
}
|
|
@ -0,0 +1,379 @@
|
|||
package checkstore
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/shoenig/test/must"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
var (
|
||||
success = structs.CheckSuccess
|
||||
failure = structs.CheckFailure
|
||||
pending = structs.CheckPending
|
||||
)
|
||||
|
||||
func newQR(id string, status structs.CheckStatus) *structs.CheckQueryResult {
|
||||
return &structs.CheckQueryResult{
|
||||
ID: structs.CheckID(id),
|
||||
Status: status,
|
||||
}
|
||||
}
|
||||
|
||||
// alias for brevity
|
||||
type qrMap = map[structs.CheckID]*structs.CheckQueryResult
|
||||
|
||||
func TestShim_New(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
t.Run("restore empty", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
m := s.List("none")
|
||||
must.MapEmpty(t, m)
|
||||
})
|
||||
|
||||
t.Run("restore checks", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
must.NoError(t, db.PutCheckResult("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, db.PutCheckResult("alloc2", newQR("id3", pending)))
|
||||
s := NewStore(logger, db)
|
||||
m1 := s.List("alloc1")
|
||||
must.MapEq(t, qrMap{
|
||||
"id1": newQR("id1", success),
|
||||
"id2": newQR("id2", failure),
|
||||
}, m1)
|
||||
m2 := s.List("alloc2")
|
||||
must.MapEq(t, qrMap{"id3": newQR("id3", pending)}, m2)
|
||||
})
|
||||
}
|
||||
|
||||
func TestShim_Set(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
t.Run("insert pending", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert initial pending check into empty database
|
||||
qr1 := newQR("id1", pending)
|
||||
qr1.Timestamp = 1
|
||||
must.NoError(t, s.Set("alloc1", qr1))
|
||||
|
||||
// ensure underlying db has check
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr1}}, internal)
|
||||
})
|
||||
|
||||
t.Run("ignore followup pending", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert a check
|
||||
qr1 := newQR("id1", success)
|
||||
qr1.Timestamp = 1
|
||||
must.NoError(t, s.Set("alloc1", qr1))
|
||||
|
||||
// insert a followup pending check (e.g. client restart)
|
||||
qr2 := newQR("id1", pending)
|
||||
qr2.Timestamp = 2
|
||||
t.Run("into existing", func(t *testing.T) {
|
||||
must.NoError(t, s.Set("alloc1", qr2))
|
||||
})
|
||||
|
||||
// ensure shim maintains success result
|
||||
list := s.List("alloc1")
|
||||
must.Eq(t, qrMap{"id1": qr1}, list)
|
||||
|
||||
// ensure underlying db maintains success result
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr1}}, internal)
|
||||
})
|
||||
|
||||
t.Run("insert status change", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert initial check, success
|
||||
qr1 := newQR("id1", success)
|
||||
must.NoError(t, s.Set("alloc1", qr1))
|
||||
|
||||
// insert followup check, failure
|
||||
qr2 := newQR("id1", failure)
|
||||
must.NoError(t, s.Set("alloc1", qr2))
|
||||
|
||||
// ensure shim sees newest status result
|
||||
list := s.List("alloc1")
|
||||
must.Eq(t, qrMap{"id1": qr2}, list)
|
||||
|
||||
// ensure underlying db sees newest status result
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr2}}, internal)
|
||||
})
|
||||
|
||||
t.Run("insert status same", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert initial check, success
|
||||
qr1 := newQR("id1", success)
|
||||
qr1.Timestamp = 1
|
||||
must.NoError(t, s.Set("alloc1", qr1))
|
||||
|
||||
// insert followup check, also success
|
||||
qr2 := newQR("id1", success)
|
||||
qr2.Timestamp = 2
|
||||
must.NoError(t, s.Set("alloc1", qr2))
|
||||
|
||||
// ensure shim sees newest status result
|
||||
list := s.List("alloc1")
|
||||
must.Eq(t, qrMap{"id1": qr2}, list)
|
||||
|
||||
// ensure underlying db sees stale result (optimization)
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr1}}, internal)
|
||||
})
|
||||
}
|
||||
|
||||
func TestShim_List(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
t.Run("list empty", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
list := s.List("alloc1")
|
||||
must.MapEmpty(t, list)
|
||||
})
|
||||
|
||||
t.Run("list mix", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc2", newQR("id1", pending)))
|
||||
|
||||
list1 := s.List("alloc1")
|
||||
must.MapEq(t, qrMap{
|
||||
"id1": newQR("id1", success),
|
||||
"id2": newQR("id2", failure),
|
||||
}, list1)
|
||||
|
||||
list2 := s.List("alloc2")
|
||||
must.MapEq(t, qrMap{
|
||||
"id1": newQR("id1", pending),
|
||||
}, list2)
|
||||
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.MapEq(t, checks.ClientResults{
|
||||
"alloc1": {
|
||||
"id1": newQR("id1", success),
|
||||
"id2": newQR("id2", failure),
|
||||
},
|
||||
"alloc2": {
|
||||
"id1": newQR("id1", pending),
|
||||
},
|
||||
}, internal)
|
||||
})
|
||||
}
|
||||
|
||||
func TestShim_Difference(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
t.Run("empty store", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
ids := []structs.CheckID{"id1", "id2", "id3"}
|
||||
unwanted := s.Difference("alloc1", ids)
|
||||
must.Empty(t, unwanted)
|
||||
})
|
||||
|
||||
t.Run("empty unwanted", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc2", newQR("id1", pending)))
|
||||
|
||||
var ids []structs.CheckID
|
||||
var exp = []structs.CheckID{"id1", "id2"}
|
||||
unwanted := s.Difference("alloc1", ids)
|
||||
slices.Sort(unwanted)
|
||||
must.Eq(t, exp, unwanted)
|
||||
})
|
||||
|
||||
t.Run("subset unwanted", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id3", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id4", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id5", pending)))
|
||||
|
||||
ids := []structs.CheckID{"id1", "id3", "id4"}
|
||||
exp := []structs.CheckID{"id2", "id5"}
|
||||
unwanted := s.Difference("alloc1", ids)
|
||||
slices.Sort(unwanted)
|
||||
must.Eq(t, exp, unwanted)
|
||||
})
|
||||
|
||||
t.Run("unexpected unwanted", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id3", success)))
|
||||
|
||||
ids := []structs.CheckID{"id1", "id4"}
|
||||
exp := []structs.CheckID{"id2", "id3"}
|
||||
unwanted := s.Difference("alloc1", ids)
|
||||
slices.Sort(unwanted)
|
||||
must.Eq(t, exp, unwanted)
|
||||
})
|
||||
}
|
||||
|
||||
func TestShim_Remove(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
t.Run("remove from empty store", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
ids := []structs.CheckID{"id1", "id2"}
|
||||
err := s.Remove("alloc1", ids)
|
||||
must.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("remove empty set from store", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc2", newQR("id1", pending)))
|
||||
|
||||
var ids []structs.CheckID
|
||||
err := s.Remove("alloc1", ids)
|
||||
must.NoError(t, err)
|
||||
|
||||
// ensure shim still contains checks
|
||||
list := s.List("alloc1")
|
||||
must.Eq(t, qrMap{"id1": newQR("id1", success), "id2": newQR("id2", failure)}, list)
|
||||
|
||||
// ensure underlying db still contains all checks
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, checks.ClientResults{
|
||||
"alloc1": {
|
||||
"id1": newQR("id1", success),
|
||||
"id2": newQR("id2", failure),
|
||||
},
|
||||
"alloc2": {
|
||||
"id1": newQR("id1", pending),
|
||||
},
|
||||
}, internal)
|
||||
})
|
||||
|
||||
t.Run("remove subset from store", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id3", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id4", pending)))
|
||||
must.NoError(t, s.Set("alloc2", newQR("id1", pending)))
|
||||
must.NoError(t, s.Set("alloc2", newQR("id2", success)))
|
||||
|
||||
ids := []structs.CheckID{"id1", "id4"}
|
||||
err := s.Remove("alloc1", ids)
|
||||
must.NoError(t, err)
|
||||
|
||||
// ensure shim still contains remaining checks
|
||||
list := s.List("alloc1")
|
||||
must.Eq(t, qrMap{"id2": newQR("id2", failure), "id3": newQR("id3", success)}, list)
|
||||
|
||||
// ensure underlying db still contains remaining checks
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.MapEq(t, checks.ClientResults{
|
||||
"alloc1": {
|
||||
"id2": newQR("id2", failure),
|
||||
"id3": newQR("id3", success),
|
||||
},
|
||||
"alloc2": {
|
||||
"id1": newQR("id1", pending),
|
||||
"id2": newQR("id2", success),
|
||||
},
|
||||
}, internal)
|
||||
})
|
||||
}
|
||||
|
||||
func TestShim_Purge(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
t.Run("purge from empty", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
err := s.Purge("alloc1")
|
||||
must.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("purge one alloc", func(t *testing.T) {
|
||||
db := state.NewMemDB(logger)
|
||||
s := NewStore(logger, db)
|
||||
|
||||
// insert some checks
|
||||
must.NoError(t, s.Set("alloc1", newQR("id1", success)))
|
||||
must.NoError(t, s.Set("alloc1", newQR("id2", failure)))
|
||||
must.NoError(t, s.Set("alloc2", newQR("id1", pending)))
|
||||
|
||||
err := s.Purge("alloc1")
|
||||
must.NoError(t, err)
|
||||
|
||||
// ensure alloc1 is gone from shim
|
||||
list1 := s.List("alloc1")
|
||||
must.MapEmpty(t, list1)
|
||||
|
||||
// ensure alloc2 remains in shim
|
||||
list2 := s.List("alloc2")
|
||||
must.MapEq(t, qrMap{"id1": newQR("id1", pending)}, list2)
|
||||
|
||||
// ensure alloc is gone from underlying db
|
||||
internal, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.MapEq(t, checks.ClientResults{
|
||||
"alloc2": {"id1": newQR("id1", pending)},
|
||||
}, internal)
|
||||
})
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
package checks
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-cleanhttp"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"oss.indeed.com/go/libtime"
|
||||
)
|
||||
|
||||
const (
|
||||
// maxTimeoutHTTP is a fail-safe value for the HTTP client, ensuring a Nomad
|
||||
// Client does not leak goroutines hanging on to unresponsive endpoints.
|
||||
maxTimeoutHTTP = 10 * time.Minute
|
||||
)
|
||||
|
||||
// Checker executes a check given an allocation-specific context, and produces
|
||||
// a resulting structs.CheckQueryResult
|
||||
type Checker interface {
|
||||
Do(context.Context, *QueryContext, *Query) *structs.CheckQueryResult
|
||||
}
|
||||
|
||||
// New creates a new Checker capable of executing HTTP and TCP checks.
|
||||
func New(log hclog.Logger) Checker {
|
||||
httpClient := cleanhttp.DefaultPooledClient()
|
||||
httpClient.Timeout = maxTimeoutHTTP
|
||||
return &checker{
|
||||
log: log.Named("checks"),
|
||||
httpClient: httpClient,
|
||||
clock: libtime.SystemClock(),
|
||||
}
|
||||
}
|
||||
|
||||
type checker struct {
|
||||
log hclog.Logger
|
||||
clock libtime.Clock
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
func (c *checker) now() int64 {
|
||||
return c.clock.Now().UTC().Unix()
|
||||
}
|
||||
|
||||
// Do will execute the Query given the QueryContext and produce a structs.CheckQueryResult
|
||||
func (c *checker) Do(ctx context.Context, qc *QueryContext, q *Query) *structs.CheckQueryResult {
|
||||
var qr *structs.CheckQueryResult
|
||||
|
||||
timeout, cancel := context.WithTimeout(ctx, q.Timeout)
|
||||
defer cancel()
|
||||
|
||||
switch q.Type {
|
||||
case "http":
|
||||
qr = c.checkHTTP(timeout, qc, q)
|
||||
default:
|
||||
qr = c.checkTCP(timeout, qc, q)
|
||||
}
|
||||
|
||||
qr.ID = qc.ID
|
||||
qr.Group = qc.Group
|
||||
qr.Task = qc.Task
|
||||
qr.Service = qc.Service
|
||||
qr.Check = qc.Check
|
||||
return qr
|
||||
}
|
||||
|
||||
// resolve the address to use when executing Query given a QueryContext
|
||||
func address(qc *QueryContext, q *Query) (string, error) {
|
||||
mode := q.AddressMode
|
||||
if mode == "" { // determine resolution for check address
|
||||
if qc.CustomAddress != "" {
|
||||
// if the service is using a custom address, enable the check to
|
||||
// inherit that custom address
|
||||
mode = structs.AddressModeAuto
|
||||
} else {
|
||||
// otherwise a check defaults to the host address
|
||||
mode = structs.AddressModeHost
|
||||
}
|
||||
}
|
||||
|
||||
label := q.PortLabel
|
||||
if label == "" {
|
||||
label = qc.ServicePortLabel
|
||||
}
|
||||
|
||||
status := qc.NetworkStatus.NetworkStatus()
|
||||
addr, port, err := serviceregistration.GetAddress(
|
||||
qc.CustomAddress, // custom address
|
||||
mode, // check address mode
|
||||
label, // port label
|
||||
qc.Networks, // allocation networks
|
||||
nil, // driver network (not supported)
|
||||
qc.Ports, // ports
|
||||
status, // allocation network status
|
||||
)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if port > 0 {
|
||||
addr = net.JoinHostPort(addr, strconv.Itoa(port))
|
||||
}
|
||||
return addr, nil
|
||||
}
|
||||
|
||||
func (c *checker) checkTCP(ctx context.Context, qc *QueryContext, q *Query) *structs.CheckQueryResult {
|
||||
qr := &structs.CheckQueryResult{
|
||||
Mode: q.Mode,
|
||||
Timestamp: c.now(),
|
||||
Status: structs.CheckPending,
|
||||
}
|
||||
|
||||
addr, err := address(qc, q)
|
||||
if err != nil {
|
||||
qr.Output = err.Error()
|
||||
qr.Status = structs.CheckFailure
|
||||
return qr
|
||||
}
|
||||
|
||||
if _, err = new(net.Dialer).DialContext(ctx, "tcp", addr); err != nil {
|
||||
qr.Output = err.Error()
|
||||
qr.Status = structs.CheckFailure
|
||||
return qr
|
||||
}
|
||||
|
||||
qr.Output = "nomad: tcp ok"
|
||||
qr.Status = structs.CheckSuccess
|
||||
return qr
|
||||
}
|
||||
|
||||
func (c *checker) checkHTTP(ctx context.Context, qc *QueryContext, q *Query) *structs.CheckQueryResult {
|
||||
qr := &structs.CheckQueryResult{
|
||||
Mode: q.Mode,
|
||||
Timestamp: c.now(),
|
||||
Status: structs.CheckPending,
|
||||
}
|
||||
|
||||
addr, err := address(qc, q)
|
||||
if err != nil {
|
||||
qr.Output = err.Error()
|
||||
qr.Status = structs.CheckFailure
|
||||
return qr
|
||||
}
|
||||
|
||||
u := (&url.URL{
|
||||
Scheme: q.Protocol,
|
||||
Host: addr,
|
||||
Path: q.Path,
|
||||
}).String()
|
||||
|
||||
request, err := http.NewRequest(q.Method, u, nil)
|
||||
if err != nil {
|
||||
qr.Output = fmt.Sprintf("nomad: %s", err.Error())
|
||||
qr.Status = structs.CheckFailure
|
||||
return qr
|
||||
}
|
||||
request = request.WithContext(ctx)
|
||||
|
||||
result, err := c.httpClient.Do(request)
|
||||
if err != nil {
|
||||
qr.Output = fmt.Sprintf("nomad: %s", err.Error())
|
||||
qr.Status = structs.CheckFailure
|
||||
return qr
|
||||
}
|
||||
|
||||
// match the result status code to the http status code
|
||||
qr.StatusCode = result.StatusCode
|
||||
|
||||
switch {
|
||||
case result.StatusCode == 200:
|
||||
qr.Status = structs.CheckSuccess
|
||||
qr.Output = "nomad: http ok"
|
||||
return qr
|
||||
case result.StatusCode < 400:
|
||||
qr.Status = structs.CheckSuccess
|
||||
default:
|
||||
qr.Status = structs.CheckFailure
|
||||
}
|
||||
|
||||
// status code was not 200; read the response body and set that as the
|
||||
// check result output content
|
||||
qr.Output = limitRead(result.Body)
|
||||
|
||||
return qr
|
||||
}
|
||||
|
||||
const (
|
||||
// outputSizeLimit is the maximum number of bytes to read and store of an http
|
||||
// check output. Set to 3kb which fits in 1 page with room for other fields.
|
||||
outputSizeLimit = 3 * 1024
|
||||
)
|
||||
|
||||
func limitRead(r io.Reader) string {
|
||||
b := make([]byte, 0, outputSizeLimit)
|
||||
output := bytes.NewBuffer(b)
|
||||
limited := io.LimitReader(r, outputSizeLimit)
|
||||
if _, err := io.Copy(output, limited); err != nil {
|
||||
return fmt.Sprintf("nomad: %s", err.Error())
|
||||
}
|
||||
return output.String()
|
||||
}
|
|
@ -0,0 +1,342 @@
|
|||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/helper/freeport"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/shoenig/test/must"
|
||||
"oss.indeed.com/go/libtime/libtimetest"
|
||||
)
|
||||
|
||||
func TestChecker_Do_HTTP(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
// an example response that will be truncated
|
||||
tooLong, truncate := bigResponse()
|
||||
|
||||
// create an http server with various responses
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/fail":
|
||||
w.WriteHeader(500)
|
||||
_, _ = io.WriteString(w, "500 problem")
|
||||
case "/hang":
|
||||
time.Sleep(1 * time.Second)
|
||||
_, _ = io.WriteString(w, "too slow")
|
||||
case "/long-fail":
|
||||
w.WriteHeader(500)
|
||||
_, _ = io.WriteString(w, tooLong)
|
||||
case "/long-not-fail":
|
||||
w.WriteHeader(201)
|
||||
_, _ = io.WriteString(w, tooLong)
|
||||
default:
|
||||
w.WriteHeader(200)
|
||||
_, _ = io.WriteString(w, "200 ok")
|
||||
}
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
// get the address and port for http server
|
||||
tokens := strings.Split(ts.URL, ":")
|
||||
addr, port := strings.TrimPrefix(tokens[1], "//"), tokens[2]
|
||||
|
||||
// create a mock clock so we can assert time is set
|
||||
now := time.Date(2022, 1, 2, 3, 4, 5, 6, time.UTC)
|
||||
clock := libtimetest.NewClockMock(t).NowMock.Return(now)
|
||||
|
||||
makeQueryContext := func() *QueryContext {
|
||||
return &QueryContext{
|
||||
ID: "abc123",
|
||||
CustomAddress: addr,
|
||||
ServicePortLabel: port,
|
||||
Networks: nil,
|
||||
NetworkStatus: mock.NewNetworkStatus(addr),
|
||||
Ports: nil,
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}
|
||||
}
|
||||
|
||||
makeQuery := func(
|
||||
kind structs.CheckMode,
|
||||
path string,
|
||||
) *Query {
|
||||
return &Query{
|
||||
Mode: kind,
|
||||
Type: "http",
|
||||
Timeout: 100 * time.Millisecond,
|
||||
AddressMode: "auto",
|
||||
PortLabel: port,
|
||||
Protocol: "http",
|
||||
Path: path,
|
||||
Method: "GET",
|
||||
}
|
||||
}
|
||||
|
||||
makeExpResult := func(
|
||||
kind structs.CheckMode,
|
||||
status structs.CheckStatus,
|
||||
code int,
|
||||
output string,
|
||||
) *structs.CheckQueryResult {
|
||||
return &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: kind,
|
||||
Status: status,
|
||||
StatusCode: code,
|
||||
Output: output,
|
||||
Timestamp: now.Unix(),
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
qc *QueryContext
|
||||
q *Query
|
||||
expResult *structs.CheckQueryResult
|
||||
}{{
|
||||
name: "200 healthiness",
|
||||
qc: makeQueryContext(),
|
||||
q: makeQuery(structs.Healthiness, "/"),
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckSuccess,
|
||||
http.StatusOK,
|
||||
"nomad: http ok",
|
||||
),
|
||||
}, {
|
||||
name: "200 readiness",
|
||||
qc: makeQueryContext(),
|
||||
q: makeQuery(structs.Readiness, "/"),
|
||||
expResult: makeExpResult(
|
||||
structs.Readiness,
|
||||
structs.CheckSuccess,
|
||||
http.StatusOK,
|
||||
"nomad: http ok",
|
||||
),
|
||||
}, {
|
||||
name: "500 healthiness",
|
||||
qc: makeQueryContext(),
|
||||
q: makeQuery(structs.Healthiness, "fail"),
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckFailure,
|
||||
http.StatusInternalServerError,
|
||||
"500 problem",
|
||||
),
|
||||
}, {
|
||||
name: "hang",
|
||||
qc: makeQueryContext(),
|
||||
q: makeQuery(structs.Healthiness, "hang"),
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckFailure,
|
||||
0,
|
||||
fmt.Sprintf(`nomad: Get "%s/hang": context deadline exceeded`, ts.URL),
|
||||
),
|
||||
}, {
|
||||
name: "500 truncate",
|
||||
qc: makeQueryContext(),
|
||||
q: makeQuery(structs.Healthiness, "long-fail"),
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckFailure,
|
||||
http.StatusInternalServerError,
|
||||
truncate,
|
||||
),
|
||||
}, {
|
||||
name: "201 truncate",
|
||||
qc: makeQueryContext(),
|
||||
q: makeQuery(structs.Healthiness, "long-not-fail"),
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckSuccess,
|
||||
http.StatusCreated,
|
||||
truncate,
|
||||
),
|
||||
}}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
c := New(logger)
|
||||
c.(*checker).clock = clock
|
||||
|
||||
ctx := context.Background()
|
||||
result := c.Do(ctx, tc.qc, tc.q)
|
||||
must.Eq(t, tc.expResult, result)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// bigResponse creates a response payload larger than the maximum outputSizeLimit
|
||||
// as well as the same response but truncated to length of outputSizeLimit
|
||||
func bigResponse() (string, string) {
|
||||
size := outputSizeLimit + 5
|
||||
b := make([]byte, size, size)
|
||||
for i := 0; i < size; i++ {
|
||||
b[i] = 'a'
|
||||
}
|
||||
s := string(b)
|
||||
return s, s[:outputSizeLimit]
|
||||
}
|
||||
|
||||
func TestChecker_Do_TCP(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
// create a mock clock so we can assert time is set
|
||||
now := time.Date(2022, 1, 2, 3, 4, 5, 6, time.UTC)
|
||||
clock := libtimetest.NewClockMock(t).NowMock.Return(now)
|
||||
|
||||
makeQueryContext := func(address string, port int) *QueryContext {
|
||||
return &QueryContext{
|
||||
ID: "abc123",
|
||||
CustomAddress: address,
|
||||
ServicePortLabel: fmt.Sprintf("%d", port),
|
||||
Networks: nil,
|
||||
NetworkStatus: mock.NewNetworkStatus(address),
|
||||
Ports: nil,
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}
|
||||
}
|
||||
|
||||
makeQuery := func(
|
||||
kind structs.CheckMode,
|
||||
port int,
|
||||
) *Query {
|
||||
return &Query{
|
||||
Mode: kind,
|
||||
Type: "tcp",
|
||||
Timeout: 100 * time.Millisecond,
|
||||
AddressMode: "auto",
|
||||
PortLabel: fmt.Sprintf("%d", port),
|
||||
}
|
||||
}
|
||||
|
||||
makeExpResult := func(
|
||||
kind structs.CheckMode,
|
||||
status structs.CheckStatus,
|
||||
output string,
|
||||
) *structs.CheckQueryResult {
|
||||
return &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: kind,
|
||||
Status: status,
|
||||
Output: output,
|
||||
Timestamp: now.Unix(),
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}
|
||||
}
|
||||
|
||||
ports := freeport.MustTake(3)
|
||||
defer freeport.Return(ports)
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
qc *QueryContext
|
||||
q *Query
|
||||
tcpMode string // "ok", "off", "hang"
|
||||
tcpPort int
|
||||
expResult *structs.CheckQueryResult
|
||||
}{{
|
||||
name: "tcp ok",
|
||||
qc: makeQueryContext("localhost", ports[0]),
|
||||
q: makeQuery(structs.Healthiness, ports[0]),
|
||||
tcpMode: "ok",
|
||||
tcpPort: ports[0],
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckSuccess,
|
||||
"nomad: tcp ok",
|
||||
),
|
||||
}, {
|
||||
name: "tcp not listening",
|
||||
qc: makeQueryContext("127.0.0.1", ports[1]),
|
||||
q: makeQuery(structs.Healthiness, ports[1]),
|
||||
tcpMode: "off",
|
||||
tcpPort: ports[1],
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckFailure,
|
||||
fmt.Sprintf("dial tcp 127.0.0.1:%d: connect: connection refused", ports[1]),
|
||||
),
|
||||
}, {
|
||||
name: "tcp slow accept",
|
||||
qc: makeQueryContext("localhost", ports[2]),
|
||||
q: makeQuery(structs.Healthiness, ports[2]),
|
||||
tcpMode: "hang",
|
||||
tcpPort: ports[2],
|
||||
expResult: makeExpResult(
|
||||
structs.Healthiness,
|
||||
structs.CheckFailure,
|
||||
"dial tcp: lookup localhost: i/o timeout",
|
||||
),
|
||||
}}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
logger := testlog.HCLogger(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
c := New(logger)
|
||||
c.(*checker).clock = clock
|
||||
|
||||
switch tc.tcpMode {
|
||||
case "ok":
|
||||
// simulate tcp server by listening
|
||||
go tcpServer(ctx, tc.tcpPort)
|
||||
case "hang":
|
||||
// simulate tcp hang by setting an already expired context
|
||||
timeout, stop := context.WithDeadline(ctx, now.Add(-1*time.Second))
|
||||
defer stop()
|
||||
ctx = timeout
|
||||
case "off":
|
||||
// simulate tcp dead connection by not listening
|
||||
}
|
||||
|
||||
result := c.Do(ctx, tc.qc, tc.q)
|
||||
must.Eq(t, tc.expResult, result)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func tcpServer(ctx context.Context, port int) {
|
||||
var lc net.ListenConfig
|
||||
l, _ := lc.Listen(ctx, "tcp", net.JoinHostPort(
|
||||
"localhost", fmt.Sprintf("%d", port),
|
||||
))
|
||||
defer func() {
|
||||
_ = l.Close()
|
||||
}()
|
||||
con, _ := l.Accept()
|
||||
defer func() {
|
||||
_ = con.Close()
|
||||
}()
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
package checks
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
// GetCheckQuery extracts the needed info from c to actually execute the check.
|
||||
func GetCheckQuery(c *structs.ServiceCheck) *Query {
|
||||
var protocol = c.Protocol // ensure appropriate default
|
||||
if c.Type == "http" && protocol == "" {
|
||||
protocol = "http"
|
||||
}
|
||||
return &Query{
|
||||
Mode: structs.GetCheckMode(c),
|
||||
Type: c.Type,
|
||||
Timeout: c.Timeout,
|
||||
AddressMode: c.AddressMode,
|
||||
PortLabel: c.PortLabel,
|
||||
Path: c.Path,
|
||||
Method: c.Method,
|
||||
Protocol: protocol,
|
||||
}
|
||||
}
|
||||
|
||||
// A Query is derived from a ServiceCheck and contains the minimal
|
||||
// amount of information needed to actually execute that check.
|
||||
type Query struct {
|
||||
Mode structs.CheckMode // readiness or healthiness
|
||||
Type string // tcp or http
|
||||
|
||||
Timeout time.Duration // connection / request timeout
|
||||
|
||||
AddressMode string // host, driver, or alloc
|
||||
PortLabel string // label or value
|
||||
|
||||
Protocol string // http checks only (http or https)
|
||||
Path string // http checks only
|
||||
Method string // http checks only
|
||||
}
|
||||
|
||||
// A QueryContext contains allocation and service parameters necessary for
|
||||
// address resolution.
|
||||
type QueryContext struct {
|
||||
ID structs.CheckID
|
||||
CustomAddress string
|
||||
ServicePortLabel string
|
||||
Networks structs.Networks
|
||||
NetworkStatus structs.NetworkStatus
|
||||
Ports structs.AllocatedPorts
|
||||
|
||||
Group string
|
||||
Task string
|
||||
Service string
|
||||
Check string
|
||||
}
|
||||
|
||||
// Stub creates a temporary QueryResult for the check of ID in the Pending state
|
||||
// so we can represent the status of not being checked yet.
|
||||
func Stub(
|
||||
id structs.CheckID, kind structs.CheckMode, now int64,
|
||||
group, task, service, check string,
|
||||
) *structs.CheckQueryResult {
|
||||
return &structs.CheckQueryResult{
|
||||
ID: id,
|
||||
Mode: kind,
|
||||
Status: structs.CheckPending,
|
||||
Output: "nomad: waiting to run",
|
||||
Timestamp: now,
|
||||
Group: group,
|
||||
Task: task,
|
||||
Service: service,
|
||||
Check: check,
|
||||
}
|
||||
}
|
||||
|
||||
// AllocationResults is a view of the check_id -> latest result for group and task
|
||||
// checks in an allocation.
|
||||
type AllocationResults map[structs.CheckID]*structs.CheckQueryResult
|
||||
|
||||
// diff returns the set of IDs in ids that are not in m.
|
||||
func (m AllocationResults) diff(ids []structs.CheckID) []structs.CheckID {
|
||||
var missing []structs.CheckID
|
||||
for _, id := range ids {
|
||||
if _, exists := m[id]; !exists {
|
||||
missing = append(missing, id)
|
||||
}
|
||||
}
|
||||
return missing
|
||||
}
|
||||
|
||||
// ClientResults is a holistic view of alloc_id -> check_id -> latest result
|
||||
// group and task checks across all allocations on a client.
|
||||
type ClientResults map[string]AllocationResults
|
||||
|
||||
func (cr ClientResults) Insert(allocID string, result *structs.CheckQueryResult) {
|
||||
if _, exists := cr[allocID]; !exists {
|
||||
cr[allocID] = make(AllocationResults)
|
||||
}
|
||||
cr[allocID][result.ID] = result
|
||||
}
|
|
@ -0,0 +1,171 @@
|
|||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/shoenig/test/must"
|
||||
)
|
||||
|
||||
func TestChecks_GetCheckQuery(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
cType string
|
||||
protocol string
|
||||
onUpdate string
|
||||
expMode structs.CheckMode
|
||||
expProtocol string
|
||||
}{
|
||||
{
|
||||
name: "http check and http set",
|
||||
cType: "http",
|
||||
protocol: "http",
|
||||
onUpdate: "checks",
|
||||
expMode: structs.Healthiness,
|
||||
expProtocol: "http",
|
||||
},
|
||||
{
|
||||
name: "http check and https set",
|
||||
cType: "http",
|
||||
protocol: "https",
|
||||
onUpdate: "checks",
|
||||
expMode: structs.Healthiness,
|
||||
expProtocol: "https",
|
||||
},
|
||||
{
|
||||
name: "http check and protocol unset",
|
||||
cType: "http",
|
||||
protocol: "",
|
||||
onUpdate: "checks",
|
||||
expMode: structs.Healthiness,
|
||||
expProtocol: "http", // inherit default
|
||||
},
|
||||
{
|
||||
name: "tcp check and protocol unset",
|
||||
cType: "tcp",
|
||||
protocol: "",
|
||||
onUpdate: "checks",
|
||||
expMode: structs.Healthiness,
|
||||
expProtocol: "",
|
||||
},
|
||||
{
|
||||
name: "http check and http set",
|
||||
cType: "http",
|
||||
protocol: "http",
|
||||
onUpdate: "checks",
|
||||
expMode: structs.Healthiness,
|
||||
expProtocol: "http",
|
||||
},
|
||||
{
|
||||
name: "on-update ignore",
|
||||
cType: "http",
|
||||
protocol: "http",
|
||||
onUpdate: "ignore",
|
||||
expMode: structs.Readiness,
|
||||
expProtocol: "http",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
serviceCheck := &structs.ServiceCheck{
|
||||
Type: tc.cType,
|
||||
Path: "/",
|
||||
Protocol: tc.protocol,
|
||||
PortLabel: "web",
|
||||
AddressMode: "host",
|
||||
Interval: 10 * time.Second,
|
||||
Timeout: 2 * time.Second,
|
||||
Method: "GET",
|
||||
OnUpdate: tc.onUpdate,
|
||||
}
|
||||
query := GetCheckQuery(serviceCheck)
|
||||
must.Eq(t, tc.expMode, query.Mode)
|
||||
must.Eq(t, tc.expProtocol, query.Protocol)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestChecks_Stub(t *testing.T) {
|
||||
now := time.Date(2020, 1, 2, 3, 4, 5, 6, time.UTC).Unix()
|
||||
result := Stub(
|
||||
"abc123", // check id
|
||||
structs.Healthiness, // kind
|
||||
now, // timestamp
|
||||
"group", "task", "service", "check",
|
||||
)
|
||||
must.Eq(t, &structs.CheckQueryResult{
|
||||
ID: "abc123",
|
||||
Mode: structs.Healthiness,
|
||||
Status: structs.CheckPending,
|
||||
Output: "nomad: waiting to run",
|
||||
Timestamp: now,
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}, result)
|
||||
}
|
||||
|
||||
func TestChecks_AllocationResults_diff(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
results []structs.CheckID
|
||||
ids []structs.CheckID
|
||||
exp []structs.CheckID
|
||||
}{{
|
||||
name: "empty results and empty ids",
|
||||
results: nil,
|
||||
ids: nil,
|
||||
exp: nil,
|
||||
}, {
|
||||
name: "empty results and nonempty ids",
|
||||
results: nil,
|
||||
ids: []structs.CheckID{"aaa", "bbb"},
|
||||
exp: []structs.CheckID{"aaa", "bbb"},
|
||||
}, {
|
||||
name: "nonempty results and empty ids",
|
||||
results: []structs.CheckID{"aaa", "bbb"},
|
||||
ids: nil,
|
||||
exp: nil,
|
||||
}, {
|
||||
name: "mix",
|
||||
results: []structs.CheckID{"aaa", "ccc", "ddd", "fff"},
|
||||
ids: []structs.CheckID{"bbb", "ccc", "fff", "eee"},
|
||||
exp: []structs.CheckID{"bbb", "eee"},
|
||||
}}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ar := make(AllocationResults)
|
||||
for _, id := range tc.results {
|
||||
ar[id] = nil
|
||||
}
|
||||
result := ar.diff(tc.ids)
|
||||
must.Eq(t, tc.exp, result)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestChecks_ClientResults_Insert(t *testing.T) {
|
||||
cr := make(ClientResults)
|
||||
cr.Insert("alloc1", &structs.CheckQueryResult{ID: "qr1", Check: "c1"})
|
||||
cr.Insert("alloc2", &structs.CheckQueryResult{ID: "qr2", Check: "c2"})
|
||||
cr.Insert("alloc3", &structs.CheckQueryResult{ID: "qr3", Check: "c3"})
|
||||
cr.Insert("alloc2", &structs.CheckQueryResult{ID: "qr2", Check: "c4"}) // overwrite
|
||||
cr.Insert("alloc3", &structs.CheckQueryResult{ID: "qr4", Check: "c5"})
|
||||
exp := ClientResults{
|
||||
"alloc1": {
|
||||
"qr1": &structs.CheckQueryResult{ID: "qr1", Check: "c1"},
|
||||
},
|
||||
"alloc2": {
|
||||
"qr2": &structs.CheckQueryResult{ID: "qr2", Check: "c4"},
|
||||
},
|
||||
"alloc3": {
|
||||
"qr3": &structs.CheckQueryResult{ID: "qr3", Check: "c3"},
|
||||
"qr4": &structs.CheckQueryResult{ID: "qr4", Check: "c5"},
|
||||
},
|
||||
}
|
||||
must.Eq(t, exp, cr)
|
||||
}
|
|
@ -108,6 +108,7 @@ func (a *AllocRegistration) NumChecks() int {
|
|||
// ServiceRegistrations holds the status of services registered for a
|
||||
// particular task or task group.
|
||||
type ServiceRegistrations struct {
|
||||
// Services maps service_id -> service registration
|
||||
Services map[string]*ServiceRegistration
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
@ -11,6 +12,7 @@ import (
|
|||
dmstate "github.com/hashicorp/nomad/client/devicemanager/state"
|
||||
"github.com/hashicorp/nomad/client/dynamicplugins"
|
||||
driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/helper/boltdd"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"go.etcd.io/bbolt"
|
||||
|
@ -29,7 +31,9 @@ allocations/
|
|||
|--> network_status -> networkStatusEntry{*structs.AllocNetworkStatus}
|
||||
|--> task-<name>/
|
||||
|--> local_state -> *trstate.LocalState # Local-only state
|
||||
|--> task_state -> *structs.TaskState # Sync'd to servers
|
||||
|--> task_state -> *structs.TaskState # Syncs to servers
|
||||
|--> checks/
|
||||
|--> check-<id> -> *structs.CheckState # Syncs to servers
|
||||
|
||||
devicemanager/
|
||||
|--> plugin_state -> *dmstate.PluginState
|
||||
|
@ -73,6 +77,9 @@ var (
|
|||
// stored under
|
||||
allocNetworkStatusKey = []byte("network_status")
|
||||
|
||||
// checkResultsBucket is the bucket name in which check query results are stored
|
||||
checkResultsBucket = []byte("check_results")
|
||||
|
||||
// allocations -> $allocid -> task-$taskname -> the keys below
|
||||
taskLocalStateKey = []byte("local_state")
|
||||
taskStateKey = []byte("task_state")
|
||||
|
@ -176,6 +183,7 @@ func (s *BoltStateDB) Name() string {
|
|||
func (s *BoltStateDB) GetAllAllocations() ([]*structs.Allocation, map[string]error, error) {
|
||||
var allocs []*structs.Allocation
|
||||
var errs map[string]error
|
||||
|
||||
err := s.db.View(func(tx *boltdd.Tx) error {
|
||||
allocs, errs = s.getAllAllocations(tx)
|
||||
return nil
|
||||
|
@ -195,7 +203,7 @@ type allocEntry struct {
|
|||
}
|
||||
|
||||
func (s *BoltStateDB) getAllAllocations(tx *boltdd.Tx) ([]*structs.Allocation, map[string]error) {
|
||||
allocs := []*structs.Allocation{}
|
||||
allocs := make([]*structs.Allocation, 0, 10)
|
||||
errs := map[string]error{}
|
||||
|
||||
allocationsBkt := tx.Bucket(allocationsBucketName)
|
||||
|
@ -717,6 +725,71 @@ func (s *BoltStateDB) GetDynamicPluginRegistryState() (*dynamicplugins.RegistryS
|
|||
return ps, nil
|
||||
}
|
||||
|
||||
func keyForCheck(allocID string, checkID structs.CheckID) []byte {
|
||||
return []byte(fmt.Sprintf("%s_%s", allocID, checkID))
|
||||
}
|
||||
|
||||
// PutCheckResult puts qr into the state store.
|
||||
func (s *BoltStateDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error {
|
||||
return s.db.Update(func(tx *boltdd.Tx) error {
|
||||
bkt, err := tx.CreateBucketIfNotExists(checkResultsBucket)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
key := keyForCheck(allocID, qr.ID)
|
||||
return bkt.Put(key, qr)
|
||||
})
|
||||
}
|
||||
|
||||
// GetCheckResults gets the check results associated with allocID from the state store.
|
||||
func (s *BoltStateDB) GetCheckResults() (checks.ClientResults, error) {
|
||||
m := make(checks.ClientResults)
|
||||
|
||||
err := s.db.View(func(tx *boltdd.Tx) error {
|
||||
bkt := tx.Bucket(checkResultsBucket)
|
||||
if bkt == nil {
|
||||
return nil // nothing set yet
|
||||
}
|
||||
|
||||
if err := boltdd.Iterate(bkt, nil, func(key []byte, qr structs.CheckQueryResult) {
|
||||
parts := bytes.SplitN(key, []byte("_"), 2)
|
||||
allocID, _ := parts[0], parts[1]
|
||||
m.Insert(string(allocID), &qr)
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return m, err
|
||||
}
|
||||
|
||||
func (s *BoltStateDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error {
|
||||
return s.db.Update(func(tx *boltdd.Tx) error {
|
||||
bkt := tx.Bucket(checkResultsBucket)
|
||||
if bkt == nil {
|
||||
return nil // nothing set yet
|
||||
}
|
||||
|
||||
for _, id := range checkIDs {
|
||||
key := keyForCheck(allocID, id)
|
||||
if err := bkt.Delete(key); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func (s *BoltStateDB) PurgeCheckResults(allocID string) error {
|
||||
return s.db.Update(func(tx *boltdd.Tx) error {
|
||||
bkt := tx.Bucket(checkResultsBucket)
|
||||
if bkt == nil {
|
||||
return nil // nothing set yet
|
||||
}
|
||||
return bkt.DeletePrefix([]byte(allocID + "_"))
|
||||
})
|
||||
}
|
||||
|
||||
// init initializes metadata entries in a newly created state database.
|
||||
func (s *BoltStateDB) init() error {
|
||||
return s.db.Update(func(tx *boltdd.Tx) error {
|
|
@ -7,6 +7,7 @@ import (
|
|||
dmstate "github.com/hashicorp/nomad/client/devicemanager/state"
|
||||
"github.com/hashicorp/nomad/client/dynamicplugins"
|
||||
driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -80,8 +81,6 @@ func (m *ErrDB) PutDynamicPluginRegistryState(state *dynamicplugins.RegistryStat
|
|||
return fmt.Errorf("Error!")
|
||||
}
|
||||
|
||||
// GetDevicePluginState stores the device manager's plugin state or returns an
|
||||
// error.
|
||||
func (m *ErrDB) GetDevicePluginState() (*dmstate.PluginState, error) {
|
||||
return nil, fmt.Errorf("Error!")
|
||||
}
|
||||
|
@ -94,9 +93,22 @@ func (m *ErrDB) PutDriverPluginState(ps *driverstate.PluginState) error {
|
|||
return fmt.Errorf("Error!")
|
||||
}
|
||||
|
||||
func (m *ErrDB) Close() error {
|
||||
func (m *ErrDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error {
|
||||
return fmt.Errorf("Error!")
|
||||
}
|
||||
|
||||
// Ensure *ErrDB implements StateDB
|
||||
var _ StateDB = (*ErrDB)(nil)
|
||||
func (m *ErrDB) GetCheckResults() (checks.ClientResults, error) {
|
||||
return nil, fmt.Errorf("Error!")
|
||||
}
|
||||
|
||||
func (m *ErrDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error {
|
||||
return fmt.Errorf("Error!")
|
||||
}
|
||||
|
||||
func (m *ErrDB) PurgeCheckResults(allocID string) error {
|
||||
return fmt.Errorf("Error!")
|
||||
}
|
||||
|
||||
func (m *ErrDB) Close() error {
|
||||
return fmt.Errorf("Error!")
|
||||
}
|
|
@ -3,11 +3,13 @@ package state
|
|||
import (
|
||||
"sync"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
|
||||
dmstate "github.com/hashicorp/nomad/client/devicemanager/state"
|
||||
"github.com/hashicorp/nomad/client/dynamicplugins"
|
||||
driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -27,6 +29,9 @@ type MemDB struct {
|
|||
localTaskState map[string]map[string]*state.LocalState
|
||||
taskState map[string]map[string]*structs.TaskState
|
||||
|
||||
// alloc_id -> check_id -> result
|
||||
checks checks.ClientResults
|
||||
|
||||
// devicemanager -> plugin-state
|
||||
devManagerPs *dmstate.PluginState
|
||||
|
||||
|
@ -49,6 +54,7 @@ func NewMemDB(logger hclog.Logger) *MemDB {
|
|||
networkStatus: make(map[string]*structs.AllocNetworkStatus),
|
||||
localTaskState: make(map[string]map[string]*state.LocalState),
|
||||
taskState: make(map[string]map[string]*structs.TaskState),
|
||||
checks: make(checks.ClientResults),
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
@ -73,7 +79,7 @@ func (m *MemDB) GetAllAllocations() ([]*structs.Allocation, map[string]error, er
|
|||
return allocs, map[string]error{}, nil
|
||||
}
|
||||
|
||||
func (m *MemDB) PutAllocation(alloc *structs.Allocation, opts ...WriteOption) error {
|
||||
func (m *MemDB) PutAllocation(alloc *structs.Allocation, _ ...WriteOption) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.allocs[alloc.ID] = alloc
|
||||
|
@ -99,7 +105,7 @@ func (m *MemDB) GetNetworkStatus(allocID string) (*structs.AllocNetworkStatus, e
|
|||
return m.networkStatus[allocID], nil
|
||||
}
|
||||
|
||||
func (m *MemDB) PutNetworkStatus(allocID string, ns *structs.AllocNetworkStatus, opts ...WriteOption) error {
|
||||
func (m *MemDB) PutNetworkStatus(allocID string, ns *structs.AllocNetworkStatus, _ ...WriteOption) error {
|
||||
m.mu.Lock()
|
||||
m.networkStatus[allocID] = ns
|
||||
defer m.mu.Unlock()
|
||||
|
@ -175,7 +181,7 @@ func (m *MemDB) DeleteTaskBucket(allocID, taskName string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (m *MemDB) DeleteAllocationBucket(allocID string, opts ...WriteOption) error {
|
||||
func (m *MemDB) DeleteAllocationBucket(allocID string, _ ...WriteOption) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
|
@ -227,6 +233,40 @@ func (m *MemDB) PutDynamicPluginRegistryState(ps *dynamicplugins.RegistryState)
|
|||
return nil
|
||||
}
|
||||
|
||||
func (m *MemDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
if _, exists := m.checks[allocID]; !exists {
|
||||
m.checks[allocID] = make(checks.AllocationResults)
|
||||
}
|
||||
|
||||
m.checks[allocID][qr.ID] = qr
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemDB) GetCheckResults() (checks.ClientResults, error) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return helper.CopyMap(m.checks), nil
|
||||
}
|
||||
|
||||
func (m *MemDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
for _, id := range checkIDs {
|
||||
delete(m.checks[allocID], id)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemDB) PurgeCheckResults(allocID string) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
delete(m.checks, allocID)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemDB) Close() error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
|
@ -5,6 +5,7 @@ import (
|
|||
dmstate "github.com/hashicorp/nomad/client/devicemanager/state"
|
||||
"github.com/hashicorp/nomad/client/dynamicplugins"
|
||||
driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -87,6 +88,22 @@ func (n NoopDB) GetDynamicPluginRegistryState() (*dynamicplugins.RegistryState,
|
|||
return nil, nil
|
||||
}
|
||||
|
||||
func (n NoopDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n NoopDB) GetCheckResults() (checks.ClientResults, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (n NoopDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n NoopDB) PurgeCheckResults(allocID string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n NoopDB) Close() error {
|
||||
return nil
|
||||
}
|
|
@ -16,23 +16,32 @@ import (
|
|||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/kr/pretty"
|
||||
"github.com/shoenig/test/must"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// assert each implementation satisfies StateDB interface
|
||||
var (
|
||||
_ StateDB = (*BoltStateDB)(nil)
|
||||
_ StateDB = (*MemDB)(nil)
|
||||
_ StateDB = (*NoopDB)(nil)
|
||||
_ StateDB = (*ErrDB)(nil)
|
||||
)
|
||||
|
||||
func setupBoltStateDB(t *testing.T) *BoltStateDB {
|
||||
dir := t.TempDir()
|
||||
|
||||
db, err := NewBoltStateDB(testlog.HCLogger(t), dir)
|
||||
if err != nil {
|
||||
if err := os.RemoveAll(dir); err != nil {
|
||||
t.Logf("error removing boltdb dir: %v", err)
|
||||
if rmErr := os.RemoveAll(dir); rmErr != nil {
|
||||
t.Logf("error removing boltdb dir: %v", rmErr)
|
||||
}
|
||||
t.Fatalf("error creating boltdb: %v", err)
|
||||
}
|
||||
|
||||
t.Cleanup(func() {
|
||||
if err := db.Close(); err != nil {
|
||||
t.Errorf("error closing boltdb: %v", err)
|
||||
if closeErr := db.Close(); closeErr != nil {
|
||||
t.Errorf("error closing boltdb: %v", closeErr)
|
||||
}
|
||||
})
|
||||
|
||||
|
@ -40,14 +49,12 @@ func setupBoltStateDB(t *testing.T) *BoltStateDB {
|
|||
}
|
||||
|
||||
func testDB(t *testing.T, f func(*testing.T, StateDB)) {
|
||||
boltdb := setupBoltStateDB(t)
|
||||
dbs := []StateDB{
|
||||
setupBoltStateDB(t),
|
||||
NewMemDB(testlog.HCLogger(t)),
|
||||
}
|
||||
|
||||
memdb := NewMemDB(testlog.HCLogger(t))
|
||||
|
||||
impls := []StateDB{boltdb, memdb}
|
||||
|
||||
for _, db := range impls {
|
||||
db := db
|
||||
for _, db := range dbs {
|
||||
t.Run(db.Name(), func(t *testing.T) {
|
||||
f(t, db)
|
||||
})
|
||||
|
@ -374,6 +381,79 @@ func TestStateDB_DynamicRegistry(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestStateDB_CheckResult_keyForCheck(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
allocID := "alloc1"
|
||||
checkID := structs.CheckID("id1")
|
||||
result := keyForCheck(allocID, checkID)
|
||||
exp := allocID + "_" + string(checkID)
|
||||
must.Eq(t, exp, string(result))
|
||||
}
|
||||
|
||||
func TestStateDB_CheckResult(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
qr := func(id string) *structs.CheckQueryResult {
|
||||
return &structs.CheckQueryResult{
|
||||
ID: structs.CheckID(id),
|
||||
Mode: "healthiness",
|
||||
Status: "passing",
|
||||
Output: "nomad: tcp ok",
|
||||
Timestamp: 1,
|
||||
Group: "group",
|
||||
Task: "task",
|
||||
Service: "service",
|
||||
Check: "check",
|
||||
}
|
||||
}
|
||||
|
||||
testDB(t, func(t *testing.T, db StateDB) {
|
||||
t.Run("put and get", func(t *testing.T) {
|
||||
err := db.PutCheckResult("alloc1", qr("abc123"))
|
||||
must.NoError(t, err)
|
||||
results, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.MapContainsKeys(t, results, []string{"alloc1"})
|
||||
must.MapContainsKeys(t, results["alloc1"], []structs.CheckID{"abc123"})
|
||||
})
|
||||
})
|
||||
|
||||
testDB(t, func(t *testing.T, db StateDB) {
|
||||
t.Run("delete", func(t *testing.T) {
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id1")))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id2")))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id3")))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id4")))
|
||||
must.NoError(t, db.PutCheckResult("alloc2", qr("id5")))
|
||||
err := db.DeleteCheckResults("alloc1", []structs.CheckID{"id2", "id3"})
|
||||
must.NoError(t, err)
|
||||
results, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.MapContainsKeys(t, results, []string{"alloc1", "alloc2"})
|
||||
must.MapContainsKeys(t, results["alloc1"], []structs.CheckID{"id1", "id4"})
|
||||
must.MapContainsKeys(t, results["alloc2"], []structs.CheckID{"id5"})
|
||||
})
|
||||
})
|
||||
|
||||
testDB(t, func(t *testing.T, db StateDB) {
|
||||
t.Run("purge", func(t *testing.T) {
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id1")))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id2")))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id3")))
|
||||
must.NoError(t, db.PutCheckResult("alloc1", qr("id4")))
|
||||
must.NoError(t, db.PutCheckResult("alloc2", qr("id5")))
|
||||
err := db.PurgeCheckResults("alloc1")
|
||||
must.NoError(t, err)
|
||||
results, err := db.GetCheckResults()
|
||||
must.NoError(t, err)
|
||||
must.MapContainsKeys(t, results, []string{"alloc2"})
|
||||
must.MapContainsKeys(t, results["alloc2"], []structs.CheckID{"id5"})
|
||||
})
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
// TestStateDB_Upgrade asserts calling Upgrade on new databases always
|
||||
// succeeds.
|
||||
func TestStateDB_Upgrade(t *testing.T) {
|
||||
|
|
|
@ -5,6 +5,7 @@ import (
|
|||
dmstate "github.com/hashicorp/nomad/client/devicemanager/state"
|
||||
"github.com/hashicorp/nomad/client/dynamicplugins"
|
||||
driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
|
||||
"github.com/hashicorp/nomad/client/serviceregistration/checks"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -28,14 +29,16 @@ type StateDB interface {
|
|||
// not be stored.
|
||||
PutAllocation(*structs.Allocation, ...WriteOption) error
|
||||
|
||||
// Get/Put DeploymentStatus get and put the allocation's deployment
|
||||
// status. It may be nil.
|
||||
// GetDeploymentStatus gets the allocation's deployment status. It may be nil.
|
||||
GetDeploymentStatus(allocID string) (*structs.AllocDeploymentStatus, error)
|
||||
|
||||
// PutDeploymentStatus sets the allocation's deployment status. It may be nil.
|
||||
PutDeploymentStatus(allocID string, ds *structs.AllocDeploymentStatus) error
|
||||
|
||||
// Get/Put NetworkStatus get and put the allocation's network
|
||||
// status. It may be nil.
|
||||
// GetNetworkStatus gets the allocation's network status. It may be nil.
|
||||
GetNetworkStatus(allocID string) (*structs.AllocNetworkStatus, error)
|
||||
|
||||
// PutNetworkStatus puts the allocation's network status. It may be nil.
|
||||
PutNetworkStatus(allocID string, ns *structs.AllocNetworkStatus, opts ...WriteOption) error
|
||||
|
||||
// GetTaskRunnerState returns the LocalState and TaskState for a
|
||||
|
@ -43,7 +46,7 @@ type StateDB interface {
|
|||
// error is encountered only the error will be non-nil.
|
||||
GetTaskRunnerState(allocID, taskName string) (*state.LocalState, *structs.TaskState, error)
|
||||
|
||||
// PutTaskRunnerLocalTask stores the LocalState for a TaskRunner or
|
||||
// PutTaskRunnerLocalState stores the LocalState for a TaskRunner or
|
||||
// returns an error.
|
||||
PutTaskRunnerLocalState(allocID, taskName string, val *state.LocalState) error
|
||||
|
||||
|
@ -81,6 +84,18 @@ type StateDB interface {
|
|||
// PutDynamicPluginRegistryState is used to store the dynamic plugin manager's state.
|
||||
PutDynamicPluginRegistryState(state *dynamicplugins.RegistryState) error
|
||||
|
||||
// PutCheckResult sets the query result for the check implied in qr.
|
||||
PutCheckResult(allocID string, qr *structs.CheckQueryResult) error
|
||||
|
||||
// DeleteCheckResults removes the given set of check results.
|
||||
DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error
|
||||
|
||||
// PurgeCheckResults removes all check results of the given allocation.
|
||||
PurgeCheckResults(allocID string) error
|
||||
|
||||
// GetCheckResults is used to restore the set of check results on this Client.
|
||||
GetCheckResults() (checks.ClientResults, error)
|
||||
|
||||
// Close the database. Unsafe for further use after calling regardless
|
||||
// of return value.
|
||||
Close() error
|
||||
|
|
|
@ -4,7 +4,7 @@ import (
|
|||
"errors"
|
||||
"sync"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
|
@ -32,7 +32,7 @@ type AllocBroadcaster struct {
|
|||
// nextId is the next id to assign in listener map
|
||||
nextId int
|
||||
|
||||
// closed is true if broadcsater is closed
|
||||
// closed is true if broadcaster is closed.
|
||||
closed bool
|
||||
|
||||
// last alloc sent to prime new listeners
|
||||
|
|
|
@ -182,6 +182,20 @@ type AllocExecRequest struct {
|
|||
structs.QueryOptions
|
||||
}
|
||||
|
||||
// AllocChecksRequest is used to request the latest nomad service discovery
|
||||
// check status information of a given allocation.
|
||||
type AllocChecksRequest struct {
|
||||
structs.QueryOptions
|
||||
AllocID string
|
||||
}
|
||||
|
||||
// AllocChecksResponse is used to return the latest nomad service discovery
|
||||
// check status information of a given allocation.
|
||||
type AllocChecksResponse struct {
|
||||
structs.QueryMeta
|
||||
Results map[structs.CheckID]*structs.CheckQueryResult
|
||||
}
|
||||
|
||||
// AllocStatsRequest is used to request the resource usage of a given
|
||||
// allocation, potentially filtering by task
|
||||
type AllocStatsRequest struct {
|
||||
|
|
|
@ -84,6 +84,8 @@ func (s *HTTPServer) AllocSpecificRequest(resp http.ResponseWriter, req *http.Re
|
|||
}
|
||||
|
||||
switch tokens[1] {
|
||||
case "checks":
|
||||
return s.allocChecks(allocID, resp, req)
|
||||
case "stop":
|
||||
return s.allocStop(allocID, resp, req)
|
||||
case "services":
|
||||
|
@ -213,6 +215,8 @@ func (s *HTTPServer) ClientAllocRequest(resp http.ResponseWriter, req *http.Requ
|
|||
}
|
||||
allocID := tokens[0]
|
||||
switch tokens[1] {
|
||||
case "checks":
|
||||
return s.allocChecks(allocID, resp, req)
|
||||
case "stats":
|
||||
return s.allocStats(allocID, resp, req)
|
||||
case "exec":
|
||||
|
@ -436,6 +440,39 @@ func (s *HTTPServer) allocStats(allocID string, resp http.ResponseWriter, req *h
|
|||
return reply.Stats, rpcErr
|
||||
}
|
||||
|
||||
func (s *HTTPServer) allocChecks(allocID string, resp http.ResponseWriter, req *http.Request) (any, error) {
|
||||
// Build the request and parse the ACL token
|
||||
args := cstructs.AllocChecksRequest{
|
||||
AllocID: allocID,
|
||||
}
|
||||
s.parse(resp, req, &args.QueryOptions.Region, &args.QueryOptions)
|
||||
|
||||
// Determine the handler to use
|
||||
useLocalClient, useClientRPC, useServerRPC := s.rpcHandlerForAlloc(allocID)
|
||||
|
||||
// Make the RPC
|
||||
var reply cstructs.AllocChecksResponse
|
||||
var rpcErr error
|
||||
switch {
|
||||
case useLocalClient:
|
||||
rpcErr = s.agent.Client().ClientRPC("Allocations.Checks", &args, &reply)
|
||||
case useClientRPC:
|
||||
rpcErr = s.agent.Client().RPC("Allocations.Checks", &args, &reply)
|
||||
case useServerRPC:
|
||||
rpcErr = s.agent.Server().RPC("Allocations.Checks", &args, &reply)
|
||||
default:
|
||||
rpcErr = CodedError(400, "No local Node and node_id not provided")
|
||||
}
|
||||
|
||||
if rpcErr != nil {
|
||||
if structs.IsErrNoNodeConn(rpcErr) || structs.IsErrUnknownAllocation(rpcErr) {
|
||||
rpcErr = CodedError(404, rpcErr.Error())
|
||||
}
|
||||
}
|
||||
|
||||
return reply.Results, rpcErr
|
||||
}
|
||||
|
||||
func (s *HTTPServer) allocExec(allocID string, resp http.ResponseWriter, req *http.Request) (interface{}, error) {
|
||||
// Build the request and parse the ACL token
|
||||
task := req.URL.Query().Get("task")
|
||||
|
|
|
@ -87,7 +87,7 @@ func commandAssetsConnectShortNomad() (*asset, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
info := bindataFileInfo{name: "command/assets/connect-short.nomad", size: 1071, mode: os.FileMode(420), modTime: time.Unix(1654712611, 0)}
|
||||
info := bindataFileInfo{name: "command/assets/connect-short.nomad", size: 1071, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)}
|
||||
a := &asset{bytes: bytes, info: info}
|
||||
return a, nil
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ func commandAssetsConnectNomad() (*asset, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
info := bindataFileInfo{name: "command/assets/connect.nomad", size: 18126, mode: os.FileMode(420), modTime: time.Unix(1654712611, 0)}
|
||||
info := bindataFileInfo{name: "command/assets/connect.nomad", size: 18126, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)}
|
||||
a := &asset{bytes: bytes, info: info}
|
||||
return a, nil
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ func commandAssetsExampleShortNomad() (*asset, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
info := bindataFileInfo{name: "command/assets/example-short.nomad", size: 369, mode: os.FileMode(420), modTime: time.Unix(1654554902, 0)}
|
||||
info := bindataFileInfo{name: "command/assets/example-short.nomad", size: 369, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)}
|
||||
a := &asset{bytes: bytes, info: info}
|
||||
return a, nil
|
||||
}
|
||||
|
@ -147,7 +147,7 @@ func commandAssetsExampleNomad() (*asset, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
info := bindataFileInfo{name: "command/assets/example.nomad", size: 16351, mode: os.FileMode(420), modTime: time.Unix(1654554902, 0)}
|
||||
info := bindataFileInfo{name: "command/assets/example.nomad", size: 16351, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)}
|
||||
a := &asset{bytes: bytes, info: info}
|
||||
return a, nil
|
||||
}
|
||||
|
|
|
@ -356,6 +356,8 @@ func (b *Bucket) Get(key []byte, obj interface{}) error {
|
|||
// Iterate iterates each key in Bucket b that starts with prefix. fn is called on
|
||||
// the key and msg-pack decoded value. If prefix is empty or nil, all keys in the
|
||||
// bucket are iterated.
|
||||
//
|
||||
// b must already exist.
|
||||
func Iterate[T any](b *Bucket, prefix []byte, fn func([]byte, T)) error {
|
||||
c := b.boltBucket.Cursor()
|
||||
for k, data := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix); k, data = c.Next() {
|
||||
|
@ -371,6 +373,8 @@ func Iterate[T any](b *Bucket, prefix []byte, fn func([]byte, T)) error {
|
|||
// DeletePrefix removes all keys starting with prefix from the bucket. If no keys
|
||||
// with prefix exist then nothing is done and a nil error is returned. Returns an
|
||||
// error if the bucket was created from a read-only transaction.
|
||||
//
|
||||
// b must already exist.
|
||||
func (b *Bucket) DeletePrefix(prefix []byte) error {
|
||||
c := b.boltBucket.Cursor()
|
||||
for k, _ := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix); k, _ = c.Next() {
|
||||
|
|
|
@ -245,6 +245,8 @@ func SliceStringIsSubset(larger, smaller []string) (bool, []string) {
|
|||
}
|
||||
|
||||
// SliceStringContains returns whether item exists at least once in list.
|
||||
//
|
||||
// Deprecated; use slices.Contains instead.
|
||||
func SliceStringContains(list []string, item string) bool {
|
||||
for _, s := range list {
|
||||
if s == item {
|
||||
|
|
|
@ -10,14 +10,13 @@ import (
|
|||
metrics "github.com/armon/go-metrics"
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-msgpack/codec"
|
||||
"github.com/hashicorp/nomad/acl"
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
|
||||
"github.com/hashicorp/nomad/acl"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
// ClientAllocations is used to forward RPC requests to the targed Nomad client's
|
||||
// ClientAllocations is used to forward RPC requests to the targeted Nomad client's
|
||||
// Allocation endpoint.
|
||||
type ClientAllocations struct {
|
||||
srv *Server
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
package mock
|
||||
|
||||
import (
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
// CheckShim is a mock implementation of checkstore.Shim
|
||||
//
|
||||
// So far the implementation does nothing.
|
||||
type CheckShim struct{}
|
||||
|
||||
func (s *CheckShim) Set(allocID string, result *structs.CheckQueryResult) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CheckShim) List(allocID string) map[structs.CheckID]*structs.CheckQueryResult {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CheckShim) Difference(allocID string, ids []structs.CheckID) []structs.CheckID {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CheckShim) Remove(allocID string, ids []structs.CheckID) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CheckShim) Purge(allocID string) error {
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package mock
|
||||
|
||||
import (
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
// NetworkStatus is a mock implementation of structs.NetworkStatus
|
||||
type NetworkStatus struct {
|
||||
address string
|
||||
}
|
||||
|
||||
// NewNetworkStatus creates a mock NetworkStatus based on address.
|
||||
func NewNetworkStatus(address string) structs.NetworkStatus {
|
||||
return &NetworkStatus{address: address}
|
||||
}
|
||||
|
||||
func (ns *NetworkStatus) NetworkStatus() *structs.AllocNetworkStatus {
|
||||
return &structs.AllocNetworkStatus{Address: ns.address}
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
package structs
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/shoenig/test/must"
|
||||
)
|
||||
|
||||
func TestChecks_NomadCheckID(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
orig := ServiceCheck{
|
||||
Name: "c1",
|
||||
Type: "http",
|
||||
Path: "/health",
|
||||
Protocol: "https",
|
||||
PortLabel: "web",
|
||||
AddressMode: "host",
|
||||
Interval: 1 * time.Minute,
|
||||
Timeout: 10 * time.Second,
|
||||
Method: "GET",
|
||||
TaskName: "t1",
|
||||
OnUpdate: "ignore",
|
||||
}
|
||||
|
||||
different := func(a, b ServiceCheck) bool {
|
||||
idA := NomadCheckID("id", "group", &a)
|
||||
idB := NomadCheckID("id", "group", &b)
|
||||
return idA != idB
|
||||
}
|
||||
|
||||
t.Run("same", func(t *testing.T) {
|
||||
c := orig
|
||||
must.False(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different name", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Name = "c2"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different type", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Type = "tcp"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different path", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Path = "/metrics"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different protocol", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Protocol = "http"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different port label", func(t *testing.T) {
|
||||
c := orig
|
||||
c.PortLabel = "ingress"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different address mode", func(t *testing.T) {
|
||||
c := orig
|
||||
c.AddressMode = "bridge"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different interval", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Interval = 1 * time.Second
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different timeout", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Timeout = 5 * time.Second
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different method", func(t *testing.T) {
|
||||
c := orig
|
||||
c.Method = "POST"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different task", func(t *testing.T) {
|
||||
c := orig
|
||||
c.TaskName = "task2"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
|
||||
t.Run("different on update", func(t *testing.T) {
|
||||
c := orig
|
||||
c.OnUpdate = "checks"
|
||||
must.True(t, different(orig, c))
|
||||
})
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package structs
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// The CheckMode of a Nomad check is either Healthiness or Readiness.
|
||||
type CheckMode string
|
||||
|
||||
const (
|
||||
// A Healthiness check is useful in the context of ensuring a service
|
||||
// is capable of performing its duties. This is an indicator that a check's
|
||||
// on_update configuration is set to "check_result", implying that Deployments
|
||||
// will not move forward while the check is failing.
|
||||
Healthiness CheckMode = "healthiness"
|
||||
|
||||
// A Readiness check is useful in the context of ensuring a service is
|
||||
// should be performing its duties (regardless of healthiness). This is an
|
||||
// indicator that the check's on_update configuration is set to "ignore",
|
||||
// implying that Deployments will move forward regardless if the check is
|
||||
// failing.
|
||||
Readiness CheckMode = "readiness"
|
||||
)
|
||||
|
||||
// GetCheckMode determines whether the check is readiness or healthiness.
|
||||
func GetCheckMode(c *ServiceCheck) CheckMode {
|
||||
if c != nil && c.OnUpdate == "ignore" {
|
||||
return Readiness
|
||||
}
|
||||
return Healthiness
|
||||
}
|
||||
|
||||
// An CheckID is unique to a check.
|
||||
type CheckID string
|
||||
|
||||
// A CheckQueryResult represents the outcome of a single execution of a Nomad service
|
||||
// check. It records the result, the output, and when the execution took place.
|
||||
// Advanced check math (e.g. success_before_passing) are left to the calling
|
||||
// context.
|
||||
type CheckQueryResult struct {
|
||||
ID CheckID
|
||||
Mode CheckMode
|
||||
Status CheckStatus
|
||||
StatusCode int `json:",omitempty"`
|
||||
Output string
|
||||
Timestamp int64
|
||||
|
||||
// check coordinates
|
||||
Group string
|
||||
Task string `json:",omitempty"`
|
||||
Service string
|
||||
Check string
|
||||
}
|
||||
|
||||
func (r *CheckQueryResult) String() string {
|
||||
return fmt.Sprintf("(%s %s %s %v)", r.ID, r.Mode, r.Status, r.Timestamp)
|
||||
}
|
||||
|
||||
// A CheckStatus is the result of executing a check. The status of a query is
|
||||
// ternary - success, failure, or pending (not yet executed). Deployments treat
|
||||
// pending and failure as the same - a deployment does not continue until a check
|
||||
// is passing (unless on_update=ignore).
|
||||
type CheckStatus string
|
||||
|
||||
const (
|
||||
CheckSuccess CheckStatus = "success"
|
||||
CheckFailure CheckStatus = "failure"
|
||||
CheckPending CheckStatus = "pending"
|
||||
)
|
||||
|
||||
// NomadCheckID returns an ID unique to the nomad service check.
|
||||
//
|
||||
// Checks of group-level services have no task.
|
||||
func NomadCheckID(allocID, group string, c *ServiceCheck) CheckID {
|
||||
sum := md5.New()
|
||||
hashString(sum, allocID)
|
||||
hashString(sum, group)
|
||||
hashString(sum, c.TaskName)
|
||||
hashString(sum, c.Name)
|
||||
hashString(sum, c.Type)
|
||||
hashString(sum, c.PortLabel)
|
||||
hashString(sum, c.OnUpdate)
|
||||
hashString(sum, c.AddressMode)
|
||||
hashDuration(sum, c.Interval)
|
||||
hashDuration(sum, c.Timeout)
|
||||
hashString(sum, c.Protocol)
|
||||
hashString(sum, c.Path)
|
||||
hashString(sum, c.Method)
|
||||
h := sum.Sum(nil)
|
||||
return CheckID(fmt.Sprintf("%x", h))
|
||||
}
|
|
@ -2,6 +2,7 @@ package structs
|
|||
|
||||
import (
|
||||
"crypto/sha1"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash"
|
||||
|
@ -19,6 +20,7 @@ import (
|
|||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/helper/args"
|
||||
"github.com/mitchellh/copystructure"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -41,7 +43,7 @@ const (
|
|||
|
||||
// ServiceCheck represents the Consul health check.
|
||||
type ServiceCheck struct {
|
||||
Name string // Name of the check, defaults to id
|
||||
Name string // Name of the check, defaults to a generated label
|
||||
Type string // Type of the check - tcp, http, docker and script
|
||||
Command string // Command is the command to run for script checks
|
||||
Args []string // Args is a list of arguments for script checks
|
||||
|
@ -66,6 +68,12 @@ type ServiceCheck struct {
|
|||
OnUpdate string
|
||||
}
|
||||
|
||||
// IsReadiness returns whether the configuration of the ServiceCheck is effectively
|
||||
// a readiness check - i.e. check failures do not affect a deployment.
|
||||
func (sc *ServiceCheck) IsReadiness() bool {
|
||||
return sc != nil && sc.OnUpdate == "ignore"
|
||||
}
|
||||
|
||||
// Copy the stanza recursively. Returns nil if nil.
|
||||
func (sc *ServiceCheck) Copy() *ServiceCheck {
|
||||
if sc == nil {
|
||||
|
@ -207,48 +215,49 @@ func (sc *ServiceCheck) Canonicalize(serviceName string) {
|
|||
}
|
||||
}
|
||||
|
||||
// validate a Service's ServiceCheck
|
||||
func (sc *ServiceCheck) validate() error {
|
||||
// Validate Type
|
||||
// validateCommon validates the parts of ServiceCheck shared across providers.
|
||||
func (sc *ServiceCheck) validateCommon(allowableTypes []string) error {
|
||||
// validate the type is allowable (different between nomad, consul checks)
|
||||
checkType := strings.ToLower(sc.Type)
|
||||
if !slices.Contains(allowableTypes, checkType) {
|
||||
s := strings.Join(allowableTypes, ", ")
|
||||
return fmt.Errorf(`invalid check type (%q), must be one of %s`, checkType, s)
|
||||
}
|
||||
|
||||
// validate specific check types
|
||||
switch checkType {
|
||||
case ServiceCheckGRPC:
|
||||
case ServiceCheckTCP:
|
||||
case ServiceCheckHTTP:
|
||||
if sc.Path == "" {
|
||||
return fmt.Errorf("http type must have a valid http path")
|
||||
return fmt.Errorf("http type must have http path")
|
||||
}
|
||||
checkPath, err := url.Parse(sc.Path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("http type must have a valid http path")
|
||||
checkPath, pathErr := url.Parse(sc.Path)
|
||||
if pathErr != nil {
|
||||
return fmt.Errorf("http type must have valid http path")
|
||||
}
|
||||
if checkPath.IsAbs() {
|
||||
return fmt.Errorf("http type must have a relative http path")
|
||||
return fmt.Errorf("http type must have relative http path")
|
||||
}
|
||||
|
||||
case ServiceCheckScript:
|
||||
if sc.Command == "" {
|
||||
return fmt.Errorf("script type must have a valid script path")
|
||||
}
|
||||
|
||||
default:
|
||||
return fmt.Errorf(`invalid type (%+q), must be one of "http", "tcp", or "script" type`, sc.Type)
|
||||
}
|
||||
|
||||
// Validate interval and timeout
|
||||
// validate interval
|
||||
if sc.Interval == 0 {
|
||||
return fmt.Errorf("missing required value interval. Interval cannot be less than %v", minCheckInterval)
|
||||
} else if sc.Interval < minCheckInterval {
|
||||
return fmt.Errorf("interval (%v) cannot be lower than %v", sc.Interval, minCheckInterval)
|
||||
}
|
||||
|
||||
// validate timeout
|
||||
if sc.Timeout == 0 {
|
||||
return fmt.Errorf("missing required value timeout. Timeout cannot be less than %v", minCheckInterval)
|
||||
} else if sc.Timeout < minCheckTimeout {
|
||||
return fmt.Errorf("timeout (%v) is lower than required minimum timeout %v", sc.Timeout, minCheckInterval)
|
||||
}
|
||||
|
||||
// Validate InitialStatus
|
||||
// validate the initial status
|
||||
switch sc.InitialStatus {
|
||||
case "":
|
||||
case api.HealthPassing:
|
||||
|
@ -256,10 +265,9 @@ func (sc *ServiceCheck) validate() error {
|
|||
case api.HealthCritical:
|
||||
default:
|
||||
return fmt.Errorf(`invalid initial check state (%s), must be one of %q, %q, %q or empty`, sc.InitialStatus, api.HealthPassing, api.HealthWarning, api.HealthCritical)
|
||||
|
||||
}
|
||||
|
||||
// Validate AddressMode
|
||||
// validate address_mode
|
||||
switch sc.AddressMode {
|
||||
case "", AddressModeHost, AddressModeDriver, AddressModeAlloc:
|
||||
// Ok
|
||||
|
@ -269,7 +277,7 @@ func (sc *ServiceCheck) validate() error {
|
|||
return fmt.Errorf("invalid address_mode %q", sc.AddressMode)
|
||||
}
|
||||
|
||||
// Validate OnUpdate
|
||||
// validate on_update
|
||||
switch sc.OnUpdate {
|
||||
case "", OnUpdateIgnore, OnUpdateRequireHealthy, OnUpdateIgnoreWarn:
|
||||
// OK
|
||||
|
@ -277,9 +285,102 @@ func (sc *ServiceCheck) validate() error {
|
|||
return fmt.Errorf("on_update must be %q, %q, or %q; got %q", OnUpdateRequireHealthy, OnUpdateIgnoreWarn, OnUpdateIgnore, sc.OnUpdate)
|
||||
}
|
||||
|
||||
// validate check_restart and on_update do not conflict
|
||||
if sc.CheckRestart != nil {
|
||||
// CheckRestart and OnUpdate Ignore are incompatible If OnUpdate treats
|
||||
// an error has healthy, and the deployment succeeds followed by check
|
||||
// restart restarting failing checks, the deployment is left in an odd
|
||||
// state
|
||||
if sc.OnUpdate == OnUpdateIgnore {
|
||||
return fmt.Errorf("on_update value %q is not compatible with check_restart", sc.OnUpdate)
|
||||
}
|
||||
// CheckRestart IgnoreWarnings must be true if a check has defined OnUpdate
|
||||
// ignore_warnings
|
||||
if !sc.CheckRestart.IgnoreWarnings && sc.OnUpdate == OnUpdateIgnoreWarn {
|
||||
return fmt.Errorf("on_update value %q not supported with check_restart ignore_warnings value %q", sc.OnUpdate, strconv.FormatBool(sc.CheckRestart.IgnoreWarnings))
|
||||
}
|
||||
}
|
||||
|
||||
// validate check_restart
|
||||
if err := sc.CheckRestart.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// validate a Service's ServiceCheck in the context of the Nomad provider.
|
||||
func (sc *ServiceCheck) validateNomad() error {
|
||||
allowable := []string{ServiceCheckTCP, ServiceCheckHTTP}
|
||||
if err := sc.validateCommon(allowable); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// expose is connect (consul) specific
|
||||
if sc.Expose {
|
||||
return fmt.Errorf("expose may only be set for Consul service checks")
|
||||
}
|
||||
|
||||
// nomad checks do not have warnings
|
||||
if sc.OnUpdate == "ignore_warnings" {
|
||||
return fmt.Errorf("on_update may only be set to ignore_warnings for Consul service checks")
|
||||
}
|
||||
|
||||
// below are temporary limitations on checks in nomad
|
||||
// https://github.com/hashicorp/team-nomad/issues/354
|
||||
|
||||
// check_restart not yet supported on nomad
|
||||
if sc.CheckRestart != nil {
|
||||
return fmt.Errorf("check_restart may only be set for Consul service checks")
|
||||
}
|
||||
|
||||
// address_mode="driver" not yet supported on nomad
|
||||
if sc.AddressMode == "driver" {
|
||||
return fmt.Errorf("address_mode = driver may only be set for Consul service checks")
|
||||
}
|
||||
|
||||
if sc.Type == "http" {
|
||||
if sc.Method != "" && sc.Method != "GET" {
|
||||
// unset turns into GET
|
||||
return fmt.Errorf("http checks may only use GET method in Nomad services")
|
||||
}
|
||||
|
||||
if len(sc.Header) > 0 {
|
||||
return fmt.Errorf("http checks may not set headers in Nomad services")
|
||||
}
|
||||
|
||||
if len(sc.Body) > 0 {
|
||||
return fmt.Errorf("http checks may not set Body in Nomad services")
|
||||
}
|
||||
}
|
||||
|
||||
// success_before_passing is consul only
|
||||
if sc.SuccessBeforePassing != 0 {
|
||||
return fmt.Errorf("success_before_passing may only be set for Consul service checks")
|
||||
}
|
||||
|
||||
// failures_before_critical is consul only
|
||||
if sc.FailuresBeforeCritical != 0 {
|
||||
return fmt.Errorf("failures_before_critical may only be set for Consul service checks")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// validate a Service's ServiceCheck in the context of the Consul provider.
|
||||
func (sc *ServiceCheck) validateConsul() error {
|
||||
allowable := []string{ServiceCheckGRPC, ServiceCheckTCP, ServiceCheckHTTP, ServiceCheckScript}
|
||||
if err := sc.validateCommon(allowable); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
checkType := strings.ToLower(sc.Type)
|
||||
|
||||
// Note that we cannot completely validate the Expose field yet - we do not
|
||||
// know whether this ServiceCheck belongs to a connect-enabled group-service.
|
||||
// Instead, such validation will happen in a job admission controller.
|
||||
//
|
||||
// Consul only.
|
||||
if sc.Expose {
|
||||
// We can however immediately ensure expose is configured only for HTTP
|
||||
// and gRPC checks.
|
||||
|
@ -292,6 +393,8 @@ func (sc *ServiceCheck) validate() error {
|
|||
|
||||
// passFailCheckTypes are intersection of check types supported by both Consul
|
||||
// and Nomad when using the pass/fail check threshold features.
|
||||
//
|
||||
// Consul only.
|
||||
passFailCheckTypes := []string{"tcp", "http", "grpc"}
|
||||
|
||||
if sc.SuccessBeforePassing < 0 {
|
||||
|
@ -306,23 +409,7 @@ func (sc *ServiceCheck) validate() error {
|
|||
return fmt.Errorf("failures_before_critical not supported for check of type %q", sc.Type)
|
||||
}
|
||||
|
||||
// Check that CheckRestart and OnUpdate do not conflict
|
||||
if sc.CheckRestart != nil {
|
||||
// CheckRestart and OnUpdate Ignore are incompatible If OnUpdate treats
|
||||
// an error has healthy, and the deployment succeeds followed by check
|
||||
// restart restarting erroring checks, the deployment is left in an odd
|
||||
// state
|
||||
if sc.OnUpdate == OnUpdateIgnore {
|
||||
return fmt.Errorf("on_update value %q is not compatible with check_restart", sc.OnUpdate)
|
||||
}
|
||||
// CheckRestart IgnoreWarnings must be true if a check has defined OnUpdate
|
||||
// ignore_warnings
|
||||
if !sc.CheckRestart.IgnoreWarnings && sc.OnUpdate == OnUpdateIgnoreWarn {
|
||||
return fmt.Errorf("on_update value %q not supported with check_restart ignore_warnings value %q", sc.OnUpdate, strconv.FormatBool(sc.CheckRestart.IgnoreWarnings))
|
||||
}
|
||||
}
|
||||
|
||||
return sc.CheckRestart.Validate()
|
||||
return nil
|
||||
}
|
||||
|
||||
// RequiresPort returns whether the service check requires the task has a port.
|
||||
|
@ -401,6 +488,10 @@ func hashIntIfNonZero(h hash.Hash, name string, i int) {
|
|||
}
|
||||
}
|
||||
|
||||
func hashDuration(h hash.Hash, dur time.Duration) {
|
||||
_ = binary.Write(h, binary.LittleEndian, dur)
|
||||
}
|
||||
|
||||
func hashHeader(h hash.Hash, m map[string][]string) {
|
||||
// maintain backwards compatibility for ID stability
|
||||
// using the %v formatter on a map with string keys produces consistent
|
||||
|
@ -621,14 +712,21 @@ func (s *Service) Validate() error {
|
|||
return mErr.ErrorOrNil()
|
||||
}
|
||||
|
||||
func (s *Service) validateCheckPort(c *ServiceCheck) error {
|
||||
if s.PortLabel == "" && c.PortLabel == "" && c.RequiresPort() {
|
||||
return fmt.Errorf("Check %s invalid: check requires a port but neither check nor service %+q have a port", c.Name, s.Name)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateConsulService performs validation on a service which is using the
|
||||
// consul provider.
|
||||
func (s *Service) validateConsulService(mErr *multierror.Error) {
|
||||
|
||||
// check checks
|
||||
for _, c := range s.Checks {
|
||||
if s.PortLabel == "" && c.PortLabel == "" && c.RequiresPort() {
|
||||
mErr.Errors = append(mErr.Errors, fmt.Errorf("Check %s invalid: check requires a port but neither check nor service %+q have a port", c.Name, s.Name))
|
||||
// validat ethe check port
|
||||
if err := s.validateCheckPort(c); err != nil {
|
||||
mErr.Errors = append(mErr.Errors, err)
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -640,7 +738,8 @@ func (s *Service) validateConsulService(mErr *multierror.Error) {
|
|||
continue
|
||||
}
|
||||
|
||||
if err := c.validate(); err != nil {
|
||||
// validate the consul check
|
||||
if err := c.validateConsul(); err != nil {
|
||||
mErr.Errors = append(mErr.Errors, fmt.Errorf("Check %s invalid: %v", c.Name, err))
|
||||
}
|
||||
}
|
||||
|
@ -662,12 +761,18 @@ func (s *Service) validateConsulService(mErr *multierror.Error) {
|
|||
// validateNomadService performs validation on a service which is using the
|
||||
// nomad provider.
|
||||
func (s *Service) validateNomadService(mErr *multierror.Error) {
|
||||
// check checks
|
||||
for _, c := range s.Checks {
|
||||
// validate the check port
|
||||
if err := s.validateCheckPort(c); err != nil {
|
||||
mErr.Errors = append(mErr.Errors, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Service blocks for the Nomad provider do not support checks. We perform
|
||||
// a nil check, as an empty check list is nilled within the service
|
||||
// canonicalize function.
|
||||
if s.Checks != nil {
|
||||
mErr.Errors = append(mErr.Errors, errors.New("Service with provider nomad cannot include Check blocks"))
|
||||
// validate the nomad check
|
||||
if err := c.validateNomad(); err != nil {
|
||||
mErr.Errors = append(mErr.Errors, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Services using the Nomad provider do not support Consul connect.
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"github.com/hashicorp/go-multierror"
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/shoenig/test/must"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
@ -68,7 +69,7 @@ func TestServiceCheck_validate_PassingTypes(t *testing.T) {
|
|||
Interval: 1 * time.Second,
|
||||
Timeout: 2 * time.Second,
|
||||
SuccessBeforePassing: 3,
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.NoError(t, err)
|
||||
}
|
||||
})
|
||||
|
@ -81,7 +82,7 @@ func TestServiceCheck_validate_PassingTypes(t *testing.T) {
|
|||
Interval: 1 * time.Second,
|
||||
Timeout: 2 * time.Second,
|
||||
SuccessBeforePassing: 3,
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.EqualError(t, err, `success_before_passing not supported for check of type "script"`)
|
||||
})
|
||||
}
|
||||
|
@ -98,7 +99,7 @@ func TestServiceCheck_validate_FailingTypes(t *testing.T) {
|
|||
Interval: 1 * time.Second,
|
||||
Timeout: 2 * time.Second,
|
||||
FailuresBeforeCritical: 3,
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.NoError(t, err)
|
||||
}
|
||||
})
|
||||
|
@ -112,7 +113,7 @@ func TestServiceCheck_validate_FailingTypes(t *testing.T) {
|
|||
Timeout: 2 * time.Second,
|
||||
SuccessBeforePassing: 0,
|
||||
FailuresBeforeCritical: 3,
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.EqualError(t, err, `failures_before_critical not supported for check of type "script"`)
|
||||
})
|
||||
}
|
||||
|
@ -129,7 +130,7 @@ func TestServiceCheck_validate_PassFailZero_on_scripts(t *testing.T) {
|
|||
Timeout: 2 * time.Second,
|
||||
SuccessBeforePassing: 0, // script checks should still pass validation
|
||||
FailuresBeforeCritical: 0, // script checks should still pass validation
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
@ -150,7 +151,7 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) {
|
|||
Grace: 5 * time.Second,
|
||||
},
|
||||
OnUpdate: "ignore_warnings",
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.EqualError(t, err, `on_update value "ignore_warnings" not supported with check_restart ignore_warnings value "false"`)
|
||||
})
|
||||
|
||||
|
@ -167,7 +168,7 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) {
|
|||
Grace: 5 * time.Second,
|
||||
},
|
||||
OnUpdate: "ignore",
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.EqualError(t, err, `on_update value "ignore" is not compatible with check_restart`)
|
||||
})
|
||||
|
||||
|
@ -184,11 +185,129 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) {
|
|||
Grace: 5 * time.Second,
|
||||
},
|
||||
OnUpdate: "ignore_warnings",
|
||||
}).validate()
|
||||
}).validateConsul()
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestServiceCheck_validateNomad(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
sc *ServiceCheck
|
||||
exp string
|
||||
}{
|
||||
{name: "grpc", sc: &ServiceCheck{Type: ServiceCheckGRPC}, exp: `invalid check type ("grpc"), must be one of tcp, http`},
|
||||
{name: "script", sc: &ServiceCheck{Type: ServiceCheckScript}, exp: `invalid check type ("script"), must be one of tcp, http`},
|
||||
{
|
||||
name: "expose",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckTCP,
|
||||
Expose: true, // consul only
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
},
|
||||
exp: `expose may only be set for Consul service checks`,
|
||||
}, {
|
||||
name: "on_update ignore_warnings",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckTCP,
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
OnUpdate: "ignore_warnings",
|
||||
},
|
||||
exp: `on_update may only be set to ignore_warnings for Consul service checks`,
|
||||
},
|
||||
{
|
||||
name: "success_before_passing",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckTCP,
|
||||
SuccessBeforePassing: 3, // consul only
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
},
|
||||
exp: `success_before_passing may only be set for Consul service checks`,
|
||||
},
|
||||
{
|
||||
name: "failures_before_critical",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckTCP,
|
||||
FailuresBeforeCritical: 3, // consul only
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
},
|
||||
exp: `failures_before_critical may only be set for Consul service checks`,
|
||||
},
|
||||
{
|
||||
name: "check_restart",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckTCP,
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
CheckRestart: new(CheckRestart),
|
||||
},
|
||||
exp: `check_restart may only be set for Consul service checks`,
|
||||
},
|
||||
{
|
||||
name: "address mode driver",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckTCP,
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
AddressMode: "driver",
|
||||
},
|
||||
exp: `address_mode = driver may only be set for Consul service checks`,
|
||||
},
|
||||
{
|
||||
name: "http non GET",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckHTTP,
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
Path: "/health",
|
||||
Method: "HEAD",
|
||||
},
|
||||
exp: `http checks may only use GET method in Nomad services`,
|
||||
},
|
||||
{
|
||||
name: "http with headers",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckHTTP,
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
Path: "/health",
|
||||
Method: "GET",
|
||||
Header: map[string][]string{"foo": {"bar"}},
|
||||
},
|
||||
exp: `http checks may not set headers in Nomad services`,
|
||||
},
|
||||
{
|
||||
name: "http with body",
|
||||
sc: &ServiceCheck{
|
||||
Type: ServiceCheckHTTP,
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
Path: "/health",
|
||||
Method: "GET",
|
||||
Body: "blah",
|
||||
},
|
||||
exp: `http checks may not set Body in Nomad services`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, testCase := range testCases {
|
||||
t.Run(testCase.name, func(t *testing.T) {
|
||||
err := testCase.sc.validateNomad()
|
||||
if testCase.exp == "" {
|
||||
must.NoError(t, err)
|
||||
} else {
|
||||
must.EqError(t, err, testCase.exp)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestService_Hash(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
|
@ -1551,6 +1670,30 @@ func TestService_Validate(t *testing.T) {
|
|||
expErrStr: "Consul Connect must be exclusively native",
|
||||
name: "Native Connect with Sidecar",
|
||||
},
|
||||
{
|
||||
input: &Service{
|
||||
Name: "testservice",
|
||||
Provider: "nomad",
|
||||
PortLabel: "port",
|
||||
Checks: []*ServiceCheck{
|
||||
{
|
||||
Name: "servicecheck",
|
||||
Type: "http",
|
||||
Path: "/",
|
||||
Interval: 1 * time.Second,
|
||||
Timeout: 3 * time.Second,
|
||||
},
|
||||
{
|
||||
Name: "servicecheck",
|
||||
Type: "tcp",
|
||||
Interval: 1 * time.Second,
|
||||
Timeout: 3 * time.Second,
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: false,
|
||||
name: "provider nomad with checks",
|
||||
},
|
||||
{
|
||||
input: &Service{
|
||||
Name: "testservice",
|
||||
|
@ -1558,12 +1701,12 @@ func TestService_Validate(t *testing.T) {
|
|||
Checks: []*ServiceCheck{
|
||||
{
|
||||
Name: "servicecheck",
|
||||
Type: "script",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: true,
|
||||
expErrStr: "Service with provider nomad cannot include Check blocks",
|
||||
name: "provider nomad with checks",
|
||||
expErr: true,
|
||||
name: "provider nomad with invalid check type",
|
||||
},
|
||||
{
|
||||
input: &Service{
|
||||
|
@ -1602,6 +1745,8 @@ func TestService_Validate(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestService_Validate_Address(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
try := func(mode, advertise string, exp error) {
|
||||
s := &Service{Name: "s1", Provider: "consul", AddressMode: mode, Address: advertise}
|
||||
result := s.Validate()
|
||||
|
@ -1702,7 +1847,7 @@ func TestService_validateNomadService(t *testing.T) {
|
|||
Provider: "nomad",
|
||||
},
|
||||
inputErr: &multierror.Error{},
|
||||
expectedOutputErrors: []error{},
|
||||
expectedOutputErrors: nil,
|
||||
name: "valid service",
|
||||
},
|
||||
{
|
||||
|
@ -1711,13 +1856,18 @@ func TestService_validateNomadService(t *testing.T) {
|
|||
PortLabel: "http",
|
||||
Namespace: "default",
|
||||
Provider: "nomad",
|
||||
Checks: []*ServiceCheck{
|
||||
{Name: "some-check"},
|
||||
},
|
||||
Checks: []*ServiceCheck{{
|
||||
Name: "webapp",
|
||||
Type: ServiceCheckHTTP,
|
||||
Method: "GET",
|
||||
Path: "/health",
|
||||
Interval: 3 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
}},
|
||||
},
|
||||
inputErr: &multierror.Error{},
|
||||
expectedOutputErrors: []error{errors.New("Service with provider nomad cannot include Check blocks")},
|
||||
name: "invalid service due to checks",
|
||||
expectedOutputErrors: nil,
|
||||
name: "valid service with checks",
|
||||
},
|
||||
{
|
||||
inputService: &Service{
|
||||
|
@ -1739,26 +1889,22 @@ func TestService_validateNomadService(t *testing.T) {
|
|||
PortLabel: "http",
|
||||
Namespace: "default",
|
||||
Provider: "nomad",
|
||||
Connect: &ConsulConnect{
|
||||
Native: true,
|
||||
},
|
||||
Checks: []*ServiceCheck{
|
||||
{Name: "some-check"},
|
||||
},
|
||||
},
|
||||
inputErr: &multierror.Error{},
|
||||
expectedOutputErrors: []error{
|
||||
errors.New("Service with provider nomad cannot include Check blocks"),
|
||||
errors.New("Service with provider nomad cannot include Connect blocks"),
|
||||
errors.New(`invalid check type (""), must be one of tcp, http`),
|
||||
},
|
||||
name: "invalid service due to checks and connect",
|
||||
name: "bad nomad check",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tc.inputService.validateNomadService(tc.inputErr)
|
||||
require.ElementsMatch(t, tc.expectedOutputErrors, tc.inputErr.Errors)
|
||||
must.Eq(t, tc.expectedOutputErrors, tc.inputErr.Errors)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,9 +25,6 @@ import (
|
|||
"time"
|
||||
|
||||
jwt "github.com/golang-jwt/jwt/v4"
|
||||
"github.com/hashicorp/nomad/helper/escapingfs"
|
||||
"golang.org/x/crypto/blake2b"
|
||||
|
||||
"github.com/hashicorp/cronexpr"
|
||||
"github.com/hashicorp/go-msgpack/codec"
|
||||
"github.com/hashicorp/go-multierror"
|
||||
|
@ -38,12 +35,14 @@ import (
|
|||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/helper/args"
|
||||
"github.com/hashicorp/nomad/helper/constraints/semver"
|
||||
"github.com/hashicorp/nomad/helper/escapingfs"
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/lib/cpuset"
|
||||
"github.com/hashicorp/nomad/lib/kheap"
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
"github.com/miekg/dns"
|
||||
"github.com/mitchellh/copystructure"
|
||||
"golang.org/x/crypto/blake2b"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -4818,7 +4817,7 @@ func (jc *JobChildrenSummary) Copy() *JobChildrenSummary {
|
|||
return njc
|
||||
}
|
||||
|
||||
// TaskGroup summarizes the state of all the allocations of a particular
|
||||
// TaskGroupSummary summarizes the state of all the allocations of a particular
|
||||
// TaskGroup
|
||||
type TaskGroupSummary struct {
|
||||
Queued int
|
||||
|
@ -4909,9 +4908,9 @@ func (u *UpdateStrategy) Copy() *UpdateStrategy {
|
|||
return nil
|
||||
}
|
||||
|
||||
copy := new(UpdateStrategy)
|
||||
*copy = *u
|
||||
return copy
|
||||
c := new(UpdateStrategy)
|
||||
*c = *u
|
||||
return c
|
||||
}
|
||||
|
||||
func (u *UpdateStrategy) Validate() error {
|
||||
|
@ -6314,6 +6313,25 @@ func (tg *TaskGroup) Canonicalize(job *Job) {
|
|||
}
|
||||
}
|
||||
|
||||
// NomadServices returns a list of all group and task - level services in tg that
|
||||
// are making use of the nomad service provider.
|
||||
func (tg *TaskGroup) NomadServices() []*Service {
|
||||
var services []*Service
|
||||
for _, service := range tg.Services {
|
||||
if service.Provider == ServiceProviderNomad {
|
||||
services = append(services, service)
|
||||
}
|
||||
}
|
||||
for _, task := range tg.Tasks {
|
||||
for _, service := range task.Services {
|
||||
if service.Provider == ServiceProviderNomad {
|
||||
services = append(services, service)
|
||||
}
|
||||
}
|
||||
}
|
||||
return services
|
||||
}
|
||||
|
||||
// Validate is used to check a task group for reasonable configuration
|
||||
func (tg *TaskGroup) Validate(j *Job) error {
|
||||
var mErr multierror.Error
|
||||
|
@ -7817,7 +7835,7 @@ func (wc *WaitConfig) Validate() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// AllocState records a single event that changes the state of the whole allocation
|
||||
// AllocStateField records a single event that changes the state of the whole allocation
|
||||
type AllocStateField uint8
|
||||
|
||||
const (
|
||||
|
@ -10652,6 +10670,12 @@ func (a *AllocNetworkStatus) Copy() *AllocNetworkStatus {
|
|||
}
|
||||
}
|
||||
|
||||
// NetworkStatus is an interface satisfied by alloc runner, for acquiring the
|
||||
// network status of an allocation.
|
||||
type NetworkStatus interface {
|
||||
NetworkStatus() *AllocNetworkStatus
|
||||
}
|
||||
|
||||
// AllocDeploymentStatus captures the status of the allocation as part of the
|
||||
// deployment. This can include things like if the allocation has been marked as
|
||||
// healthy.
|
||||
|
|
|
@ -1999,7 +1999,7 @@ func TestTask_Validate_Service_Check(t *testing.T) {
|
|||
Interval: 10 * time.Second,
|
||||
}
|
||||
|
||||
err := invalidCheck.validate()
|
||||
err := invalidCheck.validateConsul()
|
||||
if err == nil || !strings.Contains(err.Error(), "Timeout cannot be less") {
|
||||
t.Fatalf("expected a timeout validation error but received: %q", err)
|
||||
}
|
||||
|
@ -2011,12 +2011,12 @@ func TestTask_Validate_Service_Check(t *testing.T) {
|
|||
Timeout: 2 * time.Second,
|
||||
}
|
||||
|
||||
if err := check1.validate(); err != nil {
|
||||
if err := check1.validateConsul(); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
check1.InitialStatus = "foo"
|
||||
err = check1.validate()
|
||||
err = check1.validateConsul()
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error")
|
||||
}
|
||||
|
@ -2026,19 +2026,19 @@ func TestTask_Validate_Service_Check(t *testing.T) {
|
|||
}
|
||||
|
||||
check1.InitialStatus = api.HealthCritical
|
||||
err = check1.validate()
|
||||
err = check1.validateConsul()
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
check1.InitialStatus = api.HealthPassing
|
||||
err = check1.validate()
|
||||
err = check1.validateConsul()
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
check1.InitialStatus = ""
|
||||
err = check1.validate()
|
||||
err = check1.validateConsul()
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
@ -2051,22 +2051,22 @@ func TestTask_Validate_Service_Check(t *testing.T) {
|
|||
Path: "/foo/bar",
|
||||
}
|
||||
|
||||
err = check2.validate()
|
||||
err = check2.validateConsul()
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
check2.Path = ""
|
||||
err = check2.validate()
|
||||
err = check2.validateConsul()
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "valid http path") {
|
||||
if !strings.Contains(err.Error(), "http type must have http path") {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
check2.Path = "http://www.example.com"
|
||||
err = check2.validate()
|
||||
err = check2.validateConsul()
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error")
|
||||
}
|
||||
|
@ -2082,7 +2082,7 @@ func TestTask_Validate_Service_Check(t *testing.T) {
|
|||
Timeout: 1 * time.Second,
|
||||
Path: "/health",
|
||||
Expose: true,
|
||||
}).validate())
|
||||
}).validateConsul())
|
||||
})
|
||||
t.Run("type tcp", func(t *testing.T) {
|
||||
require.EqualError(t, (&ServiceCheck{
|
||||
|
@ -2090,7 +2090,7 @@ func TestTask_Validate_Service_Check(t *testing.T) {
|
|||
Interval: 1 * time.Second,
|
||||
Timeout: 1 * time.Second,
|
||||
Expose: true,
|
||||
}).validate(), "expose may only be set on HTTP or gRPC checks")
|
||||
}).validateConsul(), "expose may only be set on HTTP or gRPC checks")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue