open-nomad/client/serviceregistration/watcher.go

package serviceregistration

import (
	"context"
	"fmt"
	"time"

	"github.com/hashicorp/go-hclog"
	"github.com/hashicorp/go-set"
	"github.com/hashicorp/nomad/helper"
	"github.com/hashicorp/nomad/nomad/structs"
)

// composite of allocID + taskName for uniqueness
type key string

type restarter struct {
	allocID   string
	taskName  string
	checkID   string
	checkName string
	taskKey   key

	logger         hclog.Logger
	task           WorkloadRestarter
	grace          time.Duration
	interval       time.Duration
	timeLimit      time.Duration
	ignoreWarnings bool

	// unhealthyState is the time a check first went unhealthy. Set to the
	// zero value if the check passes before timeLimit.
	unhealthyState time.Time

	// graceUntil is when the check's grace period expires and unhealthy
	// checks should be counted.
	graceUntil time.Time
}

// apply restart state for check and restart task if necessary. Current
// timestamp is passed in so all check updates have the same view of time (and
// to ease testing).
//
// Returns true if a restart was triggered in which case this check should be
// removed (checks are added on task startup).
func (r *restarter) apply(ctx context.Context, now time.Time, status string) bool {
	healthy := func() {
		if !r.unhealthyState.IsZero() {
			r.logger.Debug("canceling restart because check became healthy")
			r.unhealthyState = time.Time{}
		}
	}
	switch status {
	case "critical": // consul
	case string(structs.CheckFailure): // nomad
	case string(structs.CheckPending): // nomad
	case "warning": // consul
		if r.ignoreWarnings {
			// Warnings are ignored, reset state and exit
			healthy()
			return false
		}
	default:
		// All other statuses are ok, reset state and exit
		healthy()
		return false
	}

	if now.Before(r.graceUntil) {
		// In grace period, exit
		return false
	}

	if r.unhealthyState.IsZero() {
		// First failure, set restart deadline
		if r.timeLimit != 0 {
			r.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", r.timeLimit)
		}
		r.unhealthyState = now
	}

	// restart timeLimit after start of this check becoming unhealthy
	restartAt := r.unhealthyState.Add(r.timeLimit)

	// Must test >= because if limit=1, restartAt == first failure
	if now.Equal(restartAt) || now.After(restartAt) {
		// hasn't become healthy by deadline, restart!
		r.logger.Debug("restarting due to unhealthy check")

		// Tell TaskRunner to restart due to failure
		reason := fmt.Sprintf("healthcheck: check %q unhealthy", r.checkName)
		event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
		go asyncRestart(ctx, r.logger, r.task, event)
		return true
	}

	return false
}

// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
// to be called in a goroutine.
func asyncRestart(ctx context.Context, logger hclog.Logger, task WorkloadRestarter, event *structs.TaskEvent) {
	// Check watcher restarts are always failures
	const failure = true

	// Restarting is asynchronous so there's no reason to allow this
	// goroutine to block indefinitely.
	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
	defer cancel()

	if err := task.Restart(ctx, event, failure); err != nil {
		// Restart errors are not actionable and only relevant when
		// debugging allocation lifecycle management.
		logger.Debug("failed to restart task", "error", err, "event_time", event.Time, "event_type", event.Type)
	}
}

// CheckStatusGetter is implemented per-provider.
type CheckStatusGetter interface {
	// Get returns a map from CheckID -> (minimal) CheckStatus
	Get() (map[string]string, error)
}

// checkWatchUpdates add or remove checks from the watcher
type checkWatchUpdate struct {
	checkID string
	remove  bool
	restart *restarter
}

// A CheckWatcher watches for check failures and restarts tasks according to
// their check_restart policy.
type CheckWatcher interface {
	// Run the CheckWatcher. Maintains a background process to continuously
	// monitor active checks. Must be called before Watch or Unwatch. Must be
	// called as a goroutine.
	Run(ctx context.Context)

	// Watch the given check. If the check status enters a failing state, the
	// task associated with the check will be restarted according to its check_restart
	// policy via wr.
	Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter)

	// Unwatch will cause the CheckWatcher to no longer monitor the check of given checkID.
	Unwatch(checkID string)
}

// UniversalCheckWatcher is an implementation of CheckWatcher capable of watching
// checks in the Nomad or Consul service providers.
type UniversalCheckWatcher struct {
	logger hclog.Logger
	getter CheckStatusGetter

	// pollFrequency is how often to poll the checks API
	pollFrequency time.Duration

	// checkUpdateCh sends watches/removals to the main loop
	checkUpdateCh chan checkWatchUpdate

	// done is closed when Run has exited
	done chan struct{}

	// failedPreviousInterval is used to indicate whether something went wrong during
	// the previous poll interval - if so we can silence ongoing errors
	failedPreviousInterval bool
}

func NewCheckWatcher(logger hclog.Logger, getter CheckStatusGetter) *UniversalCheckWatcher {
	return &UniversalCheckWatcher{
		logger:        logger.ResetNamed("watch.checks"),
		getter:        getter,
		pollFrequency: 1 * time.Second,
		checkUpdateCh: make(chan checkWatchUpdate, 8),
		done:          make(chan struct{}),
	}
}

// Watch a check and restart its task if unhealthy.
func (w *UniversalCheckWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter) {
	if !check.TriggersRestarts() {
		return // check_restart not set; no-op
	}

	c := &restarter{
		allocID:        allocID,
		taskName:       taskName,
		checkID:        checkID,
		checkName:      check.Name,
		taskKey:        key(allocID + taskName),
		task:           wr,
		interval:       check.Interval,
		grace:          check.CheckRestart.Grace,
		graceUntil:     time.Now().Add(check.CheckRestart.Grace),
		timeLimit:      check.Interval * time.Duration(check.CheckRestart.Limit-1),
		ignoreWarnings: check.CheckRestart.IgnoreWarnings,
		logger:         w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name),
	}

	select {
	case w.checkUpdateCh <- checkWatchUpdate{
		checkID: checkID,
		restart: c,
	}: // activate watch
	case <-w.done: // exited; nothing to do
	}
}

// Unwatch a check.
func (w *UniversalCheckWatcher) Unwatch(checkID string) {
	select {
	case w.checkUpdateCh <- checkWatchUpdate{
		checkID: checkID,
		remove:  true,
	}: // deactivate watch
	case <-w.done: // exited; nothing to do
	}
}

func (w *UniversalCheckWatcher) Run(ctx context.Context) {
	defer close(w.done)

	// map of checkID to their restarter handle (contains only checks we are watching)
	watched := make(map[string]*restarter)

	checkTimer, cleanupCheckTimer := helper.NewSafeTimer(0)
	defer cleanupCheckTimer()

	stopCheckTimer := func() { // todo: refactor using that other pattern
		checkTimer.Stop()
		select {
		case <-checkTimer.C:
		default:
		}
	}

	// initialize with checkTimer disabled
	stopCheckTimer()

	for {
		// disable polling if there are no checks
		if len(watched) == 0 {
			stopCheckTimer()
		}

		select {
		// caller cancelled us; goodbye
		case <-ctx.Done():
			return

		// received an update; add or remove check
		case update := <-w.checkUpdateCh:
			if update.remove {
				delete(watched, update.checkID)
				continue
			}

			watched[update.checkID] = update.restart
			allocID := update.restart.allocID
			taskName := update.restart.taskName
			checkName := update.restart.checkName
			w.logger.Trace("now watching check", "alloc_i", allocID, "task", taskName, "check", checkName)

			// turn on the timer if we are now active
			if len(watched) == 1 {
				stopCheckTimer()
				checkTimer.Reset(w.pollFrequency)
			}

		// poll time; refresh check statuses
		case now := <-checkTimer.C:
			w.interval(ctx, now, watched)
			checkTimer.Reset(w.pollFrequency)
		}
	}
}

func (w *UniversalCheckWatcher) interval(ctx context.Context, now time.Time, watched map[string]*restarter) {
	statuses, err := w.getter.Get()
	if err != nil && !w.failedPreviousInterval {
		w.failedPreviousInterval = true
		w.logger.Error("failed to retrieve check statuses", "error", err)
		return
	}
	w.failedPreviousInterval = false

	// keep track of tasks restarted this interval
	restarts := set.New[key](len(statuses))

	// iterate over status of all checks, and update the status of checks
	// we care about watching
	for checkID, checkRestarter := range watched {
		if ctx.Err() != nil {
			return //  short circuit; caller cancelled us
		}

		if restarts.Contains(checkRestarter.taskKey) {
			// skip; task is already being restarted
			delete(watched, checkID)
			continue
		}

		status, exists := statuses[checkID]
		if !exists {
			// warn only if outside grace period; avoiding race with check registration
			if now.After(checkRestarter.graceUntil) {
				w.logger.Warn("watched check not found", "check_id", checkID)
			}
			continue
		}

		if checkRestarter.apply(ctx, now, status) {
			// check will be re-registered & re-watched on startup
			delete(watched, checkID)
			restarts.Insert(checkRestarter.taskKey)
		}
	}

	// purge passing checks of tasks that are being restarted
	if restarts.Size() > 0 {
		for checkID, checkRestarter := range watched {
			if restarts.Contains(checkRestarter.taskKey) {
				delete(watched, checkID)
			}
		}
	}
}
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`package serviceregistration`

			`import (`
			`"context"`
			`"fmt"`
			`"time"`

			`"github.com/hashicorp/go-hclog"`
			`"github.com/hashicorp/go-set"`
			`"github.com/hashicorp/nomad/helper"`
			`"github.com/hashicorp/nomad/nomad/structs"`
			`)`

			`// composite of allocID + taskName for uniqueness`
			`type key string`

			`type restarter struct {`
			`allocID string`
			`taskName string`
			`checkID string`
			`checkName string`
			`taskKey key`

			`logger hclog.Logger`
			`task WorkloadRestarter`
			`grace time.Duration`
			`interval time.Duration`
			`timeLimit time.Duration`
			`ignoreWarnings bool`

			`// unhealthyState is the time a check first went unhealthy. Set to the`
			`// zero value if the check passes before timeLimit.`
			`unhealthyState time.Time`

			`// graceUntil is when the check's grace period expires and unhealthy`
			`// checks should be counted.`
			`graceUntil time.Time`
			`}`

			`// apply restart state for check and restart task if necessary. Current`
			`// timestamp is passed in so all check updates have the same view of time (and`
			`// to ease testing).`
			`//`
			`// Returns true if a restart was triggered in which case this check should be`
			`// removed (checks are added on task startup).`
			`func (r *restarter) apply(ctx context.Context, now time.Time, status string) bool {`
			`healthy := func() {`
			`if !r.unhealthyState.IsZero() {`
			`r.logger.Debug("canceling restart because check became healthy")`
			`r.unhealthyState = time.Time{}`
			`}`
			`}`
			`switch status {`
cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`case "critical": // consul`
			`case string(structs.CheckFailure): // nomad`
			`case string(structs.CheckPending): // nomad`
			`case "warning": // consul`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`if r.ignoreWarnings {`
			`// Warnings are ignored, reset state and exit`
			`healthy()`
			`return false`
			`}`
			`default:`
			`// All other statuses are ok, reset state and exit`
			`healthy()`
			`return false`
			`}`

			`if now.Before(r.graceUntil) {`
			`// In grace period, exit`
			`return false`
			`}`

			`if r.unhealthyState.IsZero() {`
			`// First failure, set restart deadline`
			`if r.timeLimit != 0 {`
			`r.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", r.timeLimit)`
			`}`
			`r.unhealthyState = now`
			`}`

			`// restart timeLimit after start of this check becoming unhealthy`
			`restartAt := r.unhealthyState.Add(r.timeLimit)`

			`// Must test >= because if limit=1, restartAt == first failure`
			`if now.Equal(restartAt) \|\| now.After(restartAt) {`
			`// hasn't become healthy by deadline, restart!`
			`r.logger.Debug("restarting due to unhealthy check")`

			`// Tell TaskRunner to restart due to failure`
			`reason := fmt.Sprintf("healthcheck: check %q unhealthy", r.checkName)`
			`event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)`
			`go asyncRestart(ctx, r.logger, r.task, event)`
			`return true`
			`}`

			`return false`
			`}`

			`// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended`
			`// to be called in a goroutine.`
			`func asyncRestart(ctx context.Context, logger hclog.Logger, task WorkloadRestarter, event *structs.TaskEvent) {`
			`// Check watcher restarts are always failures`
			`const failure = true`

			`// Restarting is asynchronous so there's no reason to allow this`
			`// goroutine to block indefinitely.`
			`ctx, cancel := context.WithTimeout(ctx, 10*time.Second)`
			`defer cancel()`

			`if err := task.Restart(ctx, event, failure); err != nil {`
			`// Restart errors are not actionable and only relevant when`
			`// debugging allocation lifecycle management.`
			`logger.Debug("failed to restart task", "error", err, "event_time", event.Time, "event_type", event.Type)`
			`}`
			`}`

			`// CheckStatusGetter is implemented per-provider.`
			`type CheckStatusGetter interface {`
			`// Get returns a map from CheckID -> (minimal) CheckStatus`
servicedisco: implement check_restart for nomad service checks This PR implements support for check_restart for checks registered in the Nomad service provider. Unlike Consul, Nomad service checks never report a "warning" status, and so the check_restart.ignore_warnings configuration is not valid for Nomad service checks. 2022-09-12 20:23:21 +00:00			`Get() (map[string]string, error)`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`}`

			`// checkWatchUpdates add or remove checks from the watcher`
			`type checkWatchUpdate struct {`
			`checkID string`
			`remove bool`
			`restart *restarter`
			`}`

cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`// A CheckWatcher watches for check failures and restarts tasks according to`
			`// their check_restart policy.`
			`type CheckWatcher interface {`
			`// Run the CheckWatcher. Maintains a background process to continuously`
			`// monitor active checks. Must be called before Watch or Unwatch. Must be`
			`// called as a goroutine.`
			`Run(ctx context.Context)`

			`// Watch the given check. If the check status enters a failing state, the`
			`// task associated with the check will be restarted according to its check_restart`
			`// policy via wr.`
			`Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter)`

			`// Unwatch will cause the CheckWatcher to no longer monitor the check of given checkID.`
			`Unwatch(checkID string)`
			`}`

			`// UniversalCheckWatcher is an implementation of CheckWatcher capable of watching`
			`// checks in the Nomad or Consul service providers.`
			`type UniversalCheckWatcher struct {`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`logger hclog.Logger`
			`getter CheckStatusGetter`

			`// pollFrequency is how often to poll the checks API`
			`pollFrequency time.Duration`

			`// checkUpdateCh sends watches/removals to the main loop`
			`checkUpdateCh chan checkWatchUpdate`

			`// done is closed when Run has exited`
			`done chan struct{}`

			`// failedPreviousInterval is used to indicate whether something went wrong during`
			`// the previous poll interval - if so we can silence ongoing errors`
			`failedPreviousInterval bool`
			`}`

cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`func NewCheckWatcher(logger hclog.Logger, getter CheckStatusGetter) *UniversalCheckWatcher {`
			`return &UniversalCheckWatcher{`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`logger: logger.ResetNamed("watch.checks"),`
			`getter: getter,`
			`pollFrequency: 1 * time.Second,`
			`checkUpdateCh: make(chan checkWatchUpdate, 8),`
			`done: make(chan struct{}),`
			`}`
			`}`

			`// Watch a check and restart its task if unhealthy.`
cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`func (w UniversalCheckWatcher) Watch(allocID, taskName, checkID string, check structs.ServiceCheck, wr WorkloadRestarter) {`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`if !check.TriggersRestarts() {`
			`return // check_restart not set; no-op`
			`}`

			`c := &restarter{`
			`allocID: allocID,`
			`taskName: taskName,`
			`checkID: checkID,`
			`checkName: check.Name,`
			`taskKey: key(allocID + taskName),`
			`task: wr,`
			`interval: check.Interval,`
			`grace: check.CheckRestart.Grace,`
			`graceUntil: time.Now().Add(check.CheckRestart.Grace),`
			`timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1),`
			`ignoreWarnings: check.CheckRestart.IgnoreWarnings,`
			`logger: w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name),`
			`}`

			`select {`
			`case w.checkUpdateCh <- checkWatchUpdate{`
			`checkID: checkID,`
			`restart: c,`
			`}: // activate watch`
			`case <-w.done: // exited; nothing to do`
			`}`
			`}`

			`// Unwatch a check.`
cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`func (w *UniversalCheckWatcher) Unwatch(checkID string) {`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`select {`
			`case w.checkUpdateCh <- checkWatchUpdate{`
			`checkID: checkID,`
			`remove: true,`
			`}: // deactivate watch`
			`case <-w.done: // exited; nothing to do`
			`}`
			`}`

cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`func (w *UniversalCheckWatcher) Run(ctx context.Context) {`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`defer close(w.done)`

			`// map of checkID to their restarter handle (contains only checks we are watching)`
			`watched := make(map[string]*restarter)`

			`checkTimer, cleanupCheckTimer := helper.NewSafeTimer(0)`
			`defer cleanupCheckTimer()`

			`stopCheckTimer := func() { // todo: refactor using that other pattern`
			`checkTimer.Stop()`
			`select {`
			`case <-checkTimer.C:`
			`default:`
			`}`
			`}`

			`// initialize with checkTimer disabled`
			`stopCheckTimer()`

			`for {`
			`// disable polling if there are no checks`
			`if len(watched) == 0 {`
			`stopCheckTimer()`
			`}`

			`select {`
			`// caller cancelled us; goodbye`
			`case <-ctx.Done():`
			`return`

			`// received an update; add or remove check`
			`case update := <-w.checkUpdateCh:`
			`if update.remove {`
			`delete(watched, update.checkID)`
			`continue`
			`}`

			`watched[update.checkID] = update.restart`
			`allocID := update.restart.allocID`
			`taskName := update.restart.taskName`
			`checkName := update.restart.checkName`
			`w.logger.Trace("now watching check", "alloc_i", allocID, "task", taskName, "check", checkName)`

			`// turn on the timer if we are now active`
			`if len(watched) == 1 {`
			`stopCheckTimer()`
			`checkTimer.Reset(w.pollFrequency)`
			`}`

			`// poll time; refresh check statuses`
			`case now := <-checkTimer.C:`
			`w.interval(ctx, now, watched)`
			`checkTimer.Reset(w.pollFrequency)`
			`}`
			`}`
			`}`

cleanup: create interface for check watcher and mock it in nsd tests (#14577) * cleanup: create interface for check watcher and mock it in nsd tests * cleanup: add comments for check watcher interface 2022-09-14 13:25:20 +00:00			`func (w UniversalCheckWatcher) interval(ctx context.Context, now time.Time, watched map[string]restarter) {`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`statuses, err := w.getter.Get()`
			`if err != nil && !w.failedPreviousInterval {`
			`w.failedPreviousInterval = true`
			`w.logger.Error("failed to retrieve check statuses", "error", err)`
			`return`
			`}`
			`w.failedPreviousInterval = false`

			`// keep track of tasks restarted this interval`
			`restarts := set.New[key](len(statuses))`

			`// iterate over status of all checks, and update the status of checks`
			`// we care about watching`
			`for checkID, checkRestarter := range watched {`
			`if ctx.Err() != nil {`
			`return // short circuit; caller cancelled us`
			`}`

			`if restarts.Contains(checkRestarter.taskKey) {`
			`// skip; task is already being restarted`
			`delete(watched, checkID)`
			`continue`
			`}`

			`status, exists := statuses[checkID]`
			`if !exists {`
			`// warn only if outside grace period; avoiding race with check registration`
			`if now.After(checkRestarter.graceUntil) {`
			`w.logger.Warn("watched check not found", "check_id", checkID)`
			`}`
			`continue`
			`}`

servicedisco: implement check_restart for nomad service checks This PR implements support for check_restart for checks registered in the Nomad service provider. Unlike Consul, Nomad service checks never report a "warning" status, and so the check_restart.ignore_warnings configuration is not valid for Nomad service checks. 2022-09-12 20:23:21 +00:00			`if checkRestarter.apply(ctx, now, status) {`
client: refactor check watcher to be reusable This PR refactors agent/consul/check_watcher into client/serviceregistration, and abstracts away the Consul-specific check lookups. In doing so we should be able to reuse the existing check watcher logic for also watching NSD checks in a followup PR. A chunk of consul/unit_test.go is removed - we'll cover that in e2e tests in a follow PR if needed. In the long run I'd like to remove this whole file. 2022-09-09 17:47:22 +00:00			`// check will be re-registered & re-watched on startup`
			`delete(watched, checkID)`
			`restarts.Insert(checkRestarter.taskKey)`
			`}`
			`}`

			`// purge passing checks of tasks that are being restarted`
			`if restarts.Size() > 0 {`
			`for checkID, checkRestarter := range watched {`
			`if restarts.Contains(checkRestarter.taskKey) {`
			`delete(watched, checkID)`
			`}`
			`}`
			`}`
			`}`