2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2018-10-04 23:22:01 +00:00
|
|
|
package allocrunner
|
2018-08-30 21:33:50 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
"github.com/hashicorp/go-hclog"
|
2018-08-30 21:33:50 +00:00
|
|
|
"github.com/hashicorp/nomad/client/allochealth"
|
2018-10-04 23:22:01 +00:00
|
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
2022-03-15 08:38:30 +00:00
|
|
|
"github.com/hashicorp/nomad/client/serviceregistration"
|
2022-06-07 14:18:19 +00:00
|
|
|
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
|
2018-08-30 21:33:50 +00:00
|
|
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
2023-03-10 19:43:31 +00:00
|
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
2018-08-30 21:33:50 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
// healthSetter is able to set/clear alloc health.
|
2018-09-11 00:07:59 +00:00
|
|
|
type healthSetter interface {
|
2018-12-07 01:24:43 +00:00
|
|
|
// HasHealth returns true if health is already set.
|
|
|
|
HasHealth() bool
|
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
// SetHealth via the mutator.
|
2018-08-30 21:33:50 +00:00
|
|
|
SetHealth(healthy, isDeploy bool, taskEvents map[string]*structs.TaskEvent)
|
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
// ClearHealth for when the deployment ID changes.
|
2018-08-30 21:33:50 +00:00
|
|
|
ClearHealth()
|
|
|
|
}
|
|
|
|
|
|
|
|
// allocHealthWatcherHook is responsible for watching an allocation's task
|
|
|
|
// status and (optionally) Consul health check status to determine if the
|
2022-06-07 14:18:19 +00:00
|
|
|
// allocation is healthy or unhealthy. Used by deployments and migrations.
|
2018-08-30 21:33:50 +00:00
|
|
|
type allocHealthWatcherHook struct {
|
2018-09-11 00:07:59 +00:00
|
|
|
healthSetter healthSetter
|
2018-08-30 21:33:50 +00:00
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
// consul client used to monitor Consul service health checks
|
2022-03-15 08:38:30 +00:00
|
|
|
consul serviceregistration.Handler
|
2018-08-30 21:33:50 +00:00
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
// checkStore is used to monitor Nomad service health checks
|
|
|
|
checkStore checkstore.Shim
|
|
|
|
|
2018-08-30 21:33:50 +00:00
|
|
|
// listener is given to trackers to listen for alloc updates and closed
|
|
|
|
// when the alloc is destroyed.
|
|
|
|
listener *cstructs.AllocListener
|
|
|
|
|
|
|
|
// hookLock is held by hook methods to prevent concurrent access by
|
|
|
|
// Update and synchronous hooks.
|
|
|
|
hookLock sync.Mutex
|
|
|
|
|
2018-09-14 00:25:01 +00:00
|
|
|
// watchDone is created before calling watchHealth and is closed when
|
|
|
|
// watchHealth exits. Must be passed into watchHealth to avoid races.
|
|
|
|
// Initialized already closed as Update may be called before Prerun.
|
|
|
|
watchDone chan struct{}
|
2018-08-30 21:33:50 +00:00
|
|
|
|
|
|
|
// ranOnce is set once Prerun or Update have run at least once. This
|
|
|
|
// prevents Prerun from running if an Update has already been
|
|
|
|
// processed. Must hold hookLock to access.
|
|
|
|
ranOnce bool
|
|
|
|
|
2018-09-11 00:48:09 +00:00
|
|
|
// cancelFn stops the health watching/setting goroutine. Wait on
|
|
|
|
// watchLock to block until the watcher exits.
|
2018-08-30 21:33:50 +00:00
|
|
|
cancelFn context.CancelFunc
|
|
|
|
|
|
|
|
// alloc set by new func or Update. Must hold hookLock to access.
|
|
|
|
alloc *structs.Allocation
|
|
|
|
|
2023-03-10 19:43:31 +00:00
|
|
|
// taskEnvBuilder is the current builder used to build task environments
|
|
|
|
// for the group and each of its tasks. Must hold hookLock to modify.
|
|
|
|
taskEnvBuilder *taskenv.Builder
|
|
|
|
|
|
|
|
// taskEnvBuilderFactory creates a new *taskenv.Builder instance.
|
|
|
|
taskEnvBuilderFactory func() *taskenv.Builder
|
|
|
|
|
2018-08-30 21:33:50 +00:00
|
|
|
// isDeploy is true if monitoring a deployment. Set in init(). Must
|
|
|
|
// hold hookLock to access.
|
|
|
|
isDeploy bool
|
|
|
|
|
2022-06-07 14:18:19 +00:00
|
|
|
logger hclog.Logger
|
2018-08-30 21:33:50 +00:00
|
|
|
}
|
|
|
|
|
2023-03-10 19:43:31 +00:00
|
|
|
func newAllocHealthWatcherHook(
|
|
|
|
logger hclog.Logger,
|
|
|
|
alloc *structs.Allocation,
|
|
|
|
taskEnvBuilderFactory func() *taskenv.Builder,
|
|
|
|
hs healthSetter,
|
|
|
|
listener *cstructs.AllocListener,
|
|
|
|
consul serviceregistration.Handler,
|
|
|
|
checkStore checkstore.Shim,
|
|
|
|
) interfaces.RunnerHook {
|
2018-08-30 21:33:50 +00:00
|
|
|
|
|
|
|
// Neither deployments nor migrations care about the health of
|
|
|
|
// non-service jobs so never watch their health
|
|
|
|
if alloc.Job.Type != structs.JobTypeService {
|
|
|
|
return noopAllocHealthWatcherHook{}
|
|
|
|
}
|
|
|
|
|
2018-09-14 00:25:01 +00:00
|
|
|
// Initialize watchDone with a closed chan in case Update runs before Prerun
|
|
|
|
closedDone := make(chan struct{})
|
|
|
|
close(closedDone)
|
|
|
|
|
2018-08-30 21:33:50 +00:00
|
|
|
h := &allocHealthWatcherHook{
|
2023-03-10 19:43:31 +00:00
|
|
|
alloc: alloc,
|
|
|
|
taskEnvBuilderFactory: taskEnvBuilderFactory,
|
|
|
|
taskEnvBuilder: taskEnvBuilderFactory(),
|
|
|
|
cancelFn: func() {}, // initialize to prevent nil func panics
|
|
|
|
watchDone: closedDone,
|
|
|
|
consul: consul,
|
|
|
|
checkStore: checkStore,
|
|
|
|
healthSetter: hs,
|
|
|
|
listener: listener,
|
2018-08-30 21:33:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
h.logger = logger.Named(h.Name())
|
|
|
|
return h
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *allocHealthWatcherHook) Name() string {
|
|
|
|
return "alloc_health_watcher"
|
|
|
|
}
|
|
|
|
|
|
|
|
// init starts the allochealth.Tracker and watchHealth goroutine on either
|
|
|
|
// Prerun or Update. Caller must set/update alloc and logger fields.
|
|
|
|
//
|
|
|
|
// Not threadsafe so the caller should lock since Updates occur concurrently.
|
|
|
|
func (h *allocHealthWatcherHook) init() error {
|
|
|
|
// No need to watch health as it's already set
|
2018-12-07 01:24:43 +00:00
|
|
|
if h.healthSetter.HasHealth() {
|
2019-02-15 23:42:46 +00:00
|
|
|
h.logger.Trace("not watching; already has health set")
|
2018-08-30 21:33:50 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
|
|
|
|
if tg == nil {
|
|
|
|
return fmt.Errorf("task group %q does not exist in job %q", h.alloc.TaskGroup, h.alloc.Job.ID)
|
|
|
|
}
|
|
|
|
|
|
|
|
h.isDeploy = h.alloc.DeploymentID != ""
|
|
|
|
|
|
|
|
// No need to watch allocs for deployments that rely on operators
|
|
|
|
// manually setting health
|
2019-09-02 17:30:09 +00:00
|
|
|
if h.isDeploy && (tg.Update.IsEmpty() || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
|
2018-08-30 21:33:50 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Define the deadline, health method, min healthy time from the
|
|
|
|
// deployment if this is a deployment; otherwise from the migration
|
|
|
|
// strategy.
|
|
|
|
deadline, useChecks, minHealthyTime := getHealthParams(time.Now(), tg, h.isDeploy)
|
|
|
|
|
2019-02-15 23:42:46 +00:00
|
|
|
// Create a context that is canceled when the tracker should shutdown.
|
2018-08-30 21:33:50 +00:00
|
|
|
ctx := context.Background()
|
2019-02-15 23:42:46 +00:00
|
|
|
ctx, h.cancelFn = context.WithCancel(ctx)
|
2018-08-30 21:33:50 +00:00
|
|
|
|
2019-02-15 23:42:46 +00:00
|
|
|
h.logger.Trace("watching", "deadline", deadline, "checks", useChecks, "min_healthy_time", minHealthyTime)
|
2018-08-30 21:33:50 +00:00
|
|
|
// Create a new tracker, start it, and watch for health results.
|
2022-06-07 14:18:19 +00:00
|
|
|
tracker := allochealth.NewTracker(
|
2023-03-10 19:43:31 +00:00
|
|
|
ctx, h.logger, h.alloc, h.listener, h.taskEnvBuilder, h.consul, h.checkStore, minHealthyTime, useChecks,
|
2022-06-07 14:18:19 +00:00
|
|
|
)
|
2018-08-30 21:33:50 +00:00
|
|
|
tracker.Start()
|
2018-09-14 00:25:01 +00:00
|
|
|
|
|
|
|
// Create a new done chan and start watching for health updates
|
|
|
|
h.watchDone = make(chan struct{})
|
2019-02-15 23:42:46 +00:00
|
|
|
go h.watchHealth(ctx, deadline, tracker, h.watchDone)
|
2018-08-30 21:33:50 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-03-12 22:03:54 +00:00
|
|
|
func (h *allocHealthWatcherHook) Prerun() error {
|
2018-08-30 21:33:50 +00:00
|
|
|
h.hookLock.Lock()
|
|
|
|
defer h.hookLock.Unlock()
|
|
|
|
|
|
|
|
if h.ranOnce {
|
|
|
|
// An Update beat Prerun to running the watcher; noop
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
h.ranOnce = true
|
|
|
|
return h.init()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *allocHealthWatcherHook) Update(req *interfaces.RunnerUpdateRequest) error {
|
|
|
|
h.hookLock.Lock()
|
|
|
|
defer h.hookLock.Unlock()
|
|
|
|
|
|
|
|
// Prevent Prerun from running after an Update
|
|
|
|
h.ranOnce = true
|
|
|
|
|
|
|
|
// Cancel the old watcher and create a new one
|
|
|
|
h.cancelFn()
|
|
|
|
|
2018-09-11 00:48:09 +00:00
|
|
|
// Wait until the watcher exits
|
2018-09-14 00:25:01 +00:00
|
|
|
<-h.watchDone
|
2018-08-30 21:33:50 +00:00
|
|
|
|
|
|
|
// Deployment has changed, reset status
|
|
|
|
if req.Alloc.DeploymentID != h.alloc.DeploymentID {
|
2018-09-11 00:07:59 +00:00
|
|
|
h.healthSetter.ClearHealth()
|
2018-08-30 21:33:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update alloc
|
|
|
|
h.alloc = req.Alloc
|
|
|
|
|
2023-03-10 19:43:31 +00:00
|
|
|
// Create a new taskEnvBuilder with the updated alloc and a nil task
|
|
|
|
h.taskEnvBuilder = h.taskEnvBuilderFactory().UpdateTask(req.Alloc, nil)
|
|
|
|
|
2018-08-30 21:33:50 +00:00
|
|
|
return h.init()
|
|
|
|
}
|
|
|
|
|
2018-12-06 20:30:31 +00:00
|
|
|
func (h *allocHealthWatcherHook) Postrun() error {
|
2018-08-30 21:33:50 +00:00
|
|
|
h.hookLock.Lock()
|
|
|
|
defer h.hookLock.Unlock()
|
|
|
|
|
|
|
|
h.cancelFn()
|
|
|
|
h.listener.Close()
|
|
|
|
|
2018-09-11 00:48:09 +00:00
|
|
|
// Wait until the watcher exits
|
2018-09-14 00:25:01 +00:00
|
|
|
<-h.watchDone
|
2018-08-30 21:33:50 +00:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-11-14 18:29:07 +00:00
|
|
|
func (h *allocHealthWatcherHook) Shutdown() {
|
2018-12-06 20:30:31 +00:00
|
|
|
// Same as Postrun
|
2022-06-07 14:18:19 +00:00
|
|
|
_ = h.Postrun()
|
2018-11-14 18:29:07 +00:00
|
|
|
}
|
|
|
|
|
2019-02-15 23:42:46 +00:00
|
|
|
// watchHealth watches alloc health until it is set, the alloc is stopped, the
|
|
|
|
// deadline is reached, or the context is canceled. watchHealth will be
|
|
|
|
// canceled and restarted on Updates so calls are serialized with a lock.
|
|
|
|
func (h *allocHealthWatcherHook) watchHealth(ctx context.Context, deadline time.Time, tracker *allochealth.Tracker, done chan<- struct{}) {
|
2018-09-14 00:25:01 +00:00
|
|
|
defer close(done)
|
2018-08-30 21:33:50 +00:00
|
|
|
|
2019-02-15 23:42:46 +00:00
|
|
|
// Default to unhealthy for the deadline reached case
|
|
|
|
healthy := false
|
|
|
|
|
2018-08-30 21:33:50 +00:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
2019-02-15 23:42:46 +00:00
|
|
|
// Graceful shutdown
|
2018-08-30 21:33:50 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
case <-tracker.AllocStoppedCh():
|
2019-02-15 23:42:46 +00:00
|
|
|
// Allocation has stopped so no need to set health
|
2018-08-30 21:33:50 +00:00
|
|
|
return
|
|
|
|
|
2020-12-09 19:05:18 +00:00
|
|
|
case <-time.After(time.Until(deadline)):
|
2019-02-15 23:42:46 +00:00
|
|
|
// Time is up! Fallthrough to set unhealthy.
|
|
|
|
h.logger.Trace("deadline reached; setting unhealthy", "deadline", deadline)
|
2018-08-30 21:33:50 +00:00
|
|
|
|
2019-02-15 23:42:46 +00:00
|
|
|
case healthy = <-tracker.HealthyCh():
|
|
|
|
// Health received. Fallthrough to set it.
|
2018-08-30 21:33:50 +00:00
|
|
|
}
|
2019-02-15 23:42:46 +00:00
|
|
|
|
|
|
|
h.logger.Trace("health set", "healthy", healthy)
|
|
|
|
|
|
|
|
// If this is an unhealthy deployment emit events for tasks
|
|
|
|
var taskEvents map[string]*structs.TaskEvent
|
|
|
|
if !healthy && h.isDeploy {
|
|
|
|
taskEvents = tracker.TaskEvents()
|
|
|
|
}
|
|
|
|
|
|
|
|
h.healthSetter.SetHealth(healthy, h.isDeploy, taskEvents)
|
2018-08-30 21:33:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// getHealthParams returns the health watcher parameters which vary based on
|
|
|
|
// whether this allocation is in a deployment or migration.
|
|
|
|
func getHealthParams(now time.Time, tg *structs.TaskGroup, isDeploy bool) (deadline time.Time, useChecks bool, minHealthyTime time.Duration) {
|
|
|
|
if isDeploy {
|
|
|
|
deadline = now.Add(tg.Update.HealthyDeadline)
|
|
|
|
minHealthyTime = tg.Update.MinHealthyTime
|
|
|
|
useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
|
|
|
|
} else {
|
|
|
|
strategy := tg.Migrate
|
|
|
|
if strategy == nil {
|
|
|
|
// For backwards compat with pre-0.8 allocations that
|
|
|
|
// don't have a migrate strategy set.
|
|
|
|
strategy = structs.DefaultMigrateStrategy()
|
|
|
|
}
|
|
|
|
|
|
|
|
deadline = now.Add(strategy.HealthyDeadline)
|
|
|
|
minHealthyTime = strategy.MinHealthyTime
|
|
|
|
useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// noopAllocHealthWatcherHook is an empty hook implementation returned by
|
|
|
|
// newAllocHealthWatcherHook when an allocation will never need its health
|
|
|
|
// monitored.
|
|
|
|
type noopAllocHealthWatcherHook struct{}
|
|
|
|
|
|
|
|
func (noopAllocHealthWatcherHook) Name() string {
|
|
|
|
return "alloc_health_watcher"
|
|
|
|
}
|