open-nomad/client/allocrunner/taskrunner/script_check_hook.go
Tim Gross fa4da93578
interpolate environment for services in script checks (#6916)
In 0.10.2 (specifically 387b016) we added interpolation to group
service blocks and centralized the logic for task environment
interpolation. This wasn't also added to script checks, which caused a
regression where the IDs for script checks for services w/
interpolated fields (ex. the service name) didn't match the service ID
that was registered with Consul.

This changeset calls the same taskenv interpolation logic during
`script_check` configuration, and adds tests to reduce the risk of
future regressions by comparing the IDs of service hook and the check hook.
2020-01-09 08:12:54 -05:00

398 lines
11 KiB
Go

package taskrunner
import (
"context"
"fmt"
"sync"
"time"
"github.com/hashicorp/consul/api"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
tinterfaces "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces"
"github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/taskenv"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/structs"
)
var _ interfaces.TaskPoststartHook = &scriptCheckHook{}
var _ interfaces.TaskUpdateHook = &scriptCheckHook{}
var _ interfaces.TaskStopHook = &scriptCheckHook{}
// default max amount of time to wait for all scripts on shutdown.
const defaultShutdownWait = time.Minute
type scriptCheckHookConfig struct {
alloc *structs.Allocation
task *structs.Task
consul consul.ConsulServiceAPI
logger log.Logger
shutdownWait time.Duration
}
// scriptCheckHook implements a task runner hook for running script
// checks in the context of a task
type scriptCheckHook struct {
consul consul.ConsulServiceAPI
alloc *structs.Allocation
task *structs.Task
logger log.Logger
shutdownWait time.Duration // max time to wait for scripts to shutdown
shutdownCh chan struct{} // closed when all scripts should shutdown
// The following fields can be changed by Update()
driverExec tinterfaces.ScriptExecutor
taskEnv *taskenv.TaskEnv
// These maintain state and are populated by Poststart() or Update()
scripts map[string]*scriptCheck
runningScripts map[string]*taskletHandle
// Since Update() may be called concurrently with any other hook all
// hook methods must be fully serialized
mu sync.Mutex
}
// newScriptCheckHook returns a hook without any scriptChecks.
// They will get created only once their task environment is ready
// in Poststart() or Update()
func newScriptCheckHook(c scriptCheckHookConfig) *scriptCheckHook {
h := &scriptCheckHook{
consul: c.consul,
alloc: c.alloc,
task: c.task,
scripts: make(map[string]*scriptCheck),
runningScripts: make(map[string]*taskletHandle),
shutdownWait: defaultShutdownWait,
shutdownCh: make(chan struct{}),
}
if c.shutdownWait != 0 {
h.shutdownWait = c.shutdownWait // override for testing
}
h.logger = c.logger.Named(h.Name())
return h
}
func (h *scriptCheckHook) Name() string {
return "script_checks"
}
// Prestart implements interfaces.TaskPrestartHook. It stores the
// initial structs.Task
func (h *scriptCheckHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, _ *interfaces.TaskPrestartResponse) error {
h.mu.Lock()
defer h.mu.Unlock()
h.task = req.Task
return nil
}
// PostStart implements interfaces.TaskPoststartHook. It creates new
// script checks with the current task context (driver and env), and
// starts up the scripts.
func (h *scriptCheckHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error {
h.mu.Lock()
defer h.mu.Unlock()
if req.DriverExec == nil {
h.logger.Debug("driver doesn't support script checks")
return nil
}
h.driverExec = req.DriverExec
h.taskEnv = req.TaskEnv
return h.upsertChecks()
}
// Updated implements interfaces.TaskUpdateHook. It creates new
// script checks with the current task context (driver and env and possibly
// new structs.Task), and starts up the scripts.
func (h *scriptCheckHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequest, _ *interfaces.TaskUpdateResponse) error {
h.mu.Lock()
defer h.mu.Unlock()
task := req.Alloc.LookupTask(h.task.Name)
if task == nil {
return fmt.Errorf("task %q not found in updated alloc", h.task.Name)
}
h.alloc = req.Alloc
h.task = task
h.taskEnv = req.TaskEnv
return h.upsertChecks()
}
func (h *scriptCheckHook) upsertChecks() error {
// Create new script checks struct with new task context
oldScriptChecks := h.scripts
h.scripts = h.newScriptChecks()
// Run new or replacement scripts
for id, script := range h.scripts {
// If it's already running, cancel and replace
if oldScript, running := h.runningScripts[id]; running {
oldScript.cancel()
}
// Start and store the handle
h.runningScripts[id] = script.run()
}
// Cancel scripts we no longer want
for id := range oldScriptChecks {
if _, ok := h.scripts[id]; !ok {
if oldScript, running := h.runningScripts[id]; running {
oldScript.cancel()
}
}
}
return nil
}
// Stop implements interfaces.TaskStopHook and blocks waiting for running
// scripts to finish (or for the shutdownWait timeout to expire).
func (h *scriptCheckHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, resp *interfaces.TaskStopResponse) error {
h.mu.Lock()
defer h.mu.Unlock()
close(h.shutdownCh)
deadline := time.After(h.shutdownWait)
err := fmt.Errorf("timed out waiting for script checks to exit")
for _, script := range h.runningScripts {
select {
case <-script.wait():
case <-ctx.Done():
// the caller is passing the background context, so
// we should never really see this outside of testing
case <-deadline:
// at this point the Consul client has been cleaned
// up so we don't want to hang onto this.
return err
}
}
return nil
}
func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck {
scriptChecks := make(map[string]*scriptCheck)
interpolatedTaskServices := taskenv.InterpolateServices(h.taskEnv, h.task.Services)
for _, service := range interpolatedTaskServices {
for _, check := range service.Checks {
if check.Type != structs.ServiceCheckScript {
continue
}
serviceID := agentconsul.MakeAllocServiceID(
h.alloc.ID, h.task.Name, service)
sc := newScriptCheck(&scriptCheckConfig{
allocID: h.alloc.ID,
taskName: h.task.Name,
check: check,
serviceID: serviceID,
agent: h.consul,
driverExec: h.driverExec,
taskEnv: h.taskEnv,
logger: h.logger,
shutdownCh: h.shutdownCh,
})
if sc != nil {
scriptChecks[sc.id] = sc
}
}
}
// Walk back through the task group to see if there are script checks
// associated with the task. If so, we'll create scriptCheck tasklets
// for them. The group-level service and any check restart behaviors it
// needs are entirely encapsulated within the group service hook which
// watches Consul for status changes.
tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
interpolatedGroupServices := taskenv.InterpolateServices(h.taskEnv, tg.Services)
for _, service := range interpolatedGroupServices {
for _, check := range service.Checks {
if check.Type != structs.ServiceCheckScript {
continue
}
if check.TaskName != h.task.Name {
continue
}
groupTaskName := "group-" + tg.Name
serviceID := agentconsul.MakeAllocServiceID(
h.alloc.ID, groupTaskName, service)
sc := newScriptCheck(&scriptCheckConfig{
allocID: h.alloc.ID,
taskName: groupTaskName,
check: check,
serviceID: serviceID,
agent: h.consul,
driverExec: h.driverExec,
taskEnv: h.taskEnv,
logger: h.logger,
shutdownCh: h.shutdownCh,
isGroup: true,
})
if sc != nil {
scriptChecks[sc.id] = sc
}
}
}
return scriptChecks
}
// heartbeater is the subset of consul agent functionality needed by script
// checks to heartbeat
type heartbeater interface {
UpdateTTL(id, output, status string) error
}
// scriptCheck runs script checks via a interfaces.ScriptExecutor and updates the
// appropriate check's TTL when the script succeeds.
type scriptCheck struct {
id string
agent heartbeater
check *structs.ServiceCheck
lastCheckOk bool // true if the last check was ok; otherwise false
tasklet
}
// scriptCheckConfig is a parameter struct for newScriptCheck
type scriptCheckConfig struct {
allocID string
taskName string
serviceID string
check *structs.ServiceCheck
agent heartbeater
driverExec tinterfaces.ScriptExecutor
taskEnv *taskenv.TaskEnv
logger log.Logger
shutdownCh chan struct{}
isGroup bool
}
// newScriptCheck constructs a scriptCheck. we're only going to
// configure the immutable fields of scriptCheck here, with the
// rest being configured during the Poststart hook so that we have
// the rest of the task execution environment
func newScriptCheck(config *scriptCheckConfig) *scriptCheck {
// Guard against not having a valid taskEnv. This can be the case if the
// PreKilling or Exited hook is run before Poststart.
if config.taskEnv == nil || config.driverExec == nil {
return nil
}
orig := config.check
sc := &scriptCheck{
agent: config.agent,
check: config.check.Copy(),
lastCheckOk: true, // start logging on first failure
}
// we can't use the promoted fields of tasklet in the struct literal
sc.Command = config.taskEnv.ReplaceEnv(config.check.Command)
sc.Args = config.taskEnv.ParseAndReplace(config.check.Args)
sc.Interval = config.check.Interval
sc.Timeout = config.check.Timeout
sc.exec = config.driverExec
sc.callback = newScriptCheckCallback(sc)
sc.logger = config.logger
sc.shutdownCh = config.shutdownCh
sc.check.Command = sc.Command
sc.check.Args = sc.Args
if config.isGroup {
// group services don't have access to a task environment
// at creation, so their checks get registered before the
// check can be interpolated here. if we don't use the
// original checkID, they can't be updated.
sc.id = agentconsul.MakeCheckID(config.serviceID, orig)
} else {
sc.id = agentconsul.MakeCheckID(config.serviceID, sc.check)
}
return sc
}
// Copy does a *shallow* copy of script checks.
func (sc *scriptCheck) Copy() *scriptCheck {
newSc := sc
return newSc
}
// closes over the script check and returns the taskletCallback for
// when the script check executes.
func newScriptCheckCallback(s *scriptCheck) taskletCallback {
return func(ctx context.Context, params execResult) {
output := params.output
code := params.code
err := params.err
state := api.HealthCritical
switch code {
case 0:
state = api.HealthPassing
case 1:
state = api.HealthWarning
}
var outputMsg string
if err != nil {
state = api.HealthCritical
outputMsg = err.Error()
} else {
outputMsg = string(output)
}
// heartbeat the check to Consul
err = s.updateTTL(ctx, outputMsg, state)
select {
case <-ctx.Done():
// check has been removed; don't report errors
return
default:
}
if err != nil {
if s.lastCheckOk {
s.lastCheckOk = false
s.logger.Warn("updating check failed", "error", err)
} else {
s.logger.Debug("updating check still failing", "error", err)
}
} else if !s.lastCheckOk {
// Succeeded for the first time or after failing; log
s.lastCheckOk = true
s.logger.Info("updating check succeeded")
}
}
}
const (
updateTTLBackoffBaseline = 1 * time.Second
updateTTLBackoffLimit = 3 * time.Second
)
// updateTTL updates the state to Consul, performing an expontential backoff
// in the case where the check isn't registered in Consul to avoid a race between
// service registration and the first check.
func (s *scriptCheck) updateTTL(ctx context.Context, msg, state string) error {
for attempts := 0; ; attempts++ {
err := s.agent.UpdateTTL(s.id, msg, state)
if err == nil {
return nil
}
// Handle the retry case
backoff := (1 << (2 * uint64(attempts))) * updateTTLBackoffBaseline
if backoff > updateTTLBackoffLimit {
return err
}
// Wait till retrying
select {
case <-ctx.Done():
return err
case <-time.After(backoff):
}
}
}