2017-02-01 00:43:57 +00:00
|
|
|
package consul
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"log"
|
|
|
|
"time"
|
|
|
|
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics "github.com/armon/go-metrics"
|
2017-02-01 00:43:57 +00:00
|
|
|
"github.com/hashicorp/consul/api"
|
2017-04-12 20:26:55 +00:00
|
|
|
"github.com/hashicorp/nomad/client/driver"
|
2017-02-01 00:43:57 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
|
|
|
// heartbeater is the subset of consul agent functionality needed by script
|
|
|
|
// checks to heartbeat
|
|
|
|
type heartbeater interface {
|
|
|
|
UpdateTTL(id, output, status string) error
|
|
|
|
}
|
|
|
|
|
2017-04-12 20:47:38 +00:00
|
|
|
// scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and
|
|
|
|
// waiting for it to shutdown.
|
2017-02-01 00:43:57 +00:00
|
|
|
type scriptHandle struct {
|
|
|
|
// cancel the script
|
|
|
|
cancel func()
|
2017-04-12 20:47:38 +00:00
|
|
|
exitCh chan struct{}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// wait returns a chan that's closed when the script exits
|
|
|
|
func (s *scriptHandle) wait() <-chan struct{} {
|
2017-04-12 20:47:38 +00:00
|
|
|
return s.exitCh
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2017-04-12 20:47:38 +00:00
|
|
|
// scriptCheck runs script checks via a ScriptExecutor and updates the
|
|
|
|
// appropriate check's TTL when the script succeeds.
|
2017-02-01 00:43:57 +00:00
|
|
|
type scriptCheck struct {
|
2017-04-13 23:43:38 +00:00
|
|
|
allocID string
|
|
|
|
taskName string
|
|
|
|
|
2017-04-12 20:47:38 +00:00
|
|
|
id string
|
|
|
|
check *structs.ServiceCheck
|
|
|
|
exec driver.ScriptExecutor
|
|
|
|
agent heartbeater
|
2017-02-01 00:43:57 +00:00
|
|
|
|
|
|
|
// lastCheckOk is true if the last check was ok; otherwise false
|
|
|
|
lastCheckOk bool
|
|
|
|
|
|
|
|
logger *log.Logger
|
|
|
|
shutdownCh <-chan struct{}
|
|
|
|
}
|
|
|
|
|
2017-04-12 20:47:38 +00:00
|
|
|
// newScriptCheck creates a new scriptCheck. run() should be called once the
|
|
|
|
// initial check is registered with Consul.
|
2017-04-13 23:43:38 +00:00
|
|
|
func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck,
|
|
|
|
exec driver.ScriptExecutor, agent heartbeater, logger *log.Logger,
|
|
|
|
shutdownCh <-chan struct{}) *scriptCheck {
|
2017-04-12 20:26:55 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
return &scriptCheck{
|
2017-04-13 23:43:38 +00:00
|
|
|
allocID: allocID,
|
|
|
|
taskName: taskName,
|
|
|
|
id: checkID,
|
2017-02-01 00:43:57 +00:00
|
|
|
check: check,
|
|
|
|
exec: exec,
|
|
|
|
agent: agent,
|
|
|
|
lastCheckOk: true, // start logging on first failure
|
|
|
|
logger: logger,
|
|
|
|
shutdownCh: shutdownCh,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// run this script check and return its cancel func. If the shutdownCh is
|
|
|
|
// closed the check will be run once more before exiting.
|
|
|
|
func (s *scriptCheck) run() *scriptHandle {
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
2017-04-12 20:47:38 +00:00
|
|
|
exitCh := make(chan struct{})
|
2017-02-01 00:43:57 +00:00
|
|
|
go func() {
|
2017-04-12 20:47:38 +00:00
|
|
|
defer close(exitCh)
|
2017-02-01 00:43:57 +00:00
|
|
|
timer := time.NewTimer(0)
|
|
|
|
defer timer.Stop()
|
|
|
|
for {
|
|
|
|
// Block until check is removed, Nomad is shutting
|
|
|
|
// down, or the check interval is up
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
// check has been removed
|
|
|
|
return
|
|
|
|
case <-s.shutdownCh:
|
|
|
|
// unblock but don't exit until after we heartbeat once more
|
|
|
|
case <-timer.C:
|
|
|
|
timer.Reset(s.check.Interval)
|
|
|
|
}
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1)
|
2017-02-01 00:43:57 +00:00
|
|
|
|
|
|
|
// Execute check script with timeout
|
|
|
|
execctx, cancel := context.WithTimeout(ctx, s.check.Timeout)
|
|
|
|
output, code, err := s.exec.Exec(execctx, s.check.Command, s.check.Args)
|
|
|
|
switch execctx.Err() {
|
|
|
|
case context.Canceled:
|
|
|
|
// check removed during execution; exit
|
2017-04-19 20:05:41 +00:00
|
|
|
cancel()
|
2017-02-01 00:43:57 +00:00
|
|
|
return
|
|
|
|
case context.DeadlineExceeded:
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1)
|
2017-04-18 22:28:44 +00:00
|
|
|
// If no error was returned, set one to make sure the task goes critical
|
|
|
|
if err == nil {
|
|
|
|
err = context.DeadlineExceeded
|
|
|
|
}
|
|
|
|
|
|
|
|
// Log deadline exceeded every time as it's a
|
|
|
|
// distinct issue from checks returning
|
|
|
|
// failures
|
2017-04-13 23:43:38 +00:00
|
|
|
s.logger.Printf("[WARN] consul.checks: check %q for task %q alloc %q timed out (%s)",
|
|
|
|
s.check.Name, s.taskName, s.allocID, s.check.Timeout)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-12 20:47:38 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// cleanup context
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
state := api.HealthCritical
|
|
|
|
switch code {
|
|
|
|
case 0:
|
|
|
|
state = api.HealthPassing
|
|
|
|
case 1:
|
|
|
|
state = api.HealthWarning
|
|
|
|
}
|
2017-04-12 20:47:38 +00:00
|
|
|
|
|
|
|
var outputMsg string
|
2017-02-01 00:43:57 +00:00
|
|
|
if err != nil {
|
|
|
|
state = api.HealthCritical
|
2017-04-12 20:47:38 +00:00
|
|
|
outputMsg = err.Error()
|
|
|
|
} else {
|
|
|
|
outputMsg = string(output)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Actually heartbeat the check
|
2017-04-12 20:47:38 +00:00
|
|
|
err = s.agent.UpdateTTL(s.id, outputMsg, state)
|
2017-02-01 00:43:57 +00:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
// check has been removed; don't report errors
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
if s.lastCheckOk {
|
|
|
|
s.lastCheckOk = false
|
2017-04-13 23:43:38 +00:00
|
|
|
s.logger.Printf("[WARN] consul.checks: update for task %q alloc %q check %q failed: %v",
|
|
|
|
s.taskName, s.allocID, s.check.Name, err)
|
2017-02-01 00:43:57 +00:00
|
|
|
} else {
|
2017-04-13 23:43:38 +00:00
|
|
|
s.logger.Printf("[DEBUG] consul.checks: update for task %q alloc %q check %q still failing: %v",
|
|
|
|
s.taskName, s.allocID, s.check.Name, err)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
} else if !s.lastCheckOk {
|
|
|
|
// Succeeded for the first time or after failing; log
|
|
|
|
s.lastCheckOk = true
|
2017-04-13 23:43:38 +00:00
|
|
|
s.logger.Printf("[INFO] consul.checks: update for task %q alloc %q check %q succeeded",
|
|
|
|
s.taskName, s.allocID, s.check.Name)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-s.shutdownCh:
|
2017-04-12 20:47:38 +00:00
|
|
|
// We've been told to exit and just heartbeated so exit
|
2017-02-01 00:43:57 +00:00
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
2017-04-12 20:47:38 +00:00
|
|
|
return &scriptHandle{cancel: cancel, exitCh: exitCh}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|