Kill check processes after the timeout is reached (#3567)
* Kill check processes after the timeout is reached Kill the subprocess spawned by a script check once the timeout is reached. Previously Consul just marked the check critical and left the subprocess around. Fixes #3565. * Set err to non-nil when timeout occurs * Fix check timeout test * Kill entire process subtree on check timeout * Add a docs note about windows subprocess termination
This commit is contained in:
parent
144a938bcc
commit
eea2bd2753
|
@ -104,13 +104,16 @@ func (c *CheckMonitor) check() {
|
|||
// Create the command
|
||||
var cmd *exec.Cmd
|
||||
var err error
|
||||
var cmdDisplay string
|
||||
if len(c.ScriptArgs) > 0 {
|
||||
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
|
||||
cmd, err = ExecSubprocess(c.ScriptArgs)
|
||||
} else {
|
||||
cmdDisplay = c.Script
|
||||
cmd, err = ExecScript(c.Script)
|
||||
}
|
||||
if err != nil {
|
||||
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err)
|
||||
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
|
@ -119,28 +122,39 @@ func (c *CheckMonitor) check() {
|
|||
output, _ := circbuf.NewBuffer(CheckBufSize)
|
||||
cmd.Stdout = output
|
||||
cmd.Stderr = output
|
||||
SetSysProcAttr(cmd)
|
||||
|
||||
// Start the check
|
||||
if err := cmd.Start(); err != nil {
|
||||
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err)
|
||||
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// Wait for the check to complete
|
||||
errCh := make(chan error, 2)
|
||||
waitCh := make(chan error, 1)
|
||||
go func() {
|
||||
errCh <- cmd.Wait()
|
||||
waitCh <- cmd.Wait()
|
||||
}()
|
||||
go func() {
|
||||
if c.Timeout > 0 {
|
||||
time.Sleep(c.Timeout)
|
||||
|
||||
timeout := 30 * time.Second
|
||||
if c.Timeout > 0 {
|
||||
timeout = c.Timeout
|
||||
}
|
||||
select {
|
||||
case <-time.After(timeout):
|
||||
if err := KillCommandSubtree(cmd); err != nil {
|
||||
c.Logger.Printf("[WARN] Timed out running check '%s': error killing process: %v", cmdDisplay, err)
|
||||
} else {
|
||||
time.Sleep(30 * time.Second)
|
||||
c.Logger.Printf("[WARN] Timed out (%s) running check '%s'", timeout.String(), cmdDisplay)
|
||||
}
|
||||
errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
|
||||
}()
|
||||
err = <-errCh
|
||||
|
||||
err = fmt.Errorf("Timed out running check '%s'", cmdDisplay)
|
||||
<-waitCh
|
||||
|
||||
case err = <-waitCh:
|
||||
// The process returned before the timeout, proceed normally
|
||||
}
|
||||
|
||||
// Get the output, add a message about truncation
|
||||
outputStr := string(output.Bytes())
|
||||
|
@ -150,7 +164,7 @@ func (c *CheckMonitor) check() {
|
|||
}
|
||||
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
|
||||
c.CheckID, c.Script, outputStr)
|
||||
c.CheckID, cmdDisplay, outputStr)
|
||||
|
||||
// Check if the check passed
|
||||
if err == nil {
|
||||
|
|
|
@ -5,6 +5,7 @@ package agent
|
|||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// ExecScript returns a command to execute a script through a shell.
|
||||
|
@ -15,3 +16,12 @@ func ExecScript(script string) (*exec.Cmd, error) {
|
|||
}
|
||||
return exec.Command(shell, "-c", script), nil
|
||||
}
|
||||
|
||||
func SetSysProcAttr(cmd *exec.Cmd) {
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
}
|
||||
|
||||
// KillCommandSubtree kills the command process and any child processes
|
||||
func KillCommandSubtree(cmd *exec.Cmd) error {
|
||||
return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
|
|
|
@ -22,3 +22,9 @@ func ExecScript(script string) (*exec.Cmd, error) {
|
|||
}
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
func SetSysProcAttr(cmd *exec.Cmd) {}
|
||||
|
||||
func KillCommandSubtree(cmd *exec.Cmd) error {
|
||||
return cmd.Process.Kill()
|
||||
}
|
||||
|
|
|
@ -24,9 +24,11 @@ There are five different kinds of checks:
|
|||
a script check is limited to 4KB. Output larger than this will be truncated.
|
||||
By default, Script checks will be configured with a timeout equal to 30 seconds.
|
||||
It is possible to configure a custom Script check timeout value by specifying the
|
||||
`timeout` field in the check definition. In Consul 0.9.0 and later, the agent
|
||||
must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
|
||||
set to `true` in order to enable script checks.
|
||||
`timeout` field in the check definition. On Windows, Consul will wait for any child processes
|
||||
spawned by the script to finish once the timeout is reached (instead of killing them immediately,
|
||||
as on other platforms). In Consul 0.9.0 and later, the agent must be configured with
|
||||
[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true`
|
||||
in order to enable script checks.
|
||||
|
||||
* HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
|
||||
every 30 seconds) to the specified URL. The status of the service depends on
|
||||
|
|
Loading…
Reference in New Issue