Kill check processes after the timeout is reached (#3567)
* Kill check processes after the timeout is reached Kill the subprocess spawned by a script check once the timeout is reached. Previously Consul just marked the check critical and left the subprocess around. Fixes #3565. * Set err to non-nil when timeout occurs * Fix check timeout test * Kill entire process subtree on check timeout * Add a docs note about windows subprocess termination
This commit is contained in:
parent
144a938bcc
commit
eea2bd2753
|
@ -104,13 +104,16 @@ func (c *CheckMonitor) check() {
|
||||||
// Create the command
|
// Create the command
|
||||||
var cmd *exec.Cmd
|
var cmd *exec.Cmd
|
||||||
var err error
|
var err error
|
||||||
|
var cmdDisplay string
|
||||||
if len(c.ScriptArgs) > 0 {
|
if len(c.ScriptArgs) > 0 {
|
||||||
|
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
|
||||||
cmd, err = ExecSubprocess(c.ScriptArgs)
|
cmd, err = ExecSubprocess(c.ScriptArgs)
|
||||||
} else {
|
} else {
|
||||||
|
cmdDisplay = c.Script
|
||||||
cmd, err = ExecScript(c.Script)
|
cmd, err = ExecScript(c.Script)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err)
|
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
|
||||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -119,28 +122,39 @@ func (c *CheckMonitor) check() {
|
||||||
output, _ := circbuf.NewBuffer(CheckBufSize)
|
output, _ := circbuf.NewBuffer(CheckBufSize)
|
||||||
cmd.Stdout = output
|
cmd.Stdout = output
|
||||||
cmd.Stderr = output
|
cmd.Stderr = output
|
||||||
|
SetSysProcAttr(cmd)
|
||||||
|
|
||||||
// Start the check
|
// Start the check
|
||||||
if err := cmd.Start(); err != nil {
|
if err := cmd.Start(); err != nil {
|
||||||
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err)
|
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
|
||||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for the check to complete
|
// Wait for the check to complete
|
||||||
errCh := make(chan error, 2)
|
waitCh := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
errCh <- cmd.Wait()
|
waitCh <- cmd.Wait()
|
||||||
}()
|
}()
|
||||||
go func() {
|
|
||||||
|
timeout := 30 * time.Second
|
||||||
if c.Timeout > 0 {
|
if c.Timeout > 0 {
|
||||||
time.Sleep(c.Timeout)
|
timeout = c.Timeout
|
||||||
} else {
|
}
|
||||||
time.Sleep(30 * time.Second)
|
select {
|
||||||
|
case <-time.After(timeout):
|
||||||
|
if err := KillCommandSubtree(cmd); err != nil {
|
||||||
|
c.Logger.Printf("[WARN] Timed out running check '%s': error killing process: %v", cmdDisplay, err)
|
||||||
|
} else {
|
||||||
|
c.Logger.Printf("[WARN] Timed out (%s) running check '%s'", timeout.String(), cmdDisplay)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = fmt.Errorf("Timed out running check '%s'", cmdDisplay)
|
||||||
|
<-waitCh
|
||||||
|
|
||||||
|
case err = <-waitCh:
|
||||||
|
// The process returned before the timeout, proceed normally
|
||||||
}
|
}
|
||||||
errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
|
|
||||||
}()
|
|
||||||
err = <-errCh
|
|
||||||
|
|
||||||
// Get the output, add a message about truncation
|
// Get the output, add a message about truncation
|
||||||
outputStr := string(output.Bytes())
|
outputStr := string(output.Bytes())
|
||||||
|
@ -150,7 +164,7 @@ func (c *CheckMonitor) check() {
|
||||||
}
|
}
|
||||||
|
|
||||||
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
|
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
|
||||||
c.CheckID, c.Script, outputStr)
|
c.CheckID, cmdDisplay, outputStr)
|
||||||
|
|
||||||
// Check if the check passed
|
// Check if the check passed
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
|
|
@ -5,6 +5,7 @@ package agent
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ExecScript returns a command to execute a script through a shell.
|
// ExecScript returns a command to execute a script through a shell.
|
||||||
|
@ -15,3 +16,12 @@ func ExecScript(script string) (*exec.Cmd, error) {
|
||||||
}
|
}
|
||||||
return exec.Command(shell, "-c", script), nil
|
return exec.Command(shell, "-c", script), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SetSysProcAttr(cmd *exec.Cmd) {
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
// KillCommandSubtree kills the command process and any child processes
|
||||||
|
func KillCommandSubtree(cmd *exec.Cmd) error {
|
||||||
|
return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
|
|
@ -22,3 +22,9 @@ func ExecScript(script string) (*exec.Cmd, error) {
|
||||||
}
|
}
|
||||||
return cmd, nil
|
return cmd, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SetSysProcAttr(cmd *exec.Cmd) {}
|
||||||
|
|
||||||
|
func KillCommandSubtree(cmd *exec.Cmd) error {
|
||||||
|
return cmd.Process.Kill()
|
||||||
|
}
|
||||||
|
|
|
@ -24,9 +24,11 @@ There are five different kinds of checks:
|
||||||
a script check is limited to 4KB. Output larger than this will be truncated.
|
a script check is limited to 4KB. Output larger than this will be truncated.
|
||||||
By default, Script checks will be configured with a timeout equal to 30 seconds.
|
By default, Script checks will be configured with a timeout equal to 30 seconds.
|
||||||
It is possible to configure a custom Script check timeout value by specifying the
|
It is possible to configure a custom Script check timeout value by specifying the
|
||||||
`timeout` field in the check definition. In Consul 0.9.0 and later, the agent
|
`timeout` field in the check definition. On Windows, Consul will wait for any child processes
|
||||||
must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
|
spawned by the script to finish once the timeout is reached (instead of killing them immediately,
|
||||||
set to `true` in order to enable script checks.
|
as on other platforms). In Consul 0.9.0 and later, the agent must be configured with
|
||||||
|
[`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true`
|
||||||
|
in order to enable script checks.
|
||||||
|
|
||||||
* HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
|
* HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
|
||||||
every 30 seconds) to the specified URL. The status of the service depends on
|
every 30 seconds) to the specified URL. The status of the service depends on
|
||||||
|
|
Loading…
Reference in New Issue