diff --git a/agent/check.go b/agent/check.go index 3d86579f9..b2dd254bb 100644 --- a/agent/check.go +++ b/agent/check.go @@ -104,13 +104,16 @@ func (c *CheckMonitor) check() { // Create the command var cmd *exec.Cmd var err error + var cmdDisplay string if len(c.ScriptArgs) > 0 { + cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs) cmd, err = ExecSubprocess(c.ScriptArgs) } else { + cmdDisplay = c.Script cmd, err = ExecScript(c.Script) } if err != nil { - c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err) + c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) return } @@ -119,28 +122,39 @@ func (c *CheckMonitor) check() { output, _ := circbuf.NewBuffer(CheckBufSize) cmd.Stdout = output cmd.Stderr = output + SetSysProcAttr(cmd) // Start the check if err := cmd.Start(); err != nil { - c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err) + c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) return } // Wait for the check to complete - errCh := make(chan error, 2) + waitCh := make(chan error, 1) go func() { - errCh <- cmd.Wait() + waitCh <- cmd.Wait() }() - go func() { - if c.Timeout > 0 { - time.Sleep(c.Timeout) + + timeout := 30 * time.Second + if c.Timeout > 0 { + timeout = c.Timeout + } + select { + case <-time.After(timeout): + if err := KillCommandSubtree(cmd); err != nil { + c.Logger.Printf("[WARN] Timed out running check '%s': error killing process: %v", cmdDisplay, err) } else { - time.Sleep(30 * time.Second) + c.Logger.Printf("[WARN] Timed out (%s) running check '%s'", timeout.String(), cmdDisplay) } - errCh <- fmt.Errorf("Timed out running check '%s'", c.Script) - }() - err = <-errCh + + err = fmt.Errorf("Timed out running check '%s'", cmdDisplay) + <-waitCh + + case err = <-waitCh: + // The process returned before the timeout, proceed normally + } // Get the output, add a message about truncation outputStr := string(output.Bytes()) @@ -150,7 +164,7 @@ func (c *CheckMonitor) check() { } c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s", - c.CheckID, c.Script, outputStr) + c.CheckID, cmdDisplay, outputStr) // Check if the check passed if err == nil { diff --git a/agent/util_other.go b/agent/util_other.go index fd3472aed..d9ff0f8ba 100644 --- a/agent/util_other.go +++ b/agent/util_other.go @@ -5,6 +5,7 @@ package agent import ( "os" "os/exec" + "syscall" ) // ExecScript returns a command to execute a script through a shell. @@ -15,3 +16,12 @@ func ExecScript(script string) (*exec.Cmd, error) { } return exec.Command(shell, "-c", script), nil } + +func SetSysProcAttr(cmd *exec.Cmd) { + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} +} + +// KillCommandSubtree kills the command process and any child processes +func KillCommandSubtree(cmd *exec.Cmd) error { + return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) +} diff --git a/agent/util_windows.go b/agent/util_windows.go index a9b8517f7..acb3b0cc8 100644 --- a/agent/util_windows.go +++ b/agent/util_windows.go @@ -22,3 +22,9 @@ func ExecScript(script string) (*exec.Cmd, error) { } return cmd, nil } + +func SetSysProcAttr(cmd *exec.Cmd) {} + +func KillCommandSubtree(cmd *exec.Cmd) error { + return cmd.Process.Kill() +} diff --git a/website/source/docs/agent/checks.html.md b/website/source/docs/agent/checks.html.md index b66445f4a..6b1595d0d 100644 --- a/website/source/docs/agent/checks.html.md +++ b/website/source/docs/agent/checks.html.md @@ -24,9 +24,11 @@ There are five different kinds of checks: a script check is limited to 4KB. Output larger than this will be truncated. By default, Script checks will be configured with a timeout equal to 30 seconds. It is possible to configure a custom Script check timeout value by specifying the - `timeout` field in the check definition. In Consul 0.9.0 and later, the agent - must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) - set to `true` in order to enable script checks. + `timeout` field in the check definition. On Windows, Consul will wait for any child processes + spawned by the script to finish once the timeout is reached (instead of killing them immediately, + as on other platforms). In Consul 0.9.0 and later, the agent must be configured with + [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true` + in order to enable script checks. * HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g. every 30 seconds) to the specified URL. The status of the service depends on