Kill check processes after the timeout is reached (#3567)

* Kill check processes after the timeout is reached Kill the subprocess spawned by a script check once the timeout is reached. Previously Consul just marked the check critical and left the subprocess around. Fixes #3565. * Set err to non-nil when timeout occurs * Fix check timeout test * Kill entire process subtree on check timeout * Add a docs note about windows subprocess termination
2017-10-11 11:57:39 -07:00 · 2017-10-11 11:57:39 -07:00 · eea2bd2753
parent 144a938bcc
commit eea2bd2753
4 changed files with 47 additions and 15 deletions
--- a/agent/check.go
+++ b/agent/check.go
@ -104,13 +104,16 @@ func (c *CheckMonitor) check() {
 	// Create the command
 	var cmd *exec.Cmd
 	var err error
+	var cmdDisplay string
 	if len(c.ScriptArgs) > 0 {
+		cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
 		cmd, err = ExecSubprocess(c.ScriptArgs)
 	} else {
+		cmdDisplay = c.Script
 		cmd, err = ExecScript(c.Script)
 	}
 	if err != nil {
-		c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err)
+		c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
 		c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
 		return
 	}
@ -119,28 +122,39 @@ func (c *CheckMonitor) check() {
 	output, _ := circbuf.NewBuffer(CheckBufSize)
 	cmd.Stdout = output
 	cmd.Stderr = output
+	SetSysProcAttr(cmd)

 	// Start the check
 	if err := cmd.Start(); err != nil {
-		c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err)
+		c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
 		c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
 		return
 	}

 	// Wait for the check to complete
-	errCh := make(chan error, 2)
+	waitCh := make(chan error, 1)
 	go func() {
-		errCh <- cmd.Wait()
+		waitCh <- cmd.Wait()
 	}()
-	go func() {
-		if c.Timeout > 0 {
-			time.Sleep(c.Timeout)
+
+	timeout := 30 * time.Second
+	if c.Timeout > 0 {
+		timeout = c.Timeout
+	}
+	select {
+	case <-time.After(timeout):
+		if err := KillCommandSubtree(cmd); err != nil {
+			c.Logger.Printf("[WARN] Timed out running check '%s': error killing process: %v", cmdDisplay, err)
 		} else {
-			time.Sleep(30 * time.Second)
+			c.Logger.Printf("[WARN] Timed out (%s) running check '%s'", timeout.String(), cmdDisplay)
 		}
-		errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
-	}()
-	err = <-errCh
+
+		err = fmt.Errorf("Timed out running check '%s'", cmdDisplay)
+		<-waitCh
+
+	case err = <-waitCh:
+		// The process returned before the timeout, proceed normally
+	}

 	// Get the output, add a message about truncation
 	outputStr := string(output.Bytes())
@ -150,7 +164,7 @@ func (c *CheckMonitor) check() {
 	}

 	c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
-		c.CheckID, c.Script, outputStr)
+		c.CheckID, cmdDisplay, outputStr)

 	// Check if the check passed
 	if err == nil {
--- a/agent/util_other.go
+++ b/agent/util_other.go
@ -5,6 +5,7 @@ package agent
 import (
 	"os"
 	"os/exec"
+	"syscall"
 )

 // ExecScript returns a command to execute a script through a shell.
@ -15,3 +16,12 @@ func ExecScript(script string) (*exec.Cmd, error) {
 	}
 	return exec.Command(shell, "-c", script), nil
 }
+
+func SetSysProcAttr(cmd *exec.Cmd) {
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+}
+
+// KillCommandSubtree kills the command process and any child processes
+func KillCommandSubtree(cmd *exec.Cmd) error {
+	return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+}
--- a/agent/util_windows.go
+++ b/agent/util_windows.go
@ -22,3 +22,9 @@ func ExecScript(script string) (*exec.Cmd, error) {
 	}
 	return cmd, nil
 }
+
+func SetSysProcAttr(cmd *exec.Cmd) {}
+
+func KillCommandSubtree(cmd *exec.Cmd) error {
+	return cmd.Process.Kill()
+}
--- a/website/source/docs/agent/checks.html.md
+++ b/website/source/docs/agent/checks.html.md
@ -24,9 +24,11 @@ There are five different kinds of checks:
  a script check is limited to 4KB. Output larger than this will be truncated.
  By default, Script checks will be configured with a timeout equal to 30 seconds.
  It is possible to configure a custom Script check timeout value by specifying the
-  `timeout` field in the check definition. In Consul 0.9.0 and later, the agent
-  must be configured with [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks)
-  set to `true` in order to enable script checks.
+  `timeout` field in the check definition. On Windows, Consul will wait for any child processes
+  spawned by the script to finish once the timeout is reached (instead of killing them immediately,
+  as on other platforms). In Consul 0.9.0 and later, the agent must be configured with
+  [`enable_script_checks`](/docs/agent/options.html#_enable_script_checks) set to `true`
+  in order to enable script checks.

 * HTTP + Interval - These checks make an HTTP `GET` request every Interval (e.g.
  every 30 seconds) to the specified URL. The status of the service depends on