Merge pull request #1785 from hashicorp/f-check-put-api

Adds a PUT-based API for TTL checks and retains output on timeouts.
2016-03-02 20:17:50 -08:00 · 2016-03-02 20:17:50 -08:00 · 61afe388fa
parent ecc617008a bfe6f073fe
commit 61afe388fa
6 changed files with 248 additions and 14 deletions
--- a/command/agent/agent_endpoint.go
+++ b/command/agent/agent_endpoint.go
@ -167,6 +167,58 @@ func (s *HTTPServer) AgentCheckFail(resp http.ResponseWriter, req *http.Request)
 	return nil, nil
 }

+// checkUpdate is the payload for a PUT to AgentCheckUpdate.
+type checkUpdate struct {
+	// Status us one of the structs.Health* states, "passing", "warning", or
+	// "critical".
+	Status string
+
+	// Output is the information to post to the UI for operators as the
+	// output of the process that decided to hit the TTL check. This is
+	// different from the note field that's associated with the check
+	// itself.
+	Output string
+}
+
+// AgentCheckUpdate is a PUT-based alternative to the GET-based Pass/Warn/Fail
+// APIs.
+func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
+	if req.Method != "PUT" {
+		resp.WriteHeader(405)
+		return nil, nil
+	}
+
+	var update checkUpdate
+	if err := decodeBody(req, &update, nil); err != nil {
+		resp.WriteHeader(400)
+		resp.Write([]byte(fmt.Sprintf("Request decode failed: %v", err)))
+		return nil, nil
+	}
+
+	switch update.Status {
+	case structs.HealthPassing:
+	case structs.HealthWarning:
+	case structs.HealthCritical:
+	default:
+		resp.WriteHeader(400)
+		resp.Write([]byte(fmt.Sprintf("Invalid check status: '%s'", update.Status)))
+		return nil, nil
+	}
+
+	total := len(update.Output)
+	if total > CheckBufSize {
+		update.Output = fmt.Sprintf("%s ... (captured %d of %d bytes)",
+			update.Output[:CheckBufSize], CheckBufSize, total)
+	}
+
+	checkID := strings.TrimPrefix(req.URL.Path, "/v1/agent/check/update/")
+	if err := s.agent.UpdateCheck(checkID, update.Status, update.Output); err != nil {
+		return nil, err
+	}
+	s.syncChanges()
+	return nil, nil
+}
+
 func (s *HTTPServer) AgentRegisterService(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
 	var args ServiceDefinition
 	// Fixup the type decode of TTL or Interval if a check if provided
--- a/command/agent/agent_endpoint_test.go
+++ b/command/agent/agent_endpoint_test.go
@ -7,6 +7,7 @@ import (
 	"net/http/httptest"
 	"os"
 	"reflect"
+	"strings"
 	"testing"
 	"time"

@ -428,7 +429,6 @@ func TestHTTPAgentPassCheck(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

-	// Register node
 	req, err := http.NewRequest("GET", "/v1/agent/check/pass/test", nil)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@ -461,7 +461,6 @@ func TestHTTPAgentWarnCheck(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

-	// Register node
 	req, err := http.NewRequest("GET", "/v1/agent/check/warn/test", nil)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@ -494,7 +493,6 @@ func TestHTTPAgentFailCheck(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

-	// Register node
 	req, err := http.NewRequest("GET", "/v1/agent/check/fail/test", nil)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@ -515,6 +513,134 @@ func TestHTTPAgentFailCheck(t *testing.T) {
 	}
 }

+func TestHTTPAgentUpdateCheck(t *testing.T) {
+	dir, srv := makeHTTPServer(t)
+	defer os.RemoveAll(dir)
+	defer srv.Shutdown()
+	defer srv.agent.Shutdown()
+
+	chk := &structs.HealthCheck{Name: "test", CheckID: "test"}
+	chkType := &CheckType{TTL: 15 * time.Second}
+	if err := srv.agent.AddCheck(chk, chkType, false, ""); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	cases := []checkUpdate{
+		checkUpdate{"passing", "hello-passing"},
+		checkUpdate{"critical", "hello-critical"},
+		checkUpdate{"warning", "hello-warning"},
+	}
+
+	for _, c := range cases {
+		req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+		req.Body = encodeReq(c)
+
+		resp := httptest.NewRecorder()
+		obj, err := srv.AgentCheckUpdate(resp, req)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+		if obj != nil {
+			t.Fatalf("bad: %v", obj)
+		}
+		if resp.Code != 200 {
+			t.Fatalf("expected 200, got %d", resp.Code)
+		}
+
+		state := srv.agent.state.Checks()["test"]
+		if state.Status != c.Status || state.Output != c.Output {
+			t.Fatalf("bad: %v", state)
+		}
+	}
+
+	// Make sure abusive levels of output are capped.
+	{
+		req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+
+		update := checkUpdate{
+			Status: "passing",
+			Output: strings.Repeat("-= bad -=", 5*CheckBufSize),
+		}
+		req.Body = encodeReq(update)
+
+		resp := httptest.NewRecorder()
+		obj, err := srv.AgentCheckUpdate(resp, req)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+		if obj != nil {
+			t.Fatalf("bad: %v", obj)
+		}
+		if resp.Code != 200 {
+			t.Fatalf("expected 200, got %d", resp.Code)
+		}
+
+		// Since we append some notes about truncating, we just do a
+		// rough check that the output buffer was cut down so this test
+		// isn't super brittle.
+		state := srv.agent.state.Checks()["test"]
+		if state.Status != structs.HealthPassing || len(state.Output) > 2*CheckBufSize {
+			t.Fatalf("bad: %v", state)
+		}
+	}
+
+	// Check a bogus status.
+	{
+		req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+
+		update := checkUpdate{
+			Status: "itscomplicated",
+		}
+		req.Body = encodeReq(update)
+
+		resp := httptest.NewRecorder()
+		obj, err := srv.AgentCheckUpdate(resp, req)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+		if obj != nil {
+			t.Fatalf("bad: %v", obj)
+		}
+		if resp.Code != 400 {
+			t.Fatalf("expected 400, got %d", resp.Code)
+		}
+	}
+
+	// Check a bogus verb.
+	{
+		req, err := http.NewRequest("POST", "/v1/agent/check/update/test", nil)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+
+		update := checkUpdate{
+			Status: "passing",
+		}
+		req.Body = encodeReq(update)
+
+		resp := httptest.NewRecorder()
+		obj, err := srv.AgentCheckUpdate(resp, req)
+		if err != nil {
+			t.Fatalf("err: %v", err)
+		}
+		if obj != nil {
+			t.Fatalf("bad: %v", obj)
+		}
+		if resp.Code != 405 {
+			t.Fatalf("expected 405, got %d", resp.Code)
+		}
+	}
+}
+
 func TestHTTPAgentRegisterService(t *testing.T) {
 	dir, srv := makeHTTPServer(t)
 	defer os.RemoveAll(dir)
--- a/command/agent/check.go
+++ b/command/agent/check.go
@ -232,6 +232,9 @@ type CheckTTL struct {

 	timer *time.Timer

+	lastOutput     string
+	lastOutputLock sync.RWMutex
+
 	stop     bool
 	stopCh   chan struct{}
 	stopLock sync.Mutex
@ -265,7 +268,7 @@ func (c *CheckTTL) run() {
 		case <-c.timer.C:
 			c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
 				c.CheckID)
-			c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, "TTL expired")
+			c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, c.getExpiredOutput())

 		case <-c.stopCh:
 			return
@ -273,12 +276,31 @@ func (c *CheckTTL) run() {
 	}
 }

+// getExpiredOutput formats the output for the case when the TTL is expired.
+func (c *CheckTTL) getExpiredOutput() string {
+	c.lastOutputLock.RLock()
+	defer c.lastOutputLock.RUnlock()
+
+	const prefix = "TTL expired"
+	if c.lastOutput == "" {
+		return prefix
+	}
+
+	return fmt.Sprintf("%s (last output before timeout follows): %s", prefix, c.lastOutput)
+}
+
 // SetStatus is used to update the status of the check,
 // and to renew the TTL. If expired, TTL is restarted.
 func (c *CheckTTL) SetStatus(status, output string) {
 	c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
 		c.CheckID, status)
 	c.Notify.UpdateCheck(c.CheckID, status, output)
+
+	// Store the last output so we can retain it if the TTL expires.
+	c.lastOutputLock.Lock()
+	c.lastOutput = output
+	c.lastOutputLock.Unlock()
+
 	c.timer.Reset(c.TTL)
 }

--- a/command/agent/check_test.go
+++ b/command/agent/check_test.go
@ -9,6 +9,7 @@ import (
 	"net/http/httptest"
 	"os"
 	"os/exec"
+	"strings"
 	"sync"
 	"testing"
 	"time"
@ -150,7 +151,7 @@ func TestCheckTTL(t *testing.T) {
 	defer check.Stop()

 	time.Sleep(50 * time.Millisecond)
-	check.SetStatus(structs.HealthPassing, "")
+	check.SetStatus(structs.HealthPassing, "test-output")

 	if mock.updates["foo"] != 1 {
 		t.Fatalf("should have 1 updates %v", mock.updates)
@ -176,6 +177,10 @@ func TestCheckTTL(t *testing.T) {
 	if mock.state["foo"] != structs.HealthCritical {
 		t.Fatalf("should be critical %v", mock.state)
 	}
+
+	if !strings.Contains(mock.output["foo"], "test-output") {
+		t.Fatalf("should have retained output %v", mock.output)
+	}
 }

 func mockHTTPServer(responseCode int) *httptest.Server {
--- a/command/agent/http.go
+++ b/command/agent/http.go
@ -232,6 +232,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
 	s.mux.HandleFunc("/v1/agent/check/pass/", s.wrap(s.AgentCheckPass))
 	s.mux.HandleFunc("/v1/agent/check/warn/", s.wrap(s.AgentCheckWarn))
 	s.mux.HandleFunc("/v1/agent/check/fail/", s.wrap(s.AgentCheckFail))
+	s.mux.HandleFunc("/v1/agent/check/update/", s.wrap(s.AgentCheckUpdate))

 	s.mux.HandleFunc("/v1/agent/service/register", s.wrap(s.AgentRegisterService))
 	s.mux.HandleFunc("/v1/agent/service/deregister/", s.wrap(s.AgentDeregisterService))
--- a/website/source/docs/agent/http/agent.html.markdown
+++ b/website/source/docs/agent/http/agent.html.markdown
@ -25,9 +25,10 @@ The following endpoints are supported:
 * [`/v1/agent/force-leave/<node>`](#agent_force_leave)>: Forces removal of a node
 * [`/v1/agent/check/register`](#agent_check_register) : Registers a new local check
 * [`/v1/agent/check/deregister/<checkID>`](#agent_check_deregister) : Deregisters a local check
-* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local test as passing
-* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local test as warning
-* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local test as critical
+* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local check as passing
+* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local check as warning
+* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local check as critical
+* [`/v1/agent/check/update/<checkID>`](#agent_check_update) : Updates a local check
 * [`/v1/agent/service/register`](#agent_service_register) : Registers a new local service
 * [`/v1/agent/service/deregister/<serviceID>`](#agent_service_deregister) : Deregisters a local service
 * [`/v1/agent/service/maintenance/<serviceID>`](#agent_service_maintenance) : Manages service maintenance mode
@ -310,8 +311,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
 When this endpoint is accessed via a GET, the status of the check is set to `passing`
 and the TTL clock is reset.

-The optional "?note=" query parameter can be used to associate a human-readable message 
-with the status of the check.
+The optional "?note=" query parameter can be used to associate a human-readable message
+with the status of the check. This will be passed through to the check's `Output` field
+in the check endpoints.

 The return code is 200 on success.

@ -321,8 +323,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
 When this endpoint is accessed via a GET, the status of the check is set to `warning`,
 and the TTL clock is reset.

-The optional "?note=" query parameter can be used to associate a human-readable message 
-with the status of the check.
+The optional "?note=" query parameter can be used to associate a human-readable message
+with the status of the check. This will be passed through to the check's `Output` field
+in the check endpoints.

 The return code is 200 on success.

@ -332,8 +335,33 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
 When this endpoint is accessed via a GET, the status of the check is set to `critical`,
 and the TTL clock is reset.

-The optional "?note=" query parameter can be used to associate a human-readable message 
-with the status of the check.
+The optional "?note=" query parameter can be used to associate a human-readable message
+with the status of the check. This will be passed through to the check's `Output` field
+in the check endpoints.
+
+The return code is 200 on success.
+
+### <a name="agent_check_update"></a> /v1/agent/check/update/\<checkId\>
+
+This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.html).
+When this endpoint is accessed with a PUT, the status and output of the check are
+updated and the TTL clock is reset.
+
+This endpoint expects a JSON request body to be put. The request body must look like:
+
+```javascript
+{
+  "Status": "passing",
+  "Output": "curl reported a failure:\n\n..."
+}
+```
+
+The `Status` field is mandatory, and must be set to "passing", "warning", or "critical".
+
+`Output` is an optional field that will associate a human-readable message with the status
+of the check, such as the output of the checking script or process. This will be truncated
+if it exceeds 4KB in size. This will be passed through to the check's `Output` field in the
+check endpoints.

 The return code is 200 on success.