diff --git a/command/agent/agent.go b/command/agent/agent.go index c34063a86..13f7c35b9 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -6,6 +6,7 @@ import ( "io" "log" "net" + "net/http" "os" "path/filepath" "strconv" @@ -51,11 +52,16 @@ type Agent struct { state localState // checkMonitors maps the check ID to an associated monitor - // checkTTLs maps the check ID to an associated check TTL - // checkLock protects updates to either checkMonitors map[string]*CheckMonitor - checkTTLs map[string]*CheckTTL - checkLock sync.Mutex + + // checkHTTPs maps the check ID to an associated HTTP check + checkHTTPs map[string]*CheckHTTP + + // checkTTLs maps the check ID to an associated check TTL + checkTTLs map[string]*CheckTTL + + // checkLock protects updates to the check* maps + checkLock sync.Mutex // eventCh is used to receive user events eventCh chan serf.UserEvent @@ -111,6 +117,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) { logOutput: logOutput, checkMonitors: make(map[string]*CheckMonitor), checkTTLs: make(map[string]*CheckTTL), + checkHTTPs: make(map[string]*CheckHTTP), eventCh: make(chan serf.UserEvent, 1024), eventBuf: make([]*UserEvent, 256), shutdownCh: make(chan struct{}), @@ -382,6 +389,10 @@ func (a *Agent) Shutdown() error { chk.Stop() } + for _, chk := range a.checkHTTPs { + chk.Stop() + } + a.logger.Println("[INFO] agent: requesting shutdown") var err error if a.server != nil { @@ -661,6 +672,29 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist ttl.Start() a.checkTTLs[check.CheckID] = ttl + } else if chkType.IsHTTP() { + if existing, ok := a.checkHTTPs[check.CheckID]; ok { + existing.Stop() + } + if chkType.Interval < MinInterval { + a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", + check.CheckID, MinInterval)) + chkType.Interval = MinInterval + } + + http := &CheckHTTP{ + Notify: &a.state, + CheckID: check.CheckID, + HTTP: chkType.HTTP, + Interval: chkType.Interval, + Logger: a.logger, + httpClient: &http.Client{ + Timeout: chkType.Interval, + }, + } + http.Start() + a.checkHTTPs[check.CheckID] = http + } else { if existing, ok := a.checkMonitors[check.CheckID]; ok { existing.Stop() diff --git a/command/agent/check.go b/command/agent/check.go index 17e2fb5f0..2a383ae16 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -5,6 +5,7 @@ import ( "github.com/armon/circbuf" "github.com/hashicorp/consul/consul/structs" "log" + "net/http" "os/exec" "sync" "syscall" @@ -23,10 +24,14 @@ const ( ) // CheckType is used to create either the CheckMonitor -// or the CheckTTL. Only one of TTL or Script/Interval -// needs to be provided +// or the CheckTTL. +// Three types are supported: Script, HTTP, and TTL +// Script and HTTP both require Interval +// Only one of the types needs to be provided +// TTL or Script/Interval or HTTP/Interval type CheckType struct { Script string + HTTP string Interval time.Duration TTL time.Duration @@ -36,7 +41,7 @@ type CheckType struct { // Valid checks if the CheckType is valid func (c *CheckType) Valid() bool { - return c.IsTTL() || c.IsMonitor() + return c.IsTTL() || c.IsMonitor() || c.IsHTTP() } // IsTTL checks if this is a TTL type @@ -49,6 +54,11 @@ func (c *CheckType) IsMonitor() bool { return c.Script != "" && c.Interval != 0 } +// IsHTTP checks if this is a HTTP type +func (c *CheckType) IsHTTP() bool { + return c.HTTP != "" && c.Interval != 0 +} + // CheckNotifier interface is used by the CheckMonitor // to notify when a check has a status update. The update // should take care to be idempotent. @@ -244,3 +254,93 @@ type persistedCheck struct { Check *structs.HealthCheck ChkType *CheckType } + +// CheckHTTP is used to periodically make an HTTP request to +// determine the health of a given check. +// The check is passing if the response code is 200. +// The check is warning if the response code is 503. +// The check is critical if the response code is anything else +// or if the request returns an error +type CheckHTTP struct { + Notify CheckNotifier + CheckID string + HTTP string + Interval time.Duration + Logger *log.Logger + + httpClient *http.Client + stop bool + stopCh chan struct{} + stopLock sync.Mutex +} + +// Start is used to start an HTTP check. +// The check runs until stop is called +func (c *CheckHTTP) Start() { + c.stopLock.Lock() + defer c.stopLock.Unlock() + c.stop = false + c.stopCh = make(chan struct{}) + go c.run() +} + +// Stop is used to stop an HTTP check. +func (c *CheckHTTP) Stop() { + c.stopLock.Lock() + defer c.stopLock.Unlock() + if !c.stop { + c.stop = true + close(c.stopCh) + } +} + +// run is invoked by a goroutine to run until Stop() is called +func (c *CheckHTTP) run() { + // Get the randomized initial pause time + initialPauseTime := randomStagger(c.Interval) + c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP) + next := time.After(initialPauseTime) + for { + select { + case <-next: + c.check() + next = time.After(c.Interval) + case <-c.stopCh: + return + } + } +} + +// check is invoked periodically to perform the HTTP check +func (c *CheckHTTP) check() { + resp, err := c.httpClient.Get(c.HTTP) + if err != nil { + c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err) + c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error()) + return + } + resp.Body.Close() + + switch resp.StatusCode { + + // PASSING + case http.StatusOK: + c.Logger.Printf("[DEBUG] http check '%v' is passing", c.CheckID) + result := fmt.Sprintf("%s from %s", resp.Status, c.HTTP) + c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result) + + // WARNING + // 503 Service Unavailable + // The server is currently unable to handle the request due to + // a temporary overloading or maintenance of the server. + // http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html + case http.StatusServiceUnavailable: + c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID) + c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status) + + // CRITICAL + default: + c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID) + c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status) + } +} diff --git a/website/source/docs/agent/checks.html.markdown b/website/source/docs/agent/checks.html.markdown index 8a31a0462..b3ee93260 100644 --- a/website/source/docs/agent/checks.html.markdown +++ b/website/source/docs/agent/checks.html.markdown @@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application level if it associated with a service. A check is defined in a configuration file, or added at runtime over the HTTP interface. -There are two different kinds of checks: +There are three different kinds of checks: * Script + Interval - These checks depend on invoking an external application that does the health check and exits with an appropriate exit code, potentially generating some output. A script is paired with an invocation interval (e.g. every 30 seconds). This is similar to the Nagios plugin system. + * HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g. + every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code. + `200` is passing, `503` is warning and anything else is failing. + This type of check should be preferred over a script that for example uses `curl`. + * Time to Live (TTL) - These checks retain their last known state for a given TTL. The state of the check must be updated periodically over the HTTP interface. If an external system fails to update the status within a given TTL, the check is @@ -43,6 +48,19 @@ A check definition that is a script looks like: } ``` +An HTTP based check looks like: + +```javascript +{ + "check": { + "id": "api", + "name": "HTTP API on port 5000", + "http": "http://localhost:5000/health", + "interval": "10s" + } +} +``` + A TTL based check is very similar: ```javascript @@ -56,7 +74,7 @@ A TTL based check is very similar: } ``` -Both types of definitions must include a `name`, and may optionally +Each type of definitions must include a `name`, and may optionally provide an `id` and `notes` field. The `id` is set to the `name` if not provided. It is required that all checks have a unique ID per node, so if names might conflict then unique ID's should be provided. @@ -102,6 +120,12 @@ key in your configuration file. }, { "id": "chk2", + "name": "/health", + "http": "http://localhost:5000/health", + "interval": "15s" + }, + { + "id": "chk3", "name": "cpu", "script": "/bin/check_cpu", "interval": "10s" diff --git a/website/source/docs/agent/http.html.markdown b/website/source/docs/agent/http.html.markdown index 84efaf524..b67bab02a 100644 --- a/website/source/docs/agent/http.html.markdown +++ b/website/source/docs/agent/http.html.markdown @@ -422,7 +422,7 @@ The endpoint always returns 200. The register endpoint is used to add a new check to the local agent. There is more documentation on checks [here](/docs/agent/checks.html). -Checks are either a script or TTL type. The agent is responsible for managing +Checks are of script, HTTP, or TTL type. The agent is responsible for managing the status of the check and keeping the Catalog in sync. The register endpoint expects a JSON request body to be PUT. The request @@ -434,20 +434,25 @@ body must look like: "Name": "Memory utilization", "Notes": "Ensure we don't oversubscribe memory", "Script": "/usr/local/bin/check_mem.py", + "HTTP": "http://example.com", "Interval": "10s", "TTL": "15s" } ``` -The `Name` field is mandatory, as is either `Script` and `Interval` -or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided. +The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`. +`Script` and `HTTP` also require that `Interval` be set. + If an `ID` is not provided, it is set to `Name`. You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID. The `Notes` field is not used by Consul, and is meant to be human readable. If a `Script` is provided, the check type is a script, and Consul will -evaluate the script every `Interval` to update the status. If a `TTL` type -is used, then the TTL update APIs must be used to periodically update +evaluate the script every `Interval` to update the status. + +An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is `200` the check is passing, if the response is `503` the check is warning, otherwise the check is critical. + +If a `TTL` type is used, then the TTL update APIs must be used to periodically update the state of the check. The return code is 200 on success. @@ -515,6 +520,7 @@ body must look like: "Port": 8000, "Check": { "Script": "/usr/local/bin/check_redis.py", + "HTTP": "http://localhost:5000/health", "Interval": "10s", "TTL": "15s" } @@ -523,8 +529,10 @@ body must look like: The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`. You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID. -`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval` -or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html). +`Tags`, `Address`, `Port` and `Check` are optional. +If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided. +`Script` and `HTTP` also require `Interval`. +There is more information about checks [here](/docs/agent/checks.html). The `Address` will default to that of the agent if not provided. The created check will be named "service:\". diff --git a/website/source/docs/agent/services.html.markdown b/website/source/docs/agent/services.html.markdown index 2a4bce7a5..95ee7ba82 100644 --- a/website/source/docs/agent/services.html.markdown +++ b/website/source/docs/agent/services.html.markdown @@ -55,7 +55,8 @@ a node has any failing system-level check, the DNS interface will omit that node from any service query. There is more information about [checks here](/docs/agent/checks.html). The -check must be of the script or TTL type. If it is a script type, `script` and +check must be of the script, HTTP or TTL type. If it is a script type, `script` and +`interval` must be provided. If it is a HTTP type, `http` and `interval` must be provided. If it is a TTL type, then only `ttl` must be provided. The check name is automatically generated as "service:".