command/agent: Add simple HTTP check type

These checks make an `HTTP GET` request every Interval to the specified URL.
The status of the service depends on the HTTP Response Code.
`200` is passing, `503` is warning and anything else is failing.
This commit is contained in:
Nicholas Capo 2015-01-09 16:43:24 -06:00
parent e129236564
commit f2649edcfc
5 changed files with 184 additions and 17 deletions

View File

@ -6,6 +6,7 @@ import (
"io"
"log"
"net"
"net/http"
"os"
"path/filepath"
"strconv"
@ -51,11 +52,16 @@ type Agent struct {
state localState
// checkMonitors maps the check ID to an associated monitor
// checkTTLs maps the check ID to an associated check TTL
// checkLock protects updates to either
checkMonitors map[string]*CheckMonitor
checkTTLs map[string]*CheckTTL
checkLock sync.Mutex
// checkHTTPs maps the check ID to an associated HTTP check
checkHTTPs map[string]*CheckHTTP
// checkTTLs maps the check ID to an associated check TTL
checkTTLs map[string]*CheckTTL
// checkLock protects updates to the check* maps
checkLock sync.Mutex
// eventCh is used to receive user events
eventCh chan serf.UserEvent
@ -111,6 +117,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
logOutput: logOutput,
checkMonitors: make(map[string]*CheckMonitor),
checkTTLs: make(map[string]*CheckTTL),
checkHTTPs: make(map[string]*CheckHTTP),
eventCh: make(chan serf.UserEvent, 1024),
eventBuf: make([]*UserEvent, 256),
shutdownCh: make(chan struct{}),
@ -382,6 +389,10 @@ func (a *Agent) Shutdown() error {
chk.Stop()
}
for _, chk := range a.checkHTTPs {
chk.Stop()
}
a.logger.Println("[INFO] agent: requesting shutdown")
var err error
if a.server != nil {
@ -661,6 +672,29 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
ttl.Start()
a.checkTTLs[check.CheckID] = ttl
} else if chkType.IsHTTP() {
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
existing.Stop()
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}
http := &CheckHTTP{
Notify: &a.state,
CheckID: check.CheckID,
HTTP: chkType.HTTP,
Interval: chkType.Interval,
Logger: a.logger,
httpClient: &http.Client{
Timeout: chkType.Interval,
},
}
http.Start()
a.checkHTTPs[check.CheckID] = http
} else {
if existing, ok := a.checkMonitors[check.CheckID]; ok {
existing.Stop()

View File

@ -5,6 +5,7 @@ import (
"github.com/armon/circbuf"
"github.com/hashicorp/consul/consul/structs"
"log"
"net/http"
"os/exec"
"sync"
"syscall"
@ -23,10 +24,14 @@ const (
)
// CheckType is used to create either the CheckMonitor
// or the CheckTTL. Only one of TTL or Script/Interval
// needs to be provided
// or the CheckTTL.
// Three types are supported: Script, HTTP, and TTL
// Script and HTTP both require Interval
// Only one of the types needs to be provided
// TTL or Script/Interval or HTTP/Interval
type CheckType struct {
Script string
HTTP string
Interval time.Duration
TTL time.Duration
@ -36,7 +41,7 @@ type CheckType struct {
// Valid checks if the CheckType is valid
func (c *CheckType) Valid() bool {
return c.IsTTL() || c.IsMonitor()
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
}
// IsTTL checks if this is a TTL type
@ -49,6 +54,11 @@ func (c *CheckType) IsMonitor() bool {
return c.Script != "" && c.Interval != 0
}
// IsHTTP checks if this is a HTTP type
func (c *CheckType) IsHTTP() bool {
return c.HTTP != "" && c.Interval != 0
}
// CheckNotifier interface is used by the CheckMonitor
// to notify when a check has a status update. The update
// should take care to be idempotent.
@ -244,3 +254,93 @@ type persistedCheck struct {
Check *structs.HealthCheck
ChkType *CheckType
}
// CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check.
// The check is passing if the response code is 200.
// The check is warning if the response code is 503.
// The check is critical if the response code is anything else
// or if the request returns an error
type CheckHTTP struct {
Notify CheckNotifier
CheckID string
HTTP string
Interval time.Duration
Logger *log.Logger
httpClient *http.Client
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start an HTTP check.
// The check runs until stop is called
func (c *CheckHTTP) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop an HTTP check.
func (c *CheckHTTP) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckHTTP) run() {
// Get the randomized initial pause time
initialPauseTime := randomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
// check is invoked periodically to perform the HTTP check
func (c *CheckHTTP) check() {
resp, err := c.httpClient.Get(c.HTTP)
if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
return
}
resp.Body.Close()
switch resp.StatusCode {
// PASSING
case http.StatusOK:
c.Logger.Printf("[DEBUG] http check '%v' is passing", c.CheckID)
result := fmt.Sprintf("%s from %s", resp.Status, c.HTTP)
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result)
// WARNING
// 503 Service Unavailable
// The server is currently unable to handle the request due to
// a temporary overloading or maintenance of the server.
// http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
case http.StatusServiceUnavailable:
c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status)
// CRITICAL
default:
c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status)
}
}

View File

@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application
level if it associated with a service. A check is defined in a configuration file,
or added at runtime over the HTTP interface.
There are two different kinds of checks:
There are three different kinds of checks:
* Script + Interval - These checks depend on invoking an external application
that does the health check and exits with an appropriate exit code, potentially
generating some output. A script is paired with an invocation interval (e.g.
every 30 seconds). This is similar to the Nagios plugin system.
* HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g.
every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code.
`200` is passing, `503` is warning and anything else is failing.
This type of check should be preferred over a script that for example uses `curl`.
* Time to Live (TTL) - These checks retain their last known state for a given TTL.
The state of the check must be updated periodically over the HTTP interface. If an
external system fails to update the status within a given TTL, the check is
@ -43,6 +48,19 @@ A check definition that is a script looks like:
}
```
An HTTP based check looks like:
```javascript
{
"check": {
"id": "api",
"name": "HTTP API on port 5000",
"http": "http://localhost:5000/health",
"interval": "10s"
}
}
```
A TTL based check is very similar:
```javascript
@ -56,7 +74,7 @@ A TTL based check is very similar:
}
```
Both types of definitions must include a `name`, and may optionally
Each type of definitions must include a `name`, and may optionally
provide an `id` and `notes` field. The `id` is set to the `name` if not
provided. It is required that all checks have a unique ID per node, so if names
might conflict then unique ID's should be provided.
@ -102,6 +120,12 @@ key in your configuration file.
},
{
"id": "chk2",
"name": "/health",
"http": "http://localhost:5000/health",
"interval": "15s"
},
{
"id": "chk3",
"name": "cpu",
"script": "/bin/check_cpu",
"interval": "10s"

View File

@ -422,7 +422,7 @@ The endpoint always returns 200.
The register endpoint is used to add a new check to the local agent.
There is more documentation on checks [here](/docs/agent/checks.html).
Checks are either a script or TTL type. The agent is responsible for managing
Checks are of script, HTTP, or TTL type. The agent is responsible for managing
the status of the check and keeping the Catalog in sync.
The register endpoint expects a JSON request body to be PUT. The request
@ -434,20 +434,25 @@ body must look like:
"Name": "Memory utilization",
"Notes": "Ensure we don't oversubscribe memory",
"Script": "/usr/local/bin/check_mem.py",
"HTTP": "http://example.com",
"Interval": "10s",
"TTL": "15s"
}
```
The `Name` field is mandatory, as is either `Script` and `Interval`
or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided.
The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
`Script` and `HTTP` also require that `Interval` be set.
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
`ID` entries per agent, so it may be necessary to provide an ID. The `Notes`
field is not used by Consul, and is meant to be human readable.
If a `Script` is provided, the check type is a script, and Consul will
evaluate the script every `Interval` to update the status. If a `TTL` type
is used, then the TTL update APIs must be used to periodically update
evaluate the script every `Interval` to update the status.
An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is `200` the check is passing, if the response is `503` the check is warning, otherwise the check is critical.
If a `TTL` type is used, then the TTL update APIs must be used to periodically update
the state of the check.
The return code is 200 on success.
@ -515,6 +520,7 @@ body must look like:
"Port": 8000,
"Check": {
"Script": "/usr/local/bin/check_redis.py",
"HTTP": "http://localhost:5000/health",
"Interval": "10s",
"TTL": "15s"
}
@ -523,8 +529,10 @@ body must look like:
The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`.
You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID.
`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval`
or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html).
`Tags`, `Address`, `Port` and `Check` are optional.
If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided.
`Script` and `HTTP` also require `Interval`.
There is more information about checks [here](/docs/agent/checks.html).
The `Address` will default to that of the agent if not provided.
The created check will be named "service:\<ServiceId\>".

View File

@ -55,7 +55,8 @@ a node has any failing system-level check, the DNS interface will omit that
node from any service query.
There is more information about [checks here](/docs/agent/checks.html). The
check must be of the script or TTL type. If it is a script type, `script` and
check must be of the script, HTTP or TTL type. If it is a script type, `script` and
`interval` must be provided. If it is a HTTP type, `http` and
`interval` must be provided. If it is a TTL type, then only `ttl` must be
provided. The check name is automatically generated as "service:<service-id>".