2016-04-10 08:13:25 +00:00
|
|
|
package command
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
2016-04-10 08:45:10 +00:00
|
|
|
"time"
|
2016-04-10 08:13:25 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2016-04-10 08:45:10 +00:00
|
|
|
HealthCritical = 2
|
|
|
|
HealthWarn = 1
|
2016-04-10 08:13:25 +00:00
|
|
|
HealthPass = 0
|
2016-04-10 08:45:10 +00:00
|
|
|
HealthUnknown = 3
|
2016-04-10 08:13:25 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type AgentCheckCommand struct {
|
|
|
|
Meta
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *AgentCheckCommand) Help() string {
|
|
|
|
helpText := `
|
2016-12-24 20:39:33 +00:00
|
|
|
Usage: nomad check [options]
|
2016-06-17 00:34:09 +00:00
|
|
|
|
2016-04-10 08:13:25 +00:00
|
|
|
Display state of the Nomad agent. The exit code of the command is Nagios
|
|
|
|
compatible and could be used with alerting systems.
|
|
|
|
|
|
|
|
General Options:
|
|
|
|
|
|
|
|
` + generalOptionsUsage() + `
|
|
|
|
|
|
|
|
Agent Check Options:
|
2016-06-17 00:34:09 +00:00
|
|
|
|
2016-04-10 08:13:25 +00:00
|
|
|
-min-peers
|
|
|
|
Minimum number of peers that a server is expected to know.
|
2016-04-10 08:45:10 +00:00
|
|
|
|
|
|
|
-min-servers
|
2017-08-07 21:13:05 +00:00
|
|
|
Minimum number of servers that a client is expected to know.
|
2016-04-10 08:13:25 +00:00
|
|
|
`
|
|
|
|
|
|
|
|
return strings.TrimSpace(helpText)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *AgentCheckCommand) Synopsis() string {
|
|
|
|
return "Displays health of the local Nomad agent"
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *AgentCheckCommand) Run(args []string) int {
|
2016-04-10 08:45:10 +00:00
|
|
|
var minPeers, minServers int
|
2016-04-10 08:13:25 +00:00
|
|
|
|
|
|
|
flags := c.Meta.FlagSet("check", FlagSetClient)
|
|
|
|
flags.Usage = func() { c.Ui.Output(c.Help()) }
|
|
|
|
flags.IntVar(&minPeers, "min-peers", 0, "")
|
2016-04-10 08:45:10 +00:00
|
|
|
flags.IntVar(&minServers, "min-servers", 1, "")
|
2016-04-10 08:13:25 +00:00
|
|
|
|
|
|
|
if err := flags.Parse(args); err != nil {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
|
|
|
|
client, err := c.Meta.Client()
|
|
|
|
if err != nil {
|
|
|
|
c.Ui.Error(fmt.Sprintf("error initializing client: %s", err))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
|
|
|
info, err := client.Agent().Self()
|
|
|
|
if err != nil {
|
|
|
|
c.Ui.Output(fmt.Sprintf("unable to query agent info: %v", err))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
2017-03-08 14:50:54 +00:00
|
|
|
if _, ok := info.Stats["nomad"]; ok {
|
|
|
|
return c.checkServerHealth(info.Stats, minPeers)
|
2016-04-10 08:13:25 +00:00
|
|
|
}
|
|
|
|
|
2017-03-08 14:50:54 +00:00
|
|
|
if clientStats, ok := info.Stats["client"]; ok {
|
|
|
|
return c.checkClientHealth(clientStats, minServers)
|
2016-04-10 08:13:25 +00:00
|
|
|
}
|
|
|
|
return HealthWarn
|
|
|
|
}
|
|
|
|
|
2016-04-10 08:45:10 +00:00
|
|
|
// checkServerHealth returns the health of a server.
|
|
|
|
// TODO Add more rules for determining server health
|
2017-03-08 14:50:54 +00:00
|
|
|
func (c *AgentCheckCommand) checkServerHealth(info map[string]map[string]string, minPeers int) int {
|
|
|
|
raft := info["raft"]
|
|
|
|
knownPeers, err := strconv.Atoi(raft["num_peers"])
|
2016-04-10 08:13:25 +00:00
|
|
|
if err != nil {
|
|
|
|
c.Ui.Output(fmt.Sprintf("unable to get known peers: %v", err))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
|
|
|
if knownPeers < minPeers {
|
|
|
|
c.Ui.Output(fmt.Sprintf("known peers: %v, is less than expected number of peers: %v", knownPeers, minPeers))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
return HealthPass
|
|
|
|
}
|
|
|
|
|
2016-05-15 16:41:34 +00:00
|
|
|
// checkClientHealth returns the health of a client
|
2017-03-08 14:50:54 +00:00
|
|
|
func (c *AgentCheckCommand) checkClientHealth(clientStats map[string]string, minServers int) int {
|
|
|
|
knownServers, err := strconv.Atoi(clientStats["known_servers"])
|
2016-04-10 08:45:10 +00:00
|
|
|
if err != nil {
|
|
|
|
c.Ui.Output(fmt.Sprintf("unable to get known servers: %v", err))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
2017-03-08 14:50:54 +00:00
|
|
|
heartbeatTTL, err := time.ParseDuration(clientStats["heartbeat_ttl"])
|
2016-04-10 08:45:10 +00:00
|
|
|
if err != nil {
|
|
|
|
c.Ui.Output(fmt.Sprintf("unable to parse heartbeat TTL: %v", err))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
2017-03-08 14:50:54 +00:00
|
|
|
lastHeartbeat, err := time.ParseDuration(clientStats["last_heartbeat"])
|
2016-04-10 08:45:10 +00:00
|
|
|
if err != nil {
|
|
|
|
c.Ui.Output(fmt.Sprintf("unable to parse last heartbeat: %v", err))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
|
|
|
if lastHeartbeat > heartbeatTTL {
|
|
|
|
c.Ui.Output(fmt.Sprintf("last heartbeat was %q time ago, expected heartbeat ttl: %q", lastHeartbeat, heartbeatTTL))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
|
|
|
if knownServers < minServers {
|
|
|
|
c.Ui.Output(fmt.Sprintf("known servers: %v, is less than expected number of servers: %v", knownServers, minServers))
|
|
|
|
return HealthCritical
|
|
|
|
}
|
|
|
|
|
2016-04-10 08:13:25 +00:00
|
|
|
return HealthPass
|
|
|
|
}
|