Add an API method for determining the best status

Given a list of HealthChecks, this determines the "best" status for the
collective group. This is useful for nodes and services, which may have
multiple checks associated with them.
This commit is contained in:
Seth Vargo 2016-11-29 16:15:20 -05:00
parent 89981b147b
commit 1c55429a38
No known key found for this signature in database
GPG Key ID: 905A90C2949E8787
6 changed files with 213 additions and 17 deletions

View File

@ -2,6 +2,8 @@ package api
import ( import (
"fmt" "fmt"
"log"
"strings"
) )
const ( const (
@ -11,6 +13,15 @@ const (
HealthPassing = "passing" HealthPassing = "passing"
HealthWarning = "warning" HealthWarning = "warning"
HealthCritical = "critical" HealthCritical = "critical"
HealthMaint = "maintenance"
)
const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
) )
// HealthCheck is used to represent a single check // HealthCheck is used to represent a single check
@ -25,6 +36,52 @@ type HealthCheck struct {
ServiceName string ServiceName string
} }
// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck
// AggregatedStatus returns the "best" status for the list of health checks.
// Because a given entry may have many service and node-level health checks
// attached, this function determines the best representative of the status as
// as single string using the following heuristic:
//
// maintenance > critical > warning > passing
//
func (c HealthChecks) AggregatedStatus() string {
var passing, warning, critical, maintenance bool
for _, check := range c {
id := string(check.CheckID)
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
maintenance = true
continue
}
switch check.Status {
case HealthPassing:
passing = true
case HealthWarning:
warning = true
case HealthCritical:
critical = true
default:
log.Printf("[WARN] unknown status %q", check.Status)
return ""
}
}
switch {
case maintenance:
return HealthMaint
case critical:
return HealthCritical
case warning:
return HealthWarning
case passing:
return HealthPassing
default:
return HealthPassing
}
}
// ServiceEntry is used for the health service endpoint // ServiceEntry is used for the health service endpoint
type ServiceEntry struct { type ServiceEntry struct {
Node *Node Node *Node

View File

@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
}) })
} }
func TestHealthChecks_AggregatedStatus(t *testing.T) {
t.Parallel()
cases := []struct {
name string
checks HealthChecks
exp string
}{
{
"empty",
nil,
HealthPassing,
},
{
"passing",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
},
HealthPassing,
},
{
"warning",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
{
"critical",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
},
HealthCritical,
},
{
"node_maintenance",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
},
HealthMaint,
},
{
"service_maintenance",
HealthChecks{
&HealthCheck{
CheckID: ServiceMaintPrefix + "service",
},
},
HealthMaint,
},
{
"unknown",
HealthChecks{
&HealthCheck{
Status: "nope-nope-noper",
},
},
"",
},
{
"maintenance_over_critical",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
&HealthCheck{
Status: HealthCritical,
},
},
HealthMaint,
},
{
"critical_over_warning",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthCritical,
},
{
"warning_over_passing",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
&HealthCheck{
Status: HealthPassing,
},
},
HealthWarning,
},
{
"lots",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
}
for i, tc := range cases {
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
act := tc.checks.AggregatedStatus()
if tc.exp != act {
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
}
})
}
}
func TestHealth_Checks(t *testing.T) { func TestHealth_Checks(t *testing.T) {
t.Parallel() t.Parallel()
c, s := makeClient(t) c, s := makeClient(t)

View File

@ -34,10 +34,6 @@ const (
checksDir = "checks" checksDir = "checks"
checkStateDir = "checks/state" checkStateDir = "checks/state"
// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
nodeMaintCheckID = "_node_maintenance"
// Default reasons for node/service maintenance mode // Default reasons for node/service maintenance mode
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " + defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
"but no reason was provided. This is a default message." "but no reason was provided. This is a default message."
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
// serviceMaintCheckID returns the ID of a given service's maintenance check // serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) types.CheckID { func serviceMaintCheckID(serviceID string) types.CheckID {
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID)) return types.CheckID(structs.ServiceMaintPrefix + serviceID)
} }
// EnableServiceMaintenance will register a false health check against the given // EnableServiceMaintenance will register a false health check against the given
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
// EnableNodeMaintenance places a node into maintenance mode. // EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance(reason, token string) { func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Ensure node maintenance is not already enabled // Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok { if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
return return
} }
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Create and register the node maintenance check // Create and register the node maintenance check
check := &structs.HealthCheck{ check := &structs.HealthCheck{
Node: a.config.NodeName, Node: a.config.NodeName,
CheckID: nodeMaintCheckID, CheckID: structs.NodeMaint,
Name: "Node Maintenance Mode", Name: "Node Maintenance Mode",
Notes: reason, Notes: reason,
Status: structs.HealthCritical, Status: structs.HealthCritical,
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// DisableNodeMaintenance removes a node from maintenance mode // DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() { func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok { if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
return return
} }
a.RemoveCheck(nodeMaintCheckID, true) a.RemoveCheck(structs.NodeMaint, true)
a.logger.Printf("[INFO] agent: Node left maintenance mode") a.logger.Printf("[INFO] agent: Node left maintenance mode")
} }

View File

@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
} }
// Ensure the maintenance check was registered // Ensure the maintenance check was registered
check, ok := srv.agent.state.Checks()[nodeMaintCheckID] check, ok := srv.agent.state.Checks()[structs.NodeMaint]
if !ok { if !ok {
t.Fatalf("should have registered maintenance check") t.Fatalf("should have registered maintenance check")
} }
// Check that the token was used // Check that the token was used
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" { if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got '%s'", token) t.Fatalf("expected 'mytoken', got '%s'", token)
} }
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
} }
// Ensure the maintenance check was removed // Ensure the maintenance check was removed
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok { if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have removed maintenance check") t.Fatalf("should have removed maintenance check")
} }
} }

View File

@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("broken", "mytoken") agent.EnableNodeMaintenance("broken", "mytoken")
// Make sure the critical health check was added // Make sure the critical health check was added
check, ok := agent.state.Checks()[nodeMaintCheckID] check, ok := agent.state.Checks()[structs.NodeMaint]
if !ok { if !ok {
t.Fatalf("should have registered critical node check") t.Fatalf("should have registered critical node check")
} }
// Check that the token was used to register the check // Check that the token was used to register the check
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" { if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got: '%s'", token) t.Fatalf("expected 'mytoken', got: '%s'", token)
} }
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.DisableNodeMaintenance() agent.DisableNodeMaintenance()
// Ensure the check was deregistered // Ensure the check was deregistered
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok { if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have deregistered critical node check") t.Fatalf("should have deregistered critical node check")
} }
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("", "") agent.EnableNodeMaintenance("", "")
// Make sure the check was registered with the default note // Make sure the check was registered with the default note
check, ok = agent.state.Checks()[nodeMaintCheckID] check, ok = agent.state.Checks()[structs.NodeMaint]
if !ok { if !ok {
t.Fatalf("should have registered critical node check") t.Fatalf("should have registered critical node check")
} }

View File

@ -56,6 +56,15 @@ const (
HealthPassing = "passing" HealthPassing = "passing"
HealthWarning = "warning" HealthWarning = "warning"
HealthCritical = "critical" HealthCritical = "critical"
HealthMaint = "maintenance"
)
const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
) )
func ValidStatus(s string) bool { func ValidStatus(s string) bool {
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
return clone return clone
} }
// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck type HealthChecks []*HealthCheck
// CheckServiceNode is used to provide the node, its service // CheckServiceNode is used to provide the node, its service
@ -460,7 +470,7 @@ type NodeInfo struct {
Address string Address string
TaggedAddresses map[string]string TaggedAddresses map[string]string
Services []*NodeService Services []*NodeService
Checks []*HealthCheck Checks HealthChecks
} }
// NodeDump is used to dump all the nodes with all their // NodeDump is used to dump all the nodes with all their