Add an API method for determining the best status
Given a list of HealthChecks, this determines the "best" status for the collective group. This is useful for nodes and services, which may have multiple checks associated with them.
This commit is contained in:
parent
89981b147b
commit
1c55429a38
|
@ -2,6 +2,8 @@ package api
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -11,6 +13,15 @@ const (
|
|||
HealthPassing = "passing"
|
||||
HealthWarning = "warning"
|
||||
HealthCritical = "critical"
|
||||
HealthMaint = "maintenance"
|
||||
)
|
||||
|
||||
const (
|
||||
// NodeMaint is the special key set by a node in maintenance mode.
|
||||
NodeMaint = "_node_maintenance"
|
||||
|
||||
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||
ServiceMaintPrefix = "_service_maintenance:"
|
||||
)
|
||||
|
||||
// HealthCheck is used to represent a single check
|
||||
|
@ -25,6 +36,52 @@ type HealthCheck struct {
|
|||
ServiceName string
|
||||
}
|
||||
|
||||
// HealthChecks is a collection of HealthCheck structs.
|
||||
type HealthChecks []*HealthCheck
|
||||
|
||||
// AggregatedStatus returns the "best" status for the list of health checks.
|
||||
// Because a given entry may have many service and node-level health checks
|
||||
// attached, this function determines the best representative of the status as
|
||||
// as single string using the following heuristic:
|
||||
//
|
||||
// maintenance > critical > warning > passing
|
||||
//
|
||||
func (c HealthChecks) AggregatedStatus() string {
|
||||
var passing, warning, critical, maintenance bool
|
||||
for _, check := range c {
|
||||
id := string(check.CheckID)
|
||||
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
|
||||
maintenance = true
|
||||
continue
|
||||
}
|
||||
|
||||
switch check.Status {
|
||||
case HealthPassing:
|
||||
passing = true
|
||||
case HealthWarning:
|
||||
warning = true
|
||||
case HealthCritical:
|
||||
critical = true
|
||||
default:
|
||||
log.Printf("[WARN] unknown status %q", check.Status)
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case maintenance:
|
||||
return HealthMaint
|
||||
case critical:
|
||||
return HealthCritical
|
||||
case warning:
|
||||
return HealthWarning
|
||||
case passing:
|
||||
return HealthPassing
|
||||
default:
|
||||
return HealthPassing
|
||||
}
|
||||
}
|
||||
|
||||
// ServiceEntry is used for the health service endpoint
|
||||
type ServiceEntry struct {
|
||||
Node *Node
|
||||
|
|
|
@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestHealthChecks_AggregatedStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
checks HealthChecks
|
||||
exp string
|
||||
}{
|
||||
{
|
||||
"empty",
|
||||
nil,
|
||||
HealthPassing,
|
||||
},
|
||||
{
|
||||
"passing",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
},
|
||||
HealthPassing,
|
||||
},
|
||||
{
|
||||
"warning",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
},
|
||||
HealthWarning,
|
||||
},
|
||||
{
|
||||
"critical",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthCritical,
|
||||
},
|
||||
},
|
||||
HealthCritical,
|
||||
},
|
||||
{
|
||||
"node_maintenance",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
CheckID: NodeMaint,
|
||||
},
|
||||
},
|
||||
HealthMaint,
|
||||
},
|
||||
{
|
||||
"service_maintenance",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
CheckID: ServiceMaintPrefix + "service",
|
||||
},
|
||||
},
|
||||
HealthMaint,
|
||||
},
|
||||
{
|
||||
"unknown",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: "nope-nope-noper",
|
||||
},
|
||||
},
|
||||
"",
|
||||
},
|
||||
{
|
||||
"maintenance_over_critical",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
CheckID: NodeMaint,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthCritical,
|
||||
},
|
||||
},
|
||||
HealthMaint,
|
||||
},
|
||||
{
|
||||
"critical_over_warning",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthCritical,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
},
|
||||
HealthCritical,
|
||||
},
|
||||
{
|
||||
"warning_over_passing",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
},
|
||||
HealthWarning,
|
||||
},
|
||||
{
|
||||
"lots",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
},
|
||||
HealthWarning,
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range cases {
|
||||
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
|
||||
act := tc.checks.AggregatedStatus()
|
||||
if tc.exp != act {
|
||||
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealth_Checks(t *testing.T) {
|
||||
t.Parallel()
|
||||
c, s := makeClient(t)
|
||||
|
|
|
@ -34,10 +34,6 @@ const (
|
|||
checksDir = "checks"
|
||||
checkStateDir = "checks/state"
|
||||
|
||||
// The ID of the faux health checks for maintenance mode
|
||||
serviceMaintCheckPrefix = "_service_maintenance"
|
||||
nodeMaintCheckID = "_node_maintenance"
|
||||
|
||||
// Default reasons for node/service maintenance mode
|
||||
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
||||
"but no reason was provided. This is a default message."
|
||||
|
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
|
|||
|
||||
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
||||
func serviceMaintCheckID(serviceID string) types.CheckID {
|
||||
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
|
||||
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
|
||||
}
|
||||
|
||||
// EnableServiceMaintenance will register a false health check against the given
|
||||
|
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
|
|||
// EnableNodeMaintenance places a node into maintenance mode.
|
||||
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||
// Ensure node maintenance is not already enabled
|
||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
|
||||
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
|||
// Create and register the node maintenance check
|
||||
check := &structs.HealthCheck{
|
||||
Node: a.config.NodeName,
|
||||
CheckID: nodeMaintCheckID,
|
||||
CheckID: structs.NodeMaint,
|
||||
Name: "Node Maintenance Mode",
|
||||
Notes: reason,
|
||||
Status: structs.HealthCritical,
|
||||
|
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
|||
|
||||
// DisableNodeMaintenance removes a node from maintenance mode
|
||||
func (a *Agent) DisableNodeMaintenance() {
|
||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
|
||||
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
|
||||
return
|
||||
}
|
||||
a.RemoveCheck(nodeMaintCheckID, true)
|
||||
a.RemoveCheck(structs.NodeMaint, true)
|
||||
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
||||
}
|
||||
|
||||
|
|
|
@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
|
|||
}
|
||||
|
||||
// Ensure the maintenance check was registered
|
||||
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
|
||||
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
|
||||
if !ok {
|
||||
t.Fatalf("should have registered maintenance check")
|
||||
}
|
||||
|
||||
// Check that the token was used
|
||||
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
||||
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||
t.Fatalf("expected 'mytoken', got '%s'", token)
|
||||
}
|
||||
|
||||
|
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
|
|||
}
|
||||
|
||||
// Ensure the maintenance check was removed
|
||||
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
|
||||
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
|
||||
t.Fatalf("should have removed maintenance check")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
|||
agent.EnableNodeMaintenance("broken", "mytoken")
|
||||
|
||||
// Make sure the critical health check was added
|
||||
check, ok := agent.state.Checks()[nodeMaintCheckID]
|
||||
check, ok := agent.state.Checks()[structs.NodeMaint]
|
||||
if !ok {
|
||||
t.Fatalf("should have registered critical node check")
|
||||
}
|
||||
|
||||
// Check that the token was used to register the check
|
||||
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
||||
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||
t.Fatalf("expected 'mytoken', got: '%s'", token)
|
||||
}
|
||||
|
||||
|
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
|||
agent.DisableNodeMaintenance()
|
||||
|
||||
// Ensure the check was deregistered
|
||||
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
|
||||
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
|
||||
t.Fatalf("should have deregistered critical node check")
|
||||
}
|
||||
|
||||
|
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
|||
agent.EnableNodeMaintenance("", "")
|
||||
|
||||
// Make sure the check was registered with the default note
|
||||
check, ok = agent.state.Checks()[nodeMaintCheckID]
|
||||
check, ok = agent.state.Checks()[structs.NodeMaint]
|
||||
if !ok {
|
||||
t.Fatalf("should have registered critical node check")
|
||||
}
|
||||
|
|
|
@ -56,6 +56,15 @@ const (
|
|||
HealthPassing = "passing"
|
||||
HealthWarning = "warning"
|
||||
HealthCritical = "critical"
|
||||
HealthMaint = "maintenance"
|
||||
)
|
||||
|
||||
const (
|
||||
// NodeMaint is the special key set by a node in maintenance mode.
|
||||
NodeMaint = "_node_maintenance"
|
||||
|
||||
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||
ServiceMaintPrefix = "_service_maintenance:"
|
||||
)
|
||||
|
||||
func ValidStatus(s string) bool {
|
||||
|
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
|
|||
return clone
|
||||
}
|
||||
|
||||
// HealthChecks is a collection of HealthCheck structs.
|
||||
type HealthChecks []*HealthCheck
|
||||
|
||||
// CheckServiceNode is used to provide the node, its service
|
||||
|
@ -460,7 +470,7 @@ type NodeInfo struct {
|
|||
Address string
|
||||
TaggedAddresses map[string]string
|
||||
Services []*NodeService
|
||||
Checks []*HealthCheck
|
||||
Checks HealthChecks
|
||||
}
|
||||
|
||||
// NodeDump is used to dump all the nodes with all their
|
||||
|
|
Loading…
Reference in New Issue