Add an API method for determining the best status
Given a list of HealthChecks, this determines the "best" status for the collective group. This is useful for nodes and services, which may have multiple checks associated with them.
This commit is contained in:
parent
89981b147b
commit
1c55429a38
|
@ -2,6 +2,8 @@ package api
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -11,6 +13,15 @@ const (
|
||||||
HealthPassing = "passing"
|
HealthPassing = "passing"
|
||||||
HealthWarning = "warning"
|
HealthWarning = "warning"
|
||||||
HealthCritical = "critical"
|
HealthCritical = "critical"
|
||||||
|
HealthMaint = "maintenance"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// NodeMaint is the special key set by a node in maintenance mode.
|
||||||
|
NodeMaint = "_node_maintenance"
|
||||||
|
|
||||||
|
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||||
|
ServiceMaintPrefix = "_service_maintenance:"
|
||||||
)
|
)
|
||||||
|
|
||||||
// HealthCheck is used to represent a single check
|
// HealthCheck is used to represent a single check
|
||||||
|
@ -25,6 +36,52 @@ type HealthCheck struct {
|
||||||
ServiceName string
|
ServiceName string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HealthChecks is a collection of HealthCheck structs.
|
||||||
|
type HealthChecks []*HealthCheck
|
||||||
|
|
||||||
|
// AggregatedStatus returns the "best" status for the list of health checks.
|
||||||
|
// Because a given entry may have many service and node-level health checks
|
||||||
|
// attached, this function determines the best representative of the status as
|
||||||
|
// as single string using the following heuristic:
|
||||||
|
//
|
||||||
|
// maintenance > critical > warning > passing
|
||||||
|
//
|
||||||
|
func (c HealthChecks) AggregatedStatus() string {
|
||||||
|
var passing, warning, critical, maintenance bool
|
||||||
|
for _, check := range c {
|
||||||
|
id := string(check.CheckID)
|
||||||
|
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
|
||||||
|
maintenance = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
switch check.Status {
|
||||||
|
case HealthPassing:
|
||||||
|
passing = true
|
||||||
|
case HealthWarning:
|
||||||
|
warning = true
|
||||||
|
case HealthCritical:
|
||||||
|
critical = true
|
||||||
|
default:
|
||||||
|
log.Printf("[WARN] unknown status %q", check.Status)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case maintenance:
|
||||||
|
return HealthMaint
|
||||||
|
case critical:
|
||||||
|
return HealthCritical
|
||||||
|
case warning:
|
||||||
|
return HealthWarning
|
||||||
|
case passing:
|
||||||
|
return HealthPassing
|
||||||
|
default:
|
||||||
|
return HealthPassing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ServiceEntry is used for the health service endpoint
|
// ServiceEntry is used for the health service endpoint
|
||||||
type ServiceEntry struct {
|
type ServiceEntry struct {
|
||||||
Node *Node
|
Node *Node
|
||||||
|
|
|
@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHealthChecks_AggregatedStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
checks HealthChecks
|
||||||
|
exp string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
"empty",
|
||||||
|
nil,
|
||||||
|
HealthPassing,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"passing",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthPassing,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"warning",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"critical",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthCritical,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"node_maintenance",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
CheckID: NodeMaint,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthMaint,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"service_maintenance",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
CheckID: ServiceMaintPrefix + "service",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthMaint,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"unknown",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: "nope-nope-noper",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"maintenance_over_critical",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
CheckID: NodeMaint,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthCritical,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthMaint,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"critical_over_warning",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthCritical,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"warning_over_passing",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"lots",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthWarning,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, tc := range cases {
|
||||||
|
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
|
||||||
|
act := tc.checks.AggregatedStatus()
|
||||||
|
if tc.exp != act {
|
||||||
|
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHealth_Checks(t *testing.T) {
|
func TestHealth_Checks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
c, s := makeClient(t)
|
c, s := makeClient(t)
|
||||||
|
|
|
@ -34,10 +34,6 @@ const (
|
||||||
checksDir = "checks"
|
checksDir = "checks"
|
||||||
checkStateDir = "checks/state"
|
checkStateDir = "checks/state"
|
||||||
|
|
||||||
// The ID of the faux health checks for maintenance mode
|
|
||||||
serviceMaintCheckPrefix = "_service_maintenance"
|
|
||||||
nodeMaintCheckID = "_node_maintenance"
|
|
||||||
|
|
||||||
// Default reasons for node/service maintenance mode
|
// Default reasons for node/service maintenance mode
|
||||||
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
||||||
"but no reason was provided. This is a default message."
|
"but no reason was provided. This is a default message."
|
||||||
|
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
|
||||||
|
|
||||||
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
||||||
func serviceMaintCheckID(serviceID string) types.CheckID {
|
func serviceMaintCheckID(serviceID string) types.CheckID {
|
||||||
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
|
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
|
||||||
}
|
}
|
||||||
|
|
||||||
// EnableServiceMaintenance will register a false health check against the given
|
// EnableServiceMaintenance will register a false health check against the given
|
||||||
|
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
|
||||||
// EnableNodeMaintenance places a node into maintenance mode.
|
// EnableNodeMaintenance places a node into maintenance mode.
|
||||||
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||||
// Ensure node maintenance is not already enabled
|
// Ensure node maintenance is not already enabled
|
||||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
|
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||||
// Create and register the node maintenance check
|
// Create and register the node maintenance check
|
||||||
check := &structs.HealthCheck{
|
check := &structs.HealthCheck{
|
||||||
Node: a.config.NodeName,
|
Node: a.config.NodeName,
|
||||||
CheckID: nodeMaintCheckID,
|
CheckID: structs.NodeMaint,
|
||||||
Name: "Node Maintenance Mode",
|
Name: "Node Maintenance Mode",
|
||||||
Notes: reason,
|
Notes: reason,
|
||||||
Status: structs.HealthCritical,
|
Status: structs.HealthCritical,
|
||||||
|
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||||
|
|
||||||
// DisableNodeMaintenance removes a node from maintenance mode
|
// DisableNodeMaintenance removes a node from maintenance mode
|
||||||
func (a *Agent) DisableNodeMaintenance() {
|
func (a *Agent) DisableNodeMaintenance() {
|
||||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
|
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
a.RemoveCheck(nodeMaintCheckID, true)
|
a.RemoveCheck(structs.NodeMaint, true)
|
||||||
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure the maintenance check was registered
|
// Ensure the maintenance check was registered
|
||||||
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
|
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("should have registered maintenance check")
|
t.Fatalf("should have registered maintenance check")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that the token was used
|
// Check that the token was used
|
||||||
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||||
t.Fatalf("expected 'mytoken', got '%s'", token)
|
t.Fatalf("expected 'mytoken', got '%s'", token)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure the maintenance check was removed
|
// Ensure the maintenance check was removed
|
||||||
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
|
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
|
||||||
t.Fatalf("should have removed maintenance check")
|
t.Fatalf("should have removed maintenance check")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
||||||
agent.EnableNodeMaintenance("broken", "mytoken")
|
agent.EnableNodeMaintenance("broken", "mytoken")
|
||||||
|
|
||||||
// Make sure the critical health check was added
|
// Make sure the critical health check was added
|
||||||
check, ok := agent.state.Checks()[nodeMaintCheckID]
|
check, ok := agent.state.Checks()[structs.NodeMaint]
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("should have registered critical node check")
|
t.Fatalf("should have registered critical node check")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that the token was used to register the check
|
// Check that the token was used to register the check
|
||||||
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||||
t.Fatalf("expected 'mytoken', got: '%s'", token)
|
t.Fatalf("expected 'mytoken', got: '%s'", token)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
||||||
agent.DisableNodeMaintenance()
|
agent.DisableNodeMaintenance()
|
||||||
|
|
||||||
// Ensure the check was deregistered
|
// Ensure the check was deregistered
|
||||||
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
|
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
|
||||||
t.Fatalf("should have deregistered critical node check")
|
t.Fatalf("should have deregistered critical node check")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
||||||
agent.EnableNodeMaintenance("", "")
|
agent.EnableNodeMaintenance("", "")
|
||||||
|
|
||||||
// Make sure the check was registered with the default note
|
// Make sure the check was registered with the default note
|
||||||
check, ok = agent.state.Checks()[nodeMaintCheckID]
|
check, ok = agent.state.Checks()[structs.NodeMaint]
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("should have registered critical node check")
|
t.Fatalf("should have registered critical node check")
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,6 +56,15 @@ const (
|
||||||
HealthPassing = "passing"
|
HealthPassing = "passing"
|
||||||
HealthWarning = "warning"
|
HealthWarning = "warning"
|
||||||
HealthCritical = "critical"
|
HealthCritical = "critical"
|
||||||
|
HealthMaint = "maintenance"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// NodeMaint is the special key set by a node in maintenance mode.
|
||||||
|
NodeMaint = "_node_maintenance"
|
||||||
|
|
||||||
|
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||||
|
ServiceMaintPrefix = "_service_maintenance:"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ValidStatus(s string) bool {
|
func ValidStatus(s string) bool {
|
||||||
|
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
|
||||||
return clone
|
return clone
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HealthChecks is a collection of HealthCheck structs.
|
||||||
type HealthChecks []*HealthCheck
|
type HealthChecks []*HealthCheck
|
||||||
|
|
||||||
// CheckServiceNode is used to provide the node, its service
|
// CheckServiceNode is used to provide the node, its service
|
||||||
|
@ -460,7 +470,7 @@ type NodeInfo struct {
|
||||||
Address string
|
Address string
|
||||||
TaggedAddresses map[string]string
|
TaggedAddresses map[string]string
|
||||||
Services []*NodeService
|
Services []*NodeService
|
||||||
Checks []*HealthCheck
|
Checks HealthChecks
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeDump is used to dump all the nodes with all their
|
// NodeDump is used to dump all the nodes with all their
|
||||||
|
|
Loading…
Reference in New Issue