Allow ignoring checks by ID when defining a PreparedQuery. Fixes #3727.

This commit is contained in:
Paul Banks 2018-04-10 13:28:27 +01:00
parent d5b8b75614
commit 2ed0d2afcd
No known key found for this signature in database
GPG Key ID: C25A851A849B8221
10 changed files with 182 additions and 8 deletions

View File

@ -6,6 +6,7 @@ import (
"testing"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/types"
"github.com/mitchellh/copystructure"
)
@ -32,6 +33,15 @@ var (
"${agent.segment}",
},
},
IgnoreCheckIDs: []types.CheckID{
"${name.full}",
"${name.prefix}",
"${name.suffix}",
"${match(0)}",
"${match(1)}",
"${match(2)}",
"${agent.segment}",
},
Tags: []string{
"${name.full}",
"${name.prefix}",
@ -124,6 +134,7 @@ func TestTemplate_Compile(t *testing.T) {
query.Template.Type = structs.QueryTemplateTypeNamePrefixMatch
query.Template.Regexp = "^(hello)there$"
query.Service.Service = "${name.full}"
query.Service.IgnoreCheckIDs = []types.CheckID{"${match(1)}", "${agent.segment}"}
query.Service.Tags = []string{"${match(1)}", "${agent.segment}"}
backup, err := copystructure.Copy(query)
if err != nil {
@ -151,6 +162,10 @@ func TestTemplate_Compile(t *testing.T) {
},
Service: structs.ServiceQuery{
Service: "hellothere",
IgnoreCheckIDs: []types.CheckID{
"hello",
"segment-foo",
},
Tags: []string{
"hello",
"segment-foo",

View File

@ -496,7 +496,8 @@ func (p *PreparedQuery) execute(query *structs.PreparedQuery,
}
// Filter out any unhealthy nodes.
nodes = nodes.Filter(query.Service.OnlyPassing)
nodes = nodes.FilterIgnore(query.Service.OnlyPassing,
query.Service.IgnoreCheckIDs)
// Apply the node metadata filters, if any.
if len(query.Service.NodeMeta) > 0 {

View File

@ -17,6 +17,7 @@ import (
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/consul/testutil/retry"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/serf/coordinate"
)
@ -2076,6 +2077,41 @@ func TestPreparedQuery_Execute(t *testing.T) {
}
}
// Make the query ignore all our health checks (which have "failing" ID
// implicitly from their name).
query.Query.Service.IgnoreCheckIDs = []types.CheckID{"failing"}
if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Apply", &query, &query.Query.ID); err != nil {
t.Fatalf("err: %v", err)
}
// We should end up with 10 nodes again
{
req := structs.PreparedQueryExecuteRequest{
Datacenter: "dc1",
QueryIDOrName: query.Query.ID,
QueryOptions: structs.QueryOptions{Token: execToken},
}
var reply structs.PreparedQueryExecuteResponse
if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Execute", &req, &reply); err != nil {
t.Fatalf("err: %v", err)
}
if len(reply.Nodes) != 10 ||
reply.Datacenter != "dc1" ||
reply.Service != query.Query.Service.Service ||
!reflect.DeepEqual(reply.DNS, query.Query.DNS) ||
!reply.QueryMeta.KnownLeader {
t.Fatalf("bad: %v", reply)
}
}
// Undo that so all the following tests aren't broken!
query.Query.Service.IgnoreCheckIDs = nil
if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Apply", &query, &query.Query.ID); err != nil {
t.Fatalf("err: %v", err)
}
// Make the query more picky by adding a tag filter. This just proves we
// call into the tag filter, it is tested more thoroughly in a separate
// test.

View File

@ -10,6 +10,7 @@ import (
"testing"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/types"
)
// MockPreparedQuery is a fake endpoint that we inject into the Consul server
@ -87,9 +88,10 @@ func TestPreparedQuery_Create(t *testing.T) {
NearestN: 4,
Datacenters: []string{"dc1", "dc2"},
},
OnlyPassing: true,
Tags: []string{"foo", "bar"},
NodeMeta: map[string]string{"somekey": "somevalue"},
IgnoreCheckIDs: []types.CheckID{"broken_check"},
OnlyPassing: true,
Tags: []string{"foo", "bar"},
NodeMeta: map[string]string{"somekey": "somevalue"},
},
DNS: structs.QueryDNSOptions{
TTL: "10s",
@ -122,9 +124,10 @@ func TestPreparedQuery_Create(t *testing.T) {
"NearestN": 4,
"Datacenters": []string{"dc1", "dc2"},
},
"OnlyPassing": true,
"Tags": []string{"foo", "bar"},
"NodeMeta": map[string]string{"somekey": "somevalue"},
"IgnoreCheckIDs": []string{"broken_check"},
"OnlyPassing": true,
"Tags": []string{"foo", "bar"},
"NodeMeta": map[string]string{"somekey": "somevalue"},
},
"DNS": map[string]interface{}{
"TTL": "10s",

View File

@ -1,5 +1,7 @@
package structs
import "github.com/hashicorp/consul/types"
// QueryDatacenterOptions sets options about how we fail over if there are no
// healthy nodes in the local datacenter.
type QueryDatacenterOptions struct {
@ -34,6 +36,12 @@ type ServiceQuery struct {
// discarded)
OnlyPassing bool
// IgnoreCheckIDs is an optional list of health check IDs to ignore when
// considering which nodes are healthy. It is useful as an emergency measure
// to temporarily override some health check that is producing false negatives
// for example.
IgnoreCheckIDs []types.CheckID
// Near allows the query to always prefer the node nearest the given
// node. If the node does not exist, results are returned in their
// normal randomly-shuffled order. Supplying the magic "_agent" value

View File

@ -580,16 +580,33 @@ func (nodes CheckServiceNodes) Shuffle() {
// check if that option is selected). Note that this returns the filtered
// results AND modifies the receiver for performance.
func (nodes CheckServiceNodes) Filter(onlyPassing bool) CheckServiceNodes {
return nodes.FilterIgnore(onlyPassing, nil)
}
// FilterIgnore removes nodes that are failing health checks just like Filter.
// It also ignores the status of any check with an ID present in ignoreCheckIDs
// as if that check didn't exist. Note that this returns the filtered results
// AND modifies the receiver for performance.
func (nodes CheckServiceNodes) FilterIgnore(onlyPassing bool,
ignoreCheckIDs []types.CheckID) CheckServiceNodes {
n := len(nodes)
OUTER:
for i := 0; i < n; i++ {
node := nodes[i]
INNER:
for _, check := range node.Checks {
for _, ignore := range ignoreCheckIDs {
if check.CheckID == ignore {
// Skip this _check_ but keep looking at other checks for this node.
continue INNER
}
}
if check.Status == api.HealthCritical ||
(onlyPassing && check.Status != api.HealthPassing) {
nodes[i], nodes[n-1] = nodes[n-1], CheckServiceNode{}
n--
i--
// Skip this _node_ now we've swapped it off the end of the list.
continue OUTER
}
}

View File

@ -441,6 +441,20 @@ func TestStructs_CheckServiceNodes_Filter(t *testing.T) {
},
},
},
CheckServiceNode{
Node: &Node{
Node: "node4",
Address: "127.0.0.4",
},
Checks: HealthChecks{
// This check has a different ID to the others to ensure it is not
// ignored by accident
&HealthCheck{
CheckID: "failing2",
Status: api.HealthCritical,
},
},
},
}
// Test the case where warnings are allowed.
@ -473,6 +487,26 @@ func TestStructs_CheckServiceNodes_Filter(t *testing.T) {
t.Fatalf("bad: %v", filtered)
}
}
// Allow failing checks to be ignored (note that the test checks have empty
// CheckID which is valid).
{
twiddle := make(CheckServiceNodes, len(nodes))
if n := copy(twiddle, nodes); n != len(nodes) {
t.Fatalf("bad: %d", n)
}
filtered := twiddle.FilterIgnore(true, []types.CheckID{""})
expected := CheckServiceNodes{
nodes[0],
nodes[1],
nodes[2], // Node 3's critical check should be ignored.
// Node 4 should still be failing since it's got a critical check with a
// non-ignored ID.
}
if !reflect.DeepEqual(filtered, expected) {
t.Fatalf("bad: %v", filtered)
}
}
}
func TestStructs_DirEntry_Clone(t *testing.T) {

View File

@ -34,6 +34,12 @@ type ServiceQuery struct {
// local datacenter.
Failover QueryDatacenterOptions
// IgnoreCheckIDs is an optional list of health check IDs to ignore when
// considering which nodes are healthy. It is useful as an emergency measure
// to temporarily override some health check that is producing false negatives
// for example.
IgnoreCheckIDs []string
// If OnlyPassing is true then we will only include nodes with passing
// health checks (critical AND warning checks will cause a node to be
// discarded)

View File

@ -116,6 +116,53 @@ func TestAPI_PreparedQuery(t *testing.T) {
t.Fatalf("bad datacenter: %v", results)
}
// Add new node with failing health check.
reg2 := reg
reg2.Node = "failingnode"
reg2.Check = &AgentCheck{
Node: "failingnode",
ServiceID: "redis1",
ServiceName: "redis",
Name: "failingcheck",
Status: "critical",
}
retry.Run(t, func(r *retry.R) {
if _, err := catalog.Register(reg2, nil); err != nil {
r.Fatal(err)
}
if _, _, err := catalog.Node("failingnode", nil); err != nil {
r.Fatal(err)
}
})
// Execute by ID. Should return only healthy node.
results, _, err = query.Execute(def.ID, nil)
if err != nil {
t.Fatalf("err: %s", err)
}
if len(results.Nodes) != 1 || results.Nodes[0].Node.Node != "foobar" {
t.Fatalf("bad: %v", results)
}
if wan, ok := results.Nodes[0].Node.TaggedAddresses["wan"]; !ok || wan != "127.0.0.1" {
t.Fatalf("bad: %v", results)
}
// Update PQ with ignore rule for the failing check
def.Service.IgnoreCheckIDs = []string{"failingcheck"}
_, err = query.Update(def, nil)
if err != nil {
t.Fatalf("err: %s", err)
}
// Execute by ID. Should return BOTH nodes ignoring the failing check.
results, _, err = query.Execute(def.ID, nil)
if err != nil {
t.Fatalf("err: %s", err)
}
if len(results.Nodes) != 2 {
t.Fatalf("got %d nodes, want 2", len(results.Nodes))
}
// Delete it.
_, err = query.Delete(def.ID, nil)
if err != nil {

View File

@ -25,7 +25,7 @@ section for more details about how prepared queries work with Consul's ACL syste
### Prepared Query Templates
Consul 0.6.4 and later support prepared query templates. These are created
similar to static templates, except with some additional fields and features.
similar to static queries, except with some additional fields and features.
Here is an example prepared query template:
```json
@ -206,6 +206,13 @@ The table below shows this endpoint's support for
failover, even if it is selected by both `NearestN` and is listed in
`Datacenters`.
- `IgnoreCheckIDs` `(array<string>: nil)` - Specifies a list of check IDs that
should be ignored when filtering unhealthy instances. This is mostly useful
in an emergency or as a temporary measure when a health check is found to be
unreliable. Being able to ignore it in centrally-defined queries can be
simpler than de-registering the check as an interim solution until the check
can be fixed.
- `OnlyPassing` `(bool: false)` - Specifies the behavior of the query's health
check filtering. If this is set to false, the results will include nodes
with checks in the passing as well as the warning states. If this is set to