Merge pull request #4022 from hashicorp/pq-filter-checks
Allow ignoring checks by ID when defining a PreparedQuery. Fixes #3727.
This commit is contained in:
commit
48f2a74c9e
|
@ -6,6 +6,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/types"
|
||||
"github.com/mitchellh/copystructure"
|
||||
)
|
||||
|
||||
|
@ -32,6 +33,15 @@ var (
|
|||
"${agent.segment}",
|
||||
},
|
||||
},
|
||||
IgnoreCheckIDs: []types.CheckID{
|
||||
"${name.full}",
|
||||
"${name.prefix}",
|
||||
"${name.suffix}",
|
||||
"${match(0)}",
|
||||
"${match(1)}",
|
||||
"${match(2)}",
|
||||
"${agent.segment}",
|
||||
},
|
||||
Tags: []string{
|
||||
"${name.full}",
|
||||
"${name.prefix}",
|
||||
|
@ -124,6 +134,7 @@ func TestTemplate_Compile(t *testing.T) {
|
|||
query.Template.Type = structs.QueryTemplateTypeNamePrefixMatch
|
||||
query.Template.Regexp = "^(hello)there$"
|
||||
query.Service.Service = "${name.full}"
|
||||
query.Service.IgnoreCheckIDs = []types.CheckID{"${match(1)}", "${agent.segment}"}
|
||||
query.Service.Tags = []string{"${match(1)}", "${agent.segment}"}
|
||||
backup, err := copystructure.Copy(query)
|
||||
if err != nil {
|
||||
|
@ -151,6 +162,10 @@ func TestTemplate_Compile(t *testing.T) {
|
|||
},
|
||||
Service: structs.ServiceQuery{
|
||||
Service: "hellothere",
|
||||
IgnoreCheckIDs: []types.CheckID{
|
||||
"hello",
|
||||
"segment-foo",
|
||||
},
|
||||
Tags: []string{
|
||||
"hello",
|
||||
"segment-foo",
|
||||
|
|
|
@ -496,7 +496,8 @@ func (p *PreparedQuery) execute(query *structs.PreparedQuery,
|
|||
}
|
||||
|
||||
// Filter out any unhealthy nodes.
|
||||
nodes = nodes.Filter(query.Service.OnlyPassing)
|
||||
nodes = nodes.FilterIgnore(query.Service.OnlyPassing,
|
||||
query.Service.IgnoreCheckIDs)
|
||||
|
||||
// Apply the node metadata filters, if any.
|
||||
if len(query.Service.NodeMeta) > 0 {
|
||||
|
|
|
@ -17,6 +17,7 @@ import (
|
|||
"github.com/hashicorp/consul/api"
|
||||
"github.com/hashicorp/consul/testrpc"
|
||||
"github.com/hashicorp/consul/testutil/retry"
|
||||
"github.com/hashicorp/consul/types"
|
||||
"github.com/hashicorp/net-rpc-msgpackrpc"
|
||||
"github.com/hashicorp/serf/coordinate"
|
||||
)
|
||||
|
@ -2076,6 +2077,41 @@ func TestPreparedQuery_Execute(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// Make the query ignore all our health checks (which have "failing" ID
|
||||
// implicitly from their name).
|
||||
query.Query.Service.IgnoreCheckIDs = []types.CheckID{"failing"}
|
||||
if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Apply", &query, &query.Query.ID); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// We should end up with 10 nodes again
|
||||
{
|
||||
req := structs.PreparedQueryExecuteRequest{
|
||||
Datacenter: "dc1",
|
||||
QueryIDOrName: query.Query.ID,
|
||||
QueryOptions: structs.QueryOptions{Token: execToken},
|
||||
}
|
||||
|
||||
var reply structs.PreparedQueryExecuteResponse
|
||||
if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Execute", &req, &reply); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
if len(reply.Nodes) != 10 ||
|
||||
reply.Datacenter != "dc1" ||
|
||||
reply.Service != query.Query.Service.Service ||
|
||||
!reflect.DeepEqual(reply.DNS, query.Query.DNS) ||
|
||||
!reply.QueryMeta.KnownLeader {
|
||||
t.Fatalf("bad: %v", reply)
|
||||
}
|
||||
}
|
||||
|
||||
// Undo that so all the following tests aren't broken!
|
||||
query.Query.Service.IgnoreCheckIDs = nil
|
||||
if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Apply", &query, &query.Query.ID); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Make the query more picky by adding a tag filter. This just proves we
|
||||
// call into the tag filter, it is tested more thoroughly in a separate
|
||||
// test.
|
||||
|
|
|
@ -10,6 +10,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/types"
|
||||
)
|
||||
|
||||
// MockPreparedQuery is a fake endpoint that we inject into the Consul server
|
||||
|
@ -87,9 +88,10 @@ func TestPreparedQuery_Create(t *testing.T) {
|
|||
NearestN: 4,
|
||||
Datacenters: []string{"dc1", "dc2"},
|
||||
},
|
||||
OnlyPassing: true,
|
||||
Tags: []string{"foo", "bar"},
|
||||
NodeMeta: map[string]string{"somekey": "somevalue"},
|
||||
IgnoreCheckIDs: []types.CheckID{"broken_check"},
|
||||
OnlyPassing: true,
|
||||
Tags: []string{"foo", "bar"},
|
||||
NodeMeta: map[string]string{"somekey": "somevalue"},
|
||||
},
|
||||
DNS: structs.QueryDNSOptions{
|
||||
TTL: "10s",
|
||||
|
@ -122,9 +124,10 @@ func TestPreparedQuery_Create(t *testing.T) {
|
|||
"NearestN": 4,
|
||||
"Datacenters": []string{"dc1", "dc2"},
|
||||
},
|
||||
"OnlyPassing": true,
|
||||
"Tags": []string{"foo", "bar"},
|
||||
"NodeMeta": map[string]string{"somekey": "somevalue"},
|
||||
"IgnoreCheckIDs": []string{"broken_check"},
|
||||
"OnlyPassing": true,
|
||||
"Tags": []string{"foo", "bar"},
|
||||
"NodeMeta": map[string]string{"somekey": "somevalue"},
|
||||
},
|
||||
"DNS": map[string]interface{}{
|
||||
"TTL": "10s",
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package structs
|
||||
|
||||
import "github.com/hashicorp/consul/types"
|
||||
|
||||
// QueryDatacenterOptions sets options about how we fail over if there are no
|
||||
// healthy nodes in the local datacenter.
|
||||
type QueryDatacenterOptions struct {
|
||||
|
@ -34,6 +36,12 @@ type ServiceQuery struct {
|
|||
// discarded)
|
||||
OnlyPassing bool
|
||||
|
||||
// IgnoreCheckIDs is an optional list of health check IDs to ignore when
|
||||
// considering which nodes are healthy. It is useful as an emergency measure
|
||||
// to temporarily override some health check that is producing false negatives
|
||||
// for example.
|
||||
IgnoreCheckIDs []types.CheckID
|
||||
|
||||
// Near allows the query to always prefer the node nearest the given
|
||||
// node. If the node does not exist, results are returned in their
|
||||
// normal randomly-shuffled order. Supplying the magic "_agent" value
|
||||
|
|
|
@ -580,16 +580,33 @@ func (nodes CheckServiceNodes) Shuffle() {
|
|||
// check if that option is selected). Note that this returns the filtered
|
||||
// results AND modifies the receiver for performance.
|
||||
func (nodes CheckServiceNodes) Filter(onlyPassing bool) CheckServiceNodes {
|
||||
return nodes.FilterIgnore(onlyPassing, nil)
|
||||
}
|
||||
|
||||
// FilterIgnore removes nodes that are failing health checks just like Filter.
|
||||
// It also ignores the status of any check with an ID present in ignoreCheckIDs
|
||||
// as if that check didn't exist. Note that this returns the filtered results
|
||||
// AND modifies the receiver for performance.
|
||||
func (nodes CheckServiceNodes) FilterIgnore(onlyPassing bool,
|
||||
ignoreCheckIDs []types.CheckID) CheckServiceNodes {
|
||||
n := len(nodes)
|
||||
OUTER:
|
||||
for i := 0; i < n; i++ {
|
||||
node := nodes[i]
|
||||
INNER:
|
||||
for _, check := range node.Checks {
|
||||
for _, ignore := range ignoreCheckIDs {
|
||||
if check.CheckID == ignore {
|
||||
// Skip this _check_ but keep looking at other checks for this node.
|
||||
continue INNER
|
||||
}
|
||||
}
|
||||
if check.Status == api.HealthCritical ||
|
||||
(onlyPassing && check.Status != api.HealthPassing) {
|
||||
nodes[i], nodes[n-1] = nodes[n-1], CheckServiceNode{}
|
||||
n--
|
||||
i--
|
||||
// Skip this _node_ now we've swapped it off the end of the list.
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
|
|
|
@ -441,6 +441,20 @@ func TestStructs_CheckServiceNodes_Filter(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
CheckServiceNode{
|
||||
Node: &Node{
|
||||
Node: "node4",
|
||||
Address: "127.0.0.4",
|
||||
},
|
||||
Checks: HealthChecks{
|
||||
// This check has a different ID to the others to ensure it is not
|
||||
// ignored by accident
|
||||
&HealthCheck{
|
||||
CheckID: "failing2",
|
||||
Status: api.HealthCritical,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Test the case where warnings are allowed.
|
||||
|
@ -473,6 +487,26 @@ func TestStructs_CheckServiceNodes_Filter(t *testing.T) {
|
|||
t.Fatalf("bad: %v", filtered)
|
||||
}
|
||||
}
|
||||
|
||||
// Allow failing checks to be ignored (note that the test checks have empty
|
||||
// CheckID which is valid).
|
||||
{
|
||||
twiddle := make(CheckServiceNodes, len(nodes))
|
||||
if n := copy(twiddle, nodes); n != len(nodes) {
|
||||
t.Fatalf("bad: %d", n)
|
||||
}
|
||||
filtered := twiddle.FilterIgnore(true, []types.CheckID{""})
|
||||
expected := CheckServiceNodes{
|
||||
nodes[0],
|
||||
nodes[1],
|
||||
nodes[2], // Node 3's critical check should be ignored.
|
||||
// Node 4 should still be failing since it's got a critical check with a
|
||||
// non-ignored ID.
|
||||
}
|
||||
if !reflect.DeepEqual(filtered, expected) {
|
||||
t.Fatalf("bad: %v", filtered)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStructs_DirEntry_Clone(t *testing.T) {
|
||||
|
|
|
@ -34,6 +34,12 @@ type ServiceQuery struct {
|
|||
// local datacenter.
|
||||
Failover QueryDatacenterOptions
|
||||
|
||||
// IgnoreCheckIDs is an optional list of health check IDs to ignore when
|
||||
// considering which nodes are healthy. It is useful as an emergency measure
|
||||
// to temporarily override some health check that is producing false negatives
|
||||
// for example.
|
||||
IgnoreCheckIDs []string
|
||||
|
||||
// If OnlyPassing is true then we will only include nodes with passing
|
||||
// health checks (critical AND warning checks will cause a node to be
|
||||
// discarded)
|
||||
|
|
|
@ -116,6 +116,53 @@ func TestAPI_PreparedQuery(t *testing.T) {
|
|||
t.Fatalf("bad datacenter: %v", results)
|
||||
}
|
||||
|
||||
// Add new node with failing health check.
|
||||
reg2 := reg
|
||||
reg2.Node = "failingnode"
|
||||
reg2.Check = &AgentCheck{
|
||||
Node: "failingnode",
|
||||
ServiceID: "redis1",
|
||||
ServiceName: "redis",
|
||||
Name: "failingcheck",
|
||||
Status: "critical",
|
||||
}
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
if _, err := catalog.Register(reg2, nil); err != nil {
|
||||
r.Fatal(err)
|
||||
}
|
||||
if _, _, err := catalog.Node("failingnode", nil); err != nil {
|
||||
r.Fatal(err)
|
||||
}
|
||||
})
|
||||
|
||||
// Execute by ID. Should return only healthy node.
|
||||
results, _, err = query.Execute(def.ID, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %s", err)
|
||||
}
|
||||
if len(results.Nodes) != 1 || results.Nodes[0].Node.Node != "foobar" {
|
||||
t.Fatalf("bad: %v", results)
|
||||
}
|
||||
if wan, ok := results.Nodes[0].Node.TaggedAddresses["wan"]; !ok || wan != "127.0.0.1" {
|
||||
t.Fatalf("bad: %v", results)
|
||||
}
|
||||
|
||||
// Update PQ with ignore rule for the failing check
|
||||
def.Service.IgnoreCheckIDs = []string{"failingcheck"}
|
||||
_, err = query.Update(def, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %s", err)
|
||||
}
|
||||
|
||||
// Execute by ID. Should return BOTH nodes ignoring the failing check.
|
||||
results, _, err = query.Execute(def.ID, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %s", err)
|
||||
}
|
||||
if len(results.Nodes) != 2 {
|
||||
t.Fatalf("got %d nodes, want 2", len(results.Nodes))
|
||||
}
|
||||
|
||||
// Delete it.
|
||||
_, err = query.Delete(def.ID, nil)
|
||||
if err != nil {
|
||||
|
|
|
@ -25,7 +25,7 @@ section for more details about how prepared queries work with Consul's ACL syste
|
|||
### Prepared Query Templates
|
||||
|
||||
Consul 0.6.4 and later support prepared query templates. These are created
|
||||
similar to static templates, except with some additional fields and features.
|
||||
similar to static queries, except with some additional fields and features.
|
||||
Here is an example prepared query template:
|
||||
|
||||
```json
|
||||
|
@ -206,6 +206,13 @@ The table below shows this endpoint's support for
|
|||
failover, even if it is selected by both `NearestN` and is listed in
|
||||
`Datacenters`.
|
||||
|
||||
- `IgnoreCheckIDs` `(array<string>: nil)` - Specifies a list of check IDs that
|
||||
should be ignored when filtering unhealthy instances. This is mostly useful
|
||||
in an emergency or as a temporary measure when a health check is found to be
|
||||
unreliable. Being able to ignore it in centrally-defined queries can be
|
||||
simpler than de-registering the check as an interim solution until the check
|
||||
can be fixed.
|
||||
|
||||
- `OnlyPassing` `(bool: false)` - Specifies the behavior of the query's health
|
||||
check filtering. If this is set to false, the results will include nodes
|
||||
with checks in the passing as well as the warning states. If this is set to
|
||||
|
|
Loading…
Reference in a new issue