rate: add prometheus definitions, docs, and clearer names (#15945)
This commit is contained in:
parent
6151bcfa75
commit
cc02c78ce6
|
@ -10,8 +10,9 @@ import (
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/armon/go-metrics"
|
"github.com/armon/go-metrics"
|
||||||
"github.com/hashicorp/consul/agent/consul/multilimiter"
|
|
||||||
"github.com/hashicorp/go-hclog"
|
"github.com/hashicorp/go-hclog"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/agent/consul/multilimiter"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -214,7 +215,7 @@ func (h *Handler) Allow(op Operation) error {
|
||||||
"limit_enforced", enforced,
|
"limit_enforced", enforced,
|
||||||
)
|
)
|
||||||
|
|
||||||
metrics.IncrCounterWithLabels([]string{"consul", "rate_limit"}, 1, []metrics.Label{
|
metrics.IncrCounterWithLabels([]string{"rpc", "rate_limit", "exceeded"}, 1, []metrics.Label{
|
||||||
{
|
{
|
||||||
Name: "limit_type",
|
Name: "limit_type",
|
||||||
Value: l.desc,
|
Value: l.desc,
|
||||||
|
|
|
@ -104,7 +104,7 @@ func TestHandler(t *testing.T) {
|
||||||
expectErr: nil,
|
expectErr: nil,
|
||||||
expectLog: true,
|
expectLog: true,
|
||||||
expectMetric: true,
|
expectMetric: true,
|
||||||
expectMetricName: "consul.rate_limit;limit_type=global/write;op=Foo.Bar;mode=permissive",
|
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/write;op=Foo.Bar;mode=permissive",
|
||||||
expectMetricCount: 1,
|
expectMetricCount: 1,
|
||||||
},
|
},
|
||||||
"global write limit exceeded (enforcing, leader)": {
|
"global write limit exceeded (enforcing, leader)": {
|
||||||
|
@ -121,7 +121,7 @@ func TestHandler(t *testing.T) {
|
||||||
expectErr: ErrRetryLater,
|
expectErr: ErrRetryLater,
|
||||||
expectLog: true,
|
expectLog: true,
|
||||||
expectMetric: true,
|
expectMetric: true,
|
||||||
expectMetricName: "consul.rate_limit;limit_type=global/write;op=Foo.Bar;mode=enforcing",
|
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/write;op=Foo.Bar;mode=enforcing",
|
||||||
expectMetricCount: 1,
|
expectMetricCount: 1,
|
||||||
},
|
},
|
||||||
"global write limit exceeded (enforcing, follower)": {
|
"global write limit exceeded (enforcing, follower)": {
|
||||||
|
@ -138,7 +138,7 @@ func TestHandler(t *testing.T) {
|
||||||
expectErr: ErrRetryElsewhere,
|
expectErr: ErrRetryElsewhere,
|
||||||
expectLog: true,
|
expectLog: true,
|
||||||
expectMetric: true,
|
expectMetric: true,
|
||||||
expectMetricName: "consul.rate_limit;limit_type=global/write;op=Foo.Bar;mode=enforcing",
|
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/write;op=Foo.Bar;mode=enforcing",
|
||||||
expectMetricCount: 1,
|
expectMetricCount: 1,
|
||||||
},
|
},
|
||||||
"global read limit disabled": {
|
"global read limit disabled": {
|
||||||
|
@ -180,7 +180,7 @@ func TestHandler(t *testing.T) {
|
||||||
expectErr: nil,
|
expectErr: nil,
|
||||||
expectLog: true,
|
expectLog: true,
|
||||||
expectMetric: true,
|
expectMetric: true,
|
||||||
expectMetricName: "consul.rate_limit;limit_type=global/read;op=Foo.Bar;mode=permissive",
|
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/read;op=Foo.Bar;mode=permissive",
|
||||||
expectMetricCount: 1,
|
expectMetricCount: 1,
|
||||||
},
|
},
|
||||||
"global read limit exceeded (enforcing, leader)": {
|
"global read limit exceeded (enforcing, leader)": {
|
||||||
|
@ -197,7 +197,7 @@ func TestHandler(t *testing.T) {
|
||||||
expectErr: ErrRetryElsewhere,
|
expectErr: ErrRetryElsewhere,
|
||||||
expectLog: true,
|
expectLog: true,
|
||||||
expectMetric: true,
|
expectMetric: true,
|
||||||
expectMetricName: "consul.rate_limit;limit_type=global/read;op=Foo.Bar;mode=enforcing",
|
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/read;op=Foo.Bar;mode=enforcing",
|
||||||
expectMetricCount: 1,
|
expectMetricCount: 1,
|
||||||
},
|
},
|
||||||
"global read limit exceeded (enforcing, follower)": {
|
"global read limit exceeded (enforcing, follower)": {
|
||||||
|
@ -214,7 +214,7 @@ func TestHandler(t *testing.T) {
|
||||||
expectErr: ErrRetryElsewhere,
|
expectErr: ErrRetryElsewhere,
|
||||||
expectLog: true,
|
expectLog: true,
|
||||||
expectMetric: true,
|
expectMetric: true,
|
||||||
expectMetricName: "consul.rate_limit;limit_type=global/read;op=Foo.Bar;mode=enforcing",
|
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/read;op=Foo.Bar;mode=enforcing",
|
||||||
expectMetricCount: 1,
|
expectMetricCount: 1,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,10 @@ package rate
|
||||||
import "github.com/armon/go-metrics/prometheus"
|
import "github.com/armon/go-metrics/prometheus"
|
||||||
|
|
||||||
var Counters = []prometheus.CounterDefinition{
|
var Counters = []prometheus.CounterDefinition{
|
||||||
|
{
|
||||||
|
Name: []string{"rpc", "rate_limit", "exceeded"},
|
||||||
|
Help: "Increments whenever an RPC is over a configured rate limit. Note: in permissive mode, the RPC will have still been allowed to proceed.",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: []string{"rpc", "rate_limit", "log_dropped"},
|
Name: []string{"rpc", "rate_limit", "log_dropped"},
|
||||||
Help: "Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full.",
|
Help: "Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full.",
|
||||||
|
|
|
@ -172,25 +172,31 @@ func TestServerRequestRateLimit(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkForMetric(t *retry.R, metricsInfo *api.MetricsInfo, operationName string, expectedLimitType string) {
|
func checkForMetric(t *retry.R, metricsInfo *api.MetricsInfo, operationName string, expectedLimitType string) {
|
||||||
for _, counter := range metricsInfo.Counters {
|
const counterName = "rpc.rate_limit.exceeded"
|
||||||
if counter.Name == "consul.rate.limit" {
|
|
||||||
operation, ok := counter.Labels["op"]
|
|
||||||
require.True(t, ok)
|
|
||||||
|
|
||||||
limitType, ok := counter.Labels["limit_type"]
|
var counter api.SampledValue
|
||||||
require.True(t, ok)
|
for _, c := range metricsInfo.Counters {
|
||||||
|
if counter.Name == counterName {
|
||||||
mode, ok := counter.Labels["mode"]
|
counter = c
|
||||||
require.True(t, ok)
|
break
|
||||||
|
|
||||||
if operation == operationName {
|
|
||||||
require.Equal(t, 2, counter.Count)
|
|
||||||
require.Equal(t, expectedLimitType, limitType)
|
|
||||||
require.Equal(t, "disabled", mode)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
require.NotNilf(t, counter, "counter not found: %s", counterName)
|
||||||
|
|
||||||
|
operation, ok := counter.Labels["op"]
|
||||||
|
require.True(t, ok)
|
||||||
|
|
||||||
|
limitType, ok := counter.Labels["limit_type"]
|
||||||
|
require.True(t, ok)
|
||||||
|
|
||||||
|
mode, ok := counter.Labels["mode"]
|
||||||
|
require.True(t, ok)
|
||||||
|
|
||||||
|
if operation == operationName {
|
||||||
|
require.Equal(t, 2, counter.Count)
|
||||||
|
require.Equal(t, expectedLimitType, limitType)
|
||||||
|
require.Equal(t, "disabled", mode)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkLogsForMessage(t *retry.R, logs []string, msg string, operationName string, logType string, logShouldExist bool) {
|
func checkLogsForMessage(t *retry.R, logs []string, msg string, operationName string, logType string, logShouldExist bool) {
|
||||||
|
|
|
@ -477,6 +477,7 @@ These metrics are used to monitor the health of the Consul servers.
|
||||||
| `consul.raft.transition.heartbeat_timeout` | The number of times an agent has transitioned to the Candidate state, after receive no heartbeat messages from the last known leader. | timeouts / interval | counter |
|
| `consul.raft.transition.heartbeat_timeout` | The number of times an agent has transitioned to the Candidate state, after receive no heartbeat messages from the last known leader. | timeouts / interval | counter |
|
||||||
| `consul.raft.verify_leader` | This metric doesn't have a direct correlation to the leader change. It just counts the number of times an agent checks if it is still the leader or not. For example, during every consistent read, the check is done. Depending on the load in the system, this metric count can be high as it is incremented each time a consistent read is completed. | checks / interval | Counter |
|
| `consul.raft.verify_leader` | This metric doesn't have a direct correlation to the leader change. It just counts the number of times an agent checks if it is still the leader or not. For example, during every consistent read, the check is done. Depending on the load in the system, this metric count can be high as it is incremented each time a consistent read is completed. | checks / interval | Counter |
|
||||||
| `consul.rpc.accept_conn` | Increments when a server accepts an RPC connection. | connections | counter |
|
| `consul.rpc.accept_conn` | Increments when a server accepts an RPC connection. | connections | counter |
|
||||||
|
| `consul.rpc.rate_limit.exceeded` | Increments whenever an RPC is over a configured rate limit. In permissive mode, the RPC is still allowed to proceed. | RPCs | counter |
|
||||||
| `consul.rpc.rate_limit.log_dropped` | Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full. | log messages dropped | counter |
|
| `consul.rpc.rate_limit.log_dropped` | Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full. | log messages dropped | counter |
|
||||||
| `consul.catalog.register` | Measures the time it takes to complete a catalog register operation. | ms | timer |
|
| `consul.catalog.register` | Measures the time it takes to complete a catalog register operation. | ms | timer |
|
||||||
| `consul.catalog.deregister` | Measures the time it takes to complete a catalog deregister operation. | ms | timer |
|
| `consul.catalog.deregister` | Measures the time it takes to complete a catalog deregister operation. | ms | timer |
|
||||||
|
|
Loading…
Reference in New Issue