rate: add prometheus definitions, docs, and clearer names (#15945)

This commit is contained in:
Dan Upton 2023-02-03 12:01:57 +00:00 committed by GitHub
parent 6151bcfa75
commit cc02c78ce6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 35 additions and 23 deletions

View File

@ -10,8 +10,9 @@ import (
"sync/atomic"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/agent/consul/multilimiter"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/agent/consul/multilimiter"
)
var (
@ -214,7 +215,7 @@ func (h *Handler) Allow(op Operation) error {
"limit_enforced", enforced,
)
metrics.IncrCounterWithLabels([]string{"consul", "rate_limit"}, 1, []metrics.Label{
metrics.IncrCounterWithLabels([]string{"rpc", "rate_limit", "exceeded"}, 1, []metrics.Label{
{
Name: "limit_type",
Value: l.desc,

View File

@ -104,7 +104,7 @@ func TestHandler(t *testing.T) {
expectErr: nil,
expectLog: true,
expectMetric: true,
expectMetricName: "consul.rate_limit;limit_type=global/write;op=Foo.Bar;mode=permissive",
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/write;op=Foo.Bar;mode=permissive",
expectMetricCount: 1,
},
"global write limit exceeded (enforcing, leader)": {
@ -121,7 +121,7 @@ func TestHandler(t *testing.T) {
expectErr: ErrRetryLater,
expectLog: true,
expectMetric: true,
expectMetricName: "consul.rate_limit;limit_type=global/write;op=Foo.Bar;mode=enforcing",
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/write;op=Foo.Bar;mode=enforcing",
expectMetricCount: 1,
},
"global write limit exceeded (enforcing, follower)": {
@ -138,7 +138,7 @@ func TestHandler(t *testing.T) {
expectErr: ErrRetryElsewhere,
expectLog: true,
expectMetric: true,
expectMetricName: "consul.rate_limit;limit_type=global/write;op=Foo.Bar;mode=enforcing",
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/write;op=Foo.Bar;mode=enforcing",
expectMetricCount: 1,
},
"global read limit disabled": {
@ -180,7 +180,7 @@ func TestHandler(t *testing.T) {
expectErr: nil,
expectLog: true,
expectMetric: true,
expectMetricName: "consul.rate_limit;limit_type=global/read;op=Foo.Bar;mode=permissive",
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/read;op=Foo.Bar;mode=permissive",
expectMetricCount: 1,
},
"global read limit exceeded (enforcing, leader)": {
@ -197,7 +197,7 @@ func TestHandler(t *testing.T) {
expectErr: ErrRetryElsewhere,
expectLog: true,
expectMetric: true,
expectMetricName: "consul.rate_limit;limit_type=global/read;op=Foo.Bar;mode=enforcing",
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/read;op=Foo.Bar;mode=enforcing",
expectMetricCount: 1,
},
"global read limit exceeded (enforcing, follower)": {
@ -214,7 +214,7 @@ func TestHandler(t *testing.T) {
expectErr: ErrRetryElsewhere,
expectLog: true,
expectMetric: true,
expectMetricName: "consul.rate_limit;limit_type=global/read;op=Foo.Bar;mode=enforcing",
expectMetricName: "rpc.rate_limit.exceeded;limit_type=global/read;op=Foo.Bar;mode=enforcing",
expectMetricCount: 1,
},
}

View File

@ -3,6 +3,10 @@ package rate
import "github.com/armon/go-metrics/prometheus"
var Counters = []prometheus.CounterDefinition{
{
Name: []string{"rpc", "rate_limit", "exceeded"},
Help: "Increments whenever an RPC is over a configured rate limit. Note: in permissive mode, the RPC will have still been allowed to proceed.",
},
{
Name: []string{"rpc", "rate_limit", "log_dropped"},
Help: "Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full.",

View File

@ -172,25 +172,31 @@ func TestServerRequestRateLimit(t *testing.T) {
}
func checkForMetric(t *retry.R, metricsInfo *api.MetricsInfo, operationName string, expectedLimitType string) {
for _, counter := range metricsInfo.Counters {
if counter.Name == "consul.rate.limit" {
operation, ok := counter.Labels["op"]
require.True(t, ok)
const counterName = "rpc.rate_limit.exceeded"
limitType, ok := counter.Labels["limit_type"]
require.True(t, ok)
mode, ok := counter.Labels["mode"]
require.True(t, ok)
if operation == operationName {
require.Equal(t, 2, counter.Count)
require.Equal(t, expectedLimitType, limitType)
require.Equal(t, "disabled", mode)
}
var counter api.SampledValue
for _, c := range metricsInfo.Counters {
if counter.Name == counterName {
counter = c
break
}
}
require.NotNilf(t, counter, "counter not found: %s", counterName)
operation, ok := counter.Labels["op"]
require.True(t, ok)
limitType, ok := counter.Labels["limit_type"]
require.True(t, ok)
mode, ok := counter.Labels["mode"]
require.True(t, ok)
if operation == operationName {
require.Equal(t, 2, counter.Count)
require.Equal(t, expectedLimitType, limitType)
require.Equal(t, "disabled", mode)
}
}
func checkLogsForMessage(t *retry.R, logs []string, msg string, operationName string, logType string, logShouldExist bool) {

View File

@ -477,6 +477,7 @@ These metrics are used to monitor the health of the Consul servers.
| `consul.raft.transition.heartbeat_timeout` | The number of times an agent has transitioned to the Candidate state, after receive no heartbeat messages from the last known leader. | timeouts / interval | counter |
| `consul.raft.verify_leader` | This metric doesn't have a direct correlation to the leader change. It just counts the number of times an agent checks if it is still the leader or not. For example, during every consistent read, the check is done. Depending on the load in the system, this metric count can be high as it is incremented each time a consistent read is completed. | checks / interval | Counter |
| `consul.rpc.accept_conn` | Increments when a server accepts an RPC connection. | connections | counter |
| `consul.rpc.rate_limit.exceeded` | Increments whenever an RPC is over a configured rate limit. In permissive mode, the RPC is still allowed to proceed. | RPCs | counter |
| `consul.rpc.rate_limit.log_dropped` | Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full. | log messages dropped | counter |
| `consul.catalog.register` | Measures the time it takes to complete a catalog register operation. | ms | timer |
| `consul.catalog.deregister` | Measures the time it takes to complete a catalog deregister operation. | ms | timer |