diff --git a/agent/cache/cache.go b/agent/cache/cache.go index 62dc8619b..b8d77f7a3 100644 --- a/agent/cache/cache.go +++ b/agent/cache/cache.go @@ -37,7 +37,7 @@ import ( var Gauges = []prometheus.GaugeDefinition{ { Name: []string{"consul", "cache", "entries_count"}, - Help: "", + Help: "Represents the number of entries in this cache.", }, } @@ -45,19 +45,19 @@ var Gauges = []prometheus.GaugeDefinition{ var Counters = []prometheus.CounterDefinition{ { Name: []string{"consul", "cache", "bypass"}, - Help: "", + Help: "Counts how many times a request bypassed the cache because no cache-key was provided.", }, { Name: []string{"consul", "cache", "fetch_success"}, - Help: "", + Help: "Counts the number of successful fetches by the cache.", }, { Name: []string{"consul", "cache", "fetch_error"}, - Help: "", + Help: "Counts the number of failed fetches by the cache.", }, { Name: []string{"consul", "cache", "evict_expired"}, - Help: "", + Help: "Counts the number of expired entries that are evicted.", }, } diff --git a/agent/consul/catalog_endpoint.go b/agent/consul/catalog_endpoint.go index f5d5b5633..4641db2d9 100644 --- a/agent/consul/catalog_endpoint.go +++ b/agent/consul/catalog_endpoint.go @@ -25,7 +25,7 @@ var CatalogCounters = []prometheus.CounterDefinition{ }, { Name: []string{"catalog", "connect", "query"}, - Help: "", + Help: "Increments for each connect-based catalog query for the given service.", }, { Name: []string{"catalog", "service", "query-tag"}, @@ -33,7 +33,7 @@ var CatalogCounters = []prometheus.CounterDefinition{ }, { Name: []string{"catalog", "connect", "query-tag"}, - Help: "", + Help: "Increments for each connect-based catalog query for the given service with the given tag.", }, { Name: []string{"catalog", "service", "query-tags"}, @@ -41,7 +41,7 @@ var CatalogCounters = []prometheus.CounterDefinition{ }, { Name: []string{"catalog", "connect", "query-tags"}, - Help: "", + Help: "Increments for each connect-based catalog query for the given service with the given tags.", }, { Name: []string{"catalog", "service", "not-found"}, @@ -49,7 +49,7 @@ var CatalogCounters = []prometheus.CounterDefinition{ }, { Name: []string{"catalog", "connect", "not-found"}, - Help: "", + Help: "Increments for each connect-based catalog query where the given service could not be found.", }, } diff --git a/agent/consul/fsm/commands_oss.go b/agent/consul/fsm/commands_oss.go index fae5eb1a8..d637a366d 100644 --- a/agent/consul/fsm/commands_oss.go +++ b/agent/consul/fsm/commands_oss.go @@ -53,43 +53,43 @@ var CommandsSummaries = []prometheus.SummaryDefinition{ }, { Name: []string{"consul", "fsm", "intention"}, - Help: "", + Help: "Deprecated - use fsm_intention instead", }, { Name: []string{"fsm", "intention"}, - Help: "", + Help: "Measures the time it takes to apply an intention operation to the FSM.", }, { Name: []string{"consul", "fsm", "ca"}, - Help: "", + Help: "Deprecated - use fsm_ca instead", + }, + { + Name: []string{"fsm", "ca"}, + Help: "Measures the time it takes to apply CA configuration operations to the FSM.", }, { Name: []string{"fsm", "ca", "leaf"}, - Help: "", + Help: "Measures the time it takes to apply an operation while signing a leaf certificate.", }, { Name: []string{"fsm", "acl", "token"}, - Help: "", - }, - { - Name: []string{"fsm", "ca", "leaf"}, - Help: "", + Help: "Measures the time it takes to apply an ACL token operation to the FSM.", }, { Name: []string{"fsm", "acl", "policy"}, - Help: "", + Help: "Measures the time it takes to apply an ACL policy operation to the FSM.", }, { Name: []string{"fsm", "acl", "bindingrule"}, - Help: "", + Help: "Measures the time it takes to apply an ACL binding rule operation to the FSM.", }, { Name: []string{"fsm", "acl", "authmethod"}, - Help: "", + Help: "Measures the time it takes to apply an ACL authmethod operation to the FSM.", }, { Name: []string{"fsm", "system_metadata"}, - Help: "", + Help: "Measures the time it takes to apply a system metadata operation to the FSM.", }, // TODO(kit): We generate the config-entry fsm summaries by reading off of the request. It is // possible to statically declare these when we know all of the names, but I didn't get to it @@ -378,8 +378,12 @@ func (c *FSM) applyIntentionOperation(buf []byte, index uint64) interface{} { panic(fmt.Errorf("failed to decode request: %v", err)) } + // TODO(kit): We should deprecate this first metric that writes the metrics_prefix itself, + // the config we use to flag this out, telemetry.disable_compat_1.9 is on the agent - how do + // we access it here? defer metrics.MeasureSinceWithLabels([]string{"consul", "fsm", "intention"}, time.Now(), []metrics.Label{{Name: "op", Value: string(req.Op)}}) + defer metrics.MeasureSinceWithLabels([]string{"fsm", "intention"}, time.Now(), []metrics.Label{{Name: "op", Value: string(req.Op)}}) @@ -474,6 +478,7 @@ func (c *FSM) applyConnectCAOperation(buf []byte, index uint64) interface{} { } } +// applyConnectCALeafOperation applies an operation while signing a leaf certificate. func (c *FSM) applyConnectCALeafOperation(buf []byte, index uint64) interface{} { var req structs.CALeafRequest if err := structs.Decode(buf, &req); err != nil { diff --git a/agent/consul/session_ttl.go b/agent/consul/session_ttl.go index 15c77a24a..27f7d79d4 100644 --- a/agent/consul/session_ttl.go +++ b/agent/consul/session_ttl.go @@ -16,11 +16,11 @@ var SessionGauges = []prometheus.GaugeDefinition{ }, { Name: []string{"raft", "applied_index"}, - Help: "", + Help: "Represents the raft applied index.", }, { Name: []string{"raft", "last_index"}, - Help: "", + Help: "Represents the raft last index.", }, } @@ -153,7 +153,7 @@ func (s *Server) clearAllSessionTimers() { s.sessionTimers.StopAll() } -// updateMetrics is a long running routine used to uddate a +// updateMetrics is a long running routine used to update a // number of server periodic metrics func (s *Server) updateMetrics() { for { diff --git a/website/pages/docs/agent/telemetry.mdx b/website/pages/docs/agent/telemetry.mdx index 2ed0254e0..3fb3b3505 100644 --- a/website/pages/docs/agent/telemetry.mdx +++ b/website/pages/docs/agent/telemetry.mdx @@ -194,8 +194,12 @@ These metrics are used to monitor the health of the Consul servers. | `consul.acl.resolveTokenLegacy` | This measures the time it takes to resolve an ACL token using the legacy ACL system. | ms | timer | | `consul.acl.ResolveToken` | This measures the time it takes to resolve an ACL token. | ms | timer | | `consul.acl.ResolveTokenToIdentity` | This measures the time it takes to resolve an ACL token to an Identity. | ms | timer | -| `consul.acl.token.cache_hit` | Increments if Consul is able to resolve a token's identity, or a legacy token, from the cache. | cache read op | counter | -| `consul.acl.token.cache_miss` | Increments if Consul cannot resolve a token's identity, or a legacy token, from the cache. | cache read op | counter | +| `consul.acl.token.cache_hit` | Increments if Consul is able to resolve a token's identity, or a legacy token, from the cache. | cache read op | counter | +| `consul.acl.token.cache_miss` | Increments if Consul cannot resolve a token's identity, or a legacy token, from the cache. | cache read op | counter | +| `consul.cache.bypass` | Counts how many times a request bypassed the cache because no cache-key was provided. | counter | counter | +| `consul.cache.fetch_success` | Counts the number of successful fetches by the cache. | counter | counter | +| `consul.cache.fetch_error` | Counts the number of failed fetches by the cache. | counter | counter | +| `consul.cache.evict_expired` | Counts the number of expired entries that are evicted. | counter | counter | | `consul.raft.fsm.snapshot` | This metric measures the time taken by the FSM to record the current state for the snapshot. | ms | timer | | `consul.raft.fsm.apply` | This metric gives the number of logs committed since the last interval. | commit logs / interval | counter | | `consul.raft.commitNumLogs` | This metric measures the count of logs processed for application to the FSM in a single batch. | logs | gauge | @@ -207,6 +211,8 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.replication.heartbeat` | This metric measures the time taken to invoke appendEntries on a peer, so that it doesn’t timeout on a periodic basis. | ms | timer | | `consul.serf.snapshot.appendLine` | This metric measures the time taken by the Consul agent to append an entry into the existing log. | ms | timer | | `consul.serf.snapshot.compact` | This metric measures the time taken by the Consul agent to compact a log. This operation occurs only when the snapshot becomes large enough to justify the compaction . | ms | timer | +| `consul.raft.applied_index` | Represents the raft applied index. | index | gauge | +| `consul.raft.last_index` | Represents the raft applied index. | index | gauge | | `consul.raft.state.leader` | This increments whenever a Consul server becomes a leader. If there are frequent leadership changes this may be indication that the servers are overloaded and aren't meeting the soft real-time requirements for Raft, or that there are networking problems between the servers. | leadership transitions / interval | counter | | `consul.raft.state.candidate` | This increments whenever a Consul server starts an election. If this increments without a leadership change occurring it could indicate that a single server is overloaded or is experiencing network connectivity issues. | election attempts / interval | counter | | `consul.raft.apply` | This counts the number of Raft transactions occurring over the interval, which is a general indicator of the write load on the Consul servers. | raft transactions / interval | counter | @@ -243,6 +249,14 @@ These metrics are used to monitor the health of the Consul servers. | `consul.fsm.txn` | This measures the time it takes to apply the given transaction update to the FSM. | ms | timer | | `consul.fsm.autopilot` | This measures the time it takes to apply the given autopilot update to the FSM. | ms | timer | | `consul.fsm.persist` | This measures the time it takes to persist the FSM to a raft snapshot. | ms | timer | +| `consul.fsm.intention` | Measures the time it takes to apply an intention operation to the state store. | ms | timer | +| `consul.fsm.ca` | Measures the time it takes to apply CA configuration operations to the FSM. | ms | timer | +| `consul.fsm.ca.leaf` | Measures the time it takes to apply an operation while signing a leaf certificate. | ms | timer | +| `consul.fsm.acl.token` | Measures the time it takes to apply an ACL token operation to the FSM. | ms | timer | +| `consul.fsm.acl.policy` | Measures the time it takes to apply an ACL policy operation to the FSM. | ms | timer | +| `consul.fsm.acl.bindingrule` | Measures the time it takes to apply an ACL binding rule operation to the FSM. | ms | timer | +| `consul.fsm.acl.authmethod` | Measures the time it takes to apply an ACL authmethod operation to the FSM. | ms | timer | +| `consul.fsm.system_metadata` | Measures the time it takes to apply a system metadata operation to the FSM. | ms | timer | | `consul.kvs.apply` | This measures the time it takes to complete an update to the KV store. | ms | timer | | `consul.leader.barrier` | This measures the time spent waiting for the raft barrier upon gaining leadership. | ms | timer | | `consul.leader.reconcile` | This measures the time spent updating the raft store from the serf member information. | ms | timer | @@ -306,6 +320,10 @@ These metrics give insight into the health of the cluster as a whole. | `consul.catalog.service.query-tag..` | This increments for each catalog query for the given service with the given tag. | queries | counter | | `consul.catalog.service.query-tags..` | This increments for each catalog query for the given service with the given tags. | queries | counter | | `consul.catalog.service.not-found.` | This increments for each catalog query where the given service could not be found. | queries | counter | +| `consul.catalog.connect.query.` | This increments for each connect-based catalog query for the given service. | queries | counter | +| `consul.catalog.connect.query-tag..` | This increments for each connect-based catalog query for the given service with the given tag. | queries | counter | +| `consul.catalog.connect.query-tags..` | This increments for each connect-based catalog query for the given service with the given tags. | queries | counter | +| `consul.catalog.connect.not-found.` | This increments for each connect-based catalog query where the given service could not be found. | queries | counter | ## Connect Built-in Proxy Metrics