From 9533372ded417930ecc0215f2852418fc1ff51f2 Mon Sep 17 00:00:00 2001 From: Kit Patella Date: Thu, 12 Nov 2020 18:12:12 -0800 Subject: [PATCH] first pass on agent-configured prometheusDefs and adding defs for every consul metric --- agent/catalog_endpoint.go | 119 +++++++++++++++++++++- agent/consul/acl.go | 29 +++++- agent/consul/acl_endpoint.go | 70 ++++++++++++- agent/consul/autopilot.go | 12 +++ agent/consul/catalog_endpoint.go | 47 +++++++++ agent/consul/client.go | 16 +++ agent/consul/config_endpoint.go | 29 ++++++ agent/consul/federation_state_endpoint.go | 22 +++- agent/consul/intention_endpoint.go | 9 ++ agent/consul/kvs_endpoint.go | 8 ++ agent/consul/prepared_query_endpoint.go | 20 ++++ agent/consul/rpc.go | 42 ++++++++ agent/consul/server.go | 4 +- agent/consul/session_ttl.go | 24 +++++ agent/consul/txn_endpoint.go | 12 +++ agent/consul/usagemetrics/usagemetrics.go | 17 ++++ agent/dns.go | 20 ++++ agent/grpc/stats.go | 37 +++++++ agent/http.go | 8 ++ agent/local/state.go | 27 ++++- agent/setup.go | 94 ++++++++++++++++- connect/proxy/proxy.go | 4 +- lib/telemetry.go | 95 +++++------------ 23 files changed, 687 insertions(+), 78 deletions(-) diff --git a/agent/catalog_endpoint.go b/agent/catalog_endpoint.go index 60c5fc344..7e0ee9571 100644 --- a/agent/catalog_endpoint.go +++ b/agent/catalog_endpoint.go @@ -5,11 +5,128 @@ import ( "net/http" "strings" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/structs" ) +// TODO(kit): Add help strings for each +var CatalogCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "client", "api", "catalog_register"}, + Help: "Increments whenever a Consul agent receives a catalog register request.", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_register"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_register"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_deregister"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_datacenters"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_deregister"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_deregister"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_datacenters"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_datacenters"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_service_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_service_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_service_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "error", "catalog_service_nodes"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_node_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_node_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_node_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_node_service_list"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_node_service_list"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_node_service_list"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "catalog_gateway_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "rpc", "error", "catalog_gateway_services"}, + Help: "", + }, + { + Name: []string{"consul", "client", "api", "success", "catalog_gateway_services"}, + Help: "", + }, +} + func (s *HTTPHandlers) CatalogRegister(resp http.ResponseWriter, req *http.Request) (interface{}, error) { metrics.IncrCounterWithLabels([]string{"client", "api", "catalog_register"}, 1, []metrics.Label{{Name: "node", Value: s.nodeName()}}) diff --git a/agent/consul/acl.go b/agent/consul/acl.go index 7796c3756..d99b588e4 100644 --- a/agent/consul/acl.go +++ b/agent/consul/acl.go @@ -6,7 +6,8 @@ import ( "sync" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/logging" @@ -15,6 +16,32 @@ import ( "golang.org/x/time/rate" ) +var ACLCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "acl", "token", "cache_hit"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "token", "cache_miss"}, + Help: "", + }, +} + +var ACLSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "acl", "resolveTokenLegacy"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "ResolveToken"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "ResolveTokenToIdentity"}, + Help: "", + }, +} + // These must be kept in sync with the constants in command/agent/acl.go. const ( // anonymousToken is the token ID we re-write to if there is no token ID diff --git a/agent/consul/acl_endpoint.go b/agent/consul/acl_endpoint.go index ccc9e1b2a..10c879467 100644 --- a/agent/consul/acl_endpoint.go +++ b/agent/consul/acl_endpoint.go @@ -11,7 +11,8 @@ import ( "regexp" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/authmethod" "github.com/hashicorp/consul/agent/consul/state" @@ -30,6 +31,73 @@ const ( aclBootstrapReset = "acl-bootstrap-reset" ) +var ACLEndpointSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "acl", "token", "clone"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "token", "upsert"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "token", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "policy", "upsert"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "policy", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "policy", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "role", "upsert"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "role", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "bindingrule", "upsert"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "bindingrule", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "authmethod", "upsert"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "authmethod", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "login"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "login"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "logout"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "logout"}, + Help: "", + }, +} + // Regex for matching var ( validPolicyName = regexp.MustCompile(`^[A-Za-z0-9\-_]{1,128}$`) diff --git a/agent/consul/autopilot.go b/agent/consul/autopilot.go index dc5aa5da7..7bd8bc258 100644 --- a/agent/consul/autopilot.go +++ b/agent/consul/autopilot.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/types" "github.com/hashicorp/raft" @@ -12,6 +13,17 @@ import ( "github.com/hashicorp/serf/serf" ) +var AutopilotGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "autopilot", "failure_tolerance"}, + Help: "", + }, + { + Name: []string{"consul", "autopilot", "healthy"}, + Help: "This tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.", + }, +} + // AutopilotDelegate is a Consul delegate for autopilot operations. type AutopilotDelegate struct { server *Server diff --git a/agent/consul/catalog_endpoint.go b/agent/consul/catalog_endpoint.go index 04be323cb..f8b41c97d 100644 --- a/agent/consul/catalog_endpoint.go +++ b/agent/consul/catalog_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -17,6 +18,52 @@ import ( "github.com/hashicorp/go-uuid" ) +var CatalogCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "catalog", "service", "query"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "connect", "query"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "service", "query-tag"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "connect", "query-tag"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "service", "query-tags"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "connect", "query-tags"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "service", "not-found"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "connect", "not-found"}, + Help: "", + }, +} + +var CatalogSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "catalog", "deregister"}, + Help: "", + }, + { + Name: []string{"consul", "catalog", "register"}, + Help: "", + }, +} + // Catalog endpoint is used to manipulate the service catalog type Catalog struct { srv *Server diff --git a/agent/consul/client.go b/agent/consul/client.go index b4cf90759..6f20d6a02 100644 --- a/agent/consul/client.go +++ b/agent/consul/client.go @@ -9,6 +9,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/pool" "github.com/hashicorp/consul/agent/router" "github.com/hashicorp/consul/agent/structs" @@ -21,6 +22,21 @@ import ( "golang.org/x/time/rate" ) +var ClientCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "client", "rpc"}, + Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server.", + }, + { + Name: []string{"consul", "client", "rpc", "exceeded"}, + Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's limits configuration.", + }, + { + Name: []string{"consul", "client", "rpc", "failed"}, + Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails.", + }, +} + const ( // serfEventBacklog is the maximum number of unprocessed Serf Events // that will be held in queue before new serf events block. A diff --git a/agent/consul/config_endpoint.go b/agent/consul/config_endpoint.go index dc56faf94..41cf16dc2 100644 --- a/agent/consul/config_endpoint.go +++ b/agent/consul/config_endpoint.go @@ -4,6 +4,8 @@ import ( "fmt" "time" + "github.com/armon/go-metrics/prometheus" + metrics "github.com/armon/go-metrics" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" @@ -12,6 +14,33 @@ import ( "github.com/mitchellh/copystructure" ) +var ConfigSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "config_entry", "apply"}, + Help: "", + }, + { + Name: []string{"consul", "config_entry", "get"}, + Help: "", + }, + { + Name: []string{"consul", "config_entry", "list"}, + Help: "", + }, + { + Name: []string{"consul", "config_entry", "listAll"}, + Help: "", + }, + { + Name: []string{"consul", "config_entry", "delete"}, + Help: "", + }, + { + Name: []string{"consul", "config_entry", "resolve_service_config"}, + Help: "", + }, +} + // The ConfigEntry endpoint is used to query centralized config information type ConfigEntry struct { srv *Server diff --git a/agent/consul/federation_state_endpoint.go b/agent/consul/federation_state_endpoint.go index a98ab83e8..2a71ed3f8 100644 --- a/agent/consul/federation_state_endpoint.go +++ b/agent/consul/federation_state_endpoint.go @@ -5,13 +5,33 @@ import ( "fmt" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" memdb "github.com/hashicorp/go-memdb" ) +var FederationStateSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "federation_state", "apply"}, + Help: "", + }, + { + Name: []string{"consul", "federation_state", "get"}, + Help: "", + }, + { + Name: []string{"consul", "federation_state", "list"}, + Help: "", + }, + { + Name: []string{"consul", "federation_state", "list_mesh_gateways"}, + Help: "", + }, +} + var ( errFederationStatesNotEnabled = errors.New("Federation states are currently disabled until all servers in the datacenter support the feature") ) diff --git a/agent/consul/intention_endpoint.go b/agent/consul/intention_endpoint.go index 95cb5183d..592ba3492 100644 --- a/agent/consul/intention_endpoint.go +++ b/agent/consul/intention_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/consul/state" @@ -16,6 +17,13 @@ import ( "github.com/hashicorp/go-memdb" ) +var IntentionSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "intention", "apply"}, + Help: "", + }, +} + var ( // ErrIntentionNotFound is returned if the intention lookup failed. ErrIntentionNotFound = errors.New("Intention not found") @@ -252,6 +260,7 @@ func (s *Intention) Apply( if done, err := s.srv.ForwardRPC("Intention.Apply", args, args, reply); done { return err } + // TODO(Kit): Why do we have summaries for intentions both with and without the consul namespace? defer metrics.MeasureSince([]string{"consul", "intention", "apply"}, time.Now()) defer metrics.MeasureSince([]string{"intention", "apply"}, time.Now()) diff --git a/agent/consul/kvs_endpoint.go b/agent/consul/kvs_endpoint.go index 04dee57b6..8ded7366a 100644 --- a/agent/consul/kvs_endpoint.go +++ b/agent/consul/kvs_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -14,6 +15,13 @@ import ( "github.com/hashicorp/go-memdb" ) +var KVSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "kvs", "apply"}, + Help: "This measures the time it takes to complete an update to the KV store.", + }, +} + // KVS endpoint is used to manipulate the Key-Value store type KVS struct { srv *Server diff --git a/agent/consul/prepared_query_endpoint.go b/agent/consul/prepared_query_endpoint.go index bb13ff3cb..df484b2af 100644 --- a/agent/consul/prepared_query_endpoint.go +++ b/agent/consul/prepared_query_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -15,6 +16,25 @@ import ( "github.com/hashicorp/go-uuid" ) +var PreparedQuerySummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "prepared-query", "apply"}, + Help: "", + }, + { + Name: []string{"consul", "prepared-query", "explain"}, + Help: "", + }, + { + Name: []string{"consul", "prepared-query", "execute"}, + Help: "", + }, + { + Name: []string{"consul", "prepared-query", "execute_remote"}, + Help: "", + }, +} + // PreparedQuery manages the prepared query endpoint. type PreparedQuery struct { srv *Server diff --git a/agent/consul/rpc.go b/agent/consul/rpc.go index ac1096292..58e7e8210 100644 --- a/agent/consul/rpc.go +++ b/agent/consul/rpc.go @@ -13,6 +13,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/wanfed" @@ -31,6 +32,47 @@ import ( "github.com/hashicorp/yamux" ) +var RPCCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "rpc", "accept_conn"}, + Help: "", + }, + { + Name: []string{"consul", "rpc", "raft_handoff"}, + Help: "", + }, + { + Name: []string{"consul", "rpc", "request_error"}, + Help: "", + }, + { + Name: []string{"consul", "rpc", "request"}, + Help: "", + }, + { + Name: []string{"consul", "rpc", "cross-dc"}, + Help: "", + }, + { + Name: []string{"consul", "rpc", "query"}, + Help: "", + }, +} + +var RPCGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "rpc", "queries_blocking"}, + Help: "", + }, +} + +var RPCSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "rpc", "consistentRead"}, + Help: "", + }, +} + const ( // jitterFraction is a the limit to the amount of jitter we apply // to a user specified MaxQueryTime. We divide the specified time by diff --git a/agent/consul/server.go b/agent/consul/server.go index 13fece406..5db589d3a 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -17,7 +17,7 @@ import ( "sync/atomic" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" connlimit "github.com/hashicorp/go-connlimit" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" @@ -50,6 +50,8 @@ import ( "github.com/hashicorp/consul/types" ) +// NOTE The "consul.client.rpc" and "consul.client.rpc.exceeded" counters are defined in consul/client.go + // These are the protocol versions that Consul can _understand_. These are // Consul-level protocol versions, that are used to configure the Serf // protocol versions. diff --git a/agent/consul/session_ttl.go b/agent/consul/session_ttl.go index 4afdc0e38..db4447c50 100644 --- a/agent/consul/session_ttl.go +++ b/agent/consul/session_ttl.go @@ -4,10 +4,34 @@ import ( "fmt" "time" + "github.com/armon/go-metrics/prometheus" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/agent/structs" ) +var SessionGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "session_ttl", "active"}, + Help: "", + }, + { + Name: []string{"consul", "raft", "applied_index"}, + Help: "", + }, + { + Name: []string{"consul", "raft", "last_index"}, + Help: "", + }, +} + +var SessionSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "session_ttl", "invalidate"}, + Help: "", + }, +} + const ( // maxInvalidateAttempts limits how many invalidate attempts are made maxInvalidateAttempts = 6 diff --git a/agent/consul/txn_endpoint.go b/agent/consul/txn_endpoint.go index 9819d6370..42539a991 100644 --- a/agent/consul/txn_endpoint.go +++ b/agent/consul/txn_endpoint.go @@ -5,12 +5,24 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/go-hclog" ) +var TxnSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "txn", "apply"}, + Help: "This measures the time spent applying a transaction operation.", + }, + { + Name: []string{"consul", "txn", "read"}, + Help: "", + }, +} + // Txn endpoint is used to perform multi-object atomic transactions. type Txn struct { srv *Server diff --git a/agent/consul/usagemetrics/usagemetrics.go b/agent/consul/usagemetrics/usagemetrics.go index 259c6646e..fc8d9ce90 100644 --- a/agent/consul/usagemetrics/usagemetrics.go +++ b/agent/consul/usagemetrics/usagemetrics.go @@ -5,12 +5,29 @@ import ( "errors" "time" + "github.com/armon/go-metrics/prometheus" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" ) +var Gauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "state", "nodes"}, + Help: "", + }, + { + Name: []string{"consul", "state", "services"}, + Help: "", + }, + { + Name: []string{"consul", "state", "service_instances"}, + Help: "", + }, +} + // Config holds the settings for various parameters for the // UsageMetricsReporter type Config struct { diff --git a/agent/dns.go b/agent/dns.go index a9063e26f..6d541aeaa 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -10,6 +10,8 @@ import ( "sync/atomic" "time" + "github.com/armon/go-metrics/prometheus" + metrics "github.com/armon/go-metrics" radix "github.com/armon/go-radix" "github.com/coredns/coredns/plugin/pkg/dnsutil" @@ -26,6 +28,24 @@ import ( "github.com/hashicorp/consul/logging" ) +var DNSCounters = []prometheus.CounterDefinition{ + { + Name: []string{"dns", "stale_queries"}, + Help: "", + }, +} + +var DNSSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"dns", "ptr_query"}, + Help: "", + }, + { + Name: []string{"dns", "domain_query"}, + Help: "", + }, +} + const ( // UDP can fit ~25 A records in a 512B response, and ~14 AAAA // records. Limit further to prevent unintentional configuration diff --git a/agent/grpc/stats.go b/agent/grpc/stats.go index add3195f1..7f732316f 100644 --- a/agent/grpc/stats.go +++ b/agent/grpc/stats.go @@ -5,11 +5,48 @@ import ( "sync/atomic" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "google.golang.org/grpc" "google.golang.org/grpc/stats" ) var defaultMetrics = metrics.Default() +var StatsGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "grpc", "server", "connections"}, + Help: "", + }, + { + Name: []string{"consul", "grpc", "client", "connections"}, + Help: "", + }, + { + Name: []string{"consul", "grpc", "server", "streams"}, + Help: "", + }, +} +var StatsCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "grpc", "client", "request", "count"}, + Help: "", + }, + { + Name: []string{"consul", "grpc", "server", "request", "count"}, + Help: "", + }, + { + Name: []string{"consul", "grpc", "client", "connection", "count"}, + Help: "", + }, + { + Name: []string{"consul", "grpc", "server", "connection", "count"}, + Help: "", + }, + { + Name: []string{"consul", "grpc", "server", "stream", "count"}, + Help: "", + }, +} // statsHandler is a grpc/stats.StatsHandler which emits connection and // request metrics to go-metrics. diff --git a/agent/http.go b/agent/http.go index 10233fa6b..244b9522b 100644 --- a/agent/http.go +++ b/agent/http.go @@ -17,6 +17,7 @@ import ( "github.com/NYTimes/gziphandler" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/config" @@ -31,6 +32,13 @@ import ( "github.com/pkg/errors" ) +var HTTPSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "api", "http"}, + Help: "", + }, +} + // MethodNotAllowedError should be returned by a handler when the HTTP method is not allowed. type MethodNotAllowedError struct { Method string diff --git a/agent/local/state.go b/agent/local/state.go index be0c481f3..d145f97d0 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -9,8 +9,8 @@ import ( "sync/atomic" "time" - metrics "github.com/armon/go-metrics" - + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -20,6 +20,29 @@ import ( "github.com/hashicorp/go-hclog" ) +var StateCounters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "acl", "blocked", "service", "deregistration"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "blocked", "check", "deregistration"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "blocked", "service", "registration"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "blocked", "check", "registration"}, + Help: "", + }, + { + Name: []string{"consul", "acl", "blocked", "node", "registration"}, + Help: "", + }, +} + const fullSyncReadMaxStale = 2 * time.Second // Config is the configuration for the State. diff --git a/agent/setup.go b/agent/setup.go index 96265ef24..4297a3089 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -8,6 +8,10 @@ import ( "sync" "time" + "github.com/armon/go-metrics/prometheus" + "github.com/hashicorp/consul/agent/consul/usagemetrics" + "github.com/hashicorp/consul/agent/local" + "github.com/hashicorp/go-hclog" "google.golang.org/grpc/grpclog" grpcresolver "google.golang.org/grpc/resolver" @@ -72,7 +76,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error) return d, fmt.Errorf("failed to setup node ID: %w", err) } - d.MetricsHandler, err = lib.InitTelemetry(cfg.Telemetry) + d.MetricsHandler, err = lib.InitTelemetry(cfg.Telemetry, getPrometheusDefs()) if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) } @@ -177,3 +181,91 @@ func registerWithGRPC(b grpcresolver.Builder) { defer registerLock.Unlock() grpcresolver.Register(b) } + +// getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends +// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics. +func getPrometheusDefs() lib.PrometheusDefs { + var gauges = [][]prometheus.GaugeDefinition{ + consul.AutopilotGauges, + consul.RPCGauges, + consul.SessionGauges, + grpc.StatsGauges, + usagemetrics.Gauges, + } + var gaugeDefs []prometheus.GaugeDefinition + for _, g := range gauges { + gaugeDefs = append(gaugeDefs, g...) + } + + raftCounters := []prometheus.CounterDefinition{ + // TODO(kit): "consul.raft..." metrics come from the raft lib and we should migrate these to a telemetry + // package within. In the mean time, we're going to define them here because it's important that they're always + // present for Consul users setting up dashboards. + { + Name: []string{"consul", "raft", "apply"}, + Help: "This counts the number of Raft transactions occurring over the interval.", + }, + { + Name: []string{"consul", "raft", "state", "candidate"}, + Help: "This increments whenever a Consul server starts an election.", + }, + { + Name: []string{"consul", "raft", "state", "leader"}, + Help: "This increments whenever a Consul server becomes a leader.", + }, + } + + var counters = [][]prometheus.CounterDefinition{ + CatalogCounters, + consul.ACLCounters, + consul.CatalogCounters, + consul.ClientCounters, + consul.RPCCounters, + grpc.StatsCounters, + local.StateCounters, + raftCounters, + } + var counterDefs []prometheus.CounterDefinition + for _, c := range counters { + counterDefs = append(counterDefs, c...) + } + + raftSummaries := []prometheus.SummaryDefinition{ + // TODO(kit): "consul.raft..." metrics come from the raft lib and we should migrate these to a telemetry + // package within. In the mean time, we're going to define them here because it's important that they're always + // present for Consul users setting up dashboards. + { + Name: []string{"consul", "raft", "commitTime"}, + Help: "This measures the time it takes to commit a new entry to the Raft log on the leader.", + }, + { + Name: []string{"consul", "raft", "leader", "lastContact"}, + Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.", + }, + } + + var summaries = [][]prometheus.SummaryDefinition{ + HTTPSummaries, + consul.ACLSummaries, + consul.ACLEndpointSummaries, + consul.CatalogSummaries, + consul.FederationStateSummaries, + consul.IntentionSummaries, + consul.KVSummaries, + consul.PreparedQuerySummaries, + consul.RPCSummaries, + consul.SessionSummaries, + consul.TxnSummaries, + raftSummaries, + } + var summaryDefs []prometheus.SummaryDefinition + for _, s := range summaries { + summaryDefs = append(summaryDefs, s...) + } + + return lib.PrometheusDefs{ + Gauges: gaugeDefs, + Counters: counterDefs, + Summaries: summaryDefs, + } +} diff --git a/connect/proxy/proxy.go b/connect/proxy/proxy.go index 9dc27a06f..54df4e309 100644 --- a/connect/proxy/proxy.go +++ b/connect/proxy/proxy.go @@ -54,7 +54,9 @@ func (p *Proxy) Serve() error { // Initial setup // Setup telemetry if configured - _, err := lib.InitTelemetry(newCfg.Telemetry) + // NOTE(kit): As far as I can tell, all of the metrics in the proxy are generated at runtime, so we + // don't have any static metrics we initialize at start. + _, err := lib.InitTelemetry(newCfg.Telemetry, lib.EmptyPrometheusDefs()) if err != nil { p.logger.Error("proxy telemetry config error", "error", err) } diff --git a/lib/telemetry.go b/lib/telemetry.go index 33f7d2100..fe360172a 100644 --- a/lib/telemetry.go +++ b/lib/telemetry.go @@ -276,79 +276,17 @@ func dogstatdSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, err return sink, nil } -func prometheusSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, error) { +func prometheusSink(cfg TelemetryConfig, hostname string, defs PrometheusDefs) (metrics.MetricSink, error) { + if cfg.PrometheusRetentionTime.Nanoseconds() < 1 { return nil, nil } - // TODO(kit) define these in vars in the package/file they're used - gaugeDefs := []prometheus.GaugeDefinition{ - { - Name: []string{"consul", "autopilot", "healthy"}, - Help: "This tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.", - }, - } - - // TODO(kit) define these in vars in the package/file they're used - counterDefs := []prometheus.CounterDefinition{ - { - Name: []string{"consul", "raft", "apply"}, - Help: "This counts the number of Raft transactions occurring over the interval.", - }, - { - Name: []string{"consul", "raft", "state", "candidate"}, - Help: "This increments whenever a Consul server starts an election.", - }, - { - Name: []string{"consul", "raft", "state", "leader"}, - Help: "This increments whenever a Consul server becomes a leader.", - }, - { - Name: []string{"consul", "client", "api", "catalog_register"}, - Help: "Increments whenever a Consul agent receives a catalog register request.", - }, - { - Name: []string{"consul", "runtime", "total_gc_pause_ns"}, - Help: "Number of nanoseconds consumed by stop-the-world garbage collection (GC) pauses since Consul started.", - }, - { - Name: []string{"consul", "client", "rpc"}, - Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server.", - }, - { - Name: []string{"consul", "client", "rpc", "exceeded"}, - Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's limits configuration.", - }, - { - Name: []string{"consul", "client", "rpc", "failed"}, - Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails.", - }, - } - - // TODO(kit) define these in vars in the package/file they're used - summaryDefs := []prometheus.SummaryDefinition{ - { - Name: []string{"consul", "kvs", "apply"}, - Help: "This measures the time it takes to complete an update to the KV store.", - }, - { - Name: []string{"consul", "txn", "apply"}, - Help: "This measures the time spent applying a transaction operation.", - }, - { - Name: []string{"consul", "raft", "commitTime"}, - Help: "This measures the time it takes to commit a new entry to the Raft log on the leader.", - }, - { - Name: []string{"consul", "raft", "leader", "lastContact"}, - Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.", - }, - } prometheusOpts := prometheus.PrometheusOpts{ Expiration: cfg.PrometheusRetentionTime, - GaugeDefinitions: gaugeDefs, - CounterDefinitions: counterDefs, - SummaryDefinitions: summaryDefs, + GaugeDefinitions: defs.Gauges, + CounterDefinitions: defs.Counters, + SummaryDefinitions: defs.Summaries, } sink, err := prometheus.NewPrometheusSinkFrom(prometheusOpts) if err != nil { @@ -399,9 +337,25 @@ func circonusSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, err return sink, nil } +// PrometheusDefs wraps collections of metric definitions to pass into the PrometheusSink +type PrometheusDefs struct { + Gauges []prometheus.GaugeDefinition + Counters []prometheus.CounterDefinition + Summaries []prometheus.SummaryDefinition +} + +// EmptyPrometheusDefs returns a PrometheusDefs struct where each of the slices have zero elements, but not nil. +func EmptyPrometheusDefs() PrometheusDefs { + return PrometheusDefs{ + Gauges: []prometheus.GaugeDefinition{}, + Counters: []prometheus.CounterDefinition{}, + Summaries: []prometheus.SummaryDefinition{}, + } +} + // InitTelemetry configures go-metrics based on map of telemetry config // values as returned by Runtimecfg.Config(). -func InitTelemetry(cfg TelemetryConfig) (*metrics.InmemSink, error) { +func InitTelemetry(cfg TelemetryConfig, defs PrometheusDefs) (*metrics.InmemSink, error) { if cfg.Disable { return nil, nil } @@ -440,9 +394,12 @@ func InitTelemetry(cfg TelemetryConfig) (*metrics.InmemSink, error) { if err := addSink(circonusSink); err != nil { return nil, err } - if err := addSink(prometheusSink); err != nil { + + promSink, err := prometheusSink(cfg, metricsConf.HostName, defs) + if err != nil { return nil, err } + sinks = append(sinks, promSink) if len(sinks) > 0 { sinks = append(sinks, memSink)