From e454a9aae0d7aa9d93089088f39949b2014bbb60 Mon Sep 17 00:00:00 2001 From: Paul Ewing Date: Thu, 3 Jun 2021 08:25:53 -0700 Subject: [PATCH] usagemetrics: add cluster members to metrics API (#10340) This PR adds cluster members to the metrics API. The number of members per segment are reported as well as the total number of members. Tested by running a multi-node cluster locally and ensuring the numbers were correct. Also added unit test coverage to add the new expected gauges to existing test cases. --- .changelog/10340.txt | 3 + agent/consul/server.go | 12 ++- agent/consul/usagemetrics/usagemetrics.go | 86 +++++++++++++++++++ .../usagemetrics/usagemetrics_oss_test.go | 85 ++++++++++++++++-- .../consul/usagemetrics/usagemetrics_test.go | 35 +++++++- website/content/docs/agent/telemetry.mdx | 10 ++- 6 files changed, 218 insertions(+), 13 deletions(-) create mode 100644 .changelog/10340.txt diff --git a/.changelog/10340.txt b/.changelog/10340.txt new file mode 100644 index 000000000..ff2a882a2 --- /dev/null +++ b/.changelog/10340.txt @@ -0,0 +1,3 @@ +```release-note:improvement +telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis. +``` diff --git a/agent/consul/server.go b/agent/consul/server.go index a53916aa4..08677a6b1 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) { WithStateProvider(s.fsm). WithLogger(s.logger). WithDatacenter(s.config.Datacenter). - WithReportingInterval(s.config.MetricsReportingInterval), + WithReportingInterval(s.config.MetricsReportingInterval). + WithGetMembersFunc(func() []serf.Member { + members, err := s.LANMembersAllSegments() + if err != nil { + return []serf.Member{} + } + + return members + }), ) if err != nil { s.Shutdown() @@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member { return s.serfLAN.Members() } -// WANMembers is used to return the members of the LAN cluster +// WANMembers is used to return the members of the WAN cluster func (s *Server) WANMembers() []serf.Member { if s.serfWAN == nil { return nil diff --git a/agent/consul/usagemetrics/usagemetrics.go b/agent/consul/usagemetrics/usagemetrics.go index da09890e5..353e9a45d 100644 --- a/agent/consul/usagemetrics/usagemetrics.go +++ b/agent/consul/usagemetrics/usagemetrics.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" + "github.com/hashicorp/serf/serf" ) var Gauges = []prometheus.GaugeDefinition{ @@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{ Name: []string{"consul", "state", "service_instances"}, Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.", }, + { + Name: []string{"consul", "members", "clients"}, + Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.", + }, + { + Name: []string{"consul", "members", "servers"}, + Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.", + }, } +type getMembersFunc func() []serf.Member + // Config holds the settings for various parameters for the // UsageMetricsReporter type Config struct { @@ -35,6 +46,7 @@ type Config struct { metricLabels []metrics.Label stateProvider StateProvider tickerInterval time.Duration + getMembersFunc getMembersFunc } // WithDatacenter adds the datacenter as a label to all metrics emitted by the @@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config { return c } +// WithGetMembersFunc specifies the function used to identify cluster members +func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config { + c.getMembersFunc = fn + return c +} + // StateProvider defines an inteface for retrieving a state.Store handle. In // non-test code, this is satisfied by the fsm.FSM struct. type StateProvider interface { @@ -77,6 +95,7 @@ type UsageMetricsReporter struct { metricLabels []metrics.Label stateProvider StateProvider tickerInterval time.Duration + getMembersFunc getMembersFunc } func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { @@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { return nil, errors.New("must provide a StateProvider to usage reporter") } + if cfg.getMembersFunc == nil { + return nil, errors.New("must provide a getMembersFunc to usage reporter") + } + if cfg.logger == nil { cfg.logger = hclog.NewNullLogger() } @@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { stateProvider: cfg.stateProvider, metricLabels: cfg.metricLabels, tickerInterval: cfg.tickerInterval, + getMembersFunc: cfg.getMembersFunc, } return u, nil @@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() { } u.emitServiceUsage(serviceUsage) + + servers, clients := u.memberUsage() + u.emitMemberUsage(servers, clients) +} + +func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) { + if u.getMembersFunc == nil { + return 0, nil + } + + mems := u.getMembersFunc() + if len(mems) <= 0 { + u.logger.Warn("cluster reported zero members") + return 0, nil + } + + servers := 0 + clients := make(map[string]int) + + for _, m := range mems { + if m.Status != serf.StatusAlive { + continue + } + + switch m.Tags["role"] { + case "node": + clients[m.Tags["segment"]]++ + case "consul": + servers++ + } + } + + return servers, clients +} + +func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) { + totalClients := 0 + + for seg, c := range clients { + segmentLabel := metrics.Label{Name: "segment", Value: seg} + labels := append([]metrics.Label{segmentLabel}, u.metricLabels...) + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "clients"}, + float32(c), + labels, + ) + + totalClients += c + } + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "clients"}, + float32(totalClients), + u.metricLabels, + ) + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "servers"}, + float32(servers), + u.metricLabels, + ) } diff --git a/agent/consul/usagemetrics/usagemetrics_oss_test.go b/agent/consul/usagemetrics/usagemetrics_oss_test.go index d4919914f..e23201435 100644 --- a/agent/consul/usagemetrics/usagemetrics_oss_test.go +++ b/agent/consul/usagemetrics/usagemetrics_oss_test.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/serf/serf" ) func newStateStore() (*state.Store, error) { @@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) { func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { type testCase struct { modfiyStateStore func(t *testing.T, s *state.Store) + getMembersFunc getMembersFunc expectedGauges map[string]metrics.GaugeValue } cases := map[string]testCase{ @@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { {Name: "datacenter", Value: "dc1"}, }, }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 0, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 0, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, }, + getMembersFunc: func() []serf.Member { return []serf.Member{} }, }, "nodes-and-services": { modfiyStateStore: func(t *testing.T, s *state.Store) { require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) + require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"})) // Typical services and some consul services spread across two nodes - require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) - require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) - require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) - require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) + require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) + require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + }, + getMembersFunc: func() []serf.Member { + return []serf.Member{ + { + Name: "foo", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "bar", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "baz", + Tags: map[string]string{"role": "node", "segment": "a"}, + Status: serf.StatusAlive, + }, + { + Name: "qux", + Tags: map[string]string{"role": "node", "segment": "b"}, + Status: serf.StatusAlive, + }, + } }, expectedGauges: map[string]metrics.GaugeValue{ "consul.usage.test.consul.state.nodes;datacenter=dc1": { Name: "consul.usage.test.consul.state.nodes", - Value: 3, + Value: 4, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, "consul.usage.test.consul.state.services;datacenter=dc1": { @@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { {Name: "datacenter", Value: "dc1"}, }, }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 2, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 2, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{ + {Name: "segment", Value: "a"}, + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{ + {Name: "segment", Value: "b"}, + {Name: "datacenter", Value: "dc1"}, + }, + }, }, }, } @@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { new(Config). WithStateProvider(mockStateProvider). WithLogger(testutil.Logger(t)). - WithDatacenter("dc1"), + WithDatacenter("dc1"). + WithGetMembersFunc(tcase.getMembersFunc), ) require.NoError(t, err) diff --git a/agent/consul/usagemetrics/usagemetrics_test.go b/agent/consul/usagemetrics/usagemetrics_test.go index cd34581c6..1c4be1d5b 100644 --- a/agent/consul/usagemetrics/usagemetrics_test.go +++ b/agent/consul/usagemetrics/usagemetrics_test.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/serf/serf" ) type mockStateProvider struct { @@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store { func TestUsageReporter_Run_Nodes(t *testing.T) { type testCase struct { modfiyStateStore func(t *testing.T, s *state.Store) + getMembersFunc getMembersFunc expectedGauges map[string]metrics.GaugeValue } cases := map[string]testCase{ @@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) { Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, }, + getMembersFunc: func() []serf.Member { return []serf.Member{} }, }, "nodes": { modfiyStateStore: func(t *testing.T, s *state.Store) { @@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) { require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) }, + getMembersFunc: func() []serf.Member { + return []serf.Member{ + { + Name: "foo", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "bar", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "baz", + Tags: map[string]string{"role": "node"}, + Status: serf.StatusAlive, + }, + } + }, expectedGauges: map[string]metrics.GaugeValue{ "consul.usage.test.consul.state.nodes;datacenter=dc1": { Name: "consul.usage.test.consul.state.nodes", Value: 3, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 2, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, }, }, } @@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) { new(Config). WithStateProvider(mockStateProvider). WithLogger(testutil.Logger(t)). - WithDatacenter("dc1"), + WithDatacenter("dc1"). + WithGetMembersFunc(tcase.getMembersFunc), ) require.NoError(t, err) diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index f94dca0ed..50dfcbd9c 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -255,14 +255,14 @@ reflect what would happen if an agent restarts now. | :-------------------------------- | :--------------------------------------------------------------- | :---- | :---- | | `consul.system.licenseExpiration` | Number of hours until the Consul Enterprise license will expire. | hours | gauge | -**Why they're important:** +**Why they're important:** This measurement indicates how many hours are left before the Consul Enterprise license expires. When the license expires some Consul Enterprise features will cease to work. An example of this is that after expiration, it is no longer possible to create -or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced +or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced resources will still work. -**What to look for:** +**What to look for:** This metric should be monitored to ensure that the license doesn't expire to prevent degradation of functionality. @@ -313,11 +313,13 @@ This is a full list of metrics emitted by Consul. | `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | +| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge | +| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge | | `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter | | `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer | | `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer | | `consul.http...` | DEPRECATED IN 1.9: Tracks how long it takes to service the given HTTP request for the given verb and path. Paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`) | ms | timer | -| `consul.system.licenseExpiration` | This measures the number of hours remaining on the agents license. | hours | gauge | +| `consul.system.licenseExpiration` | This measures the number of hours remaining on the agents license. | hours | gauge | | `consul.version` | Measures the count of running agents. | agents | guage | ## Server Health