usagemetrics: add cluster members to metrics API (#10340)
This PR adds cluster members to the metrics API. The number of members per segment are reported as well as the total number of members. Tested by running a multi-node cluster locally and ensuring the numbers were correct. Also added unit test coverage to add the new expected gauges to existing test cases.
This commit is contained in:
parent
fd97cf9ecc
commit
e454a9aae0
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
|
||||
```
|
|
@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
|
|||
WithStateProvider(s.fsm).
|
||||
WithLogger(s.logger).
|
||||
WithDatacenter(s.config.Datacenter).
|
||||
WithReportingInterval(s.config.MetricsReportingInterval),
|
||||
WithReportingInterval(s.config.MetricsReportingInterval).
|
||||
WithGetMembersFunc(func() []serf.Member {
|
||||
members, err := s.LANMembersAllSegments()
|
||||
if err != nil {
|
||||
return []serf.Member{}
|
||||
}
|
||||
|
||||
return members
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
s.Shutdown()
|
||||
|
@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
|
|||
return s.serfLAN.Members()
|
||||
}
|
||||
|
||||
// WANMembers is used to return the members of the LAN cluster
|
||||
// WANMembers is used to return the members of the WAN cluster
|
||||
func (s *Server) WANMembers() []serf.Member {
|
||||
if s.serfWAN == nil {
|
||||
return nil
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/consul/state"
|
||||
"github.com/hashicorp/consul/logging"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
var Gauges = []prometheus.GaugeDefinition{
|
||||
|
@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{
|
|||
Name: []string{"consul", "state", "service_instances"},
|
||||
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
|
||||
},
|
||||
{
|
||||
Name: []string{"consul", "members", "clients"},
|
||||
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
|
||||
},
|
||||
{
|
||||
Name: []string{"consul", "members", "servers"},
|
||||
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
|
||||
},
|
||||
}
|
||||
|
||||
type getMembersFunc func() []serf.Member
|
||||
|
||||
// Config holds the settings for various parameters for the
|
||||
// UsageMetricsReporter
|
||||
type Config struct {
|
||||
|
@ -35,6 +46,7 @@ type Config struct {
|
|||
metricLabels []metrics.Label
|
||||
stateProvider StateProvider
|
||||
tickerInterval time.Duration
|
||||
getMembersFunc getMembersFunc
|
||||
}
|
||||
|
||||
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
|
||||
|
@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
|
|||
return c
|
||||
}
|
||||
|
||||
// WithGetMembersFunc specifies the function used to identify cluster members
|
||||
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
|
||||
c.getMembersFunc = fn
|
||||
return c
|
||||
}
|
||||
|
||||
// StateProvider defines an inteface for retrieving a state.Store handle. In
|
||||
// non-test code, this is satisfied by the fsm.FSM struct.
|
||||
type StateProvider interface {
|
||||
|
@ -77,6 +95,7 @@ type UsageMetricsReporter struct {
|
|||
metricLabels []metrics.Label
|
||||
stateProvider StateProvider
|
||||
tickerInterval time.Duration
|
||||
getMembersFunc getMembersFunc
|
||||
}
|
||||
|
||||
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
||||
|
@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
|||
return nil, errors.New("must provide a StateProvider to usage reporter")
|
||||
}
|
||||
|
||||
if cfg.getMembersFunc == nil {
|
||||
return nil, errors.New("must provide a getMembersFunc to usage reporter")
|
||||
}
|
||||
|
||||
if cfg.logger == nil {
|
||||
cfg.logger = hclog.NewNullLogger()
|
||||
}
|
||||
|
@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
|||
stateProvider: cfg.stateProvider,
|
||||
metricLabels: cfg.metricLabels,
|
||||
tickerInterval: cfg.tickerInterval,
|
||||
getMembersFunc: cfg.getMembersFunc,
|
||||
}
|
||||
|
||||
return u, nil
|
||||
|
@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
|
|||
}
|
||||
|
||||
u.emitServiceUsage(serviceUsage)
|
||||
|
||||
servers, clients := u.memberUsage()
|
||||
u.emitMemberUsage(servers, clients)
|
||||
}
|
||||
|
||||
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
|
||||
if u.getMembersFunc == nil {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
mems := u.getMembersFunc()
|
||||
if len(mems) <= 0 {
|
||||
u.logger.Warn("cluster reported zero members")
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
servers := 0
|
||||
clients := make(map[string]int)
|
||||
|
||||
for _, m := range mems {
|
||||
if m.Status != serf.StatusAlive {
|
||||
continue
|
||||
}
|
||||
|
||||
switch m.Tags["role"] {
|
||||
case "node":
|
||||
clients[m.Tags["segment"]]++
|
||||
case "consul":
|
||||
servers++
|
||||
}
|
||||
}
|
||||
|
||||
return servers, clients
|
||||
}
|
||||
|
||||
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
|
||||
totalClients := 0
|
||||
|
||||
for seg, c := range clients {
|
||||
segmentLabel := metrics.Label{Name: "segment", Value: seg}
|
||||
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
|
||||
|
||||
metrics.SetGaugeWithLabels(
|
||||
[]string{"consul", "members", "clients"},
|
||||
float32(c),
|
||||
labels,
|
||||
)
|
||||
|
||||
totalClients += c
|
||||
}
|
||||
|
||||
metrics.SetGaugeWithLabels(
|
||||
[]string{"consul", "members", "clients"},
|
||||
float32(totalClients),
|
||||
u.metricLabels,
|
||||
)
|
||||
|
||||
metrics.SetGaugeWithLabels(
|
||||
[]string{"consul", "members", "servers"},
|
||||
float32(servers),
|
||||
u.metricLabels,
|
||||
)
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/consul/state"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/sdk/testutil"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
func newStateStore() (*state.Store, error) {
|
||||
|
@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
|
|||
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
||||
type testCase struct {
|
||||
modfiyStateStore func(t *testing.T, s *state.Store)
|
||||
getMembersFunc getMembersFunc
|
||||
expectedGauges map[string]metrics.GaugeValue
|
||||
}
|
||||
cases := map[string]testCase{
|
||||
|
@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
|||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 0,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.servers;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.servers",
|
||||
Value: 0,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
},
|
||||
getMembersFunc: func() []serf.Member { return []serf.Member{} },
|
||||
},
|
||||
"nodes-and-services": {
|
||||
modfiyStateStore: func(t *testing.T, s *state.Store) {
|
||||
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
|
||||
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
|
||||
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
|
||||
require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))
|
||||
|
||||
// Typical services and some consul services spread across two nodes
|
||||
require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
},
|
||||
getMembersFunc: func() []serf.Member {
|
||||
return []serf.Member{
|
||||
{
|
||||
Name: "foo",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "bar",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "baz",
|
||||
Tags: map[string]string{"role": "node", "segment": "a"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "qux",
|
||||
Tags: map[string]string{"role": "node", "segment": "b"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
}
|
||||
},
|
||||
expectedGauges: map[string]metrics.GaugeValue{
|
||||
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.state.nodes",
|
||||
Value: 3,
|
||||
Value: 4,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
"consul.usage.test.consul.state.services;datacenter=dc1": {
|
||||
|
@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
|||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 2,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.servers;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.servers",
|
||||
Value: 2,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 1,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "segment", Value: "a"},
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 1,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "segment", Value: "b"},
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
|||
new(Config).
|
||||
WithStateProvider(mockStateProvider).
|
||||
WithLogger(testutil.Logger(t)).
|
||||
WithDatacenter("dc1"),
|
||||
WithDatacenter("dc1").
|
||||
WithGetMembersFunc(tcase.getMembersFunc),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/consul/state"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/sdk/testutil"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
type mockStateProvider struct {
|
||||
|
@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
|
|||
func TestUsageReporter_Run_Nodes(t *testing.T) {
|
||||
type testCase struct {
|
||||
modfiyStateStore func(t *testing.T, s *state.Store)
|
||||
getMembersFunc getMembersFunc
|
||||
expectedGauges map[string]metrics.GaugeValue
|
||||
}
|
||||
cases := map[string]testCase{
|
||||
|
@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
|
|||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
},
|
||||
getMembersFunc: func() []serf.Member { return []serf.Member{} },
|
||||
},
|
||||
"nodes": {
|
||||
modfiyStateStore: func(t *testing.T, s *state.Store) {
|
||||
|
@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
|
|||
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
|
||||
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
|
||||
},
|
||||
getMembersFunc: func() []serf.Member {
|
||||
return []serf.Member{
|
||||
{
|
||||
Name: "foo",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "bar",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "baz",
|
||||
Tags: map[string]string{"role": "node"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
}
|
||||
},
|
||||
expectedGauges: map[string]metrics.GaugeValue{
|
||||
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.state.nodes",
|
||||
Value: 3,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 1,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
"consul.usage.test.consul.members.servers;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.servers",
|
||||
Value: 2,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
|
|||
new(Config).
|
||||
WithStateProvider(mockStateProvider).
|
||||
WithLogger(testutil.Logger(t)).
|
||||
WithDatacenter("dc1"),
|
||||
WithDatacenter("dc1").
|
||||
WithGetMembersFunc(tcase.getMembersFunc),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
|
|
|
@ -255,14 +255,14 @@ reflect what would happen if an agent restarts now.
|
|||
| :-------------------------------- | :--------------------------------------------------------------- | :---- | :---- |
|
||||
| `consul.system.licenseExpiration` | Number of hours until the Consul Enterprise license will expire. | hours | gauge |
|
||||
|
||||
**Why they're important:**
|
||||
**Why they're important:**
|
||||
|
||||
This measurement indicates how many hours are left before the Consul Enterprise license expires. When the license expires some
|
||||
Consul Enterprise features will cease to work. An example of this is that after expiration, it is no longer possible to create
|
||||
or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced
|
||||
or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced
|
||||
resources will still work.
|
||||
|
||||
**What to look for:**
|
||||
**What to look for:**
|
||||
|
||||
This metric should be monitored to ensure that the license doesn't expire to prevent degradation of functionality.
|
||||
|
||||
|
@ -313,11 +313,13 @@ This is a full list of metrics emitted by Consul.
|
|||
| `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
|
||||
| `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
|
||||
| `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
|
||||
| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge |
|
||||
| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge |
|
||||
| `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter |
|
||||
| `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer |
|
||||
| `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |
|
||||
| `consul.http...` | DEPRECATED IN 1.9: Tracks how long it takes to service the given HTTP request for the given verb and path. Paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`) | ms | timer |
|
||||
| `consul.system.licenseExpiration` | <EnterpriseAlert inline /> This measures the number of hours remaining on the agents license. | hours | gauge |
|
||||
| `consul.system.licenseExpiration` | <EnterpriseAlert inline /> This measures the number of hours remaining on the agents license. | hours | gauge |
|
||||
| `consul.version` | Measures the count of running agents. | agents | guage |
|
||||
|
||||
## Server Health
|
||||
|
|
Loading…
Reference in New Issue