usagemetrics: add cluster members to metrics API (#10340)
This PR adds cluster members to the metrics API. The number of members per segment are reported as well as the total number of members. Tested by running a multi-node cluster locally and ensuring the numbers were correct. Also added unit test coverage to add the new expected gauges to existing test cases.
This commit is contained in:
parent
fd97cf9ecc
commit
e454a9aae0
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
|
||||
```
|
|
@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
|
|||
WithStateProvider(s.fsm).
|
||||
WithLogger(s.logger).
|
||||
WithDatacenter(s.config.Datacenter).
|
||||
WithReportingInterval(s.config.MetricsReportingInterval),
|
||||
WithReportingInterval(s.config.MetricsReportingInterval).
|
||||
WithGetMembersFunc(func() []serf.Member {
|
||||
members, err := s.LANMembersAllSegments()
|
||||
if err != nil {
|
||||
return []serf.Member{}
|
||||
}
|
||||
|
||||
return members
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
s.Shutdown()
|
||||
|
@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
|
|||
return s.serfLAN.Members()
|
||||
}
|
||||
|
||||
// WANMembers is used to return the members of the LAN cluster
|
||||
// WANMembers is used to return the members of the WAN cluster
|
||||
func (s *Server) WANMembers() []serf.Member {
|
||||
if s.serfWAN == nil {
|
||||
return nil
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/consul/state"
|
||||
"github.com/hashicorp/consul/logging"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
var Gauges = []prometheus.GaugeDefinition{
|
||||
|
@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{
|
|||
Name: []string{"consul", "state", "service_instances"},
|
||||
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
|
||||
},
|
||||
{
|
||||
Name: []string{"consul", "members", "clients"},
|
||||
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
|
||||
},
|
||||
{
|
||||
Name: []string{"consul", "members", "servers"},
|
||||
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
|
||||
},
|
||||
}
|
||||
|
||||
type getMembersFunc func() []serf.Member
|
||||
|
||||
// Config holds the settings for various parameters for the
|
||||
// UsageMetricsReporter
|
||||
type Config struct {
|
||||
|
@ -35,6 +46,7 @@ type Config struct {
|
|||
metricLabels []metrics.Label
|
||||
stateProvider StateProvider
|
||||
tickerInterval time.Duration
|
||||
getMembersFunc getMembersFunc
|
||||
}
|
||||
|
||||
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
|
||||
|
@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
|
|||
return c
|
||||
}
|
||||
|
||||
// WithGetMembersFunc specifies the function used to identify cluster members
|
||||
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
|
||||
c.getMembersFunc = fn
|
||||
return c
|
||||
}
|
||||
|
||||
// StateProvider defines an inteface for retrieving a state.Store handle. In
|
||||
// non-test code, this is satisfied by the fsm.FSM struct.
|
||||
type StateProvider interface {
|
||||
|
@ -77,6 +95,7 @@ type UsageMetricsReporter struct {
|
|||
metricLabels []metrics.Label
|
||||
stateProvider StateProvider
|
||||
tickerInterval time.Duration
|
||||
getMembersFunc getMembersFunc
|
||||
}
|
||||
|
||||
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
||||
|
@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
|||
return nil, errors.New("must provide a StateProvider to usage reporter")
|
||||
}
|
||||
|
||||
if cfg.getMembersFunc == nil {
|
||||
return nil, errors.New("must provide a getMembersFunc to usage reporter")
|
||||
}
|
||||
|
||||
if cfg.logger == nil {
|
||||
cfg.logger = hclog.NewNullLogger()
|
||||
}
|
||||
|
@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
|||
stateProvider: cfg.stateProvider,
|
||||
metricLabels: cfg.metricLabels,
|
||||
tickerInterval: cfg.tickerInterval,
|
||||
getMembersFunc: cfg.getMembersFunc,
|
||||
}
|
||||
|
||||
return u, nil
|
||||
|
@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
|
|||
}
|
||||
|
||||
u.emitServiceUsage(serviceUsage)
|
||||
|
||||
servers, clients := u.memberUsage()
|
||||
u.emitMemberUsage(servers, clients)
|
||||
}
|
||||
|
||||
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
|
||||
if u.getMembersFunc == nil {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
mems := u.getMembersFunc()
|
||||
if len(mems) <= 0 {
|
||||
u.logger.Warn("cluster reported zero members")
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
servers := 0
|
||||
clients := make(map[string]int)
|
||||
|
||||
for _, m := range mems {
|
||||
if m.Status != serf.StatusAlive {
|
||||
continue
|
||||
}
|
||||
|
||||
switch m.Tags["role"] {
|
||||
case "node":
|
||||
clients[m.Tags["segment"]]++
|
||||
case "consul":
|
||||
servers++
|
||||
}
|
||||
}
|
||||
|
||||
return servers, clients
|
||||
}
|
||||
|
||||
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
|
||||
totalClients := 0
|
||||
|
||||
for seg, c := range clients {
|
||||
segmentLabel := metrics.Label{Name: "segment", Value: seg}
|
||||
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
|
||||
|
||||
metrics.SetGaugeWithLabels(
|
||||
[]string{"consul", "members", "clients"},
|
||||
float32(c),
|
||||
labels,
|
||||
)
|
||||
|
||||
totalClients += c
|
||||
}
|
||||
|
||||
metrics.SetGaugeWithLabels(
|
||||
[]string{"consul", "members", "clients"},
|
||||
float32(totalClients),
|
||||
u.metricLabels,
|
||||
)
|
||||
|
||||
metrics.SetGaugeWithLabels(
|
||||
[]string{"consul", "members", "servers"},
|
||||
float32(servers),
|
||||
u.metricLabels,
|
||||
)
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/consul/state"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/sdk/testutil"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
func newStateStore() (*state.Store, error) {
|
||||
|
@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
|
|||
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
||||
type testCase struct {
|
||||
modfiyStateStore func(t *testing.T, s *state.Store)
|
||||
getMembersFunc getMembersFunc
|
||||
expectedGauges map[string]metrics.GaugeValue
|
||||
}
|
||||
cases := map[string]testCase{
|
||||
|
@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
|||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 0,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.servers;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.servers",
|
||||
Value: 0,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
},
|
||||
getMembersFunc: func() []serf.Member { return []serf.Member{} },
|
||||
},
|
||||
"nodes-and-services": {
|
||||
modfiyStateStore: func(t *testing.T, s *state.Store) {
|
||||
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
|
||||
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
|
||||
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
|
||||
require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))
|
||||
|
||||
// Typical services and some consul services spread across two nodes
|
||||
require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
|
||||
require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
|
||||
},
|
||||
getMembersFunc: func() []serf.Member {
|
||||
return []serf.Member{
|
||||
{
|
||||
Name: "foo",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "bar",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "baz",
|
||||
Tags: map[string]string{"role": "node", "segment": "a"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "qux",
|
||||
Tags: map[string]string{"role": "node", "segment": "b"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
}
|
||||
},
|
||||
expectedGauges: map[string]metrics.GaugeValue{
|
||||
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.state.nodes",
|
||||
Value: 3,
|
||||
Value: 4,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
"consul.usage.test.consul.state.services;datacenter=dc1": {
|
||||
|
@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
|||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 2,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.servers;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.servers",
|
||||
Value: 2,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 1,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "segment", Value: "a"},
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 1,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "segment", Value: "b"},
|
||||
{Name: "datacenter", Value: "dc1"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
|
|||
new(Config).
|
||||
WithStateProvider(mockStateProvider).
|
||||
WithLogger(testutil.Logger(t)).
|
||||
WithDatacenter("dc1"),
|
||||
WithDatacenter("dc1").
|
||||
WithGetMembersFunc(tcase.getMembersFunc),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/consul/state"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/sdk/testutil"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
type mockStateProvider struct {
|
||||
|
@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
|
|||
func TestUsageReporter_Run_Nodes(t *testing.T) {
|
||||
type testCase struct {
|
||||
modfiyStateStore func(t *testing.T, s *state.Store)
|
||||
getMembersFunc getMembersFunc
|
||||
expectedGauges map[string]metrics.GaugeValue
|
||||
}
|
||||
cases := map[string]testCase{
|
||||
|
@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
|
|||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
},
|
||||
getMembersFunc: func() []serf.Member { return []serf.Member{} },
|
||||
},
|
||||
"nodes": {
|
||||
modfiyStateStore: func(t *testing.T, s *state.Store) {
|
||||
|
@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
|
|||
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
|
||||
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
|
||||
},
|
||||
getMembersFunc: func() []serf.Member {
|
||||
return []serf.Member{
|
||||
{
|
||||
Name: "foo",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "bar",
|
||||
Tags: map[string]string{"role": "consul"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
{
|
||||
Name: "baz",
|
||||
Tags: map[string]string{"role": "node"},
|
||||
Status: serf.StatusAlive,
|
||||
},
|
||||
}
|
||||
},
|
||||
expectedGauges: map[string]metrics.GaugeValue{
|
||||
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.state.nodes",
|
||||
Value: 3,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
"consul.usage.test.consul.members.clients;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.clients",
|
||||
Value: 1,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
"consul.usage.test.consul.members.servers;datacenter=dc1": {
|
||||
Name: "consul.usage.test.consul.members.servers",
|
||||
Value: 2,
|
||||
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
|
|||
new(Config).
|
||||
WithStateProvider(mockStateProvider).
|
||||
WithLogger(testutil.Logger(t)).
|
||||
WithDatacenter("dc1"),
|
||||
WithDatacenter("dc1").
|
||||
WithGetMembersFunc(tcase.getMembersFunc),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
|
|
|
@ -313,6 +313,8 @@ This is a full list of metrics emitted by Consul.
|
|||
| `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
|
||||
| `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
|
||||
| `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
|
||||
| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge |
|
||||
| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge |
|
||||
| `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter |
|
||||
| `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer |
|
||||
| `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |
|
||||
|
|
Loading…
Reference in New Issue