usagemetrics: add cluster members to metrics API (#10340)

This PR adds cluster members to the metrics API. The number of members per
segment are reported as well as the total number of members.

Tested by running a multi-node cluster locally and ensuring the numbers were
correct. Also added unit test coverage to add the new expected gauges to
existing test cases.
This commit is contained in:
Paul Ewing 2021-06-03 08:25:53 -07:00 committed by GitHub
parent fd97cf9ecc
commit e454a9aae0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 218 additions and 13 deletions

3
.changelog/10340.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
```

View File

@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
WithStateProvider(s.fsm). WithStateProvider(s.fsm).
WithLogger(s.logger). WithLogger(s.logger).
WithDatacenter(s.config.Datacenter). WithDatacenter(s.config.Datacenter).
WithReportingInterval(s.config.MetricsReportingInterval), WithReportingInterval(s.config.MetricsReportingInterval).
WithGetMembersFunc(func() []serf.Member {
members, err := s.LANMembersAllSegments()
if err != nil {
return []serf.Member{}
}
return members
}),
) )
if err != nil { if err != nil {
s.Shutdown() s.Shutdown()
@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
return s.serfLAN.Members() return s.serfLAN.Members()
} }
// WANMembers is used to return the members of the LAN cluster // WANMembers is used to return the members of the WAN cluster
func (s *Server) WANMembers() []serf.Member { func (s *Server) WANMembers() []serf.Member {
if s.serfWAN == nil { if s.serfWAN == nil {
return nil return nil

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"github.com/hashicorp/serf/serf"
) )
var Gauges = []prometheus.GaugeDefinition{ var Gauges = []prometheus.GaugeDefinition{
@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{
Name: []string{"consul", "state", "service_instances"}, Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.", Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
}, },
{
Name: []string{"consul", "members", "clients"},
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
{
Name: []string{"consul", "members", "servers"},
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
} }
type getMembersFunc func() []serf.Member
// Config holds the settings for various parameters for the // Config holds the settings for various parameters for the
// UsageMetricsReporter // UsageMetricsReporter
type Config struct { type Config struct {
@ -35,6 +46,7 @@ type Config struct {
metricLabels []metrics.Label metricLabels []metrics.Label
stateProvider StateProvider stateProvider StateProvider
tickerInterval time.Duration tickerInterval time.Duration
getMembersFunc getMembersFunc
} }
// WithDatacenter adds the datacenter as a label to all metrics emitted by the // WithDatacenter adds the datacenter as a label to all metrics emitted by the
@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
return c return c
} }
// WithGetMembersFunc specifies the function used to identify cluster members
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
c.getMembersFunc = fn
return c
}
// StateProvider defines an inteface for retrieving a state.Store handle. In // StateProvider defines an inteface for retrieving a state.Store handle. In
// non-test code, this is satisfied by the fsm.FSM struct. // non-test code, this is satisfied by the fsm.FSM struct.
type StateProvider interface { type StateProvider interface {
@ -77,6 +95,7 @@ type UsageMetricsReporter struct {
metricLabels []metrics.Label metricLabels []metrics.Label
stateProvider StateProvider stateProvider StateProvider
tickerInterval time.Duration tickerInterval time.Duration
getMembersFunc getMembersFunc
} }
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) { func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
return nil, errors.New("must provide a StateProvider to usage reporter") return nil, errors.New("must provide a StateProvider to usage reporter")
} }
if cfg.getMembersFunc == nil {
return nil, errors.New("must provide a getMembersFunc to usage reporter")
}
if cfg.logger == nil { if cfg.logger == nil {
cfg.logger = hclog.NewNullLogger() cfg.logger = hclog.NewNullLogger()
} }
@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
stateProvider: cfg.stateProvider, stateProvider: cfg.stateProvider,
metricLabels: cfg.metricLabels, metricLabels: cfg.metricLabels,
tickerInterval: cfg.tickerInterval, tickerInterval: cfg.tickerInterval,
getMembersFunc: cfg.getMembersFunc,
} }
return u, nil return u, nil
@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
} }
u.emitServiceUsage(serviceUsage) u.emitServiceUsage(serviceUsage)
servers, clients := u.memberUsage()
u.emitMemberUsage(servers, clients)
}
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
if u.getMembersFunc == nil {
return 0, nil
}
mems := u.getMembersFunc()
if len(mems) <= 0 {
u.logger.Warn("cluster reported zero members")
return 0, nil
}
servers := 0
clients := make(map[string]int)
for _, m := range mems {
if m.Status != serf.StatusAlive {
continue
}
switch m.Tags["role"] {
case "node":
clients[m.Tags["segment"]]++
case "consul":
servers++
}
}
return servers, clients
}
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
totalClients := 0
for seg, c := range clients {
segmentLabel := metrics.Label{Name: "segment", Value: seg}
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(c),
labels,
)
totalClients += c
}
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(totalClients),
u.metricLabels,
)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "servers"},
float32(servers),
u.metricLabels,
)
} }

View File

@ -12,6 +12,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
) )
func newStateStore() (*state.Store, error) { func newStateStore() (*state.Store, error) {
@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
type testCase struct { type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store) modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue expectedGauges map[string]metrics.GaugeValue
} }
cases := map[string]testCase{ cases := map[string]testCase{
@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"}, {Name: "datacenter", Value: "dc1"},
}, },
}, },
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
}, },
}, },
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes-and-services": { "nodes-and-services": {
modfiyStateStore: func(t *testing.T, s *state.Store) { modfiyStateStore: func(t *testing.T, s *state.Store) {
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))
// Typical services and some consul services spread across two nodes // Typical services and some consul services spread across two nodes
require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node", "segment": "a"},
Status: serf.StatusAlive,
},
{
Name: "qux",
Tags: map[string]string{"role": "node", "segment": "b"},
Status: serf.StatusAlive,
},
}
}, },
expectedGauges: map[string]metrics.GaugeValue{ expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": { "consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes", Name: "consul.usage.test.consul.state.nodes",
Value: 3, Value: 4,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
}, },
"consul.usage.test.consul.state.services;datacenter=dc1": { "consul.usage.test.consul.state.services;datacenter=dc1": {
@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"}, {Name: "datacenter", Value: "dc1"},
}, },
}, },
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "a"},
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "b"},
{Name: "datacenter", Value: "dc1"},
},
},
}, },
}, },
} }
@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
new(Config). new(Config).
WithStateProvider(mockStateProvider). WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)). WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"), WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
) )
require.NoError(t, err) require.NoError(t, err)

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
) )
type mockStateProvider struct { type mockStateProvider struct {
@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
func TestUsageReporter_Run_Nodes(t *testing.T) { func TestUsageReporter_Run_Nodes(t *testing.T) {
type testCase struct { type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store) modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue expectedGauges map[string]metrics.GaugeValue
} }
cases := map[string]testCase{ cases := map[string]testCase{
@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
}, },
}, },
getMembersFunc: func() []serf.Member { return []serf.Member{} },
}, },
"nodes": { "nodes": {
modfiyStateStore: func(t *testing.T, s *state.Store) { modfiyStateStore: func(t *testing.T, s *state.Store) {
@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
}, },
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{ expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": { "consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes", Name: "consul.usage.test.consul.state.nodes",
Value: 3, Value: 3,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
}, },
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
}, },
}, },
} }
@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
new(Config). new(Config).
WithStateProvider(mockStateProvider). WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)). WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"), WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
) )
require.NoError(t, err) require.NoError(t, err)

View File

@ -313,6 +313,8 @@ This is a full list of metrics emitted by Consul.
| `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | | `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge |
| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge |
| `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter | | `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter |
| `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer | | `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer |
| `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer | | `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |