open-consul/agent/consul/usagemetrics/usagemetrics.go
Paul Ewing e454a9aae0
usagemetrics: add cluster members to metrics API (#10340)
This PR adds cluster members to the metrics API. The number of members per
segment are reported as well as the total number of members.

Tested by running a multi-node cluster locally and ensuring the numbers were
correct. Also added unit test coverage to add the new expected gauges to
existing test cases.
2021-06-03 08:25:53 -07:00

227 lines
5.9 KiB
Go

package usagemetrics
import (
"context"
"errors"
"time"
"github.com/armon/go-metrics/prometheus"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/serf/serf"
)
var Gauges = []prometheus.GaugeDefinition{
{
Name: []string{"consul", "state", "nodes"},
Help: "Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "state", "services"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "members", "clients"},
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
{
Name: []string{"consul", "members", "servers"},
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
}
type getMembersFunc func() []serf.Member
// Config holds the settings for various parameters for the
// UsageMetricsReporter
type Config struct {
logger hclog.Logger
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
// UsageMetricsReporter
func (c *Config) WithDatacenter(dc string) *Config {
c.metricLabels = append(c.metricLabels, metrics.Label{Name: "datacenter", Value: dc})
return c
}
// WithLogger takes a logger and creates a new, named sub-logger to use when
// running
func (c *Config) WithLogger(logger hclog.Logger) *Config {
c.logger = logger.Named(logging.UsageMetrics)
return c
}
// WithReportingInterval specifies the interval on which UsageMetricsReporter
// should emit metrics
func (c *Config) WithReportingInterval(dur time.Duration) *Config {
c.tickerInterval = dur
return c
}
func (c *Config) WithStateProvider(sp StateProvider) *Config {
c.stateProvider = sp
return c
}
// WithGetMembersFunc specifies the function used to identify cluster members
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
c.getMembersFunc = fn
return c
}
// StateProvider defines an inteface for retrieving a state.Store handle. In
// non-test code, this is satisfied by the fsm.FSM struct.
type StateProvider interface {
State() *state.Store
}
// UsageMetricsReporter provides functionality for emitting usage metrics into
// the metrics stream. This makes it essentially a translation layer
// between the state store and metrics stream.
type UsageMetricsReporter struct {
logger hclog.Logger
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
if cfg.stateProvider == nil {
return nil, errors.New("must provide a StateProvider to usage reporter")
}
if cfg.getMembersFunc == nil {
return nil, errors.New("must provide a getMembersFunc to usage reporter")
}
if cfg.logger == nil {
cfg.logger = hclog.NewNullLogger()
}
if cfg.tickerInterval == 0 {
// Metrics are aggregated every 10 seconds, so we default to that.
cfg.tickerInterval = 10 * time.Second
}
u := &UsageMetricsReporter{
logger: cfg.logger,
stateProvider: cfg.stateProvider,
metricLabels: cfg.metricLabels,
tickerInterval: cfg.tickerInterval,
getMembersFunc: cfg.getMembersFunc,
}
return u, nil
}
// Run must be run in a goroutine, and can be stopped by closing or sending
// data to the passed in shutdownCh
func (u *UsageMetricsReporter) Run(ctx context.Context) {
ticker := time.NewTicker(u.tickerInterval)
for {
select {
case <-ctx.Done():
u.logger.Debug("usage metrics reporter shutting down")
ticker.Stop()
return
case <-ticker.C:
u.runOnce()
}
}
}
func (u *UsageMetricsReporter) runOnce() {
state := u.stateProvider.State()
_, nodes, err := state.NodeCount()
if err != nil {
u.logger.Warn("failed to retrieve nodes from state store", "error", err)
}
metrics.SetGaugeWithLabels(
[]string{"consul", "state", "nodes"},
float32(nodes),
u.metricLabels,
)
_, serviceUsage, err := state.ServiceUsage()
if err != nil {
u.logger.Warn("failed to retrieve services from state store", "error", err)
}
u.emitServiceUsage(serviceUsage)
servers, clients := u.memberUsage()
u.emitMemberUsage(servers, clients)
}
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
if u.getMembersFunc == nil {
return 0, nil
}
mems := u.getMembersFunc()
if len(mems) <= 0 {
u.logger.Warn("cluster reported zero members")
return 0, nil
}
servers := 0
clients := make(map[string]int)
for _, m := range mems {
if m.Status != serf.StatusAlive {
continue
}
switch m.Tags["role"] {
case "node":
clients[m.Tags["segment"]]++
case "consul":
servers++
}
}
return servers, clients
}
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
totalClients := 0
for seg, c := range clients {
segmentLabel := metrics.Label{Name: "segment", Value: seg}
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(c),
labels,
)
totalClients += c
}
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(totalClients),
u.metricLabels,
)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "servers"},
float32(servers),
u.metricLabels,
)
}