e454a9aae0
This PR adds cluster members to the metrics API. The number of members per segment are reported as well as the total number of members. Tested by running a multi-node cluster locally and ensuring the numbers were correct. Also added unit test coverage to add the new expected gauges to existing test cases.
227 lines
5.9 KiB
Go
227 lines
5.9 KiB
Go
package usagemetrics
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"time"
|
|
|
|
"github.com/armon/go-metrics/prometheus"
|
|
|
|
"github.com/armon/go-metrics"
|
|
"github.com/hashicorp/consul/agent/consul/state"
|
|
"github.com/hashicorp/consul/logging"
|
|
"github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/serf/serf"
|
|
)
|
|
|
|
var Gauges = []prometheus.GaugeDefinition{
|
|
{
|
|
Name: []string{"consul", "state", "nodes"},
|
|
Help: "Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0.",
|
|
},
|
|
{
|
|
Name: []string{"consul", "state", "services"},
|
|
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
|
|
},
|
|
{
|
|
Name: []string{"consul", "state", "service_instances"},
|
|
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
|
|
},
|
|
{
|
|
Name: []string{"consul", "members", "clients"},
|
|
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
|
|
},
|
|
{
|
|
Name: []string{"consul", "members", "servers"},
|
|
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
|
|
},
|
|
}
|
|
|
|
type getMembersFunc func() []serf.Member
|
|
|
|
// Config holds the settings for various parameters for the
|
|
// UsageMetricsReporter
|
|
type Config struct {
|
|
logger hclog.Logger
|
|
metricLabels []metrics.Label
|
|
stateProvider StateProvider
|
|
tickerInterval time.Duration
|
|
getMembersFunc getMembersFunc
|
|
}
|
|
|
|
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
|
|
// UsageMetricsReporter
|
|
func (c *Config) WithDatacenter(dc string) *Config {
|
|
c.metricLabels = append(c.metricLabels, metrics.Label{Name: "datacenter", Value: dc})
|
|
return c
|
|
}
|
|
|
|
// WithLogger takes a logger and creates a new, named sub-logger to use when
|
|
// running
|
|
func (c *Config) WithLogger(logger hclog.Logger) *Config {
|
|
c.logger = logger.Named(logging.UsageMetrics)
|
|
return c
|
|
}
|
|
|
|
// WithReportingInterval specifies the interval on which UsageMetricsReporter
|
|
// should emit metrics
|
|
func (c *Config) WithReportingInterval(dur time.Duration) *Config {
|
|
c.tickerInterval = dur
|
|
return c
|
|
}
|
|
|
|
func (c *Config) WithStateProvider(sp StateProvider) *Config {
|
|
c.stateProvider = sp
|
|
return c
|
|
}
|
|
|
|
// WithGetMembersFunc specifies the function used to identify cluster members
|
|
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
|
|
c.getMembersFunc = fn
|
|
return c
|
|
}
|
|
|
|
// StateProvider defines an inteface for retrieving a state.Store handle. In
|
|
// non-test code, this is satisfied by the fsm.FSM struct.
|
|
type StateProvider interface {
|
|
State() *state.Store
|
|
}
|
|
|
|
// UsageMetricsReporter provides functionality for emitting usage metrics into
|
|
// the metrics stream. This makes it essentially a translation layer
|
|
// between the state store and metrics stream.
|
|
type UsageMetricsReporter struct {
|
|
logger hclog.Logger
|
|
metricLabels []metrics.Label
|
|
stateProvider StateProvider
|
|
tickerInterval time.Duration
|
|
getMembersFunc getMembersFunc
|
|
}
|
|
|
|
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
|
|
if cfg.stateProvider == nil {
|
|
return nil, errors.New("must provide a StateProvider to usage reporter")
|
|
}
|
|
|
|
if cfg.getMembersFunc == nil {
|
|
return nil, errors.New("must provide a getMembersFunc to usage reporter")
|
|
}
|
|
|
|
if cfg.logger == nil {
|
|
cfg.logger = hclog.NewNullLogger()
|
|
}
|
|
|
|
if cfg.tickerInterval == 0 {
|
|
// Metrics are aggregated every 10 seconds, so we default to that.
|
|
cfg.tickerInterval = 10 * time.Second
|
|
}
|
|
|
|
u := &UsageMetricsReporter{
|
|
logger: cfg.logger,
|
|
stateProvider: cfg.stateProvider,
|
|
metricLabels: cfg.metricLabels,
|
|
tickerInterval: cfg.tickerInterval,
|
|
getMembersFunc: cfg.getMembersFunc,
|
|
}
|
|
|
|
return u, nil
|
|
}
|
|
|
|
// Run must be run in a goroutine, and can be stopped by closing or sending
|
|
// data to the passed in shutdownCh
|
|
func (u *UsageMetricsReporter) Run(ctx context.Context) {
|
|
ticker := time.NewTicker(u.tickerInterval)
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
u.logger.Debug("usage metrics reporter shutting down")
|
|
ticker.Stop()
|
|
return
|
|
case <-ticker.C:
|
|
u.runOnce()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (u *UsageMetricsReporter) runOnce() {
|
|
state := u.stateProvider.State()
|
|
_, nodes, err := state.NodeCount()
|
|
if err != nil {
|
|
u.logger.Warn("failed to retrieve nodes from state store", "error", err)
|
|
}
|
|
metrics.SetGaugeWithLabels(
|
|
[]string{"consul", "state", "nodes"},
|
|
float32(nodes),
|
|
u.metricLabels,
|
|
)
|
|
|
|
_, serviceUsage, err := state.ServiceUsage()
|
|
if err != nil {
|
|
u.logger.Warn("failed to retrieve services from state store", "error", err)
|
|
}
|
|
|
|
u.emitServiceUsage(serviceUsage)
|
|
|
|
servers, clients := u.memberUsage()
|
|
u.emitMemberUsage(servers, clients)
|
|
}
|
|
|
|
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
|
|
if u.getMembersFunc == nil {
|
|
return 0, nil
|
|
}
|
|
|
|
mems := u.getMembersFunc()
|
|
if len(mems) <= 0 {
|
|
u.logger.Warn("cluster reported zero members")
|
|
return 0, nil
|
|
}
|
|
|
|
servers := 0
|
|
clients := make(map[string]int)
|
|
|
|
for _, m := range mems {
|
|
if m.Status != serf.StatusAlive {
|
|
continue
|
|
}
|
|
|
|
switch m.Tags["role"] {
|
|
case "node":
|
|
clients[m.Tags["segment"]]++
|
|
case "consul":
|
|
servers++
|
|
}
|
|
}
|
|
|
|
return servers, clients
|
|
}
|
|
|
|
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
|
|
totalClients := 0
|
|
|
|
for seg, c := range clients {
|
|
segmentLabel := metrics.Label{Name: "segment", Value: seg}
|
|
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
|
|
|
|
metrics.SetGaugeWithLabels(
|
|
[]string{"consul", "members", "clients"},
|
|
float32(c),
|
|
labels,
|
|
)
|
|
|
|
totalClients += c
|
|
}
|
|
|
|
metrics.SetGaugeWithLabels(
|
|
[]string{"consul", "members", "clients"},
|
|
float32(totalClients),
|
|
u.metricLabels,
|
|
)
|
|
|
|
metrics.SetGaugeWithLabels(
|
|
[]string{"consul", "members", "servers"},
|
|
float32(servers),
|
|
u.metricLabels,
|
|
)
|
|
}
|