hoststats: add package for collecting host statistics including cpu memory and disk usage (#17038)

This commit is contained in:
Nick Ethier 2023-05-30 14:43:29 -04:00 committed by GitHub
parent 265c50b1dc
commit e5cdb702e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 598 additions and 11 deletions

3
.changelog/17038.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled)
```

View File

@ -1106,6 +1106,9 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
LocalProxyConfigResyncInterval: 30 * time.Second, LocalProxyConfigResyncInterval: 30 * time.Second,
} }
// host metrics are enabled by default if consul is configured with HashiCorp Cloud Platform integration
rt.Telemetry.EnableHostMetrics = boolValWithDefault(c.Telemetry.EnableHostMetrics, rt.IsCloudEnabled())
rt.TLS, err = b.buildTLSConfig(rt, c.TLS) rt.TLS, err = b.buildTLSConfig(rt, c.TLS)
if err != nil { if err != nil {
return RuntimeConfig{}, err return RuntimeConfig{}, err

View File

@ -556,3 +556,22 @@ func TestBuilder_parsePrefixFilter(t *testing.T) {
} }
}) })
} }
func TestBuidler_hostMetricsWithCloud(t *testing.T) {
devMode := true
builderOpts := LoadOpts{
DevMode: &devMode,
DefaultConfig: FileSource{
Name: "test",
Format: "hcl",
Data: `cloud{ resource_id = "abc" client_id = "abc" client_secret = "abc"}`,
},
}
result, err := Load(builderOpts)
require.NoError(t, err)
require.Empty(t, result.Warnings)
cfg := result.RuntimeConfig
require.NotNil(t, cfg)
require.True(t, cfg.Telemetry.EnableHostMetrics)
}

View File

@ -691,6 +691,7 @@ type Telemetry struct {
CirconusSubmissionInterval *string `mapstructure:"circonus_submission_interval" json:"circonus_submission_interval,omitempty"` CirconusSubmissionInterval *string `mapstructure:"circonus_submission_interval" json:"circonus_submission_interval,omitempty"`
CirconusSubmissionURL *string `mapstructure:"circonus_submission_url" json:"circonus_submission_url,omitempty"` CirconusSubmissionURL *string `mapstructure:"circonus_submission_url" json:"circonus_submission_url,omitempty"`
DisableHostname *bool `mapstructure:"disable_hostname" json:"disable_hostname,omitempty"` DisableHostname *bool `mapstructure:"disable_hostname" json:"disable_hostname,omitempty"`
EnableHostMetrics *bool `mapstructure:"enable_host_metrics" json:"enable_host_metrics,omitempty"`
DogstatsdAddr *string `mapstructure:"dogstatsd_addr" json:"dogstatsd_addr,omitempty"` DogstatsdAddr *string `mapstructure:"dogstatsd_addr" json:"dogstatsd_addr,omitempty"`
DogstatsdTags []string `mapstructure:"dogstatsd_tags" json:"dogstatsd_tags,omitempty"` DogstatsdTags []string `mapstructure:"dogstatsd_tags" json:"dogstatsd_tags,omitempty"`
RetryFailedConfiguration *bool `mapstructure:"retry_failed_connection" json:"retry_failed_connection,omitempty"` RetryFailedConfiguration *bool `mapstructure:"retry_failed_connection" json:"retry_failed_connection,omitempty"`

View File

@ -46,6 +46,7 @@ type testCase struct {
desc string desc string
args []string args []string
setup func() // TODO: accept a testing.T instead of panic setup func() // TODO: accept a testing.T instead of panic
cleanup func()
expected func(rt *RuntimeConfig) expected func(rt *RuntimeConfig)
expectedErr string expectedErr string
expectedWarnings []string expectedWarnings []string
@ -2308,9 +2309,9 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
}, },
setup: func() { setup: func() {
os.Setenv("HCP_RESOURCE_ID", "env-id") os.Setenv("HCP_RESOURCE_ID", "env-id")
t.Cleanup(func() { },
os.Unsetenv("HCP_RESOURCE_ID") cleanup: func() {
}) os.Unsetenv("HCP_RESOURCE_ID")
}, },
expected: func(rt *RuntimeConfig) { expected: func(rt *RuntimeConfig) {
rt.DataDir = dataDir rt.DataDir = dataDir
@ -2321,6 +2322,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
// server things // server things
rt.ServerMode = true rt.ServerMode = true
rt.Telemetry.EnableHostMetrics = true
rt.TLS.ServerMode = true rt.TLS.ServerMode = true
rt.LeaveOnTerm = false rt.LeaveOnTerm = false
rt.SkipLeaveOnInt = true rt.SkipLeaveOnInt = true
@ -2337,9 +2339,9 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
}, },
setup: func() { setup: func() {
os.Setenv("HCP_RESOURCE_ID", "env-id") os.Setenv("HCP_RESOURCE_ID", "env-id")
t.Cleanup(func() { },
os.Unsetenv("HCP_RESOURCE_ID") cleanup: func() {
}) os.Unsetenv("HCP_RESOURCE_ID")
}, },
json: []string{`{ json: []string{`{
"cloud": { "cloud": {
@ -2360,6 +2362,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
// server things // server things
rt.ServerMode = true rt.ServerMode = true
rt.Telemetry.EnableHostMetrics = true
rt.TLS.ServerMode = true rt.TLS.ServerMode = true
rt.LeaveOnTerm = false rt.LeaveOnTerm = false
rt.SkipLeaveOnInt = true rt.SkipLeaveOnInt = true
@ -6032,6 +6035,9 @@ func (tc testCase) run(format string, dataDir string) func(t *testing.T) {
expected.ACLResolverSettings.EnterpriseMeta = *structs.NodeEnterpriseMetaInPartition(expected.PartitionOrDefault()) expected.ACLResolverSettings.EnterpriseMeta = *structs.NodeEnterpriseMetaInPartition(expected.PartitionOrDefault())
prototest.AssertDeepEqual(t, expected, actual, cmpopts.EquateEmpty()) prototest.AssertDeepEqual(t, expected, actual, cmpopts.EquateEmpty())
if tc.cleanup != nil {
tc.cleanup()
}
} }
} }
@ -6754,6 +6760,7 @@ func TestLoad_FullConfig(t *testing.T) {
Expiration: 15 * time.Second, Expiration: 15 * time.Second,
Name: "ftO6DySn", // notice this is the same as the metrics prefix Name: "ftO6DySn", // notice this is the same as the metrics prefix
}, },
EnableHostMetrics: true,
}, },
TLS: tlsutil.Config{ TLS: tlsutil.Config{
InternalRPC: tlsutil.ProtocolConfig{ InternalRPC: tlsutil.ProtocolConfig{

View File

@ -465,6 +465,7 @@
"DisableHostname": false, "DisableHostname": false,
"DogstatsdAddr": "", "DogstatsdAddr": "",
"DogstatsdTags": [], "DogstatsdTags": [],
"EnableHostMetrics": false,
"FilterDefault": false, "FilterDefault": false,
"MetricsPrefix": "", "MetricsPrefix": "",
"PrometheusOpts": { "PrometheusOpts": {

View File

@ -690,6 +690,7 @@ telemetry {
circonus_check_tags = "prvO4uBl" circonus_check_tags = "prvO4uBl"
circonus_submission_interval = "DolzaflP" circonus_submission_interval = "DolzaflP"
circonus_submission_url = "gTcbS93G" circonus_submission_url = "gTcbS93G"
enable_host_metrics = true
disable_hostname = true disable_hostname = true
dogstatsd_addr = "0wSndumK" dogstatsd_addr = "0wSndumK"
dogstatsd_tags = [ "3N81zSUB","Xtj8AnXZ" ] dogstatsd_tags = [ "3N81zSUB","Xtj8AnXZ" ]

View File

@ -808,6 +808,7 @@
"circonus_check_tags": "prvO4uBl", "circonus_check_tags": "prvO4uBl",
"circonus_submission_interval": "DolzaflP", "circonus_submission_interval": "DolzaflP",
"circonus_submission_url": "gTcbS93G", "circonus_submission_url": "gTcbS93G",
"enable_host_metrics": true,
"disable_hostname": true, "disable_hostname": true,
"dogstatsd_addr": "0wSndumK", "dogstatsd_addr": "0wSndumK",
"dogstatsd_tags": [ "dogstatsd_tags": [

View File

@ -4,6 +4,7 @@
package agent package agent
import ( import (
"context"
"fmt" "fmt"
"io" "io"
"net" "net"
@ -41,6 +42,7 @@ import (
"github.com/hashicorp/consul/agent/xds" "github.com/hashicorp/consul/agent/xds"
"github.com/hashicorp/consul/ipaddr" "github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/hoststats"
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/consul/tlsutil"
) )
@ -59,6 +61,7 @@ type BaseDeps struct {
WatchedFiles []string WatchedFiles []string
deregisterBalancer, deregisterResolver func() deregisterBalancer, deregisterResolver func()
stopHostCollector context.CancelFunc
} }
type ConfigLoader func(source config.Source) (config.LoadResult, error) type ConfigLoader func(source config.Source) (config.LoadResult, error)
@ -117,6 +120,11 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl
if err != nil { if err != nil {
return d, fmt.Errorf("failed to initialize telemetry: %w", err) return d, fmt.Errorf("failed to initialize telemetry: %w", err)
} }
if !cfg.Telemetry.Disable && cfg.Telemetry.EnableHostMetrics {
ctx, cancel := context.WithCancel(context.Background())
hoststats.NewCollector(ctx, d.Logger, cfg.DataDir)
d.stopHostCollector = cancel
}
d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger) d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger)
if err != nil { if err != nil {
@ -214,11 +222,10 @@ func (bd BaseDeps) Close() {
bd.AutoConfig.Stop() bd.AutoConfig.Stop()
bd.MetricsConfig.Cancel() bd.MetricsConfig.Cancel()
if fn := bd.deregisterBalancer; fn != nil { for _, fn := range []func(){bd.deregisterBalancer, bd.deregisterResolver, bd.stopHostCollector} {
fn() if fn != nil {
} fn()
if fn := bd.deregisterResolver; fn != nil { }
fn()
} }
} }
@ -297,6 +304,10 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G
serverGauges, serverGauges,
} }
if cfg.Telemetry.EnableHostMetrics {
gauges = append(gauges, hoststats.Gauges)
}
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc // TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
if isServer { if isServer {
gauges = append(gauges, gauges = append(gauges,

189
lib/hoststats/collector.go Normal file
View File

@ -0,0 +1,189 @@
package hoststats
import (
"context"
"fmt"
"math"
"runtime"
"sync"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
"github.com/shirou/gopsutil/v3/mem"
)
// Collector collects host resource usage stats
type Collector struct {
numCores int
cpuCalculator map[string]*cpuStatsCalculator
hostStats *HostStats
hostStatsLock sync.RWMutex
dataDir string
metrics Metrics
baseLabels []metrics.Label
logger hclog.Logger
}
// NewCollector returns a Collector. The dataDir is passed in
// so that we can present the disk related statistics for the mountpoint where the dataDir exists
func NewCollector(ctx context.Context, logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector {
logger = logger.Named("host_stats")
collector := initCollector(logger, dataDir)
go collector.loop(ctx)
return collector
}
// initCollector initializes the Collector but does not start the collection loop
func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector {
numCores := runtime.NumCPU()
statsCalculator := make(map[string]*cpuStatsCalculator)
collector := &Collector{
cpuCalculator: statsCalculator,
numCores: numCores,
logger: logger,
dataDir: dataDir,
}
for _, opt := range opts {
opt(collector)
}
if collector.metrics == nil {
collector.metrics = metrics.Default()
}
return collector
}
func (c *Collector) loop(ctx context.Context) {
// Start collecting host stats right away and then keep collecting every
// collection interval
next := time.NewTimer(0)
defer next.Stop()
for {
select {
case <-next.C:
c.collect()
next.Reset(hostStatsCollectionInterval)
c.Stats().Emit(c.metrics, c.baseLabels)
case <-ctx.Done():
return
}
}
}
// collect will collect stats related to resource usage of the host
func (c *Collector) collect() {
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
// Determine up-time
uptime, err := host.Uptime()
if err != nil {
c.logger.Error("failed to collect uptime stats", "error", err)
uptime = 0
}
hs.Uptime = uptime
// Collect memory stats
mstats, err := c.collectMemoryStats()
if err != nil {
c.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{}
}
hs.Memory = mstats
// Collect cpu stats
cpus, err := c.collectCPUStats()
if err != nil {
c.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{}
}
hs.CPU = cpus
// Collect disk stats
diskStats, err := c.collectDiskStats(c.dataDir)
if err != nil {
c.logger.Error("failed to collect dataDir disk stats", "error", err)
}
hs.DataDirStats = diskStats
// Update the collected status object.
c.hostStatsLock.Lock()
c.hostStats = hs
c.hostStatsLock.Unlock()
}
func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) {
usage, err := disk.Usage(dir)
if err != nil {
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
}
return c.toDiskStats(usage), nil
}
func (c *Collector) collectMemoryStats() (*MemoryStats, error) {
memStats, err := mem.VirtualMemory()
if err != nil {
return nil, err
}
mem := &MemoryStats{
Total: memStats.Total,
Available: memStats.Available,
Used: memStats.Used,
UsedPercent: memStats.UsedPercent,
Free: memStats.Free,
}
return mem, nil
}
// Stats returns the host stats that has been collected
func (c *Collector) Stats() *HostStats {
c.hostStatsLock.RLock()
defer c.hostStatsLock.RUnlock()
if c.hostStats == nil {
return &HostStats{}
}
return c.hostStats.Clone()
}
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
ds := DiskStats{
Size: usage.Total,
Used: usage.Used,
Available: usage.Free,
UsedPercent: usage.UsedPercent,
InodesUsedPercent: usage.InodesUsedPercent,
Path: usage.Path,
}
if math.IsNaN(ds.UsedPercent) {
ds.UsedPercent = 0.0
}
if math.IsNaN(ds.InodesUsedPercent) {
ds.InodesUsedPercent = 0.0
}
return &ds
}
type CollectorOption func(c *Collector)
func WithMetrics(m *metrics.Metrics) CollectorOption {
return func(c *Collector) {
c.metrics = m
}
}
func WithBaseLabels(labels []metrics.Label) CollectorOption {
return func(c *Collector) {
c.baseLabels = labels
}
}

85
lib/hoststats/cpu.go Normal file
View File

@ -0,0 +1,85 @@
package hoststats
import (
"math"
"github.com/shirou/gopsutil/v3/cpu"
)
// cpuStatsCalculator calculates cpu usage percentages
type cpuStatsCalculator struct {
prev cpu.TimesStat
prevBusy float64
prevTotal float64
}
// calculate the current cpu usage percentages.
// Since the cpu.TimesStat captures the total time a cpu spent in various states
// this function tracks the last seen stat and derives each cpu state's utilization
// as a percentage of the total change in cpu time between calls.
// The first time calculate is called CPUStats will report %100 idle
// usage since there is not a previous value to calculate against
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
// sum all none idle counters to get the total busy cpu time
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
times.Softirq + times.Steal + times.Guest + times.GuestNice
// sum of the total cpu time
currentTotal := currentBusy + times.Idle
// calculate how much cpu time has passed since last calculation
deltaTotal := currentTotal - h.prevTotal
stats := &CPUStats{
CPU: times.CPU,
// calculate each percentage as the ratio of the change
// in each state's time to the total change in cpu time
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100,
User: ((times.User - h.prev.User) / deltaTotal) * 100,
System: ((times.System - h.prev.System) / deltaTotal) * 100,
Iowait: ((times.Iowait - h.prev.Iowait) / deltaTotal) * 100,
Total: ((currentBusy - h.prevBusy) / deltaTotal) * 100,
}
// Protect against any invalid values
if math.IsNaN(stats.Idle) || math.IsInf(stats.Idle, 0) {
stats.Idle = 100.0
}
if math.IsNaN(stats.User) || math.IsInf(stats.User, 0) {
stats.User = 0.0
}
if math.IsNaN(stats.System) || math.IsInf(stats.System, 0) {
stats.System = 0.0
}
if math.IsNaN(stats.Iowait) || math.IsInf(stats.Iowait, 0) {
stats.Iowait = 0.0
}
if math.IsNaN(stats.Total) || math.IsInf(stats.Total, 0) {
stats.Total = 0.0
}
h.prev = times
h.prevTotal = currentTotal
h.prevBusy = currentBusy
return stats
}
func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
cpuStats, err := cpu.Times(true)
if err != nil {
return nil, err
}
cs := make([]*CPUStats, len(cpuStats))
for idx, cpuStat := range cpuStats {
percentCalculator, ok := c.cpuCalculator[cpuStat.CPU]
if !ok {
percentCalculator = &cpuStatsCalculator{}
c.cpuCalculator[cpuStat.CPU] = percentCalculator
}
cs[idx] = percentCalculator.calculate(cpuStat)
}
return cs, nil
}

58
lib/hoststats/cpu_test.go Normal file
View File

@ -0,0 +1,58 @@
package hoststats
import (
"math"
"os"
"testing"
"time"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestHostStats_CPU(t *testing.T) {
logger := testutil.Logger(t)
cwd, err := os.Getwd()
assert.Nil(t, err)
hs := initCollector(logger, cwd)
// Collect twice so we can calculate percents we need to generate some work
// so that the cpu values change
hs.collect()
for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; {
}
hs.collect()
stats := hs.Stats()
assert.NotZero(t, len(stats.CPU))
for _, cpu := range stats.CPU {
assert.False(t, math.IsNaN(cpu.Idle))
assert.False(t, math.IsNaN(cpu.Total))
assert.False(t, math.IsNaN(cpu.System))
assert.False(t, math.IsNaN(cpu.User))
assert.False(t, math.IsInf(cpu.Idle, 0))
assert.False(t, math.IsInf(cpu.Total, 0))
assert.False(t, math.IsInf(cpu.System, 0))
assert.False(t, math.IsInf(cpu.User, 0))
}
}
func TestCpuStatsCalculator_Nan(t *testing.T) {
times := cpu.TimesStat{
User: 0.0,
Idle: 100.0,
System: 0.0,
}
calculator := &cpuStatsCalculator{}
calculator.calculate(times)
stats := calculator.calculate(times)
require.Equal(t, 100.0, stats.Idle)
require.Zero(t, stats.User)
require.Zero(t, stats.System)
require.Zero(t, stats.Iowait)
require.Zero(t, stats.Total)
}

92
lib/hoststats/host.go Normal file
View File

@ -0,0 +1,92 @@
package hoststats
import (
"time"
"github.com/armon/go-metrics"
)
var hostStatsCollectionInterval = 10 * time.Second
// HostStats represents resource usage hoststats of the host running a Consul agent
type HostStats struct {
Memory *MemoryStats
CPU []*CPUStats
DataDirStats *DiskStats
Uptime uint64
Timestamp int64
}
func (hs *HostStats) Clone() *HostStats {
clone := &HostStats{}
*clone = *hs
return clone
}
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {
if hs.Memory != nil {
sink.SetGaugeWithLabels([]string{"host", "memory", "total"}, float32(hs.Memory.Total), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "available"}, float32(hs.Memory.Available), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "used"}, float32(hs.Memory.Used), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "used_percent"}, float32(hs.Memory.UsedPercent), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "free"}, float32(hs.Memory.Free), baseLabels)
}
for _, cpu := range hs.CPU {
labels := append(baseLabels, metrics.Label{
Name: "cpu",
Value: cpu.CPU,
})
sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "iowait"}, float32(cpu.Iowait), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels)
}
if hs.DataDirStats != nil {
diskLabels := append(baseLabels, metrics.Label{
Name: "path",
Value: hs.DataDirStats.Path,
})
sink.SetGaugeWithLabels([]string{"host", "disk", "size"}, float32(hs.DataDirStats.Size), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "used"}, float32(hs.DataDirStats.Used), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "available"}, float32(hs.DataDirStats.Available), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "used_percent"}, float32(hs.DataDirStats.UsedPercent), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "inodes_percent"}, float32(hs.DataDirStats.InodesUsedPercent), diskLabels)
}
sink.SetGaugeWithLabels([]string{"host", "uptime"}, float32(hs.Uptime), baseLabels)
}
// CPUStats represents hoststats related to cpu usage
type CPUStats struct {
CPU string
User float64
System float64
Idle float64
Iowait float64
Total float64
}
// MemoryStats represents hoststats related to virtual memory usage
type MemoryStats struct {
Total uint64
Available uint64
Used uint64
UsedPercent float64
Free uint64
}
// DiskStats represents hoststats related to disk usage
type DiskStats struct {
Path string
Size uint64
Used uint64
Available uint64
UsedPercent float64
InodesUsedPercent float64
}

79
lib/hoststats/metrics.go Normal file
View File

@ -0,0 +1,79 @@
package hoststats
import (
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
)
// Metrics defines an interface for the methods used to emit data to the go-metrics library.
// `metrics.Default()` should always satisfy this interface.
type Metrics interface {
SetGaugeWithLabels(key []string, val float32, labels []metrics.Label)
}
var Gauges = []prometheus.GaugeDefinition{
{
Name: []string{"host", "memory", "total"},
Help: "Total physical memory in bytes",
},
{
Name: []string{"host", "memory", "available"},
Help: "Available physical memory in bytes",
},
{
Name: []string{"host", "memory", "free"},
Help: "Free physical memory in bytes",
},
{
Name: []string{"host", "memory", "used"},
Help: "Used physical memory in bytes",
},
{
Name: []string{"host", "memory", "used_percent"},
Help: "Percentage of physical memory in use",
},
{
Name: []string{"host", "cpu", "total"},
Help: "Total cpu utilization",
},
{
Name: []string{"host", "cpu", "user"},
Help: "User cpu utilization",
},
{
Name: []string{"host", "cpu", "idle"},
Help: "Idle cpu utilization",
},
{
Name: []string{"host", "cpu", "iowait"},
Help: "Iowait cpu utilization",
},
{
Name: []string{"host", "cpu", "system"},
Help: "System cpu utilization",
},
{
Name: []string{"host", "disk", "size"},
Help: "Size of disk in bytes",
},
{
Name: []string{"host", "disk", "used"},
Help: "Disk usage in bytes",
},
{
Name: []string{"host", "disk", "available"},
Help: "Available bytes on disk",
},
{
Name: []string{"host", "disk", "used_percent"},
Help: "Percentage of disk space usage",
},
{
Name: []string{"host", "disk", "inodes_percent"},
Help: "Percentage of disk inodes usage",
},
{
Name: []string{"host", "uptime"},
Help: "System uptime",
},
}

View File

@ -204,6 +204,11 @@ type TelemetryConfig struct {
// hcl: telemetry { statsite_address = string } // hcl: telemetry { statsite_address = string }
StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"` StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"`
// EnableHostMetrics will enable metrics collected about the host system such as cpu memory and disk usage.
//
// hcl: telemetry { enable_host_metrics = (true|false) }
EnableHostMetrics bool `json:"enable_host_metrics,omitempty" mapstructure:"enable_host_metrics"`
// PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration // PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration
// we acquire from hcl is the retention time. We also use definition slices that are set in agent setup // we acquire from hcl is the retention time. We also use definition slices that are set in agent setup
// before being passed to InitTelemmetry. // before being passed to InitTelemmetry.

View File

@ -1831,6 +1831,9 @@ subsystem that provides Consul's service mesh capabilities.
of global tags that will be added to all telemetry packets sent to DogStatsD. of global tags that will be added to all telemetry packets sent to DogStatsD.
It is a list of strings, where each string looks like "my_tag_name:my_tag_value". It is a list of strings, where each string looks like "my_tag_name:my_tag_value".
- `enable_host_metrics` ((#telemetry-enable_host_metrics))
This enables reporting of host metrics about system resources, defaults to false.
- `filter_default` ((#telemetry-filter_default)) - `filter_default` ((#telemetry-filter_default))
This controls whether to allow metrics that have not been specified by the filter. This controls whether to allow metrics that have not been specified by the filter.
Defaults to `true`, which will allow all metrics when no filters are provided. Defaults to `true`, which will allow all metrics when no filters are provided.

View File

@ -755,3 +755,32 @@ Consul attaches the following labels to metric values.
| `peer_id` | The ID of a peer connected to the reporting cluster or leader. | Any UUID | | `peer_id` | The ID of a peer connected to the reporting cluster or leader. | Any UUID |
| `partition` | <EnterpriseAlert inline /> Name of the partition that the peering is created in. | Any defined partition name in the cluster | | `partition` | <EnterpriseAlert inline /> Name of the partition that the peering is created in. | Any defined partition name in the cluster |
## Server Host Metrics
Consul servers can report the following metrics about the host's system resources.
This feature must be enabled in the [agent telemetry configuration](/consul/docs/agent/config/config-files#telemetry-enable_host_metrics).
Note that if the Consul server is operating inside a container these metrics
still report host resource usage and do not report any resource limits placed
on the container.
**Requirements:**
- Consul 1.15.3+
| Metric | Description | Unit | Type |
| ----------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ------- |
| `consul.host.memory.total` | The total physical memory in bytes | mixed | mixed |
| `consul.host.memory.available` | The available physical memory in bytes | mixed | mixed |
| `consul.host.memory.free` | The free physical memory in bytes | mixed | mixed |
| `consul.host.memory.used` | The used physical memory in bytes | mixed | mixed |
| `consul.host.memory.used_percent` | The used physical memory as a percentage of total physical memory | mixed | mixed |
| `consul.host.cpu.total` | The host's total cpu utilization
| `consul.host.cpu.user` | The cpu utilization in user space
| `consul.host.cpu.idle` | The cpu utilization in idle state
| `consul.host.cpu.iowait` | The cpu utilization in iowait state
| `consul.host.cpu.system` | The cpu utilization in system space
| `consul.host.disk.size` | The size in bytes of the data_dir disk
| `consul.host.disk.used` | The number of bytes used on the data_dir disk
| `consul.host.disk.available` | The number of bytes available on the data_dir disk
| `consul.host.disk.used_percent` | The percentage of disk space used on the data_dir disk
| `consul.host.disk.inodes_percent` | The percentage of inode usage on the data_dir disk
| `consul.host.uptime` | The uptime of the host in seconds