hoststats: add package for collecting host statistics including cpu memory and disk usage (#17038)
This commit is contained in:
parent
265c50b1dc
commit
e5cdb702e5
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvement
|
||||||
|
agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled)
|
||||||
|
```
|
|
@ -1106,6 +1106,9 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
|
||||||
LocalProxyConfigResyncInterval: 30 * time.Second,
|
LocalProxyConfigResyncInterval: 30 * time.Second,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// host metrics are enabled by default if consul is configured with HashiCorp Cloud Platform integration
|
||||||
|
rt.Telemetry.EnableHostMetrics = boolValWithDefault(c.Telemetry.EnableHostMetrics, rt.IsCloudEnabled())
|
||||||
|
|
||||||
rt.TLS, err = b.buildTLSConfig(rt, c.TLS)
|
rt.TLS, err = b.buildTLSConfig(rt, c.TLS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return RuntimeConfig{}, err
|
return RuntimeConfig{}, err
|
||||||
|
|
|
@ -556,3 +556,22 @@ func TestBuilder_parsePrefixFilter(t *testing.T) {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuidler_hostMetricsWithCloud(t *testing.T) {
|
||||||
|
devMode := true
|
||||||
|
builderOpts := LoadOpts{
|
||||||
|
DevMode: &devMode,
|
||||||
|
DefaultConfig: FileSource{
|
||||||
|
Name: "test",
|
||||||
|
Format: "hcl",
|
||||||
|
Data: `cloud{ resource_id = "abc" client_id = "abc" client_secret = "abc"}`,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := Load(builderOpts)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Empty(t, result.Warnings)
|
||||||
|
cfg := result.RuntimeConfig
|
||||||
|
require.NotNil(t, cfg)
|
||||||
|
require.True(t, cfg.Telemetry.EnableHostMetrics)
|
||||||
|
}
|
||||||
|
|
|
@ -691,6 +691,7 @@ type Telemetry struct {
|
||||||
CirconusSubmissionInterval *string `mapstructure:"circonus_submission_interval" json:"circonus_submission_interval,omitempty"`
|
CirconusSubmissionInterval *string `mapstructure:"circonus_submission_interval" json:"circonus_submission_interval,omitempty"`
|
||||||
CirconusSubmissionURL *string `mapstructure:"circonus_submission_url" json:"circonus_submission_url,omitempty"`
|
CirconusSubmissionURL *string `mapstructure:"circonus_submission_url" json:"circonus_submission_url,omitempty"`
|
||||||
DisableHostname *bool `mapstructure:"disable_hostname" json:"disable_hostname,omitempty"`
|
DisableHostname *bool `mapstructure:"disable_hostname" json:"disable_hostname,omitempty"`
|
||||||
|
EnableHostMetrics *bool `mapstructure:"enable_host_metrics" json:"enable_host_metrics,omitempty"`
|
||||||
DogstatsdAddr *string `mapstructure:"dogstatsd_addr" json:"dogstatsd_addr,omitempty"`
|
DogstatsdAddr *string `mapstructure:"dogstatsd_addr" json:"dogstatsd_addr,omitempty"`
|
||||||
DogstatsdTags []string `mapstructure:"dogstatsd_tags" json:"dogstatsd_tags,omitempty"`
|
DogstatsdTags []string `mapstructure:"dogstatsd_tags" json:"dogstatsd_tags,omitempty"`
|
||||||
RetryFailedConfiguration *bool `mapstructure:"retry_failed_connection" json:"retry_failed_connection,omitempty"`
|
RetryFailedConfiguration *bool `mapstructure:"retry_failed_connection" json:"retry_failed_connection,omitempty"`
|
||||||
|
|
|
@ -46,6 +46,7 @@ type testCase struct {
|
||||||
desc string
|
desc string
|
||||||
args []string
|
args []string
|
||||||
setup func() // TODO: accept a testing.T instead of panic
|
setup func() // TODO: accept a testing.T instead of panic
|
||||||
|
cleanup func()
|
||||||
expected func(rt *RuntimeConfig)
|
expected func(rt *RuntimeConfig)
|
||||||
expectedErr string
|
expectedErr string
|
||||||
expectedWarnings []string
|
expectedWarnings []string
|
||||||
|
@ -2308,9 +2309,9 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
|
||||||
},
|
},
|
||||||
setup: func() {
|
setup: func() {
|
||||||
os.Setenv("HCP_RESOURCE_ID", "env-id")
|
os.Setenv("HCP_RESOURCE_ID", "env-id")
|
||||||
t.Cleanup(func() {
|
},
|
||||||
os.Unsetenv("HCP_RESOURCE_ID")
|
cleanup: func() {
|
||||||
})
|
os.Unsetenv("HCP_RESOURCE_ID")
|
||||||
},
|
},
|
||||||
expected: func(rt *RuntimeConfig) {
|
expected: func(rt *RuntimeConfig) {
|
||||||
rt.DataDir = dataDir
|
rt.DataDir = dataDir
|
||||||
|
@ -2321,6 +2322,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
|
||||||
|
|
||||||
// server things
|
// server things
|
||||||
rt.ServerMode = true
|
rt.ServerMode = true
|
||||||
|
rt.Telemetry.EnableHostMetrics = true
|
||||||
rt.TLS.ServerMode = true
|
rt.TLS.ServerMode = true
|
||||||
rt.LeaveOnTerm = false
|
rt.LeaveOnTerm = false
|
||||||
rt.SkipLeaveOnInt = true
|
rt.SkipLeaveOnInt = true
|
||||||
|
@ -2337,9 +2339,9 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
|
||||||
},
|
},
|
||||||
setup: func() {
|
setup: func() {
|
||||||
os.Setenv("HCP_RESOURCE_ID", "env-id")
|
os.Setenv("HCP_RESOURCE_ID", "env-id")
|
||||||
t.Cleanup(func() {
|
},
|
||||||
os.Unsetenv("HCP_RESOURCE_ID")
|
cleanup: func() {
|
||||||
})
|
os.Unsetenv("HCP_RESOURCE_ID")
|
||||||
},
|
},
|
||||||
json: []string{`{
|
json: []string{`{
|
||||||
"cloud": {
|
"cloud": {
|
||||||
|
@ -2360,6 +2362,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) {
|
||||||
|
|
||||||
// server things
|
// server things
|
||||||
rt.ServerMode = true
|
rt.ServerMode = true
|
||||||
|
rt.Telemetry.EnableHostMetrics = true
|
||||||
rt.TLS.ServerMode = true
|
rt.TLS.ServerMode = true
|
||||||
rt.LeaveOnTerm = false
|
rt.LeaveOnTerm = false
|
||||||
rt.SkipLeaveOnInt = true
|
rt.SkipLeaveOnInt = true
|
||||||
|
@ -6032,6 +6035,9 @@ func (tc testCase) run(format string, dataDir string) func(t *testing.T) {
|
||||||
expected.ACLResolverSettings.EnterpriseMeta = *structs.NodeEnterpriseMetaInPartition(expected.PartitionOrDefault())
|
expected.ACLResolverSettings.EnterpriseMeta = *structs.NodeEnterpriseMetaInPartition(expected.PartitionOrDefault())
|
||||||
|
|
||||||
prototest.AssertDeepEqual(t, expected, actual, cmpopts.EquateEmpty())
|
prototest.AssertDeepEqual(t, expected, actual, cmpopts.EquateEmpty())
|
||||||
|
if tc.cleanup != nil {
|
||||||
|
tc.cleanup()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6754,6 +6760,7 @@ func TestLoad_FullConfig(t *testing.T) {
|
||||||
Expiration: 15 * time.Second,
|
Expiration: 15 * time.Second,
|
||||||
Name: "ftO6DySn", // notice this is the same as the metrics prefix
|
Name: "ftO6DySn", // notice this is the same as the metrics prefix
|
||||||
},
|
},
|
||||||
|
EnableHostMetrics: true,
|
||||||
},
|
},
|
||||||
TLS: tlsutil.Config{
|
TLS: tlsutil.Config{
|
||||||
InternalRPC: tlsutil.ProtocolConfig{
|
InternalRPC: tlsutil.ProtocolConfig{
|
||||||
|
|
|
@ -465,6 +465,7 @@
|
||||||
"DisableHostname": false,
|
"DisableHostname": false,
|
||||||
"DogstatsdAddr": "",
|
"DogstatsdAddr": "",
|
||||||
"DogstatsdTags": [],
|
"DogstatsdTags": [],
|
||||||
|
"EnableHostMetrics": false,
|
||||||
"FilterDefault": false,
|
"FilterDefault": false,
|
||||||
"MetricsPrefix": "",
|
"MetricsPrefix": "",
|
||||||
"PrometheusOpts": {
|
"PrometheusOpts": {
|
||||||
|
|
|
@ -690,6 +690,7 @@ telemetry {
|
||||||
circonus_check_tags = "prvO4uBl"
|
circonus_check_tags = "prvO4uBl"
|
||||||
circonus_submission_interval = "DolzaflP"
|
circonus_submission_interval = "DolzaflP"
|
||||||
circonus_submission_url = "gTcbS93G"
|
circonus_submission_url = "gTcbS93G"
|
||||||
|
enable_host_metrics = true
|
||||||
disable_hostname = true
|
disable_hostname = true
|
||||||
dogstatsd_addr = "0wSndumK"
|
dogstatsd_addr = "0wSndumK"
|
||||||
dogstatsd_tags = [ "3N81zSUB","Xtj8AnXZ" ]
|
dogstatsd_tags = [ "3N81zSUB","Xtj8AnXZ" ]
|
||||||
|
|
|
@ -808,6 +808,7 @@
|
||||||
"circonus_check_tags": "prvO4uBl",
|
"circonus_check_tags": "prvO4uBl",
|
||||||
"circonus_submission_interval": "DolzaflP",
|
"circonus_submission_interval": "DolzaflP",
|
||||||
"circonus_submission_url": "gTcbS93G",
|
"circonus_submission_url": "gTcbS93G",
|
||||||
|
"enable_host_metrics": true,
|
||||||
"disable_hostname": true,
|
"disable_hostname": true,
|
||||||
"dogstatsd_addr": "0wSndumK",
|
"dogstatsd_addr": "0wSndumK",
|
||||||
"dogstatsd_tags": [
|
"dogstatsd_tags": [
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
package agent
|
package agent
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
|
@ -41,6 +42,7 @@ import (
|
||||||
"github.com/hashicorp/consul/agent/xds"
|
"github.com/hashicorp/consul/agent/xds"
|
||||||
"github.com/hashicorp/consul/ipaddr"
|
"github.com/hashicorp/consul/ipaddr"
|
||||||
"github.com/hashicorp/consul/lib"
|
"github.com/hashicorp/consul/lib"
|
||||||
|
"github.com/hashicorp/consul/lib/hoststats"
|
||||||
"github.com/hashicorp/consul/logging"
|
"github.com/hashicorp/consul/logging"
|
||||||
"github.com/hashicorp/consul/tlsutil"
|
"github.com/hashicorp/consul/tlsutil"
|
||||||
)
|
)
|
||||||
|
@ -59,6 +61,7 @@ type BaseDeps struct {
|
||||||
WatchedFiles []string
|
WatchedFiles []string
|
||||||
|
|
||||||
deregisterBalancer, deregisterResolver func()
|
deregisterBalancer, deregisterResolver func()
|
||||||
|
stopHostCollector context.CancelFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
type ConfigLoader func(source config.Source) (config.LoadResult, error)
|
type ConfigLoader func(source config.Source) (config.LoadResult, error)
|
||||||
|
@ -117,6 +120,11 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return d, fmt.Errorf("failed to initialize telemetry: %w", err)
|
return d, fmt.Errorf("failed to initialize telemetry: %w", err)
|
||||||
}
|
}
|
||||||
|
if !cfg.Telemetry.Disable && cfg.Telemetry.EnableHostMetrics {
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
hoststats.NewCollector(ctx, d.Logger, cfg.DataDir)
|
||||||
|
d.stopHostCollector = cancel
|
||||||
|
}
|
||||||
|
|
||||||
d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger)
|
d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -214,11 +222,10 @@ func (bd BaseDeps) Close() {
|
||||||
bd.AutoConfig.Stop()
|
bd.AutoConfig.Stop()
|
||||||
bd.MetricsConfig.Cancel()
|
bd.MetricsConfig.Cancel()
|
||||||
|
|
||||||
if fn := bd.deregisterBalancer; fn != nil {
|
for _, fn := range []func(){bd.deregisterBalancer, bd.deregisterResolver, bd.stopHostCollector} {
|
||||||
fn()
|
if fn != nil {
|
||||||
}
|
fn()
|
||||||
if fn := bd.deregisterResolver; fn != nil {
|
}
|
||||||
fn()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -297,6 +304,10 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G
|
||||||
serverGauges,
|
serverGauges,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cfg.Telemetry.EnableHostMetrics {
|
||||||
|
gauges = append(gauges, hoststats.Gauges)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
|
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
|
||||||
if isServer {
|
if isServer {
|
||||||
gauges = append(gauges,
|
gauges = append(gauges,
|
||||||
|
|
|
@ -0,0 +1,189 @@
|
||||||
|
package hoststats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/armon/go-metrics"
|
||||||
|
"github.com/hashicorp/go-hclog"
|
||||||
|
"github.com/shirou/gopsutil/v3/disk"
|
||||||
|
"github.com/shirou/gopsutil/v3/host"
|
||||||
|
"github.com/shirou/gopsutil/v3/mem"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Collector collects host resource usage stats
|
||||||
|
type Collector struct {
|
||||||
|
numCores int
|
||||||
|
cpuCalculator map[string]*cpuStatsCalculator
|
||||||
|
hostStats *HostStats
|
||||||
|
hostStatsLock sync.RWMutex
|
||||||
|
dataDir string
|
||||||
|
|
||||||
|
metrics Metrics
|
||||||
|
baseLabels []metrics.Label
|
||||||
|
|
||||||
|
logger hclog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewCollector returns a Collector. The dataDir is passed in
|
||||||
|
// so that we can present the disk related statistics for the mountpoint where the dataDir exists
|
||||||
|
func NewCollector(ctx context.Context, logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector {
|
||||||
|
logger = logger.Named("host_stats")
|
||||||
|
collector := initCollector(logger, dataDir)
|
||||||
|
go collector.loop(ctx)
|
||||||
|
return collector
|
||||||
|
}
|
||||||
|
|
||||||
|
// initCollector initializes the Collector but does not start the collection loop
|
||||||
|
func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector {
|
||||||
|
numCores := runtime.NumCPU()
|
||||||
|
statsCalculator := make(map[string]*cpuStatsCalculator)
|
||||||
|
collector := &Collector{
|
||||||
|
cpuCalculator: statsCalculator,
|
||||||
|
numCores: numCores,
|
||||||
|
logger: logger,
|
||||||
|
dataDir: dataDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(collector)
|
||||||
|
}
|
||||||
|
|
||||||
|
if collector.metrics == nil {
|
||||||
|
collector.metrics = metrics.Default()
|
||||||
|
}
|
||||||
|
return collector
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Collector) loop(ctx context.Context) {
|
||||||
|
// Start collecting host stats right away and then keep collecting every
|
||||||
|
// collection interval
|
||||||
|
next := time.NewTimer(0)
|
||||||
|
defer next.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-next.C:
|
||||||
|
c.collect()
|
||||||
|
next.Reset(hostStatsCollectionInterval)
|
||||||
|
c.Stats().Emit(c.metrics, c.baseLabels)
|
||||||
|
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collect will collect stats related to resource usage of the host
|
||||||
|
func (c *Collector) collect() {
|
||||||
|
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
|
||||||
|
|
||||||
|
// Determine up-time
|
||||||
|
uptime, err := host.Uptime()
|
||||||
|
if err != nil {
|
||||||
|
c.logger.Error("failed to collect uptime stats", "error", err)
|
||||||
|
uptime = 0
|
||||||
|
}
|
||||||
|
hs.Uptime = uptime
|
||||||
|
|
||||||
|
// Collect memory stats
|
||||||
|
mstats, err := c.collectMemoryStats()
|
||||||
|
if err != nil {
|
||||||
|
c.logger.Error("failed to collect memory stats", "error", err)
|
||||||
|
mstats = &MemoryStats{}
|
||||||
|
}
|
||||||
|
hs.Memory = mstats
|
||||||
|
|
||||||
|
// Collect cpu stats
|
||||||
|
cpus, err := c.collectCPUStats()
|
||||||
|
if err != nil {
|
||||||
|
c.logger.Error("failed to collect cpu stats", "error", err)
|
||||||
|
cpus = []*CPUStats{}
|
||||||
|
}
|
||||||
|
hs.CPU = cpus
|
||||||
|
|
||||||
|
// Collect disk stats
|
||||||
|
diskStats, err := c.collectDiskStats(c.dataDir)
|
||||||
|
if err != nil {
|
||||||
|
c.logger.Error("failed to collect dataDir disk stats", "error", err)
|
||||||
|
}
|
||||||
|
hs.DataDirStats = diskStats
|
||||||
|
|
||||||
|
// Update the collected status object.
|
||||||
|
c.hostStatsLock.Lock()
|
||||||
|
c.hostStats = hs
|
||||||
|
c.hostStatsLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) {
|
||||||
|
usage, err := disk.Usage(dir)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
|
||||||
|
}
|
||||||
|
return c.toDiskStats(usage), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Collector) collectMemoryStats() (*MemoryStats, error) {
|
||||||
|
memStats, err := mem.VirtualMemory()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
mem := &MemoryStats{
|
||||||
|
Total: memStats.Total,
|
||||||
|
Available: memStats.Available,
|
||||||
|
Used: memStats.Used,
|
||||||
|
UsedPercent: memStats.UsedPercent,
|
||||||
|
Free: memStats.Free,
|
||||||
|
}
|
||||||
|
|
||||||
|
return mem, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stats returns the host stats that has been collected
|
||||||
|
func (c *Collector) Stats() *HostStats {
|
||||||
|
c.hostStatsLock.RLock()
|
||||||
|
defer c.hostStatsLock.RUnlock()
|
||||||
|
|
||||||
|
if c.hostStats == nil {
|
||||||
|
return &HostStats{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.hostStats.Clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
|
||||||
|
func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
|
||||||
|
ds := DiskStats{
|
||||||
|
Size: usage.Total,
|
||||||
|
Used: usage.Used,
|
||||||
|
Available: usage.Free,
|
||||||
|
UsedPercent: usage.UsedPercent,
|
||||||
|
InodesUsedPercent: usage.InodesUsedPercent,
|
||||||
|
Path: usage.Path,
|
||||||
|
}
|
||||||
|
if math.IsNaN(ds.UsedPercent) {
|
||||||
|
ds.UsedPercent = 0.0
|
||||||
|
}
|
||||||
|
if math.IsNaN(ds.InodesUsedPercent) {
|
||||||
|
ds.InodesUsedPercent = 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ds
|
||||||
|
}
|
||||||
|
|
||||||
|
type CollectorOption func(c *Collector)
|
||||||
|
|
||||||
|
func WithMetrics(m *metrics.Metrics) CollectorOption {
|
||||||
|
return func(c *Collector) {
|
||||||
|
c.metrics = m
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithBaseLabels(labels []metrics.Label) CollectorOption {
|
||||||
|
return func(c *Collector) {
|
||||||
|
c.baseLabels = labels
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,85 @@
|
||||||
|
package hoststats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/shirou/gopsutil/v3/cpu"
|
||||||
|
)
|
||||||
|
|
||||||
|
// cpuStatsCalculator calculates cpu usage percentages
|
||||||
|
type cpuStatsCalculator struct {
|
||||||
|
prev cpu.TimesStat
|
||||||
|
prevBusy float64
|
||||||
|
prevTotal float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate the current cpu usage percentages.
|
||||||
|
// Since the cpu.TimesStat captures the total time a cpu spent in various states
|
||||||
|
// this function tracks the last seen stat and derives each cpu state's utilization
|
||||||
|
// as a percentage of the total change in cpu time between calls.
|
||||||
|
// The first time calculate is called CPUStats will report %100 idle
|
||||||
|
// usage since there is not a previous value to calculate against
|
||||||
|
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
|
||||||
|
|
||||||
|
// sum all none idle counters to get the total busy cpu time
|
||||||
|
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
|
||||||
|
times.Softirq + times.Steal + times.Guest + times.GuestNice
|
||||||
|
// sum of the total cpu time
|
||||||
|
currentTotal := currentBusy + times.Idle
|
||||||
|
|
||||||
|
// calculate how much cpu time has passed since last calculation
|
||||||
|
deltaTotal := currentTotal - h.prevTotal
|
||||||
|
|
||||||
|
stats := &CPUStats{
|
||||||
|
CPU: times.CPU,
|
||||||
|
|
||||||
|
// calculate each percentage as the ratio of the change
|
||||||
|
// in each state's time to the total change in cpu time
|
||||||
|
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100,
|
||||||
|
User: ((times.User - h.prev.User) / deltaTotal) * 100,
|
||||||
|
System: ((times.System - h.prev.System) / deltaTotal) * 100,
|
||||||
|
Iowait: ((times.Iowait - h.prev.Iowait) / deltaTotal) * 100,
|
||||||
|
Total: ((currentBusy - h.prevBusy) / deltaTotal) * 100,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Protect against any invalid values
|
||||||
|
if math.IsNaN(stats.Idle) || math.IsInf(stats.Idle, 0) {
|
||||||
|
stats.Idle = 100.0
|
||||||
|
}
|
||||||
|
if math.IsNaN(stats.User) || math.IsInf(stats.User, 0) {
|
||||||
|
stats.User = 0.0
|
||||||
|
}
|
||||||
|
if math.IsNaN(stats.System) || math.IsInf(stats.System, 0) {
|
||||||
|
stats.System = 0.0
|
||||||
|
}
|
||||||
|
if math.IsNaN(stats.Iowait) || math.IsInf(stats.Iowait, 0) {
|
||||||
|
stats.Iowait = 0.0
|
||||||
|
}
|
||||||
|
if math.IsNaN(stats.Total) || math.IsInf(stats.Total, 0) {
|
||||||
|
stats.Total = 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
h.prev = times
|
||||||
|
h.prevTotal = currentTotal
|
||||||
|
h.prevBusy = currentBusy
|
||||||
|
return stats
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
|
||||||
|
|
||||||
|
cpuStats, err := cpu.Times(true)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
cs := make([]*CPUStats, len(cpuStats))
|
||||||
|
for idx, cpuStat := range cpuStats {
|
||||||
|
percentCalculator, ok := c.cpuCalculator[cpuStat.CPU]
|
||||||
|
if !ok {
|
||||||
|
percentCalculator = &cpuStatsCalculator{}
|
||||||
|
c.cpuCalculator[cpuStat.CPU] = percentCalculator
|
||||||
|
}
|
||||||
|
cs[idx] = percentCalculator.calculate(cpuStat)
|
||||||
|
}
|
||||||
|
|
||||||
|
return cs, nil
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package hoststats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/sdk/testutil"
|
||||||
|
"github.com/shirou/gopsutil/v3/cpu"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHostStats_CPU(t *testing.T) {
|
||||||
|
logger := testutil.Logger(t)
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
assert.Nil(t, err)
|
||||||
|
hs := initCollector(logger, cwd)
|
||||||
|
|
||||||
|
// Collect twice so we can calculate percents we need to generate some work
|
||||||
|
// so that the cpu values change
|
||||||
|
hs.collect()
|
||||||
|
for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; {
|
||||||
|
}
|
||||||
|
hs.collect()
|
||||||
|
stats := hs.Stats()
|
||||||
|
assert.NotZero(t, len(stats.CPU))
|
||||||
|
|
||||||
|
for _, cpu := range stats.CPU {
|
||||||
|
assert.False(t, math.IsNaN(cpu.Idle))
|
||||||
|
assert.False(t, math.IsNaN(cpu.Total))
|
||||||
|
assert.False(t, math.IsNaN(cpu.System))
|
||||||
|
assert.False(t, math.IsNaN(cpu.User))
|
||||||
|
|
||||||
|
assert.False(t, math.IsInf(cpu.Idle, 0))
|
||||||
|
assert.False(t, math.IsInf(cpu.Total, 0))
|
||||||
|
assert.False(t, math.IsInf(cpu.System, 0))
|
||||||
|
assert.False(t, math.IsInf(cpu.User, 0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCpuStatsCalculator_Nan(t *testing.T) {
|
||||||
|
times := cpu.TimesStat{
|
||||||
|
User: 0.0,
|
||||||
|
Idle: 100.0,
|
||||||
|
System: 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
calculator := &cpuStatsCalculator{}
|
||||||
|
calculator.calculate(times)
|
||||||
|
stats := calculator.calculate(times)
|
||||||
|
require.Equal(t, 100.0, stats.Idle)
|
||||||
|
require.Zero(t, stats.User)
|
||||||
|
require.Zero(t, stats.System)
|
||||||
|
require.Zero(t, stats.Iowait)
|
||||||
|
require.Zero(t, stats.Total)
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
package hoststats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/armon/go-metrics"
|
||||||
|
)
|
||||||
|
|
||||||
|
var hostStatsCollectionInterval = 10 * time.Second
|
||||||
|
|
||||||
|
// HostStats represents resource usage hoststats of the host running a Consul agent
|
||||||
|
type HostStats struct {
|
||||||
|
Memory *MemoryStats
|
||||||
|
CPU []*CPUStats
|
||||||
|
DataDirStats *DiskStats
|
||||||
|
Uptime uint64
|
||||||
|
Timestamp int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hs *HostStats) Clone() *HostStats {
|
||||||
|
clone := &HostStats{}
|
||||||
|
*clone = *hs
|
||||||
|
return clone
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {
|
||||||
|
|
||||||
|
if hs.Memory != nil {
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "memory", "total"}, float32(hs.Memory.Total), baseLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "memory", "available"}, float32(hs.Memory.Available), baseLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "memory", "used"}, float32(hs.Memory.Used), baseLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "memory", "used_percent"}, float32(hs.Memory.UsedPercent), baseLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "memory", "free"}, float32(hs.Memory.Free), baseLabels)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, cpu := range hs.CPU {
|
||||||
|
labels := append(baseLabels, metrics.Label{
|
||||||
|
Name: "cpu",
|
||||||
|
Value: cpu.CPU,
|
||||||
|
})
|
||||||
|
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "cpu", "iowait"}, float32(cpu.Iowait), labels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels)
|
||||||
|
}
|
||||||
|
|
||||||
|
if hs.DataDirStats != nil {
|
||||||
|
diskLabels := append(baseLabels, metrics.Label{
|
||||||
|
Name: "path",
|
||||||
|
Value: hs.DataDirStats.Path,
|
||||||
|
})
|
||||||
|
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "disk", "size"}, float32(hs.DataDirStats.Size), diskLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "disk", "used"}, float32(hs.DataDirStats.Used), diskLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "disk", "available"}, float32(hs.DataDirStats.Available), diskLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "disk", "used_percent"}, float32(hs.DataDirStats.UsedPercent), diskLabels)
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "disk", "inodes_percent"}, float32(hs.DataDirStats.InodesUsedPercent), diskLabels)
|
||||||
|
}
|
||||||
|
|
||||||
|
sink.SetGaugeWithLabels([]string{"host", "uptime"}, float32(hs.Uptime), baseLabels)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CPUStats represents hoststats related to cpu usage
|
||||||
|
type CPUStats struct {
|
||||||
|
CPU string
|
||||||
|
User float64
|
||||||
|
System float64
|
||||||
|
Idle float64
|
||||||
|
Iowait float64
|
||||||
|
Total float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// MemoryStats represents hoststats related to virtual memory usage
|
||||||
|
type MemoryStats struct {
|
||||||
|
Total uint64
|
||||||
|
Available uint64
|
||||||
|
Used uint64
|
||||||
|
UsedPercent float64
|
||||||
|
Free uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiskStats represents hoststats related to disk usage
|
||||||
|
type DiskStats struct {
|
||||||
|
Path string
|
||||||
|
Size uint64
|
||||||
|
Used uint64
|
||||||
|
Available uint64
|
||||||
|
UsedPercent float64
|
||||||
|
InodesUsedPercent float64
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
package hoststats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/armon/go-metrics"
|
||||||
|
"github.com/armon/go-metrics/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Metrics defines an interface for the methods used to emit data to the go-metrics library.
|
||||||
|
// `metrics.Default()` should always satisfy this interface.
|
||||||
|
type Metrics interface {
|
||||||
|
SetGaugeWithLabels(key []string, val float32, labels []metrics.Label)
|
||||||
|
}
|
||||||
|
|
||||||
|
var Gauges = []prometheus.GaugeDefinition{
|
||||||
|
{
|
||||||
|
Name: []string{"host", "memory", "total"},
|
||||||
|
Help: "Total physical memory in bytes",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "memory", "available"},
|
||||||
|
Help: "Available physical memory in bytes",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "memory", "free"},
|
||||||
|
Help: "Free physical memory in bytes",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "memory", "used"},
|
||||||
|
Help: "Used physical memory in bytes",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "memory", "used_percent"},
|
||||||
|
Help: "Percentage of physical memory in use",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "cpu", "total"},
|
||||||
|
Help: "Total cpu utilization",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "cpu", "user"},
|
||||||
|
Help: "User cpu utilization",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "cpu", "idle"},
|
||||||
|
Help: "Idle cpu utilization",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "cpu", "iowait"},
|
||||||
|
Help: "Iowait cpu utilization",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "cpu", "system"},
|
||||||
|
Help: "System cpu utilization",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "disk", "size"},
|
||||||
|
Help: "Size of disk in bytes",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "disk", "used"},
|
||||||
|
Help: "Disk usage in bytes",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "disk", "available"},
|
||||||
|
Help: "Available bytes on disk",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "disk", "used_percent"},
|
||||||
|
Help: "Percentage of disk space usage",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "disk", "inodes_percent"},
|
||||||
|
Help: "Percentage of disk inodes usage",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: []string{"host", "uptime"},
|
||||||
|
Help: "System uptime",
|
||||||
|
},
|
||||||
|
}
|
|
@ -204,6 +204,11 @@ type TelemetryConfig struct {
|
||||||
// hcl: telemetry { statsite_address = string }
|
// hcl: telemetry { statsite_address = string }
|
||||||
StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"`
|
StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"`
|
||||||
|
|
||||||
|
// EnableHostMetrics will enable metrics collected about the host system such as cpu memory and disk usage.
|
||||||
|
//
|
||||||
|
// hcl: telemetry { enable_host_metrics = (true|false) }
|
||||||
|
EnableHostMetrics bool `json:"enable_host_metrics,omitempty" mapstructure:"enable_host_metrics"`
|
||||||
|
|
||||||
// PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration
|
// PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration
|
||||||
// we acquire from hcl is the retention time. We also use definition slices that are set in agent setup
|
// we acquire from hcl is the retention time. We also use definition slices that are set in agent setup
|
||||||
// before being passed to InitTelemmetry.
|
// before being passed to InitTelemmetry.
|
||||||
|
|
|
@ -1831,6 +1831,9 @@ subsystem that provides Consul's service mesh capabilities.
|
||||||
of global tags that will be added to all telemetry packets sent to DogStatsD.
|
of global tags that will be added to all telemetry packets sent to DogStatsD.
|
||||||
It is a list of strings, where each string looks like "my_tag_name:my_tag_value".
|
It is a list of strings, where each string looks like "my_tag_name:my_tag_value".
|
||||||
|
|
||||||
|
- `enable_host_metrics` ((#telemetry-enable_host_metrics))
|
||||||
|
This enables reporting of host metrics about system resources, defaults to false.
|
||||||
|
|
||||||
- `filter_default` ((#telemetry-filter_default))
|
- `filter_default` ((#telemetry-filter_default))
|
||||||
This controls whether to allow metrics that have not been specified by the filter.
|
This controls whether to allow metrics that have not been specified by the filter.
|
||||||
Defaults to `true`, which will allow all metrics when no filters are provided.
|
Defaults to `true`, which will allow all metrics when no filters are provided.
|
||||||
|
|
|
@ -755,3 +755,32 @@ Consul attaches the following labels to metric values.
|
||||||
| `peer_id` | The ID of a peer connected to the reporting cluster or leader. | Any UUID |
|
| `peer_id` | The ID of a peer connected to the reporting cluster or leader. | Any UUID |
|
||||||
| `partition` | <EnterpriseAlert inline /> Name of the partition that the peering is created in. | Any defined partition name in the cluster |
|
| `partition` | <EnterpriseAlert inline /> Name of the partition that the peering is created in. | Any defined partition name in the cluster |
|
||||||
|
|
||||||
|
## Server Host Metrics
|
||||||
|
|
||||||
|
Consul servers can report the following metrics about the host's system resources.
|
||||||
|
This feature must be enabled in the [agent telemetry configuration](/consul/docs/agent/config/config-files#telemetry-enable_host_metrics).
|
||||||
|
Note that if the Consul server is operating inside a container these metrics
|
||||||
|
still report host resource usage and do not report any resource limits placed
|
||||||
|
on the container.
|
||||||
|
|
||||||
|
**Requirements:**
|
||||||
|
- Consul 1.15.3+
|
||||||
|
|
||||||
|
| Metric | Description | Unit | Type |
|
||||||
|
| ----------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ------- |
|
||||||
|
| `consul.host.memory.total` | The total physical memory in bytes | mixed | mixed |
|
||||||
|
| `consul.host.memory.available` | The available physical memory in bytes | mixed | mixed |
|
||||||
|
| `consul.host.memory.free` | The free physical memory in bytes | mixed | mixed |
|
||||||
|
| `consul.host.memory.used` | The used physical memory in bytes | mixed | mixed |
|
||||||
|
| `consul.host.memory.used_percent` | The used physical memory as a percentage of total physical memory | mixed | mixed |
|
||||||
|
| `consul.host.cpu.total` | The host's total cpu utilization
|
||||||
|
| `consul.host.cpu.user` | The cpu utilization in user space
|
||||||
|
| `consul.host.cpu.idle` | The cpu utilization in idle state
|
||||||
|
| `consul.host.cpu.iowait` | The cpu utilization in iowait state
|
||||||
|
| `consul.host.cpu.system` | The cpu utilization in system space
|
||||||
|
| `consul.host.disk.size` | The size in bytes of the data_dir disk
|
||||||
|
| `consul.host.disk.used` | The number of bytes used on the data_dir disk
|
||||||
|
| `consul.host.disk.available` | The number of bytes available on the data_dir disk
|
||||||
|
| `consul.host.disk.used_percent` | The percentage of disk space used on the data_dir disk
|
||||||
|
| `consul.host.disk.inodes_percent` | The percentage of inode usage on the data_dir disk
|
||||||
|
| `consul.host.uptime` | The uptime of the host in seconds
|
||||||
|
|
Loading…
Reference in New Issue