2016-05-09 15:55:19 +00:00
|
|
|
package stats
|
|
|
|
|
|
|
|
import (
|
2016-06-20 17:29:46 +00:00
|
|
|
"math"
|
2016-06-10 21:14:33 +00:00
|
|
|
"runtime"
|
2016-12-12 06:58:28 +00:00
|
|
|
"sync"
|
2016-05-27 21:15:51 +00:00
|
|
|
"time"
|
|
|
|
|
2018-08-29 22:05:03 +00:00
|
|
|
hclog "github.com/hashicorp/go-hclog"
|
2018-11-13 16:49:14 +00:00
|
|
|
"github.com/hashicorp/nomad/plugins/device"
|
2021-03-30 18:47:33 +00:00
|
|
|
"github.com/shirou/gopsutil/v3/cpu"
|
|
|
|
"github.com/shirou/gopsutil/v3/disk"
|
|
|
|
"github.com/shirou/gopsutil/v3/host"
|
|
|
|
"github.com/shirou/gopsutil/v3/mem"
|
2016-05-09 15:55:19 +00:00
|
|
|
)
|
|
|
|
|
2016-05-09 16:53:00 +00:00
|
|
|
// HostStats represents resource usage stats of the host running a Nomad client
|
2016-05-09 15:55:19 +00:00
|
|
|
type HostStats struct {
|
2016-06-10 21:14:33 +00:00
|
|
|
Memory *MemoryStats
|
|
|
|
CPU []*CPUStats
|
|
|
|
DiskStats []*DiskStats
|
2016-12-16 07:54:54 +00:00
|
|
|
AllocDirStats *DiskStats
|
2018-11-13 16:49:14 +00:00
|
|
|
DeviceStats []*DeviceGroupStats
|
2016-06-10 21:14:33 +00:00
|
|
|
Uptime uint64
|
|
|
|
Timestamp int64
|
|
|
|
CPUTicksConsumed float64
|
2016-05-09 15:55:19 +00:00
|
|
|
}
|
|
|
|
|
2018-03-11 18:42:29 +00:00
|
|
|
// MemoryStats represents stats related to virtual memory usage
|
2016-05-09 15:55:19 +00:00
|
|
|
type MemoryStats struct {
|
|
|
|
Total uint64
|
|
|
|
Available uint64
|
|
|
|
Used uint64
|
|
|
|
Free uint64
|
|
|
|
}
|
|
|
|
|
2016-05-09 16:53:00 +00:00
|
|
|
// CPUStats represents stats related to cpu usage
|
2016-05-09 15:55:19 +00:00
|
|
|
type CPUStats struct {
|
|
|
|
CPU string
|
|
|
|
User float64
|
|
|
|
System float64
|
|
|
|
Idle float64
|
2016-05-22 09:04:27 +00:00
|
|
|
Total float64
|
2016-05-09 15:55:19 +00:00
|
|
|
}
|
|
|
|
|
2016-05-22 10:46:49 +00:00
|
|
|
// DiskStats represents stats related to disk usage
|
|
|
|
type DiskStats struct {
|
|
|
|
Device string
|
|
|
|
Mountpoint string
|
|
|
|
Size uint64
|
|
|
|
Used uint64
|
|
|
|
Available uint64
|
|
|
|
UsedPercent float64
|
|
|
|
InodesUsedPercent float64
|
|
|
|
}
|
|
|
|
|
2018-11-13 16:49:14 +00:00
|
|
|
// DeviceGroupStats represents stats related to device group
|
|
|
|
type DeviceGroupStats = device.DeviceGroupStats
|
|
|
|
|
|
|
|
// DeviceStatsCollector is used to retrieve all the latest statistics for all devices.
|
|
|
|
type DeviceStatsCollector func() []*DeviceGroupStats
|
|
|
|
|
2018-03-11 18:39:35 +00:00
|
|
|
// NodeStatsCollector is an interface which is used for the purposes of mocking
|
2016-12-20 01:53:11 +00:00
|
|
|
// the HostStatsCollector in the tests
|
2016-12-16 07:54:54 +00:00
|
|
|
type NodeStatsCollector interface {
|
|
|
|
Collect() error
|
|
|
|
Stats() *HostStats
|
|
|
|
}
|
|
|
|
|
2016-05-22 09:04:27 +00:00
|
|
|
// HostStatsCollector collects host resource usage stats
|
|
|
|
type HostStatsCollector struct {
|
2018-11-13 16:49:14 +00:00
|
|
|
numCores int
|
|
|
|
statsCalculator map[string]*HostCpuStatsCalculator
|
|
|
|
hostStats *HostStats
|
|
|
|
hostStatsLock sync.RWMutex
|
|
|
|
allocDir string
|
|
|
|
deviceStatsCollector DeviceStatsCollector
|
2017-08-28 19:04:32 +00:00
|
|
|
|
|
|
|
// badParts is a set of partitions whose usage cannot be read; used to
|
|
|
|
// squelch logspam.
|
|
|
|
badParts map[string]struct{}
|
2018-08-29 22:05:03 +00:00
|
|
|
|
|
|
|
logger hclog.Logger
|
2016-05-22 09:04:27 +00:00
|
|
|
}
|
|
|
|
|
2016-12-20 01:53:11 +00:00
|
|
|
// NewHostStatsCollector returns a HostStatsCollector. The allocDir is passed in
|
|
|
|
// so that we can present the disk related statistics for the mountpoint where
|
|
|
|
// the allocation directory lives
|
2018-11-13 16:49:14 +00:00
|
|
|
func NewHostStatsCollector(logger hclog.Logger, allocDir string, deviceStatsCollector DeviceStatsCollector) *HostStatsCollector {
|
2018-08-29 22:05:03 +00:00
|
|
|
logger = logger.Named("host_stats")
|
2016-06-10 21:14:33 +00:00
|
|
|
numCores := runtime.NumCPU()
|
2016-05-22 09:04:27 +00:00
|
|
|
statsCalculator := make(map[string]*HostCpuStatsCalculator)
|
2016-06-10 21:14:33 +00:00
|
|
|
collector := &HostStatsCollector{
|
2018-11-13 16:49:14 +00:00
|
|
|
statsCalculator: statsCalculator,
|
|
|
|
numCores: numCores,
|
|
|
|
logger: logger,
|
|
|
|
allocDir: allocDir,
|
|
|
|
badParts: make(map[string]struct{}),
|
|
|
|
deviceStatsCollector: deviceStatsCollector,
|
2016-06-10 21:14:33 +00:00
|
|
|
}
|
|
|
|
return collector
|
2016-05-22 09:04:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Collect collects stats related to resource usage of a host
|
2016-12-12 06:58:28 +00:00
|
|
|
func (h *HostStatsCollector) Collect() error {
|
2017-09-12 04:42:10 +00:00
|
|
|
h.hostStatsLock.Lock()
|
|
|
|
defer h.hostStatsLock.Unlock()
|
2018-01-11 19:24:57 +00:00
|
|
|
return h.collectLocked()
|
|
|
|
}
|
2017-09-12 04:42:10 +00:00
|
|
|
|
2018-01-11 19:24:57 +00:00
|
|
|
// collectLocked collects stats related to resource usage of the host but should
|
|
|
|
// be called with the lock held.
|
|
|
|
func (h *HostStatsCollector) collectLocked() error {
|
2016-05-27 21:15:51 +00:00
|
|
|
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
|
2017-09-06 23:46:42 +00:00
|
|
|
|
|
|
|
// Determine up-time
|
|
|
|
uptime, err := host.Uptime()
|
2016-08-07 06:00:00 +00:00
|
|
|
if err != nil {
|
client: Return empty values when host stats fail
Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).
When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.
This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.
However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.
Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
2019-09-18 22:57:23 +00:00
|
|
|
h.logger.Error("failed to collect upstime stats", "error", err)
|
|
|
|
uptime = 0
|
2016-08-07 06:00:00 +00:00
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
hs.Uptime = uptime
|
|
|
|
|
|
|
|
// Collect memory stats
|
|
|
|
mstats, err := h.collectMemoryStats()
|
|
|
|
if err != nil {
|
client: Return empty values when host stats fail
Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).
When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.
This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.
However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.
Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
2019-09-18 22:57:23 +00:00
|
|
|
h.logger.Error("failed to collect memory stats", "error", err)
|
|
|
|
mstats = &MemoryStats{}
|
2016-05-09 15:55:19 +00:00
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
hs.Memory = mstats
|
2016-05-09 15:55:19 +00:00
|
|
|
|
2017-09-06 23:46:42 +00:00
|
|
|
// Collect cpu stats
|
|
|
|
cpus, ticks, err := h.collectCPUStats()
|
2016-08-07 06:00:00 +00:00
|
|
|
if err != nil {
|
client: Return empty values when host stats fail
Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).
When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.
This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.
However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.
Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
2019-09-18 22:57:23 +00:00
|
|
|
h.logger.Error("failed to collect cpu stats", "error", err)
|
|
|
|
cpus = []*CPUStats{}
|
|
|
|
ticks = 0
|
2016-08-07 06:00:00 +00:00
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
hs.CPU = cpus
|
|
|
|
hs.CPUTicksConsumed = ticks
|
|
|
|
|
|
|
|
// Collect disk stats
|
|
|
|
diskStats, err := h.collectDiskStats()
|
|
|
|
if err != nil {
|
client: Return empty values when host stats fail
Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).
When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.
This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.
However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.
Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
2019-09-18 22:57:23 +00:00
|
|
|
h.logger.Error("failed to collect disk stats", "error", err)
|
|
|
|
hs.DiskStats = []*DiskStats{}
|
2016-05-09 15:55:19 +00:00
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
hs.DiskStats = diskStats
|
2016-05-09 15:55:19 +00:00
|
|
|
|
2017-09-06 23:46:42 +00:00
|
|
|
// Getting the disk stats for the allocation directory
|
|
|
|
usage, err := disk.Usage(h.allocDir)
|
2016-08-07 06:00:00 +00:00
|
|
|
if err != nil {
|
client: Return empty values when host stats fail
Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).
When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.
This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.
However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.
Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
2019-09-18 22:57:23 +00:00
|
|
|
h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err)
|
|
|
|
hs.AllocDirStats = &DiskStats{}
|
|
|
|
} else {
|
|
|
|
hs.AllocDirStats = h.toDiskStats(usage, nil)
|
2016-08-07 06:00:00 +00:00
|
|
|
}
|
2018-11-13 16:49:14 +00:00
|
|
|
// Collect devices stats
|
|
|
|
deviceStats := h.collectDeviceGroupStats()
|
|
|
|
hs.DeviceStats = deviceStats
|
|
|
|
|
2017-09-06 23:46:42 +00:00
|
|
|
// Update the collected status object.
|
|
|
|
h.hostStats = hs
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *HostStatsCollector) collectMemoryStats() (*MemoryStats, error) {
|
|
|
|
memStats, err := mem.VirtualMemory()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
mem := &MemoryStats{
|
|
|
|
Total: memStats.Total,
|
|
|
|
Available: memStats.Available,
|
|
|
|
Used: memStats.Used,
|
|
|
|
Free: memStats.Free,
|
|
|
|
}
|
|
|
|
|
|
|
|
return mem, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *HostStatsCollector) collectDiskStats() ([]*DiskStats, error) {
|
|
|
|
partitions, err := disk.Partitions(false)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2016-08-07 06:00:00 +00:00
|
|
|
var diskStats []*DiskStats
|
|
|
|
for _, partition := range partitions {
|
|
|
|
usage, err := disk.Usage(partition.Mountpoint)
|
|
|
|
if err != nil {
|
2017-08-28 19:04:32 +00:00
|
|
|
if _, ok := h.badParts[partition.Mountpoint]; ok {
|
|
|
|
// already known bad, don't log again
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
h.badParts[partition.Mountpoint] = struct{}{}
|
2018-08-29 22:05:03 +00:00
|
|
|
h.logger.Warn("error fetching host disk usage stats", "error", err, "partition", partition.Mountpoint)
|
2016-12-13 11:28:57 +00:00
|
|
|
continue
|
2016-05-22 10:46:49 +00:00
|
|
|
}
|
2017-08-28 19:04:32 +00:00
|
|
|
delete(h.badParts, partition.Mountpoint)
|
|
|
|
|
2016-12-16 07:54:54 +00:00
|
|
|
ds := h.toDiskStats(usage, &partition)
|
|
|
|
diskStats = append(diskStats, ds)
|
2016-05-09 15:55:19 +00:00
|
|
|
}
|
2016-08-07 06:00:00 +00:00
|
|
|
|
2017-09-06 23:46:42 +00:00
|
|
|
return diskStats, nil
|
2016-12-12 06:58:28 +00:00
|
|
|
}
|
|
|
|
|
2018-11-13 16:49:14 +00:00
|
|
|
func (h *HostStatsCollector) collectDeviceGroupStats() []*DeviceGroupStats {
|
2018-11-13 20:35:42 +00:00
|
|
|
if h.deviceStatsCollector == nil {
|
|
|
|
return []*DeviceGroupStats{}
|
|
|
|
}
|
|
|
|
|
2018-11-13 16:49:14 +00:00
|
|
|
return h.deviceStatsCollector()
|
|
|
|
}
|
|
|
|
|
2016-12-16 07:54:54 +00:00
|
|
|
// Stats returns the host stats that has been collected
|
2016-12-12 06:58:28 +00:00
|
|
|
func (h *HostStatsCollector) Stats() *HostStats {
|
|
|
|
h.hostStatsLock.RLock()
|
|
|
|
defer h.hostStatsLock.RUnlock()
|
2018-01-11 19:24:57 +00:00
|
|
|
|
|
|
|
if h.hostStats == nil {
|
|
|
|
if err := h.collectLocked(); err != nil {
|
2018-08-29 22:05:03 +00:00
|
|
|
h.logger.Warn("error fetching host resource usage stats", "error", err)
|
2018-01-11 19:24:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-12 06:58:28 +00:00
|
|
|
return h.hostStats
|
2016-05-09 15:55:19 +00:00
|
|
|
}
|
2016-05-22 09:04:27 +00:00
|
|
|
|
2016-12-16 07:54:54 +00:00
|
|
|
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
|
|
|
|
func (h *HostStatsCollector) toDiskStats(usage *disk.UsageStat, partitionStat *disk.PartitionStat) *DiskStats {
|
|
|
|
ds := DiskStats{
|
|
|
|
Size: usage.Total,
|
|
|
|
Used: usage.Used,
|
|
|
|
Available: usage.Free,
|
|
|
|
UsedPercent: usage.UsedPercent,
|
|
|
|
InodesUsedPercent: usage.InodesUsedPercent,
|
|
|
|
}
|
|
|
|
if math.IsNaN(ds.UsedPercent) {
|
|
|
|
ds.UsedPercent = 0.0
|
|
|
|
}
|
|
|
|
if math.IsNaN(ds.InodesUsedPercent) {
|
|
|
|
ds.InodesUsedPercent = 0.0
|
|
|
|
}
|
|
|
|
|
|
|
|
if partitionStat != nil {
|
|
|
|
ds.Device = partitionStat.Device
|
|
|
|
ds.Mountpoint = partitionStat.Mountpoint
|
|
|
|
}
|
|
|
|
|
|
|
|
return &ds
|
|
|
|
}
|
|
|
|
|
2016-05-22 09:04:27 +00:00
|
|
|
// HostCpuStatsCalculator calculates cpu usage percentages
|
|
|
|
type HostCpuStatsCalculator struct {
|
|
|
|
prevIdle float64
|
|
|
|
prevUser float64
|
|
|
|
prevSystem float64
|
|
|
|
prevBusy float64
|
|
|
|
prevTotal float64
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewHostCpuStatsCalculator returns a HostCpuStatsCalculator
|
|
|
|
func NewHostCpuStatsCalculator() *HostCpuStatsCalculator {
|
|
|
|
return &HostCpuStatsCalculator{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Calculate calculates the current cpu usage percentages
|
|
|
|
func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) {
|
|
|
|
currentIdle := times.Idle
|
|
|
|
currentUser := times.User
|
|
|
|
currentSystem := times.System
|
|
|
|
currentTotal := times.Total()
|
2017-09-06 23:46:42 +00:00
|
|
|
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
|
2020-03-15 08:36:59 +00:00
|
|
|
times.Softirq + times.Steal + times.Guest + times.GuestNice
|
2016-05-22 09:04:27 +00:00
|
|
|
|
|
|
|
deltaTotal := currentTotal - h.prevTotal
|
|
|
|
idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100
|
2017-09-06 23:46:42 +00:00
|
|
|
user = ((currentUser - h.prevUser) / deltaTotal) * 100
|
|
|
|
system = ((currentSystem - h.prevSystem) / deltaTotal) * 100
|
|
|
|
total = ((currentBusy - h.prevBusy) / deltaTotal) * 100
|
|
|
|
|
|
|
|
// Protect against any invalid values
|
|
|
|
if math.IsNaN(idle) || math.IsInf(idle, 0) {
|
2017-09-08 18:43:43 +00:00
|
|
|
idle = 100.0
|
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
if math.IsNaN(user) || math.IsInf(user, 0) {
|
2017-09-08 18:43:43 +00:00
|
|
|
user = 0.0
|
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
if math.IsNaN(system) || math.IsInf(system, 0) {
|
2017-09-08 18:43:43 +00:00
|
|
|
system = 0.0
|
|
|
|
}
|
2017-09-06 23:46:42 +00:00
|
|
|
if math.IsNaN(total) || math.IsInf(total, 0) {
|
2017-09-08 18:43:43 +00:00
|
|
|
total = 0.0
|
|
|
|
}
|
2016-05-22 09:04:27 +00:00
|
|
|
|
|
|
|
h.prevIdle = currentIdle
|
|
|
|
h.prevUser = currentUser
|
|
|
|
h.prevSystem = currentSystem
|
|
|
|
h.prevTotal = currentTotal
|
|
|
|
h.prevBusy = currentBusy
|
|
|
|
return
|
|
|
|
}
|