Merge pull request #1501 from hashicorp/f-stats-opt-in
Allow operators to opt into publishing node and alloc metrics
This commit is contained in:
commit
5ff750db96
|
@ -1,5 +1,9 @@
|
|||
## 0.4.1 (UNRELEASED)
|
||||
|
||||
__BACKWARDS INCOMPATIBILITIES:__
|
||||
* telemetry: Operators will have to explicitly opt-in for Nomad client to
|
||||
publish allocation and node metrics
|
||||
|
||||
IMPROVEMENTS:
|
||||
* core: Allow count 0 on system jobs [GH-1421]
|
||||
* core: Gracefully handle short lived outages by holding RPC calls [GH-1403]
|
||||
|
@ -15,6 +19,7 @@ IMPROVEMENTS:
|
|||
* client: Fingerprint network speed on Windows [GH-1443]
|
||||
* driver/docker: Allow working directory to be configured [GH-1513]
|
||||
* telemetry: Circonus integration for telemetry metrics [GH-1459]
|
||||
* telemetry: Allow operators to opt-in for publishing metrics [GH-1501]
|
||||
|
||||
BUG FIXES:
|
||||
* core: Sanitize empty slices/maps in jobs to avoid incorrect create/destroy
|
||||
|
|
|
@ -1397,7 +1397,11 @@ func (c *Client) collectHostStats() {
|
|||
c.resourceUsageLock.Lock()
|
||||
c.resourceUsage = ru
|
||||
c.resourceUsageLock.Unlock()
|
||||
c.emitStats(ru)
|
||||
|
||||
// Publish Node metrics if operator has opted in
|
||||
if c.config.PublishNodeMetrics {
|
||||
c.emitStats(ru)
|
||||
}
|
||||
case <-c.shutdownCh:
|
||||
return
|
||||
}
|
||||
|
|
|
@ -117,6 +117,14 @@ type Config struct {
|
|||
// StatsCollectionInterval is the interval at which the Nomad client
|
||||
// collects resource usage stats
|
||||
StatsCollectionInterval time.Duration
|
||||
|
||||
// PublishNodeMetrics determines whether nomad is going to publish node
|
||||
// level metrics to remote Telemetry sinks
|
||||
PublishNodeMetrics bool
|
||||
|
||||
// PublishAllocationMetrics determines whether nomad is going to publish
|
||||
// allocation metrics to remote Telemetry sinks
|
||||
PublishAllocationMetrics bool
|
||||
}
|
||||
|
||||
func (c *Config) Copy() *Config {
|
||||
|
|
|
@ -640,7 +640,7 @@ func (r *TaskRunner) Destroy() {
|
|||
// emitStats emits resource usage stats of tasks to remote metrics collector
|
||||
// sinks
|
||||
func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
|
||||
if ru.ResourceUsage.MemoryStats != nil {
|
||||
if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
|
||||
|
@ -649,7 +649,7 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
|
|||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
|
||||
}
|
||||
|
||||
if ru.ResourceUsage.CpuStats != nil {
|
||||
if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
|
||||
|
@ -657,6 +657,4 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
|
|||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
|
||||
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
|
||||
}
|
||||
|
||||
//TODO Add Pid stats when we add an API to enable/disable them
|
||||
}
|
||||
|
|
|
@ -350,6 +350,8 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) {
|
|||
|
||||
conf.ConsulConfig = a.config.Consul
|
||||
conf.StatsCollectionInterval = a.config.Telemetry.collectionInterval
|
||||
conf.PublishNodeMetrics = a.config.Telemetry.PublishNodeMetrics
|
||||
conf.PublishAllocationMetrics = a.config.Telemetry.PublishAllocationMetrics
|
||||
return conf, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -70,6 +70,8 @@ telemetry {
|
|||
statsd_address = "127.0.0.1:2345"
|
||||
disable_hostname = true
|
||||
collection_interval = "3s"
|
||||
publish_allocation_metrics = true
|
||||
publish_node_metrics = true
|
||||
}
|
||||
leave_on_interrupt = true
|
||||
leave_on_terminate = true
|
||||
|
|
|
@ -240,11 +240,13 @@ type ServerConfig struct {
|
|||
|
||||
// Telemetry is the telemetry configuration for the server
|
||||
type Telemetry struct {
|
||||
StatsiteAddr string `mapstructure:"statsite_address"`
|
||||
StatsdAddr string `mapstructure:"statsd_address"`
|
||||
DisableHostname bool `mapstructure:"disable_hostname"`
|
||||
CollectionInterval string `mapstructure:"collection_interval"`
|
||||
collectionInterval time.Duration `mapstructure:"-"`
|
||||
StatsiteAddr string `mapstructure:"statsite_address"`
|
||||
StatsdAddr string `mapstructure:"statsd_address"`
|
||||
DisableHostname bool `mapstructure:"disable_hostname"`
|
||||
CollectionInterval string `mapstructure:"collection_interval"`
|
||||
collectionInterval time.Duration `mapstructure:"-"`
|
||||
PublishAllocationMetrics bool `mapstructure:"publish_allocation_metrics"`
|
||||
PublishNodeMetrics bool `mapstructure:"publish_node_metrics"`
|
||||
|
||||
// Circonus: see https://github.com/circonus-labs/circonus-gometrics
|
||||
// for more details on the various configuration options.
|
||||
|
|
|
@ -493,6 +493,8 @@ func parseTelemetry(result **Telemetry, list *ast.ObjectList) error {
|
|||
"statsd_address",
|
||||
"disable_hostname",
|
||||
"collection_interval",
|
||||
"publish_allocation_metrics",
|
||||
"publish_node_metrics",
|
||||
"circonus_api_token",
|
||||
"circonus_api_app",
|
||||
"circonus_api_url",
|
||||
|
|
|
@ -83,11 +83,13 @@ func TestConfig_Parse(t *testing.T) {
|
|||
RetryMaxAttempts: 3,
|
||||
},
|
||||
Telemetry: &Telemetry{
|
||||
StatsiteAddr: "127.0.0.1:1234",
|
||||
StatsdAddr: "127.0.0.1:2345",
|
||||
DisableHostname: true,
|
||||
CollectionInterval: "3s",
|
||||
collectionInterval: 3 * time.Second,
|
||||
StatsiteAddr: "127.0.0.1:1234",
|
||||
StatsdAddr: "127.0.0.1:2345",
|
||||
DisableHostname: true,
|
||||
CollectionInterval: "3s",
|
||||
collectionInterval: 3 * time.Second,
|
||||
PublishAllocationMetrics: true,
|
||||
PublishNodeMetrics: true,
|
||||
},
|
||||
LeaveOnInt: true,
|
||||
LeaveOnTerm: true,
|
||||
|
|
Loading…
Reference in a new issue