Merge pull request #1501 from hashicorp/f-stats-opt-in

Allow operators to opt into publishing node and alloc metrics
This commit is contained in:
Diptanu Choudhury 2016-08-04 13:33:56 -07:00 committed by GitHub
commit 5ff750db96
9 changed files with 40 additions and 15 deletions

View file

@ -1,5 +1,9 @@
## 0.4.1 (UNRELEASED)
__BACKWARDS INCOMPATIBILITIES:__
* telemetry: Operators will have to explicitly opt-in for Nomad client to
publish allocation and node metrics
IMPROVEMENTS:
* core: Allow count 0 on system jobs [GH-1421]
* core: Gracefully handle short lived outages by holding RPC calls [GH-1403]
@ -15,6 +19,7 @@ IMPROVEMENTS:
* client: Fingerprint network speed on Windows [GH-1443]
* driver/docker: Allow working directory to be configured [GH-1513]
* telemetry: Circonus integration for telemetry metrics [GH-1459]
* telemetry: Allow operators to opt-in for publishing metrics [GH-1501]
BUG FIXES:
* core: Sanitize empty slices/maps in jobs to avoid incorrect create/destroy

View file

@ -1397,7 +1397,11 @@ func (c *Client) collectHostStats() {
c.resourceUsageLock.Lock()
c.resourceUsage = ru
c.resourceUsageLock.Unlock()
c.emitStats(ru)
// Publish Node metrics if operator has opted in
if c.config.PublishNodeMetrics {
c.emitStats(ru)
}
case <-c.shutdownCh:
return
}

View file

@ -117,6 +117,14 @@ type Config struct {
// StatsCollectionInterval is the interval at which the Nomad client
// collects resource usage stats
StatsCollectionInterval time.Duration
// PublishNodeMetrics determines whether nomad is going to publish node
// level metrics to remote Telemetry sinks
PublishNodeMetrics bool
// PublishAllocationMetrics determines whether nomad is going to publish
// allocation metrics to remote Telemetry sinks
PublishAllocationMetrics bool
}
func (c *Config) Copy() *Config {

View file

@ -640,7 +640,7 @@ func (r *TaskRunner) Destroy() {
// emitStats emits resource usage stats of tasks to remote metrics collector
// sinks
func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
if ru.ResourceUsage.MemoryStats != nil {
if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
@ -649,7 +649,7 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
}
if ru.ResourceUsage.CpuStats != nil {
if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
@ -657,6 +657,4 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
}
//TODO Add Pid stats when we add an API to enable/disable them
}

View file

@ -350,6 +350,8 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) {
conf.ConsulConfig = a.config.Consul
conf.StatsCollectionInterval = a.config.Telemetry.collectionInterval
conf.PublishNodeMetrics = a.config.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = a.config.Telemetry.PublishAllocationMetrics
return conf, nil
}

View file

@ -70,6 +70,8 @@ telemetry {
statsd_address = "127.0.0.1:2345"
disable_hostname = true
collection_interval = "3s"
publish_allocation_metrics = true
publish_node_metrics = true
}
leave_on_interrupt = true
leave_on_terminate = true

View file

@ -240,11 +240,13 @@ type ServerConfig struct {
// Telemetry is the telemetry configuration for the server
type Telemetry struct {
StatsiteAddr string `mapstructure:"statsite_address"`
StatsdAddr string `mapstructure:"statsd_address"`
DisableHostname bool `mapstructure:"disable_hostname"`
CollectionInterval string `mapstructure:"collection_interval"`
collectionInterval time.Duration `mapstructure:"-"`
StatsiteAddr string `mapstructure:"statsite_address"`
StatsdAddr string `mapstructure:"statsd_address"`
DisableHostname bool `mapstructure:"disable_hostname"`
CollectionInterval string `mapstructure:"collection_interval"`
collectionInterval time.Duration `mapstructure:"-"`
PublishAllocationMetrics bool `mapstructure:"publish_allocation_metrics"`
PublishNodeMetrics bool `mapstructure:"publish_node_metrics"`
// Circonus: see https://github.com/circonus-labs/circonus-gometrics
// for more details on the various configuration options.

View file

@ -493,6 +493,8 @@ func parseTelemetry(result **Telemetry, list *ast.ObjectList) error {
"statsd_address",
"disable_hostname",
"collection_interval",
"publish_allocation_metrics",
"publish_node_metrics",
"circonus_api_token",
"circonus_api_app",
"circonus_api_url",

View file

@ -83,11 +83,13 @@ func TestConfig_Parse(t *testing.T) {
RetryMaxAttempts: 3,
},
Telemetry: &Telemetry{
StatsiteAddr: "127.0.0.1:1234",
StatsdAddr: "127.0.0.1:2345",
DisableHostname: true,
CollectionInterval: "3s",
collectionInterval: 3 * time.Second,
StatsiteAddr: "127.0.0.1:1234",
StatsdAddr: "127.0.0.1:2345",
DisableHostname: true,
CollectionInterval: "3s",
collectionInterval: 3 * time.Second,
PublishAllocationMetrics: true,
PublishNodeMetrics: true,
},
LeaveOnInt: true,
LeaveOnTerm: true,