Merge pull request #3162 from hashicorp/f-tagged-metrics-api
Tagged metrics API
This commit is contained in:
commit
187c1568aa
|
@ -13,6 +13,7 @@ import (
|
|||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
metrics "github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/consul/api"
|
||||
version "github.com/hashicorp/go-version"
|
||||
"github.com/hashicorp/nomad/client"
|
||||
|
@ -66,15 +67,18 @@ type Agent struct {
|
|||
shutdown bool
|
||||
shutdownCh chan struct{}
|
||||
shutdownLock sync.Mutex
|
||||
|
||||
InmemSink *metrics.InmemSink
|
||||
}
|
||||
|
||||
// NewAgent is used to create a new agent with the given configuration
|
||||
func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) {
|
||||
func NewAgent(config *Config, logOutput io.Writer, inmem *metrics.InmemSink) (*Agent, error) {
|
||||
a := &Agent{
|
||||
config: config,
|
||||
logger: log.New(logOutput, "", log.LstdFlags|log.Lmicroseconds),
|
||||
logOutput: logOutput,
|
||||
shutdownCh: make(chan struct{}),
|
||||
InmemSink: inmem,
|
||||
}
|
||||
|
||||
if err := a.setupConsul(config.Consul); err != nil {
|
||||
|
@ -331,9 +335,13 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) {
|
|||
|
||||
conf.ConsulConfig = a.config.Consul
|
||||
conf.VaultConfig = a.config.Vault
|
||||
|
||||
// Set up Telemetry configuration
|
||||
conf.StatsCollectionInterval = a.config.Telemetry.collectionInterval
|
||||
conf.PublishNodeMetrics = a.config.Telemetry.PublishNodeMetrics
|
||||
conf.PublishAllocationMetrics = a.config.Telemetry.PublishAllocationMetrics
|
||||
conf.DisableTaggedMetrics = a.config.Telemetry.DisableTaggedMetrics
|
||||
conf.BackwardsCompatibleMetrics = a.config.Telemetry.BackwardsCompatibleMetrics
|
||||
|
||||
// Set the TLS related configs
|
||||
conf.TLSConfig = a.config.TLSConfig
|
||||
|
@ -489,7 +497,6 @@ func (a *Agent) setupClient() error {
|
|||
}
|
||||
}
|
||||
|
||||
// Create the client
|
||||
client, err := client.NewClient(conf, a.consulCatalog, a.consulService, a.logger)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client setup failed: %v", err)
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
sconfig "github.com/hashicorp/nomad/nomad/structs/config"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func getPort() int {
|
||||
|
@ -316,6 +317,29 @@ func TestAgent_ClientConfig(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// Clients should inherit telemetry configuration
|
||||
func TestAget_Client_TelemetryConfiguration(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
conf := DefaultConfig()
|
||||
conf.DevMode = true
|
||||
conf.Telemetry.DisableTaggedMetrics = true
|
||||
conf.Telemetry.BackwardsCompatibleMetrics = true
|
||||
|
||||
a := &Agent{config: conf}
|
||||
|
||||
c, err := a.clientConfig()
|
||||
assert.Nil(err)
|
||||
|
||||
telemetry := conf.Telemetry
|
||||
|
||||
assert.Equal(c.StatsCollectionInterval, telemetry.collectionInterval)
|
||||
assert.Equal(c.PublishNodeMetrics, telemetry.PublishNodeMetrics)
|
||||
assert.Equal(c.PublishAllocationMetrics, telemetry.PublishAllocationMetrics)
|
||||
assert.Equal(c.DisableTaggedMetrics, telemetry.DisableTaggedMetrics)
|
||||
assert.Equal(c.BackwardsCompatibleMetrics, telemetry.BackwardsCompatibleMetrics)
|
||||
}
|
||||
|
||||
// TestAgent_HTTPCheck asserts Agent.agentHTTPCheck properly alters the HTTP
|
||||
// API health check depending on configuration.
|
||||
func TestAgent_HTTPCheck(t *testing.T) {
|
||||
|
|
|
@ -15,12 +15,12 @@ import (
|
|||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
metrics "github.com/armon/go-metrics"
|
||||
"github.com/armon/go-metrics/circonus"
|
||||
"github.com/armon/go-metrics/datadog"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
"github.com/hashicorp/go-checkpoint"
|
||||
"github.com/hashicorp/go-syslog"
|
||||
checkpoint "github.com/hashicorp/go-checkpoint"
|
||||
gsyslog "github.com/hashicorp/go-syslog"
|
||||
"github.com/hashicorp/logutils"
|
||||
"github.com/hashicorp/nomad/helper/flag-helpers"
|
||||
"github.com/hashicorp/nomad/helper/gated-writer"
|
||||
|
@ -331,9 +331,9 @@ func (c *Command) setupLoggers(config *Config) (*gatedwriter.Writer, *logWriter,
|
|||
}
|
||||
|
||||
// setupAgent is used to start the agent and various interfaces
|
||||
func (c *Command) setupAgent(config *Config, logOutput io.Writer) error {
|
||||
func (c *Command) setupAgent(config *Config, logOutput io.Writer, inmem *metrics.InmemSink) error {
|
||||
c.Ui.Output("Starting Nomad agent...")
|
||||
agent, err := NewAgent(config, logOutput)
|
||||
agent, err := NewAgent(config, logOutput, inmem)
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error starting agent: %s", err))
|
||||
return err
|
||||
|
@ -444,13 +444,14 @@ func (c *Command) Run(args []string) int {
|
|||
}
|
||||
|
||||
// Initialize the telemetry
|
||||
if err := c.setupTelemetry(config); err != nil {
|
||||
inmem, err := c.setupTelemetry(config)
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error initializing telemetry: %s", err))
|
||||
return 1
|
||||
}
|
||||
|
||||
// Create the agent
|
||||
if err := c.setupAgent(config, logOutput); err != nil {
|
||||
if err := c.setupAgent(config, logOutput, inmem); err != nil {
|
||||
logGate.Flush()
|
||||
return 1
|
||||
}
|
||||
|
@ -619,7 +620,7 @@ func (c *Command) handleReload(config *Config) *Config {
|
|||
}
|
||||
|
||||
// setupTelemetry is used ot setup the telemetry sub-systems
|
||||
func (c *Command) setupTelemetry(config *Config) error {
|
||||
func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) {
|
||||
/* Setup telemetry
|
||||
Aggregate on 10 second intervals for 1 minute. Expose the
|
||||
metrics over stderr when there is a SIGUSR1 received.
|
||||
|
@ -646,7 +647,7 @@ func (c *Command) setupTelemetry(config *Config) error {
|
|||
if telConfig.StatsiteAddr != "" {
|
||||
sink, err := metrics.NewStatsiteSink(telConfig.StatsiteAddr)
|
||||
if err != nil {
|
||||
return err
|
||||
return inm, err
|
||||
}
|
||||
fanout = append(fanout, sink)
|
||||
}
|
||||
|
@ -655,7 +656,7 @@ func (c *Command) setupTelemetry(config *Config) error {
|
|||
if telConfig.StatsdAddr != "" {
|
||||
sink, err := metrics.NewStatsdSink(telConfig.StatsdAddr)
|
||||
if err != nil {
|
||||
return err
|
||||
return inm, err
|
||||
}
|
||||
fanout = append(fanout, sink)
|
||||
}
|
||||
|
@ -664,7 +665,7 @@ func (c *Command) setupTelemetry(config *Config) error {
|
|||
if telConfig.DataDogAddr != "" {
|
||||
sink, err := datadog.NewDogStatsdSink(telConfig.DataDogAddr, config.NodeName)
|
||||
if err != nil {
|
||||
return err
|
||||
return inm, err
|
||||
}
|
||||
fanout = append(fanout, sink)
|
||||
}
|
||||
|
@ -700,7 +701,7 @@ func (c *Command) setupTelemetry(config *Config) error {
|
|||
|
||||
sink, err := circonus.NewCirconusSink(cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
return inm, err
|
||||
}
|
||||
sink.Start()
|
||||
fanout = append(fanout, sink)
|
||||
|
@ -714,7 +715,7 @@ func (c *Command) setupTelemetry(config *Config) error {
|
|||
metricsConf.EnableHostname = false
|
||||
metrics.NewGlobal(metricsConf, inm)
|
||||
}
|
||||
return nil
|
||||
return inm, nil
|
||||
}
|
||||
|
||||
// setupSCADA is used to start a new SCADA provider and listener,
|
||||
|
|
|
@ -60,8 +60,6 @@ client {
|
|||
gc_inode_usage_threshold = 91
|
||||
gc_max_allocs = 50
|
||||
no_host_uuid = false
|
||||
disable_tagged_metrics = true
|
||||
backwards_compatible_metrics = true
|
||||
}
|
||||
server {
|
||||
enabled = true
|
||||
|
@ -98,6 +96,8 @@ telemetry {
|
|||
collection_interval = "3s"
|
||||
publish_allocation_metrics = true
|
||||
publish_node_metrics = true
|
||||
disable_tagged_metrics = true
|
||||
backwards_compatible_metrics = true
|
||||
}
|
||||
leave_on_interrupt = true
|
||||
leave_on_terminate = true
|
||||
|
|
|
@ -229,14 +229,6 @@ type ClientConfig struct {
|
|||
// NoHostUUID disables using the host's UUID and will force generation of a
|
||||
// random UUID.
|
||||
NoHostUUID *bool `mapstructure:"no_host_uuid"`
|
||||
|
||||
// DisableTaggedMetrics disables a new version of generating metrics which
|
||||
// uses tags
|
||||
DisableTaggedMetrics bool `mapstructure:"disable_tagged_metrics"`
|
||||
|
||||
// BackwardsCompatibleMetrics allows for generating metrics in a simple
|
||||
// key/value structure as done in older versions of Nomad
|
||||
BackwardsCompatibleMetrics bool `mapstructure:"backwards_compatible_metrics"`
|
||||
}
|
||||
|
||||
// ACLConfig is configuration specific to the ACL system
|
||||
|
@ -371,6 +363,14 @@ type Telemetry struct {
|
|||
PublishAllocationMetrics bool `mapstructure:"publish_allocation_metrics"`
|
||||
PublishNodeMetrics bool `mapstructure:"publish_node_metrics"`
|
||||
|
||||
// DisableTaggedMetrics disables a new version of generating metrics which
|
||||
// uses tags
|
||||
DisableTaggedMetrics bool `mapstructure:"disable_tagged_metrics"`
|
||||
|
||||
// BackwardsCompatibleMetrics allows for generating metrics in a simple
|
||||
// key/value structure as done in older versions of Nomad
|
||||
BackwardsCompatibleMetrics bool `mapstructure:"backwards_compatible_metrics"`
|
||||
|
||||
// Circonus: see https://github.com/circonus-labs/circonus-gometrics
|
||||
// for more details on the various configuration options.
|
||||
// Valid configuration combinations:
|
||||
|
@ -1105,14 +1105,6 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig {
|
|||
result.NoHostUUID = b.NoHostUUID
|
||||
}
|
||||
|
||||
if b.DisableTaggedMetrics {
|
||||
result.DisableTaggedMetrics = b.DisableTaggedMetrics
|
||||
}
|
||||
|
||||
if b.BackwardsCompatibleMetrics {
|
||||
result.BackwardsCompatibleMetrics = b.BackwardsCompatibleMetrics
|
||||
}
|
||||
|
||||
// Add the servers
|
||||
result.Servers = append(result.Servers, b.Servers...)
|
||||
|
||||
|
@ -1214,6 +1206,15 @@ func (a *Telemetry) Merge(b *Telemetry) *Telemetry {
|
|||
if b.CirconusBrokerSelectTag != "" {
|
||||
result.CirconusBrokerSelectTag = b.CirconusBrokerSelectTag
|
||||
}
|
||||
|
||||
if b.DisableTaggedMetrics {
|
||||
result.DisableTaggedMetrics = b.DisableTaggedMetrics
|
||||
}
|
||||
|
||||
if b.BackwardsCompatibleMetrics {
|
||||
result.BackwardsCompatibleMetrics = b.BackwardsCompatibleMetrics
|
||||
}
|
||||
|
||||
return &result
|
||||
}
|
||||
|
||||
|
|
|
@ -357,8 +357,6 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error {
|
|||
"gc_parallel_destroys",
|
||||
"gc_max_allocs",
|
||||
"no_host_uuid",
|
||||
"disable_tagged_metrics",
|
||||
"backwards_compatible_metrics",
|
||||
}
|
||||
if err := checkHCLKeys(listVal, valid); err != nil {
|
||||
return err
|
||||
|
@ -635,6 +633,8 @@ func parseTelemetry(result **Telemetry, list *ast.ObjectList) error {
|
|||
"circonus_check_tags",
|
||||
"circonus_broker_id",
|
||||
"circonus_broker_select_tag",
|
||||
"disable_tagged_metrics",
|
||||
"backwards_compatible_metrics",
|
||||
}
|
||||
if err := checkHCLKeys(listVal, valid); err != nil {
|
||||
return err
|
||||
|
|
|
@ -75,14 +75,12 @@ func TestConfig_Parse(t *testing.T) {
|
|||
ReservedPorts: "1,100,10-12",
|
||||
ParsedReservedPorts: []int{1, 10, 11, 12, 100},
|
||||
},
|
||||
GCInterval: 6 * time.Second,
|
||||
GCParallelDestroys: 6,
|
||||
GCDiskUsageThreshold: 82,
|
||||
GCInodeUsageThreshold: 91,
|
||||
GCMaxAllocs: 50,
|
||||
NoHostUUID: helper.BoolToPtr(false),
|
||||
DisableTaggedMetrics: true,
|
||||
BackwardsCompatibleMetrics: true,
|
||||
GCInterval: 6 * time.Second,
|
||||
GCParallelDestroys: 6,
|
||||
GCDiskUsageThreshold: 82,
|
||||
GCInodeUsageThreshold: 91,
|
||||
GCMaxAllocs: 50,
|
||||
NoHostUUID: helper.BoolToPtr(false),
|
||||
},
|
||||
Server: &ServerConfig{
|
||||
Enabled: true,
|
||||
|
@ -113,14 +111,16 @@ func TestConfig_Parse(t *testing.T) {
|
|||
ReplicationToken: "foobar",
|
||||
},
|
||||
Telemetry: &Telemetry{
|
||||
StatsiteAddr: "127.0.0.1:1234",
|
||||
StatsdAddr: "127.0.0.1:2345",
|
||||
DisableHostname: true,
|
||||
UseNodeName: false,
|
||||
CollectionInterval: "3s",
|
||||
collectionInterval: 3 * time.Second,
|
||||
PublishAllocationMetrics: true,
|
||||
PublishNodeMetrics: true,
|
||||
StatsiteAddr: "127.0.0.1:1234",
|
||||
StatsdAddr: "127.0.0.1:2345",
|
||||
DisableHostname: true,
|
||||
UseNodeName: false,
|
||||
CollectionInterval: "3s",
|
||||
collectionInterval: 3 * time.Second,
|
||||
PublishAllocationMetrics: true,
|
||||
PublishNodeMetrics: true,
|
||||
DisableTaggedMetrics: true,
|
||||
BackwardsCompatibleMetrics: true,
|
||||
},
|
||||
LeaveOnInt: true,
|
||||
LeaveOnTerm: true,
|
||||
|
|
|
@ -55,6 +55,8 @@ func TestConfig_Merge(t *testing.T) {
|
|||
StatsdAddr: "127.0.0.1:8125",
|
||||
DataDogAddr: "127.0.0.1:8125",
|
||||
DisableHostname: false,
|
||||
DisableTaggedMetrics: true,
|
||||
BackwardsCompatibleMetrics: true,
|
||||
CirconusAPIToken: "0",
|
||||
CirconusAPIApp: "nomadic",
|
||||
CirconusAPIURL: "http://api.circonus.com/v2",
|
||||
|
@ -89,8 +91,6 @@ func TestConfig_Merge(t *testing.T) {
|
|||
ReservedPorts: "1,10-30,55",
|
||||
ParsedReservedPorts: []int{1, 2, 4},
|
||||
},
|
||||
DisableTaggedMetrics: true,
|
||||
BackwardsCompatibleMetrics: true,
|
||||
},
|
||||
Server: &ServerConfig{
|
||||
Enabled: false,
|
||||
|
@ -185,6 +185,8 @@ func TestConfig_Merge(t *testing.T) {
|
|||
DisableHostname: true,
|
||||
PublishNodeMetrics: true,
|
||||
PublishAllocationMetrics: true,
|
||||
DisableTaggedMetrics: true,
|
||||
BackwardsCompatibleMetrics: true,
|
||||
CirconusAPIToken: "1",
|
||||
CirconusAPIApp: "nomad",
|
||||
CirconusAPIURL: "https://api.circonus.com/v2",
|
||||
|
@ -226,12 +228,10 @@ func TestConfig_Merge(t *testing.T) {
|
|||
ReservedPorts: "2,10-30,55",
|
||||
ParsedReservedPorts: []int{1, 2, 3},
|
||||
},
|
||||
GCInterval: 6 * time.Second,
|
||||
GCParallelDestroys: 6,
|
||||
GCDiskUsageThreshold: 71,
|
||||
GCInodeUsageThreshold: 86,
|
||||
DisableTaggedMetrics: true,
|
||||
BackwardsCompatibleMetrics: true,
|
||||
GCInterval: 6 * time.Second,
|
||||
GCParallelDestroys: 6,
|
||||
GCDiskUsageThreshold: 71,
|
||||
GCInodeUsageThreshold: 86,
|
||||
},
|
||||
Server: &ServerConfig{
|
||||
Enabled: true,
|
||||
|
|
|
@ -168,6 +168,8 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
|
|||
s.mux.HandleFunc("/v1/agent/servers", s.wrap(s.AgentServersRequest))
|
||||
s.mux.HandleFunc("/v1/agent/keyring/", s.wrap(s.KeyringOperationRequest))
|
||||
|
||||
s.mux.HandleFunc("/v1/metrics", s.wrap(s.MetricsRequest))
|
||||
|
||||
s.mux.HandleFunc("/v1/validate/job", s.wrap(s.ValidateJobRequest))
|
||||
|
||||
s.mux.HandleFunc("/v1/regions", s.wrap(s.RegionListRequest))
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func (s *HTTPServer) MetricsRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
|
||||
if req.Method == "GET" {
|
||||
return s.newMetricsRequest(resp, req)
|
||||
}
|
||||
return nil, CodedError(405, ErrInvalidMethod)
|
||||
}
|
||||
|
||||
func (s *HTTPServer) newMetricsRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
|
||||
return s.agent.InmemSink.DisplayMetrics(resp, req)
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
metrics "github.com/armon/go-metrics"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestHTTP_MetricsWithIllegalMethod(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
t.Parallel()
|
||||
httpTest(t, nil, func(s *TestAgent) {
|
||||
req, err := http.NewRequest("DELETE", "/v1/metrics", nil)
|
||||
assert.Nil(err)
|
||||
respW := httptest.NewRecorder()
|
||||
|
||||
_, err = s.Server.MetricsRequest(respW, req)
|
||||
assert.NotNil(err, "HTTP DELETE should not be accepted for this endpoint")
|
||||
})
|
||||
}
|
||||
|
||||
func TestHTTP_Metrics(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
t.Parallel()
|
||||
httpTest(t, nil, func(s *TestAgent) {
|
||||
// make a separate HTTP request first, to ensure Nomad has written metrics
|
||||
// and prevent a race condition
|
||||
req, err := http.NewRequest("GET", "/v1/agent/self", nil)
|
||||
assert.Nil(err)
|
||||
respW := httptest.NewRecorder()
|
||||
s.Server.AgentSelfRequest(respW, req)
|
||||
|
||||
// now make a metrics endpoint request, which should be already initialized
|
||||
// and written to
|
||||
req, err = http.NewRequest("GET", "/v1/metrics", nil)
|
||||
assert.Nil(err)
|
||||
respW = httptest.NewRecorder()
|
||||
|
||||
resp, err := s.Server.MetricsRequest(respW, req)
|
||||
assert.Nil(err)
|
||||
|
||||
res := resp.(metrics.MetricsSummary)
|
||||
|
||||
gauges := res.Gauges
|
||||
assert.NotEqual(0, len(gauges))
|
||||
})
|
||||
}
|
|
@ -13,6 +13,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
metrics "github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/client/fingerprint"
|
||||
"github.com/hashicorp/nomad/nomad"
|
||||
|
@ -187,7 +188,14 @@ func (a *TestAgent) start() (*Agent, error) {
|
|||
a.LogOutput = os.Stderr
|
||||
}
|
||||
|
||||
agent, err := NewAgent(a.Config, a.LogOutput)
|
||||
inm := metrics.NewInmemSink(10*time.Second, time.Minute)
|
||||
metrics.NewGlobal(metrics.DefaultConfig("service-name"), inm)
|
||||
|
||||
if inm == nil {
|
||||
return nil, fmt.Errorf("unable to set up in memory metrics needed for agent initialization")
|
||||
}
|
||||
|
||||
agent, err := NewAgent(a.Config, a.LogOutput, inm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
---
|
||||
layout: api
|
||||
page_title: Metrics - HTTP API
|
||||
sidebar_current: metrics-search
|
||||
description: |-
|
||||
The /metrics endpoint is used to view metrics for Nomad
|
||||
---
|
||||
|
||||
# Metrics HTTP API
|
||||
|
||||
The `/metrics` endpoint returns metrics for the current Nomad process.
|
||||
|
||||
| Method | Path | Produces |
|
||||
| ------- | --------------- | -------------------------- |
|
||||
| `GET` | `/v1/metrics | `application/json` |
|
||||
|
||||
The table below shows this endpoint's support for
|
||||
[blocking queries](/api/index.html#blocking-queries) and
|
||||
[required ACLs](/api/index.html#acls).
|
||||
|
||||
| Blocking Queries | ACL Required |
|
||||
| ---------------- | ------------ |
|
||||
| `NO` | `none` |
|
||||
|
||||
### Sample Request
|
||||
|
||||
```text
|
||||
$ curl https://nomad.rocks/v1/metrics
|
||||
```
|
||||
|
||||
### Sample Response
|
||||
|
||||
```json
|
||||
{
|
||||
"Counters":[
|
||||
{
|
||||
"Count":11,
|
||||
"Labels":{},
|
||||
"Max":1.0,
|
||||
"Mean":1.0,
|
||||
"Min":1.0,
|
||||
"Name":"nomad.nomad.rpc.query",
|
||||
"Stddev":0.0,
|
||||
"Sum":11.0
|
||||
}
|
||||
],
|
||||
"Gauges":[
|
||||
{
|
||||
"Labels":{
|
||||
"node_id":"cd7c3e0c-0174-29dd-17ba-ea4609e0fd1f",
|
||||
"datacenter":"dc1"
|
||||
},
|
||||
"Name":"nomad.client.allocations.blocked",
|
||||
"Value":0.0
|
||||
},
|
||||
{
|
||||
"Labels":{
|
||||
"datacenter":"dc1",
|
||||
"node_id":"cd7c3e0c-0174-29dd-17ba-ea4609e0fd1f"
|
||||
},
|
||||
"Name":"nomad.client.allocations.migrating",
|
||||
"Value":0.0
|
||||
}
|
||||
],
|
||||
"Samples":[
|
||||
{
|
||||
"Count":20,
|
||||
"Labels":{},
|
||||
"Max":0.03544100001454353,
|
||||
"Mean":0.023678050097078084,
|
||||
"Min":0.00956599973142147,
|
||||
"Name":"nomad.memberlist.gossip",
|
||||
"Stddev":0.005445327799243976,
|
||||
"Sum":0.4735610019415617
|
||||
},
|
||||
{
|
||||
"Count":1,
|
||||
"Labels":{},
|
||||
"Max":0.0964059978723526,
|
||||
"Mean":0.0964059978723526,
|
||||
"Min":0.0964059978723526,
|
||||
"Name":"nomad.nomad.client.update_status",
|
||||
"Stddev":0.0,
|
||||
"Sum":0.0964059978723526
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
|
@ -58,6 +58,20 @@ The following options are available on all telemetry configurations.
|
|||
- `publish_node_metrics` `(bool: false)` - Specifies if Nomad should publish
|
||||
runtime metrics of nodes.
|
||||
|
||||
- `backwards_compatible_metrics` `(bool: false)` - Specifies if Nomad should
|
||||
publish metrics that are backwards compatible with versions below 0.7, as
|
||||
post version 0.7, Nomad emits tagged metrics. All new metrics will
|
||||
only be added to tagged metrics. Note that this option is used to transition
|
||||
monitoring to tagged metrics and will eventually be deprecated.
|
||||
|
||||
|
||||
- `disable_tagged_metrics` `(bool: false)` - Specifies if Nomad should not emit
|
||||
tagged metrics and only emit metrics compatible with versions below Nomad
|
||||
0.7. Note that this option is used to transition monitoring to tagged
|
||||
metrics and will eventually be deprecated.
|
||||
|
||||
|
||||
|
||||
### `statsite`
|
||||
|
||||
These `telemetry` parameters apply to
|
||||
|
|
|
@ -12,7 +12,14 @@ The Nomad agent collects various runtime metrics about the performance of
|
|||
different libraries and subsystems. These metrics are aggregated on a ten
|
||||
second interval and are retained for one minute.
|
||||
|
||||
To view this data, you must send a signal to the Nomad process: on Unix,
|
||||
This data can be accessed via an HTTP endpoint or via sending a signal to the
|
||||
Nomad process.
|
||||
|
||||
Via HTTP, this data is available at `/metrics`. See
|
||||
[Metrics](/api/metrics.html) for more information.
|
||||
|
||||
|
||||
To view this data via sending a signal to the Nomad process: on Unix,
|
||||
this is `USR1` while on Windows it is `BREAK`. Once Nomad receives the signal,
|
||||
it will dump the current telemetry information to the agent's `stderr`.
|
||||
|
||||
|
@ -229,7 +236,196 @@ configuration block.
|
|||
Please see the [agent configuration](/docs/agent/configuration/telemetry.html)
|
||||
page for more details.
|
||||
|
||||
## Host Metrics
|
||||
## Host Metrics (post Nomad version 0.7)
|
||||
|
||||
Starting in version 0.7, Nomad will emit tagged metrics, in the below format:
|
||||
|
||||
<table class="table table-bordered table-striped">
|
||||
<tr>
|
||||
<th>Metric</th>
|
||||
<th>Description</th>
|
||||
<th>Unit</th>
|
||||
<th>Type</th>
|
||||
<th>Labels</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.allocated.cpu`</td>
|
||||
<td>Total amount of CPU shares the scheduler has allocated to tasks</td>
|
||||
<td>MHz</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.unallocated.cpu`</td>
|
||||
<td>Total amount of CPU shares free for the scheduler to allocate to tasks</td>
|
||||
<td>MHz</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.allocated.memory`</td>
|
||||
<td>Total amount of memory the scheduler has allocated to tasks</td>
|
||||
<td>Megabytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.unallocated.memory`</td>
|
||||
<td>Total amount of memory free for the scheduler to allocate to tasks</td>
|
||||
<td>Megabytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.allocated.disk`</td>
|
||||
<td>Total amount of disk space the scheduler has allocated to tasks</td>
|
||||
<td>Megabytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.unallocated.disk`</td>
|
||||
<td>Total amount of disk space free for the scheduler to allocate to tasks</td>
|
||||
<td>Megabytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.allocated.iops`</td>
|
||||
<td>Total amount of IOPS the scheduler has allocated to tasks</td>
|
||||
<td>IOPS</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.unallocated.iops`</td>
|
||||
<td>Total amount of IOPS free for the scheduler to allocate to tasks</td>
|
||||
<td>IOPS</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.allocated.network`</td>
|
||||
<td>Total amount of bandwidth the scheduler has allocated to tasks on the
|
||||
given device</td>
|
||||
<td>Megabits</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, device</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.unallocated.network`</td>
|
||||
<td>Total amount of bandwidth free for the scheduler to allocate to tasks on
|
||||
the given device</td>
|
||||
<td>Megabits</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, device</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.memory.total`</td>
|
||||
<td>Total amount of physical memory on the node</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.memory.available`</td>
|
||||
<td>Total amount of memory available to processes which includes free and
|
||||
cached memory</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.memory.used`</td>
|
||||
<td>Amount of memory used by processes</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.memory.free`</td>
|
||||
<td>Amount of memory which is free</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.uptime`</td>
|
||||
<td>Uptime of the host running the Nomad client</td>
|
||||
<td>Seconds</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.cpu.total`</td>
|
||||
<td>Total CPU utilization</td>
|
||||
<td>Percentage</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, cpu</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.cpu.user`</td>
|
||||
<td>CPU utilization in the user space</td>
|
||||
<td>Percentage</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, cpu</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.cpu.system`</td>
|
||||
<td>CPU utilization in the system space</td>
|
||||
<td>Percentage</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, cpu</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.cpu.idle`</td>
|
||||
<td>Idle time spent by the CPU</td>
|
||||
<td>Percentage</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, cpu</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.disk.size`</td>
|
||||
<td>Total size of the device</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, disk</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.disk.used`</td>
|
||||
<td>Amount of space which has been used</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, disk</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.disk.available`</td>
|
||||
<td>Amount of space which is available</td>
|
||||
<td>Bytes</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, disk</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.disk.used_percent`</td>
|
||||
<td>Percentage of disk space used</td>
|
||||
<td>Percentage</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, disk</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>`nomad.client.host.disk.inodes_percent`</td>
|
||||
<td>Disk space consumed by the inodes</td>
|
||||
<td>Percent</td>
|
||||
<td>Gauge</td>
|
||||
<td>node_id, datacenter, disk</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Host Metrics (deprecated post Nomad 0.7)
|
||||
|
||||
The below are metrics emitted by Nomad in versions prior to 0.7. These metrics
|
||||
can be emitted in the below format post-0.7 (as well as the new format,
|
||||
detailed above) but any new metrics will only be available in the new format.
|
||||
|
||||
<table class="table table-bordered table-striped">
|
||||
<tr>
|
||||
|
|
|
@ -51,6 +51,10 @@
|
|||
<a href="/api/nodes.html">Nodes</a>
|
||||
</li>
|
||||
|
||||
<li<%= sidebar_current("api-metrics") %>>
|
||||
<a href="/api/metrics.html">Search</a>
|
||||
</li>
|
||||
|
||||
<li<%= sidebar_current("api-operator") %>>
|
||||
<a href="/api/operator.html">Operator</a>
|
||||
</li>
|
||||
|
|
Loading…
Reference in New Issue