Publishing metrics for job summary
This commit is contained in:
parent
f72b705240
commit
524a1f0712
|
@ -233,6 +233,9 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi
|
|||
// Set the TLS config
|
||||
conf.TLSConfig = agentConfig.TLSConfig
|
||||
|
||||
// Setup telemetry related config
|
||||
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
|
||||
|
||||
return conf, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -245,6 +245,10 @@ type Config struct {
|
|||
|
||||
// SentinelConfig is this Agent's Sentinel configuration
|
||||
SentinelConfig *config.SentinelConfig
|
||||
|
||||
// StatsCollectionInterval is the interval at which the Nomad server
|
||||
// publishes metrics which are periodic in nature like updating gauges
|
||||
StatsCollectionInterval time.Duration
|
||||
}
|
||||
|
||||
// CheckVersion is used to check if the ProtocolVersion is valid
|
||||
|
|
|
@ -180,6 +180,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
|
|||
// Periodically unblock failed allocations
|
||||
go s.periodicUnblockFailedEvals(stopCh)
|
||||
|
||||
// Periodically publish job summary metrics
|
||||
go s.publishJobSummaryMetrics(stopCh)
|
||||
|
||||
// Setup the heartbeat timers. This is done both when starting up or when
|
||||
// a leader fail over happens. Since the timers are maintained by the leader
|
||||
// node, effectively this means all the timers are renewed at the time of failover.
|
||||
|
@ -519,6 +522,52 @@ func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
|
|||
}
|
||||
}
|
||||
|
||||
// publishJobSummaryMetrics publishes the job summaries as metrics
|
||||
func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
|
||||
// Using a timer instead of a ticker so that we can publish after the
|
||||
// current batch of metrics have been published
|
||||
timer := time.NewTimer(0)
|
||||
defer timer.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-timer.C:
|
||||
state, err := s.State().Snapshot()
|
||||
if err != nil {
|
||||
timer.Reset(s.config.StatsCollectionInterval)
|
||||
s.logger.Printf("[ERR] nomad: failed to get state: %v", err)
|
||||
continue
|
||||
}
|
||||
ws := memdb.NewWatchSet()
|
||||
iter, err := state.JobSummaries(ws)
|
||||
if err != nil {
|
||||
timer.Reset(s.config.StatsCollectionInterval)
|
||||
s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
for {
|
||||
raw := iter.Next()
|
||||
if raw == nil {
|
||||
break
|
||||
}
|
||||
summary := raw.(*structs.JobSummary)
|
||||
for name, tgSummary := range summary.Summary {
|
||||
metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued))
|
||||
metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete))
|
||||
metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed))
|
||||
metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running))
|
||||
metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting))
|
||||
metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost))
|
||||
}
|
||||
}
|
||||
timer.Reset(s.config.StatsCollectionInterval)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// revokeLeadership is invoked once we step down as leader.
|
||||
// This is used to cleanup any state that may be specific to a leader.
|
||||
func (s *Server) revokeLeadership() error {
|
||||
|
|
Loading…
Reference in New Issue