Merge pull request #3061 from hashicorp/f-add-client-metrics

Added metrics to track task/alloc start/restarts/dead events
2017-11-02 13:45:29 -07:00 · 2017-11-02 13:45:29 -07:00 · 132e145a30
parent 2506f4081f f4abb6ccd3
commit 132e145a30
3 changed files with 116 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,9 +2,11 @@

 IMPROVEMENTS:
 * api: Allocations now track and return modify time in addition to create time.
- * cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago`.
+ * cli: Allocation create and modify times are displayed in a human readable 
+   relative format like `6 h ago`.
 * core: Allow agents to be run in `rpc_upgrade_mode` when migrating a cluster
   to TLS rather than changing `heartbeat_grace`.
+ * client: Added metrics to track state transitions of allocations [GH-3061]

 BUG FIXES:

--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@ -9,6 +9,7 @@ import (
 	"sync"
 	"time"

+	metrics "github.com/armon/go-metrics"
 	"github.com/boltdb/bolt"
 	"github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/client/allocdir"
@ -101,6 +102,10 @@ type AllocRunner struct {
 	// can lower write volume by not re-writing these values
 	immutablePersisted bool
 	allocDirPersisted  bool
+
+	// baseLabels are used when emitting tagged metrics. All alloc runner metrics
+	// will have these tags, and optionally more.
+	baseLabels []metrics.Label
 }

 // COMPAT: Remove in 0.7.0
@ -173,6 +178,22 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB,

 	// TODO Should be passed a context
 	ar.ctx, ar.exitFn = context.WithCancel(context.TODO())
+
+	ar.baseLabels = []metrics.Label{
+		{
+			Name:  "job",
+			Value: alloc.Job.Name,
+		},
+		{
+			Name:  "task_group",
+			Value: alloc.TaskGroup,
+		},
+		{
+			Name:  "node_id",
+			Value: ar.config.Node.ID,
+		},
+	}
+
 	return ar
 }

@ -645,6 +666,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 			taskState.Failed = true
 		}
 		if event.Type == structs.TaskRestarting {
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
+			}
 			taskState.Restarts++
 			taskState.LastRestart = time.Unix(0, event.Time)
 		}
@ -668,6 +696,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 		// Capture the start time if it is just starting
 		if taskState.State != structs.TaskStateRunning {
 			taskState.StartedAt = time.Now().UTC()
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
+			}
 		}
 	case structs.TaskStateDead:
 		// Capture the finished time. If it has never started there is no finish
@ -690,6 +725,24 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 			}
 		}

+		// Emitting metrics to indicate task complete and failures
+		if taskState.Failed {
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
+			}
+		} else {
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
+			}
+		}
 		// If the task failed, we should kill all the other tasks in the task group.
 		if taskState.Failed {
 			for _, tr := range otherTaskRunners {
@ -794,6 +847,15 @@ func (r *AllocRunner) Run() {
 		return
 	}

+	// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
+	if !r.config.DisableTaggedMetrics {
+		metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"},
+			1, r.baseLabels)
+	}
+	if r.config.BackwardsCompatibleMetrics {
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+	}
+
 	// Start the watcher
 	wCtx, watcherCancel := context.WithCancel(r.ctx)
 	go r.watchHealth(wCtx)
@ -922,6 +984,15 @@ func (r *AllocRunner) handleDestroy() {
 	// state as we wait for a destroy.
 	alloc := r.Alloc()

+	// Increment the destroy count for this alloc runner since this allocation is being removed from this client.
+	if !r.config.DisableTaggedMetrics {
+		metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"},
+			1, r.baseLabels)
+	}
+	if r.config.BackwardsCompatibleMetrics {
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
+	}
+
 	// Broadcast and persist state synchronously
 	r.sendBroadcast(alloc)
 	if err := r.saveAllocRunnerState(); err != nil {
--- a/website/source/docs/agent/telemetry.html.md
+++ b/website/source/docs/agent/telemetry.html.md
@ -419,6 +419,48 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format:
    <td>Gauge</td>
    <td>node_id, datacenter, disk</td>
  </tr>
+  <tr>
+    <td>`nomad.client.allocs.start`</td>
+    <td>Number of allocations starting</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>node_id, job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.running`</td>
+    <td>Number of allocations starting to run</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>node_id, job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.failed`</td>
+    <td>Number of allocations failing</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>node_id, job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.restart`</td>
+    <td>Number of allocations restarting</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>node_id, job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.complete`</td>
+    <td>Number of allocations completing</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>node_id, job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.destroy`</td>
+    <td>Number of allocations being destroyed</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>node_id, job, task_group</td>
+  </tr>
 </table>

 ## Host Metrics (deprecated post Nomad 0.7)