tr: implement stats collection hook
Tested except for the net/rpc specific error case which may need changing in the gRPC world.
This commit is contained in:
parent
86bd329539
commit
e6e2930a00
|
@ -416,6 +416,7 @@ func (ar *allocRunner) Listener() *cstructs.AllocListener {
|
||||||
// exit (thus closing WaitCh).
|
// exit (thus closing WaitCh).
|
||||||
func (ar *allocRunner) Destroy() {
|
func (ar *allocRunner) Destroy() {
|
||||||
// Stop tasks
|
// Stop tasks
|
||||||
|
ar.tasksLock.RLock()
|
||||||
for name, tr := range ar.tasks {
|
for name, tr := range ar.tasks {
|
||||||
err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilled))
|
err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilled))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -426,6 +427,7 @@ func (ar *allocRunner) Destroy() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ar.tasksLock.RUnlock()
|
||||||
|
|
||||||
// Wait for tasks to exit and postrun hooks to finish
|
// Wait for tasks to exit and postrun hooks to finish
|
||||||
<-ar.waitCh
|
<-ar.waitCh
|
||||||
|
@ -474,15 +476,38 @@ func (ar *allocRunner) IsMigrating() bool {
|
||||||
return ar.prevAllocWatcher.IsMigrating()
|
return ar.prevAllocWatcher.IsMigrating()
|
||||||
}
|
}
|
||||||
|
|
||||||
// StatsReporter needs implementing
|
|
||||||
//XXX
|
|
||||||
func (ar *allocRunner) StatsReporter() allocrunner.AllocStatsReporter {
|
func (ar *allocRunner) StatsReporter() allocrunner.AllocStatsReporter {
|
||||||
return noopStatsReporter{}
|
return ar
|
||||||
}
|
}
|
||||||
|
|
||||||
//FIXME implement
|
// LatestAllocStats returns the latest stats for an allocation. If taskFilter
|
||||||
type noopStatsReporter struct{}
|
// is set, only stats for that task -- if it exists -- are returned.
|
||||||
|
func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
|
||||||
|
ar.tasksLock.RLock()
|
||||||
|
defer ar.tasksLock.RUnlock()
|
||||||
|
|
||||||
func (noopStatsReporter) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
|
astat := &cstructs.AllocResourceUsage{
|
||||||
return nil, fmt.Errorf("not implemented")
|
Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)),
|
||||||
|
ResourceUsage: &cstructs.ResourceUsage{
|
||||||
|
MemoryStats: &cstructs.MemoryStats{},
|
||||||
|
CpuStats: &cstructs.CpuStats{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, tr := range ar.tasks {
|
||||||
|
if taskFilter != "" && taskFilter != name {
|
||||||
|
// Getting stats for a particular task and its not this one!
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if usage := tr.LatestResourceUsage(); usage != nil {
|
||||||
|
astat.Tasks[name] = usage
|
||||||
|
astat.ResourceUsage.Add(usage.ResourceUsage)
|
||||||
|
if usage.Timestamp > astat.Timestamp {
|
||||||
|
astat.Timestamp = usage.Timestamp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return astat, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,6 +75,11 @@ type TaskPrestartHook interface {
|
||||||
Prestart(context.Context, *TaskPrestartRequest, *TaskPrestartResponse) error
|
Prestart(context.Context, *TaskPrestartRequest, *TaskPrestartResponse) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DriverStats is the interface implemented by DriverHandles to return task stats.
|
||||||
|
type DriverStats interface {
|
||||||
|
Stats() (*cstructs.TaskResourceUsage, error)
|
||||||
|
}
|
||||||
|
|
||||||
type TaskPoststartRequest struct {
|
type TaskPoststartRequest struct {
|
||||||
// Exec hook (may be nil)
|
// Exec hook (may be nil)
|
||||||
DriverExec driver.ScriptExecutor
|
DriverExec driver.ScriptExecutor
|
||||||
|
@ -84,6 +89,9 @@ type TaskPoststartRequest struct {
|
||||||
|
|
||||||
// TaskEnv is the task's environment
|
// TaskEnv is the task's environment
|
||||||
TaskEnv *env.TaskEnv
|
TaskEnv *env.TaskEnv
|
||||||
|
|
||||||
|
// Stats collector
|
||||||
|
DriverStats DriverStats
|
||||||
}
|
}
|
||||||
type TaskPoststartResponse struct{}
|
type TaskPoststartResponse struct{}
|
||||||
|
|
||||||
|
|
|
@ -122,7 +122,7 @@ func (h *serviceHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequ
|
||||||
return h.consul.UpdateTask(oldTaskServices, newTaskServices)
|
return h.consul.UpdateTask(oldTaskServices, newTaskServices)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *serviceHook) Exited(ctx context.Context, req *interfaces.TaskExitedRequest, _ *interfaces.TaskExitedResponse) error {
|
func (h *serviceHook) Exited(context.Context, *interfaces.TaskExitedRequest, *interfaces.TaskExitedResponse) error {
|
||||||
h.mu.Lock()
|
h.mu.Lock()
|
||||||
defer h.mu.Unlock()
|
defer h.mu.Unlock()
|
||||||
|
|
||||||
|
|
117
client/allocrunnerv2/taskrunner/stats_hook.go
Normal file
117
client/allocrunnerv2/taskrunner/stats_hook.go
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
package taskrunner
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
hclog "github.com/hashicorp/go-hclog"
|
||||||
|
"github.com/hashicorp/nomad/client/allocrunnerv2/interfaces"
|
||||||
|
"github.com/hashicorp/nomad/client/driver"
|
||||||
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||||
|
)
|
||||||
|
|
||||||
|
type StatsUpdater interface {
|
||||||
|
UpdateStats(*cstructs.TaskResourceUsage)
|
||||||
|
}
|
||||||
|
|
||||||
|
type statsHook struct {
|
||||||
|
updater StatsUpdater
|
||||||
|
interval time.Duration
|
||||||
|
|
||||||
|
// stopCh is closed by Exited
|
||||||
|
stopCh chan struct{}
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
|
||||||
|
logger hclog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
func newStatsHook(su StatsUpdater, interval time.Duration, logger hclog.Logger) *statsHook {
|
||||||
|
h := &statsHook{
|
||||||
|
updater: su,
|
||||||
|
interval: interval,
|
||||||
|
}
|
||||||
|
h.logger = logger.Named(h.Name())
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
|
func (*statsHook) Name() string {
|
||||||
|
return "stats_hook"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *statsHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error {
|
||||||
|
h.mu.Lock()
|
||||||
|
defer h.mu.Unlock()
|
||||||
|
|
||||||
|
// This shouldn't happen, but better safe than risk leaking a goroutine
|
||||||
|
if h.stopCh != nil {
|
||||||
|
h.logger.Debug("poststart called twice without exiting between")
|
||||||
|
close(h.stopCh)
|
||||||
|
}
|
||||||
|
|
||||||
|
h.stopCh = make(chan struct{})
|
||||||
|
go h.collectResourceUsageStats(h.logger, req.DriverStats, h.stopCh)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *statsHook) Exited(context.Context, *interfaces.TaskExitedRequest, *interfaces.TaskExitedResponse) error {
|
||||||
|
h.mu.Lock()
|
||||||
|
defer h.mu.Unlock()
|
||||||
|
|
||||||
|
if h.stopCh == nil {
|
||||||
|
// No stats running
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close chan to stop stats collection
|
||||||
|
close(h.stopCh)
|
||||||
|
|
||||||
|
// Clear chan so we don't double close for any reason
|
||||||
|
h.stopCh = nil
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectResourceUsageStats starts collecting resource usage stats of a Task.
|
||||||
|
// Collection ends when the passed channel is closed
|
||||||
|
func (h *statsHook) collectResourceUsageStats(logger hclog.Logger, handle interfaces.DriverStats, stopCh <-chan struct{}) {
|
||||||
|
// start collecting the stats right away and then start collecting every
|
||||||
|
// collection interval
|
||||||
|
next := time.NewTimer(0)
|
||||||
|
defer next.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-next.C:
|
||||||
|
// Reset the timer
|
||||||
|
next.Reset(h.interval)
|
||||||
|
|
||||||
|
// Collect stats from driver
|
||||||
|
ru, err := handle.Stats()
|
||||||
|
if err != nil {
|
||||||
|
// Check if the driver doesn't implement stats
|
||||||
|
if err.Error() == driver.DriverStatsNotImplemented.Error() {
|
||||||
|
h.logger.Debug("driver does not support stats")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
//XXX This is a net/rpc specific error
|
||||||
|
// We do not log when the plugin is shutdown as this is simply a
|
||||||
|
// race between the stopCollection channel being closed and calling
|
||||||
|
// Stats on the handle.
|
||||||
|
if !strings.Contains(err.Error(), "connection is shut down") {
|
||||||
|
h.logger.Debug("error fetching stats of task", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update stats on TaskRunner and emit them
|
||||||
|
h.updater.UpdateStats(ru)
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
180
client/allocrunnerv2/taskrunner/stats_hook_test.go
Normal file
180
client/allocrunnerv2/taskrunner/stats_hook_test.go
Normal file
|
@ -0,0 +1,180 @@
|
||||||
|
package taskrunner
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/hashicorp/nomad/client/allocrunnerv2/interfaces"
|
||||||
|
"github.com/hashicorp/nomad/client/driver"
|
||||||
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||||
|
"github.com/hashicorp/nomad/helper/testlog"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Statically assert the stats hook implements the expected interfaces
|
||||||
|
var _ interfaces.TaskPoststartHook = (*statsHook)(nil)
|
||||||
|
var _ interfaces.TaskExitedHook = (*statsHook)(nil)
|
||||||
|
|
||||||
|
type mockStatsUpdater struct {
|
||||||
|
// Ch is sent task resource usage updates if not nil
|
||||||
|
Ch chan *cstructs.TaskResourceUsage
|
||||||
|
}
|
||||||
|
|
||||||
|
// newMockStatsUpdater returns a mockStatsUpdater that blocks on Ch for every
|
||||||
|
// call to UpdateStats
|
||||||
|
func newMockStatsUpdater() *mockStatsUpdater {
|
||||||
|
return &mockStatsUpdater{
|
||||||
|
Ch: make(chan *cstructs.TaskResourceUsage),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockStatsUpdater) UpdateStats(ru *cstructs.TaskResourceUsage) {
|
||||||
|
if m.Ch != nil {
|
||||||
|
m.Ch <- ru
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type mockDriverStats struct {
|
||||||
|
// err is returned by Stats if it is non-nil
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockDriverStats) Stats() (*cstructs.TaskResourceUsage, error) {
|
||||||
|
if m.err != nil {
|
||||||
|
return nil, m.err
|
||||||
|
}
|
||||||
|
ru := &cstructs.TaskResourceUsage{
|
||||||
|
ResourceUsage: &cstructs.ResourceUsage{
|
||||||
|
MemoryStats: &cstructs.MemoryStats{
|
||||||
|
RSS: 1,
|
||||||
|
Measured: []string{"RSS"},
|
||||||
|
},
|
||||||
|
CpuStats: &cstructs.CpuStats{
|
||||||
|
SystemMode: 1,
|
||||||
|
Measured: []string{"System Mode"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Timestamp: time.Now().UnixNano(),
|
||||||
|
Pids: map[string]*cstructs.ResourceUsage{},
|
||||||
|
}
|
||||||
|
ru.Pids["task"] = ru.ResourceUsage
|
||||||
|
return ru, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestTaskRunner_StatsHook_PoststartExited asserts the stats hook starts and
|
||||||
|
// stops.
|
||||||
|
func TestTaskRunner_StatsHook_PoststartExited(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
require := require.New(t)
|
||||||
|
logger := testlog.HCLogger(t)
|
||||||
|
su := newMockStatsUpdater()
|
||||||
|
ds := new(mockDriverStats)
|
||||||
|
|
||||||
|
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
|
||||||
|
|
||||||
|
// Create hook
|
||||||
|
h := newStatsHook(su, time.Minute, logger)
|
||||||
|
|
||||||
|
// Always call Exited to cleanup goroutines
|
||||||
|
defer h.Exited(context.Background(), nil, nil)
|
||||||
|
|
||||||
|
// Run prestart
|
||||||
|
require.NoError(h.Poststart(context.Background(), poststartReq, nil))
|
||||||
|
|
||||||
|
// An initial stats collection should run and call the updater
|
||||||
|
select {
|
||||||
|
case ru := <-su.Ch:
|
||||||
|
require.Equal(uint64(1), ru.ResourceUsage.MemoryStats.RSS)
|
||||||
|
case <-time.After(10 * time.Second):
|
||||||
|
t.Fatalf("timeout waiting for initial stats collection")
|
||||||
|
}
|
||||||
|
|
||||||
|
require.NoError(h.Exited(context.Background(), nil, nil))
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestTaskRunner_StatsHook_Periodic asserts the stats hook collects stats on
|
||||||
|
// an interval.
|
||||||
|
func TestTaskRunner_StatsHook_Periodic(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
require := require.New(t)
|
||||||
|
logger := testlog.HCLogger(t)
|
||||||
|
su := newMockStatsUpdater()
|
||||||
|
|
||||||
|
ds := new(mockDriverStats)
|
||||||
|
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
|
||||||
|
|
||||||
|
// interval needs to be high enough that even on a slow/busy VM
|
||||||
|
// Exited() can complete within the interval.
|
||||||
|
const interval = 500 * time.Millisecond
|
||||||
|
|
||||||
|
h := newStatsHook(su, interval, logger)
|
||||||
|
defer h.Exited(context.Background(), nil, nil)
|
||||||
|
|
||||||
|
// Run prestart
|
||||||
|
require.NoError(h.Poststart(context.Background(), poststartReq, nil))
|
||||||
|
|
||||||
|
// An initial stats collection should run and call the updater
|
||||||
|
var firstrun int64
|
||||||
|
select {
|
||||||
|
case ru := <-su.Ch:
|
||||||
|
if ru.Timestamp <= 0 {
|
||||||
|
t.Fatalf("expected nonzero timestamp (%v)", ru.Timestamp)
|
||||||
|
}
|
||||||
|
firstrun = ru.Timestamp
|
||||||
|
case <-time.After(10 * time.Second):
|
||||||
|
t.Fatalf("timeout waiting for initial stats collection")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should get another update in ~500ms (see interval above)
|
||||||
|
select {
|
||||||
|
case ru := <-su.Ch:
|
||||||
|
if ru.Timestamp <= firstrun {
|
||||||
|
t.Fatalf("expected timestamp (%v) after first run (%v)", ru.Timestamp, firstrun)
|
||||||
|
}
|
||||||
|
case <-time.After(10 * time.Second):
|
||||||
|
t.Fatalf("timeout waiting for second stats collection")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exiting should prevent further updates
|
||||||
|
require.NoError(h.Exited(context.Background(), nil, nil))
|
||||||
|
|
||||||
|
// Should *not* get another update in ~500ms (see interval above)
|
||||||
|
select {
|
||||||
|
case ru := <-su.Ch:
|
||||||
|
t.Fatalf("unexpected update after exit (firstrun=%v; update=%v", firstrun, ru.Timestamp)
|
||||||
|
case <-time.After(2 * interval):
|
||||||
|
// Ok! No update after exit as expected.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestTaskRunner_StatsHook_NotImplemented asserts the stats hook stops if the
|
||||||
|
// driver returns NotImplemented.
|
||||||
|
func TestTaskRunner_StatsHook_NotImplemented(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
require := require.New(t)
|
||||||
|
logger := testlog.HCLogger(t)
|
||||||
|
su := newMockStatsUpdater()
|
||||||
|
ds := &mockDriverStats{
|
||||||
|
err: driver.DriverStatsNotImplemented,
|
||||||
|
}
|
||||||
|
|
||||||
|
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
|
||||||
|
|
||||||
|
h := newStatsHook(su, 1, logger)
|
||||||
|
defer h.Exited(context.Background(), nil, nil)
|
||||||
|
|
||||||
|
// Run prestart
|
||||||
|
require.NoError(h.Poststart(context.Background(), poststartReq, nil))
|
||||||
|
|
||||||
|
// An initial stats collection should run and *not* call the updater
|
||||||
|
select {
|
||||||
|
case ru := <-su.Ch:
|
||||||
|
t.Fatalf("unexpected resource update (timestamp=%v)", ru.Timestamp)
|
||||||
|
case <-time.After(500 * time.Millisecond):
|
||||||
|
// Ok! No update received because error was returned
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,7 @@ import (
|
||||||
"github.com/hashicorp/nomad/client/driver"
|
"github.com/hashicorp/nomad/client/driver"
|
||||||
"github.com/hashicorp/nomad/client/driver/env"
|
"github.com/hashicorp/nomad/client/driver/env"
|
||||||
cstate "github.com/hashicorp/nomad/client/state"
|
cstate "github.com/hashicorp/nomad/client/state"
|
||||||
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||||
"github.com/hashicorp/nomad/client/vaultclient"
|
"github.com/hashicorp/nomad/client/vaultclient"
|
||||||
"github.com/hashicorp/nomad/nomad/structs"
|
"github.com/hashicorp/nomad/nomad/structs"
|
||||||
)
|
)
|
||||||
|
@ -129,6 +130,11 @@ type TaskRunner struct {
|
||||||
// baseLabels are used when emitting tagged metrics. All task runner metrics
|
// baseLabels are used when emitting tagged metrics. All task runner metrics
|
||||||
// will have these tags, and optionally more.
|
// will have these tags, and optionally more.
|
||||||
baseLabels []metrics.Label
|
baseLabels []metrics.Label
|
||||||
|
|
||||||
|
// resourceUsage is written via UpdateStats and read via
|
||||||
|
// LatestResourceUsage. May be nil at all times.
|
||||||
|
resourceUsage *cstructs.TaskResourceUsage
|
||||||
|
resourceUsageLock sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
|
@ -659,6 +665,98 @@ func (tr *TaskRunner) triggerUpdateHooks() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LatestResourceUsage returns the last resource utilization datapoint
|
||||||
|
// collected. May return nil if the task is not running or no resource
|
||||||
|
// utilization has been collected yet.
|
||||||
|
func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
|
||||||
|
tr.resourceUsageLock.Lock()
|
||||||
|
ru := tr.resourceUsage
|
||||||
|
tr.resourceUsageLock.Unlock()
|
||||||
|
return ru
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateStats updates and emits the latest stats from the driver.
|
||||||
|
func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) {
|
||||||
|
tr.resourceUsageLock.Lock()
|
||||||
|
tr.resourceUsage = ru
|
||||||
|
tr.resourceUsageLock.Unlock()
|
||||||
|
if ru != nil {
|
||||||
|
tr.emitStats(ru)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO Remove Backwardscompat or use tr.Alloc()?
|
||||||
|
func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
|
||||||
|
if !tr.clientConfig.DisableTaggedMetrics {
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
|
||||||
|
float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels)
|
||||||
|
}
|
||||||
|
|
||||||
|
if tr.clientConfig.BackwardsCompatibleMetrics {
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO Remove Backwardscompat or use tr.Alloc()?
|
||||||
|
func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
|
||||||
|
if !tr.clientConfig.DisableTaggedMetrics {
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
|
||||||
|
float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
|
||||||
|
float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
|
||||||
|
float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
|
||||||
|
float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
|
||||||
|
float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels)
|
||||||
|
metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
|
||||||
|
float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels)
|
||||||
|
}
|
||||||
|
|
||||||
|
if tr.clientConfig.BackwardsCompatibleMetrics {
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
|
||||||
|
metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// emitStats emits resource usage stats of tasks to remote metrics collector
|
||||||
|
// sinks
|
||||||
|
func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
|
||||||
|
if !tr.clientConfig.PublishAllocationMetrics {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if ru.ResourceUsage.MemoryStats != nil {
|
||||||
|
tr.setGaugeForMemory(ru)
|
||||||
|
}
|
||||||
|
|
||||||
|
if ru.ResourceUsage.CpuStats != nil {
|
||||||
|
tr.setGaugeForCPU(ru)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// appendTaskEvent updates the task status by appending the new event.
|
// appendTaskEvent updates the task status by appending the new event.
|
||||||
func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) {
|
func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) {
|
||||||
const capacity = 10
|
const capacity = 10
|
||||||
|
|
|
@ -23,6 +23,7 @@ func (tr *TaskRunner) initHooks() {
|
||||||
newTaskDirHook(tr, hookLogger),
|
newTaskDirHook(tr, hookLogger),
|
||||||
newArtifactHook(tr, hookLogger),
|
newArtifactHook(tr, hookLogger),
|
||||||
newShutdownDelayHook(task.ShutdownDelay, hookLogger),
|
newShutdownDelayHook(task.ShutdownDelay, hookLogger),
|
||||||
|
newStatsHook(tr, tr.clientConfig.StatsCollectionInterval, hookLogger),
|
||||||
}
|
}
|
||||||
|
|
||||||
// If Vault is enabled, add the hook
|
// If Vault is enabled, add the hook
|
||||||
|
@ -186,6 +187,7 @@ func (tr *TaskRunner) poststart() error {
|
||||||
req := interfaces.TaskPoststartRequest{
|
req := interfaces.TaskPoststartRequest{
|
||||||
DriverExec: handle,
|
DriverExec: handle,
|
||||||
DriverNetwork: net,
|
DriverNetwork: net,
|
||||||
|
DriverStats: handle,
|
||||||
TaskEnv: tr.envBuilder.Build(),
|
TaskEnv: tr.envBuilder.Build(),
|
||||||
}
|
}
|
||||||
var resp interfaces.TaskPoststartResponse
|
var resp interfaces.TaskPoststartResponse
|
||||||
|
|
|
@ -99,19 +99,21 @@ type ClientStatsReporter interface {
|
||||||
LatestHostStats() *stats.HostStats
|
LatestHostStats() *stats.HostStats
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AllocRunner is the interface implemented by the core alloc runner.
|
||||||
|
//TODO Create via factory to allow testing Client with mock AllocRunners.
|
||||||
type AllocRunner interface {
|
type AllocRunner interface {
|
||||||
StatsReporter() allocrunner.AllocStatsReporter
|
Alloc() *structs.Allocation
|
||||||
Destroy()
|
Destroy()
|
||||||
GetAllocDir() *allocdir.AllocDir
|
GetAllocDir() *allocdir.AllocDir
|
||||||
IsDestroyed() bool
|
IsDestroyed() bool
|
||||||
IsWaiting() bool
|
|
||||||
IsMigrating() bool
|
IsMigrating() bool
|
||||||
|
IsWaiting() bool
|
||||||
Listener() *cstructs.AllocListener
|
Listener() *cstructs.AllocListener
|
||||||
WaitCh() <-chan struct{}
|
|
||||||
Update(*structs.Allocation)
|
|
||||||
Alloc() *structs.Allocation
|
|
||||||
Restore() error
|
Restore() error
|
||||||
Run()
|
Run()
|
||||||
|
StatsReporter() allocrunner.AllocStatsReporter
|
||||||
|
Update(*structs.Allocation)
|
||||||
|
WaitCh() <-chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Client is used to implement the client interaction with Nomad. Clients
|
// Client is used to implement the client interaction with Nomad. Clients
|
||||||
|
|
Loading…
Reference in a new issue