From e5fb6fe6876904478eea320875ae3c729f47bc32 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Wed, 19 Jul 2023 13:31:14 -0500 Subject: [PATCH] backport of commit 615e76ef3c23497f768ebd175f0c624d32aeece8 (#17993) This pull request was automerged via backport-assistant --- .changelog/17628.txt | 3 + client/fingerprint/cpu.go | 15 +- client/fingerprint/env_aws.go | 2 + client/lib/resources/pid.go | 2 +- client/stats/cpu.go | 137 +++++++++++------ {helper => client}/stats/cpu_darwin_test.go | 4 +- client/stats/cpu_test.go | 17 +-- drivers/docker/util/stats_posix.go | 2 +- drivers/docker/util/stats_windows.go | 2 +- drivers/shared/executor/executor.go | 9 +- drivers/shared/executor/executor_basic.go | 4 +- drivers/shared/executor/executor_linux.go | 10 +- .../shared/executor/executor_linux_test.go | 18 +-- drivers/shared/executor/executor_plugin.go | 9 +- drivers/shared/executor/executor_test.go | 20 +-- drivers/shared/executor/pid_collector.go | 2 +- drivers/shared/executor/plugins.go | 11 +- drivers/shared/executor/utils.go | 9 +- drivers/shared/executor/z_executor_cmd.go | 1 + helper/stats/cpu.go | 142 +++++++----------- helper/stats/cpu_test.go | 24 +++ 21 files changed, 238 insertions(+), 205 deletions(-) create mode 100644 .changelog/17628.txt rename {helper => client}/stats/cpu_darwin_test.go (88%) create mode 100644 helper/stats/cpu_test.go diff --git a/.changelog/17628.txt b/.changelog/17628.txt new file mode 100644 index 000000000..fcb89bcee --- /dev/null +++ b/.changelog/17628.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cpustats: Use config "cpu_total_compute" (if set) for all CPU statistics +``` diff --git a/client/fingerprint/cpu.go b/client/fingerprint/cpu.go index c6c69e3b1..ca3a9099c 100644 --- a/client/fingerprint/cpu.go +++ b/client/fingerprint/cpu.go @@ -10,7 +10,8 @@ import ( "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/client/stats" + shelpers "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/nomad/structs" ) @@ -42,7 +43,7 @@ func NewCPUFingerprint(logger hclog.Logger) Fingerprint { } func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error { - f.initialize() + f.initialize(request) f.setModelName(response) @@ -61,8 +62,8 @@ func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *Fing return nil } -func (f *CPUFingerprint) initialize() { - if err := stats.Init(); err != nil { +func (f *CPUFingerprint) initialize(request *FingerprintRequest) { + if err := stats.Init(uint64(request.Config.CpuCompute)); err != nil { f.logger.Warn("failed initializing stats collector", "error", err) } } @@ -134,10 +135,8 @@ func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, respons func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) { var ticks uint64 switch { - case request.Config.CpuCompute > 0: - ticks = uint64(request.Config.CpuCompute) - case stats.TotalTicksAvailable() > 0: - ticks = stats.TotalTicksAvailable() + case shelpers.CpuTotalTicks() > 0: + ticks = shelpers.CpuTotalTicks() default: ticks = defaultCPUTicks } diff --git a/client/fingerprint/env_aws.go b/client/fingerprint/env_aws.go index b7e1bf706..9b75eff7b 100644 --- a/client/fingerprint/env_aws.go +++ b/client/fingerprint/env_aws.go @@ -19,6 +19,7 @@ import ( log "github.com/hashicorp/go-hclog" cleanhttp "github.com/hashicorp/go-cleanhttp" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/nomad/structs" ) @@ -185,6 +186,7 @@ func (f *EnvAWSFingerprint) Fingerprint(request *FingerprintRequest, response *F if ticks := specs.Ticks(); request.Config.CpuCompute <= 0 { response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks)) f.logger.Debug("setting ec2 cpu", "ticks", ticks) + stats.SetCpuTotalTicks(uint64(ticks)) resources = new(structs.Resources) resources.CPU = ticks if nodeResources == nil { diff --git a/client/lib/resources/pid.go b/client/lib/resources/pid.go index 8b99d1e8f..8365f1671 100644 --- a/client/lib/resources/pid.go +++ b/client/lib/resources/pid.go @@ -4,7 +4,7 @@ package resources import ( - "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/helper/stats" ) // PIDs holds all of a task's pids and their cpu percentage calculators diff --git a/client/stats/cpu.go b/client/stats/cpu.go index 0a576d036..b193cb6f5 100644 --- a/client/stats/cpu.go +++ b/client/stats/cpu.go @@ -4,64 +4,115 @@ package stats import ( - "runtime" + "context" + "errors" + "fmt" + "sync" "time" - shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/shirou/gopsutil/v3/cpu" + "github.com/shoenig/go-m1cpu" ) -// CpuStats calculates cpu usage percentage -type CpuStats struct { - prevCpuTime float64 - prevTime time.Time +const ( + // cpuInfoTimeout is the timeout used when gathering CPU info. This is used + // to override the default timeout in gopsutil which has a tendency to + // timeout on Windows. + cpuInfoTimeout = 60 * time.Second +) - totalCpus int -} +var ( + cpuPowerCoreCount int + cpuPowerCoreMHz uint64 + cpuEfficiencyCoreCount int + cpuEfficiencyCoreMHz uint64 + cpuModelName string +) -// NewCpuStats returns a cpu stats calculator -func NewCpuStats() *CpuStats { - numCpus := runtime.NumCPU() - cpuStats := &CpuStats{ - totalCpus: numCpus, - } - return cpuStats -} +var ( + detectedCpuTotalTicks uint64 + initErr error + onceLer sync.Once +) -// Percent calculates the cpu usage percentage based on the current cpu usage -// and the previous cpu usage where usage is given as time in nanoseconds spend -// in the cpu -func (c *CpuStats) Percent(cpuTime float64) float64 { - now := time.Now() +func Init(configCpuTotalCompute uint64) error { + onceLer.Do(func() { + switch { + case m1cpu.IsAppleSilicon(): + cpuModelName = m1cpu.ModelName() + cpuPowerCoreCount = m1cpu.PCoreCount() + cpuPowerCoreMHz = m1cpu.PCoreHz() / 1_000_000 + cpuEfficiencyCoreCount = m1cpu.ECoreCount() + cpuEfficiencyCoreMHz = m1cpu.ECoreHz() / 1_000_000 + bigTicks := uint64(cpuPowerCoreCount) * cpuPowerCoreMHz + littleTicks := uint64(cpuEfficiencyCoreCount) * cpuEfficiencyCoreMHz + detectedCpuTotalTicks = bigTicks + littleTicks + default: + // for now, all other cpu types assume only power cores + // todo: this is already not true for Intel 13th generation - if c.prevCpuTime == 0.0 { - // invoked first time - c.prevCpuTime = cpuTime - c.prevTime = now - return 0.0 + var err error + if cpuPowerCoreCount, err = cpu.Counts(true); err != nil { + initErr = errors.Join(initErr, fmt.Errorf("failed to detect number of CPU cores: %w", err)) + } + + ctx, cancel := context.WithTimeout(context.Background(), cpuInfoTimeout) + defer cancel() + + var cpuInfoStats []cpu.InfoStat + if cpuInfoStats, err = cpu.InfoWithContext(ctx); err != nil { + initErr = errors.Join(initErr, fmt.Errorf("Unable to obtain CPU information: %w", err)) + } + + for _, infoStat := range cpuInfoStats { + cpuModelName = infoStat.ModelName + if uint64(infoStat.Mhz) > cpuPowerCoreMHz { + cpuPowerCoreMHz = uint64(infoStat.Mhz) + } + } + + // compute ticks using only power core, until we add support for + // detecting little cores on non-apple platforms + detectedCpuTotalTicks = uint64(cpuPowerCoreCount) * cpuPowerCoreMHz + + initErr = err + } + + stats.SetCpuTotalTicks(detectedCpuTotalTicks) + }) + + // override the computed value with the config value if it is set + if configCpuTotalCompute > 0 { + stats.SetCpuTotalTicks(configCpuTotalCompute) } - timeDelta := now.Sub(c.prevTime).Nanoseconds() - ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) - c.prevCpuTime = cpuTime - c.prevTime = now - return ret + return initErr } -// TicksConsumed calculates the total ticks consumes by the process across all -// cpu cores -func (c *CpuStats) TicksConsumed(percent float64) float64 { - return (percent / 100) * float64(shelpers.TotalTicksAvailable()) / float64(c.totalCpus) +// CPUNumCores returns the number of CPU cores available. +// +// This is represented with two values - (Power (P), Efficiency (E)) so we can +// correctly compute total compute for processors with asymetric cores such as +// Apple Silicon. +// +// For platforms with symetric cores (or where we do not correcly detect asymetric +// cores), all cores are presented as P cores. +func CPUNumCores() (int, int) { + return cpuPowerCoreCount, cpuEfficiencyCoreCount } -func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { - vDelta := t2 - t1 - if timeDelta <= 0 || vDelta <= 0.0 { - return 0.0 - } +// CPUMHzPerCore returns the MHz per CPU (P, E) core type. +// +// As with CPUNumCores, asymetric core detection currently only works with +// Apple Silicon CPUs. +func CPUMHzPerCore() (uint64, uint64) { + return cpuPowerCoreMHz, cpuEfficiencyCoreMHz +} - overall_percent := (vDelta / float64(timeDelta)) * 100.0 - return overall_percent +// CPUModelName returns the model name of the CPU. +func CPUModelName() string { + return cpuModelName } func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks float64, err error) { @@ -79,7 +130,7 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo h.statsCalculator[cpuStat.CPU] = percentCalculator } idle, user, system, total := percentCalculator.Calculate(cpuStat) - ticks := (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats))) + ticks := (total / 100.0) * (float64(stats.CpuTotalTicks()) / float64(len(cpuStats))) cs[idx] = &CPUStats{ CPU: cpuStat.CPU, User: user, diff --git a/helper/stats/cpu_darwin_test.go b/client/stats/cpu_darwin_test.go similarity index 88% rename from helper/stats/cpu_darwin_test.go rename to client/stats/cpu_darwin_test.go index dd88f9429..f5b21b4f9 100644 --- a/helper/stats/cpu_darwin_test.go +++ b/client/stats/cpu_darwin_test.go @@ -32,7 +32,7 @@ func TestCPU_CPUModelName(t *testing.T) { must.NotEq(t, "", name) } -func TestCPU_CPUTotalTicksAvailable(t *testing.T) { - ticks := TotalTicksAvailable() +func TestCPU_CPUCpuTotalTicks(t *testing.T) { + ticks := CpuTotalTicks() must.Positive(t, ticks) } diff --git a/client/stats/cpu_test.go b/client/stats/cpu_test.go index 330085e65..8bbea2be7 100644 --- a/client/stats/cpu_test.go +++ b/client/stats/cpu_test.go @@ -7,32 +7,17 @@ import ( "math" "os" "testing" - "time" "github.com/hashicorp/nomad/ci" - shelpers "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/helper/testlog" "github.com/stretchr/testify/assert" ) -func TestCpuStatsPercent(t *testing.T) { - ci.Parallel(t) - - cs := NewCpuStats() - cs.Percent(79.7) - time.Sleep(1 * time.Second) - percent := cs.Percent(80.69) - expectedPercent := 98.00 - if percent < expectedPercent && percent > (expectedPercent+1.00) { - t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) - } -} - func TestHostStats_CPU(t *testing.T) { ci.Parallel(t) assert := assert.New(t) - assert.Nil(shelpers.Init()) + assert.Nil(Init(0)) logger := testlog.HCLogger(t) cwd, err := os.Getwd() diff --git a/drivers/docker/util/stats_posix.go b/drivers/docker/util/stats_posix.go index 81b05c9f9..f3951fc91 100644 --- a/drivers/docker/util/stats_posix.go +++ b/drivers/docker/util/stats_posix.go @@ -56,7 +56,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage cs.UserMode = CalculateCPUPercent( s.CPUStats.CPUUsage.UsageInUsermode, s.PreCPUStats.CPUUsage.UsageInUsermode, s.CPUStats.CPUUsage.TotalUsage, s.PreCPUStats.CPUUsage.TotalUsage, runtime.NumCPU()) - cs.TotalTicks = (cs.Percent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU()) + cs.TotalTicks = (cs.Percent / 100) * float64(stats.CpuTotalTicks()) / float64(runtime.NumCPU()) return &cstructs.TaskResourceUsage{ ResourceUsage: &cstructs.ResourceUsage{ diff --git a/drivers/docker/util/stats_windows.go b/drivers/docker/util/stats_windows.go index 999e0a8dc..ae4d2dbc0 100644 --- a/drivers/docker/util/stats_windows.go +++ b/drivers/docker/util/stats_windows.go @@ -45,7 +45,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage ThrottledPeriods: s.CPUStats.ThrottlingData.ThrottledPeriods, ThrottledTime: s.CPUStats.ThrottlingData.ThrottledTime, Percent: cpuPercent, - TotalTicks: (cpuPercent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU()), + TotalTicks: (cpuPercent / 100) * float64(stats.CpuTotalTicks()) / float64(runtime.NumCPU()), Measured: DockerMeasuredCPUStats, } diff --git a/drivers/shared/executor/executor.go b/drivers/shared/executor/executor.go index d3da4c061..aa2cc73c9 100644 --- a/drivers/shared/executor/executor.go +++ b/drivers/shared/executor/executor.go @@ -23,9 +23,8 @@ import ( "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/lib/fifo" "github.com/hashicorp/nomad/client/lib/resources" - "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" - shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/drivers" "github.com/syndtr/gocapability/capability" ) @@ -259,11 +258,9 @@ type UniversalExecutor struct { } // NewExecutor returns an Executor -func NewExecutor(logger hclog.Logger) Executor { +func NewExecutor(logger hclog.Logger, cpuTotalTicks uint64) Executor { logger = logger.Named("executor") - if err := shelpers.Init(); err != nil { - logger.Error("unable to initialize stats", "error", err) - } + stats.SetCpuTotalTicks(cpuTotalTicks) return &UniversalExecutor{ logger: logger, diff --git a/drivers/shared/executor/executor_basic.go b/drivers/shared/executor/executor_basic.go index 415d508da..ae0c8fab9 100644 --- a/drivers/shared/executor/executor_basic.go +++ b/drivers/shared/executor/executor_basic.go @@ -13,10 +13,10 @@ import ( "github.com/hashicorp/nomad/plugins/drivers" ) -func NewExecutorWithIsolation(logger hclog.Logger) Executor { +func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor { logger = logger.Named("executor") logger.Error("isolation executor is not supported on this platform, using default") - return NewExecutor(logger) + return NewExecutor(logger, cpuTotalTicks) } func (e *UniversalExecutor) configureResourceContainer(_ int) error { return nil } diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index d0c4a743b..0227fc80a 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -24,10 +24,9 @@ import ( "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/lib/resources" - "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/drivers/shared/capabilities" - shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" @@ -71,11 +70,10 @@ type LibcontainerExecutor struct { exitState *ProcessState } -func NewExecutorWithIsolation(logger hclog.Logger) Executor { +func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor { logger = logger.Named("isolated_executor") - if err := shelpers.Init(); err != nil { - logger.Error("unable to initialize stats", "error", err) - } + stats.SetCpuTotalTicks(cpuTotalTicks) + return &LibcontainerExecutor{ id: strings.ReplaceAll(uuid.Generate(), "-", "_"), logger: logger, diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index edd4e4b4c..a26a90a5d 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -147,7 +147,7 @@ func TestExecutor_Isolation_PID_and_IPC_hostMode(t *testing.T) { execCmd.ModePID = "host" // disable PID namespace execCmd.ModeIPC = "host" // disable IPC namespace - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -190,7 +190,7 @@ func TestExecutor_IsolationAndConstraints(t *testing.T) { execCmd.ModePID = "private" execCmd.ModeIPC = "private" - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -282,7 +282,7 @@ func TestExecutor_CgroupPaths(t *testing.T) { execCmd.ResourceLimits = true - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -344,7 +344,7 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { execCmd.ResourceLimits = true - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -547,7 +547,7 @@ func TestExecutor_EscapeContainer(t *testing.T) { execCmd.ResourceLimits = true - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err := executor.Launch(execCmd) @@ -597,7 +597,7 @@ func TestExecutor_DoesNotInheritOomScoreAdj(t *testing.T) { execCmd.Cmd = "/bin/bash" execCmd.Args = []string{"-c", "cat /proc/self/oom_score_adj"} - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err = executor.Launch(execCmd) @@ -691,7 +691,7 @@ CapAmb: 0000000000000400`, execCmd.Capabilities = capsAllowed } - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err := executor.Launch(execCmd) @@ -739,7 +739,7 @@ func TestExecutor_ClientCleanup(t *testing.T) { execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir defer allocDir.Destroy() - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) // Need to run a command which will produce continuous output but not @@ -864,7 +864,7 @@ func TestUniversalExecutor_NoCgroup(t *testing.T) { execCmd.BasicProcessCgroup = false execCmd.ResourceLimits = false - executor := NewExecutor(testlog.HCLogger(t)) + executor := NewExecutor(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err = executor.Launch(execCmd) diff --git a/drivers/shared/executor/executor_plugin.go b/drivers/shared/executor/executor_plugin.go index 5e5c0b51c..05be90b7c 100644 --- a/drivers/shared/executor/executor_plugin.go +++ b/drivers/shared/executor/executor_plugin.go @@ -15,15 +15,16 @@ import ( type ExecutorPlugin struct { // TODO: support backwards compatibility with pre 0.9 NetRPC plugin plugin.NetRPCUnsupportedPlugin - logger hclog.Logger - fsIsolation bool + logger hclog.Logger + fsIsolation bool + cpuTotalTicks uint64 } func (p *ExecutorPlugin) GRPCServer(broker *plugin.GRPCBroker, s *grpc.Server) error { if p.fsIsolation { - proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutorWithIsolation(p.logger)}) + proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutorWithIsolation(p.logger, p.cpuTotalTicks)}) } else { - proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutor(p.logger)}) + proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutor(p.logger, p.cpuTotalTicks)}) } return nil } diff --git a/drivers/shared/executor/executor_test.go b/drivers/shared/executor/executor_test.go index a12f9356f..a02261fad 100644 --- a/drivers/shared/executor/executor_test.go +++ b/drivers/shared/executor/executor_test.go @@ -36,7 +36,7 @@ import ( var executorFactories = map[string]executorFactory{} type executorFactory struct { - new func(hclog.Logger) Executor + new func(hclog.Logger, uint64) Executor configureExecCmd func(*testing.T, *ExecCommand) } @@ -150,7 +150,7 @@ func TestExecutor_Start_Invalid(t *testing.T) { execCmd.Args = []string{"1"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) _, err := executor.Launch(execCmd) @@ -170,7 +170,7 @@ func TestExecutor_Start_Wait_Failure_Code(t *testing.T) { execCmd.Args = []string{"-c", "sleep 1; /bin/date fail"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -195,7 +195,7 @@ func TestExecutor_Start_Wait(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -232,7 +232,7 @@ func TestExecutor_Start_Wait_Children(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -273,7 +273,7 @@ func TestExecutor_WaitExitSignal(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) pState, err := executor.Launch(execCmd) @@ -331,7 +331,7 @@ func TestExecutor_Start_Kill(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -536,7 +536,7 @@ func TestExecutor_Start_Kill_Immediately_NoGrace(t *testing.T) { execCmd.Args = []string{"100"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -572,7 +572,7 @@ func TestExecutor_Start_Kill_Immediately_WithGrace(t *testing.T) { execCmd.Args = []string{"100"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -618,7 +618,7 @@ func TestExecutor_Start_NonExecutableBinaries(t *testing.T) { execCmd.Cmd = nonExecutablePath factory.configureExecCmd(t, execCmd) - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) // need to configure path in chroot with that file if using isolation executor diff --git a/drivers/shared/executor/pid_collector.go b/drivers/shared/executor/pid_collector.go index ff0895b36..c3184a83d 100644 --- a/drivers/shared/executor/pid_collector.go +++ b/drivers/shared/executor/pid_collector.go @@ -11,7 +11,7 @@ import ( hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/lib/resources" - "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/drivers" ps "github.com/mitchellh/go-ps" "github.com/shirou/gopsutil/v3/process" diff --git a/drivers/shared/executor/plugins.go b/drivers/shared/executor/plugins.go index c35479346..97aa65187 100644 --- a/drivers/shared/executor/plugins.go +++ b/drivers/shared/executor/plugins.go @@ -22,13 +22,18 @@ type ExecutorConfig struct { // FSIsolation if set will use an executor implementation that support // filesystem isolation FSIsolation bool + + // cpuTotalTicks is the total CPU compute. It should be given as Cores * MHz + // (2 Cores * 2 Ghz = 4000) + CpuTotalTicks uint64 } -func GetPluginMap(logger hclog.Logger, fsIsolation bool) map[string]plugin.Plugin { +func GetPluginMap(logger hclog.Logger, fsIsolation bool, cpuTotalTicks uint64) map[string]plugin.Plugin { return map[string]plugin.Plugin{ "executor": &ExecutorPlugin{ - logger: logger, - fsIsolation: fsIsolation, + logger: logger, + fsIsolation: fsIsolation, + cpuTotalTicks: cpuTotalTicks, }, } } diff --git a/drivers/shared/executor/utils.go b/drivers/shared/executor/utils.go index 5cb54b144..ff1c84c52 100644 --- a/drivers/shared/executor/utils.go +++ b/drivers/shared/executor/utils.go @@ -13,6 +13,7 @@ import ( hclog "github.com/hashicorp/go-hclog" plugin "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/drivers/shared/executor/proto" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/base" ) @@ -31,6 +32,7 @@ const ( func CreateExecutor(logger hclog.Logger, driverConfig *base.ClientDriverConfig, executorConfig *ExecutorConfig) (Executor, *plugin.Client, error) { + executorConfig.CpuTotalTicks = stats.CpuTotalTicks() c, err := json.Marshal(executorConfig) if err != nil { return nil, nil, fmt.Errorf("unable to create executor config: %v", err) @@ -41,8 +43,9 @@ func CreateExecutor(logger hclog.Logger, driverConfig *base.ClientDriverConfig, } p := &ExecutorPlugin{ - logger: logger, - fsIsolation: executorConfig.FSIsolation, + logger: logger, + fsIsolation: executorConfig.FSIsolation, + cpuTotalTicks: executorConfig.CpuTotalTicks, } config := &plugin.ClientConfig{ @@ -75,7 +78,7 @@ func ReattachToExecutor(reattachConfig *plugin.ReattachConfig, logger hclog.Logg config := &plugin.ClientConfig{ HandshakeConfig: base.Handshake, Reattach: reattachConfig, - Plugins: GetPluginMap(logger, false), + Plugins: GetPluginMap(logger, false, stats.CpuTotalTicks()), AllowedProtocols: []plugin.Protocol{plugin.ProtocolGRPC}, Logger: logger.Named("executor"), } diff --git a/drivers/shared/executor/z_executor_cmd.go b/drivers/shared/executor/z_executor_cmd.go index 60ef85045..8125e4fab 100644 --- a/drivers/shared/executor/z_executor_cmd.go +++ b/drivers/shared/executor/z_executor_cmd.go @@ -49,6 +49,7 @@ func init() { Plugins: GetPluginMap( logger, executorConfig.FSIsolation, + executorConfig.CpuTotalTicks, ), GRPCServer: plugin.DefaultGRPCServer, Logger: logger, diff --git a/helper/stats/cpu.go b/helper/stats/cpu.go index 4ddb6f517..f038469ca 100644 --- a/helper/stats/cpu.go +++ b/helper/stats/cpu.go @@ -4,109 +4,73 @@ package stats import ( - "context" - "errors" - "fmt" - "sync" + "runtime" "time" - - "github.com/shirou/gopsutil/v3/cpu" - "github.com/shoenig/go-m1cpu" -) - -const ( - // cpuInfoTimeout is the timeout used when gathering CPU info. This is used - // to override the default timeout in gopsutil which has a tendency to - // timeout on Windows. - cpuInfoTimeout = 60 * time.Second ) var ( - cpuPowerCoreCount int - cpuPowerCoreMHz uint64 - cpuEfficiencyCoreCount int - cpuEfficiencyCoreMHz uint64 - cpuTotalTicks uint64 - cpuModelName string + cpuTotalTicks uint64 ) -var ( - initErr error - onceLer sync.Once -) +// CpuStats calculates cpu usage percentage +type CpuStats struct { + prevCpuTime float64 + prevTime time.Time -func Init() error { - onceLer.Do(func() { - switch { - case m1cpu.IsAppleSilicon(): - cpuModelName = m1cpu.ModelName() - cpuPowerCoreCount = m1cpu.PCoreCount() - cpuPowerCoreMHz = m1cpu.PCoreHz() / 1_000_000 - cpuEfficiencyCoreCount = m1cpu.ECoreCount() - cpuEfficiencyCoreMHz = m1cpu.ECoreHz() / 1_000_000 - bigTicks := uint64(cpuPowerCoreCount) * cpuPowerCoreMHz - littleTicks := uint64(cpuEfficiencyCoreCount) * cpuEfficiencyCoreMHz - cpuTotalTicks = bigTicks + littleTicks - default: - // for now, all other cpu types assume only power cores - // todo: this is already not true for Intel 13th generation - - var err error - if cpuPowerCoreCount, err = cpu.Counts(true); err != nil { - initErr = errors.Join(initErr, fmt.Errorf("failed to detect number of CPU cores: %w", err)) - } - - ctx, cancel := context.WithTimeout(context.Background(), cpuInfoTimeout) - defer cancel() - - var cpuInfoStats []cpu.InfoStat - if cpuInfoStats, err = cpu.InfoWithContext(ctx); err != nil { - initErr = errors.Join(initErr, fmt.Errorf("Unable to obtain CPU information: %w", err)) - } - - for _, infoStat := range cpuInfoStats { - cpuModelName = infoStat.ModelName - if uint64(infoStat.Mhz) > cpuPowerCoreMHz { - cpuPowerCoreMHz = uint64(infoStat.Mhz) - } - } - - // compute ticks using only power core, until we add support for - // detecting little cores on non-apple platforms - cpuTotalTicks = uint64(cpuPowerCoreCount) * cpuPowerCoreMHz - - initErr = err - } - }) - return initErr + totalCpus int } -// CPUNumCores returns the number of CPU cores available. -// -// This is represented with two values - (Power (P), Efficiency (E)) so we can -// correctly compute total compute for processors with asymetric cores such as -// Apple Silicon. -// -// For platforms with symetric cores (or where we do not correcly detect asymetric -// cores), all cores are presented as P cores. -func CPUNumCores() (int, int) { - return cpuPowerCoreCount, cpuEfficiencyCoreCount +// NewCpuStats returns a cpu stats calculator +func NewCpuStats() *CpuStats { + numCpus := runtime.NumCPU() + cpuStats := &CpuStats{ + totalCpus: numCpus, + } + return cpuStats } -// CPUMHzPerCore returns the MHz per CPU (P, E) core type. -// -// As with CPUNumCores, asymetric core detection currently only works with -// Apple Silicon CPUs. -func CPUMHzPerCore() (uint64, uint64) { - return cpuPowerCoreMHz, cpuEfficiencyCoreMHz +// Percent calculates the cpu usage percentage based on the current cpu usage +// and the previous cpu usage where usage is given as time in nanoseconds spend +// in the cpu +func (c *CpuStats) Percent(cpuTime float64) float64 { + now := time.Now() + + if c.prevCpuTime == 0.0 { + // invoked first time + c.prevCpuTime = cpuTime + c.prevTime = now + return 0.0 + } + + timeDelta := now.Sub(c.prevTime).Nanoseconds() + ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) + c.prevCpuTime = cpuTime + c.prevTime = now + return ret } -// CPUModelName returns the model name of the CPU. -func CPUModelName() string { - return cpuModelName +// TicksConsumed calculates the total ticks consumes by the process across all +// cpu cores +func (c *CpuStats) TicksConsumed(percent float64) float64 { + return (percent / 100) * float64(CpuTotalTicks()) / float64(c.totalCpus) } -// TotalTicksAvailable calculates the total MHz available across all cores. +func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { + vDelta := t2 - t1 + if timeDelta <= 0 || vDelta <= 0.0 { + return 0.0 + } + + overall_percent := (vDelta / float64(timeDelta)) * 100.0 + return overall_percent +} + +// Set the total ticks available across all cores. +func SetCpuTotalTicks(newCpuTotalTicks uint64) { + cpuTotalTicks = newCpuTotalTicks +} + +// CpuTotalTicks calculates the total MHz available across all cores. // // Where asymetric cores are correctly detected, the total ticks is the sum of // the performance across both core types. @@ -114,6 +78,6 @@ func CPUModelName() string { // Where asymetric cores are not correctly detected (such as Intel 13th gen), // the total ticks available is over-estimated, as we assume all cores are P // cores. -func TotalTicksAvailable() uint64 { +func CpuTotalTicks() uint64 { return cpuTotalTicks } diff --git a/helper/stats/cpu_test.go b/helper/stats/cpu_test.go new file mode 100644 index 000000000..c26cd0d1c --- /dev/null +++ b/helper/stats/cpu_test.go @@ -0,0 +1,24 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package stats + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/ci" +) + +func TestCpuStatsPercent(t *testing.T) { + ci.Parallel(t) + + cs := NewCpuStats() + cs.Percent(79.7) + time.Sleep(1 * time.Second) + percent := cs.Percent(80.69) + expectedPercent := 98.00 + if percent < expectedPercent && percent > (expectedPercent+1.00) { + t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) + } +}