From 438954e835c9f212c2d127eabed2caee38cea6bd Mon Sep 17 00:00:00 2001 From: Patric Stout Date: Wed, 19 Jul 2023 20:30:47 +0200 Subject: [PATCH] Use config "cpu_total_compute" (if set) for all CPU statistics (#17628) (manual backport of e190eae395f7ba54d5392494682f01e2f05b51de) Before this commit, it was only used for fingerprinting, but not for CPU stats on nodes or tasks. This meant that if the auto-detection failed, setting the cpu_total_compute didn't resolved the issue. This issue was most noticeable on ARM64, as there auto-detection always failed. --- .changelog/17628.txt | 3 + client/fingerprint/cpu.go | 15 +- client/fingerprint/env_aws.go | 2 + client/lib/resources/pid.go | 2 +- client/stats/cpu.go | 140 ++++++++++++------ {helper => client}/stats/cpu_darwin_test.go | 4 +- client/stats/cpu_test.go | 17 +-- drivers/docker/util/stats_posix.go | 2 +- drivers/docker/util/stats_windows.go | 2 +- drivers/shared/executor/executor.go | 9 +- drivers/shared/executor/executor_basic.go | 4 +- drivers/shared/executor/executor_linux.go | 10 +- .../shared/executor/executor_linux_test.go | 18 +-- drivers/shared/executor/executor_plugin.go | 9 +- drivers/shared/executor/executor_test.go | 20 +-- drivers/shared/executor/pid_collector.go | 2 +- drivers/shared/executor/plugins.go | 11 +- drivers/shared/executor/utils.go | 9 +- drivers/shared/executor/z_executor_cmd.go | 1 + helper/stats/cpu.go | 138 +++++++---------- helper/stats/cpu_test.go | 24 +++ 21 files changed, 238 insertions(+), 204 deletions(-) create mode 100644 .changelog/17628.txt rename {helper => client}/stats/cpu_darwin_test.go (87%) create mode 100644 helper/stats/cpu_test.go diff --git a/.changelog/17628.txt b/.changelog/17628.txt new file mode 100644 index 00000000000..fcb89bcee27 --- /dev/null +++ b/.changelog/17628.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cpustats: Use config "cpu_total_compute" (if set) for all CPU statistics +``` diff --git a/client/fingerprint/cpu.go b/client/fingerprint/cpu.go index b4943549c80..0c44005a960 100644 --- a/client/fingerprint/cpu.go +++ b/client/fingerprint/cpu.go @@ -7,7 +7,8 @@ import ( "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/client/stats" + shelpers "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/nomad/structs" ) @@ -39,7 +40,7 @@ func NewCPUFingerprint(logger hclog.Logger) Fingerprint { } func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error { - f.initialize() + f.initialize(request) f.setModelName(response) @@ -58,8 +59,8 @@ func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *Fing return nil } -func (f *CPUFingerprint) initialize() { - if err := stats.Init(); err != nil { +func (f *CPUFingerprint) initialize(request *FingerprintRequest) { + if err := stats.Init(uint64(request.Config.CpuCompute)); err != nil { f.logger.Warn("failed initializing stats collector", "error", err) } } @@ -131,10 +132,8 @@ func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, respons func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) { var ticks uint64 switch { - case request.Config.CpuCompute > 0: - ticks = uint64(request.Config.CpuCompute) - case stats.TotalTicksAvailable() > 0: - ticks = stats.TotalTicksAvailable() + case shelpers.CpuTotalTicks() > 0: + ticks = shelpers.CpuTotalTicks() default: ticks = defaultCPUTicks } diff --git a/client/fingerprint/env_aws.go b/client/fingerprint/env_aws.go index b39317d1ac7..8ef9494aed2 100644 --- a/client/fingerprint/env_aws.go +++ b/client/fingerprint/env_aws.go @@ -16,6 +16,7 @@ import ( log "github.com/hashicorp/go-hclog" cleanhttp "github.com/hashicorp/go-cleanhttp" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/nomad/structs" ) @@ -182,6 +183,7 @@ func (f *EnvAWSFingerprint) Fingerprint(request *FingerprintRequest, response *F if ticks := specs.Ticks(); request.Config.CpuCompute <= 0 { response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks)) f.logger.Debug("setting ec2 cpu", "ticks", ticks) + stats.SetCpuTotalTicks(uint64(ticks)) resources = new(structs.Resources) resources.CPU = ticks if nodeResources == nil { diff --git a/client/lib/resources/pid.go b/client/lib/resources/pid.go index c2e4f547090..15fb6a8150f 100644 --- a/client/lib/resources/pid.go +++ b/client/lib/resources/pid.go @@ -1,7 +1,7 @@ package resources import ( - "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/helper/stats" ) // PIDs holds all of a task's pids and their cpu percentage calculators diff --git a/client/stats/cpu.go b/client/stats/cpu.go index 1986b05887f..b2a2731d87c 100644 --- a/client/stats/cpu.go +++ b/client/stats/cpu.go @@ -1,64 +1,115 @@ package stats import ( - "runtime" + "context" + "errors" + "fmt" + "sync" "time" - shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/shirou/gopsutil/v3/cpu" + "github.com/shoenig/go-m1cpu" ) -// CpuStats calculates cpu usage percentage -type CpuStats struct { - prevCpuTime float64 - prevTime time.Time +const ( + // cpuInfoTimeout is the timeout used when gathering CPU info. This is used + // to override the default timeout in gopsutil which has a tendency to + // timeout on Windows. + cpuInfoTimeout = 60 * time.Second +) - totalCpus int -} +var ( + cpuPowerCoreCount int + cpuPowerCoreMHz uint64 + cpuEfficiencyCoreCount int + cpuEfficiencyCoreMHz uint64 + cpuModelName string +) -// NewCpuStats returns a cpu stats calculator -func NewCpuStats() *CpuStats { - numCpus := runtime.NumCPU() - cpuStats := &CpuStats{ - totalCpus: numCpus, - } - return cpuStats -} +var ( + detectedCpuTotalTicks uint64 + initErr error + onceLer sync.Once +) + +func Init(configCpuTotalCompute uint64) error { + onceLer.Do(func() { + switch { + case m1cpu.IsAppleSilicon(): + cpuModelName = m1cpu.ModelName() + cpuPowerCoreCount = m1cpu.PCoreCount() + cpuPowerCoreMHz = m1cpu.PCoreHz() / 1_000_000 + cpuEfficiencyCoreCount = m1cpu.ECoreCount() + cpuEfficiencyCoreMHz = m1cpu.ECoreHz() / 1_000_000 + bigTicks := uint64(cpuPowerCoreCount) * cpuPowerCoreMHz + littleTicks := uint64(cpuEfficiencyCoreCount) * cpuEfficiencyCoreMHz + detectedCpuTotalTicks = bigTicks + littleTicks + default: + // for now, all other cpu types assume only power cores + // todo: this is already not true for Intel 13th generation + + var err error + if cpuPowerCoreCount, err = cpu.Counts(true); err != nil { + initErr = errors.Join(initErr, fmt.Errorf("failed to detect number of CPU cores: %w", err)) + } + + ctx, cancel := context.WithTimeout(context.Background(), cpuInfoTimeout) + defer cancel() -// Percent calculates the cpu usage percentage based on the current cpu usage -// and the previous cpu usage where usage is given as time in nanoseconds spend -// in the cpu -func (c *CpuStats) Percent(cpuTime float64) float64 { - now := time.Now() - - if c.prevCpuTime == 0.0 { - // invoked first time - c.prevCpuTime = cpuTime - c.prevTime = now - return 0.0 + var cpuInfoStats []cpu.InfoStat + if cpuInfoStats, err = cpu.InfoWithContext(ctx); err != nil { + initErr = errors.Join(initErr, fmt.Errorf("Unable to obtain CPU information: %w", err)) + } + + for _, infoStat := range cpuInfoStats { + cpuModelName = infoStat.ModelName + if uint64(infoStat.Mhz) > cpuPowerCoreMHz { + cpuPowerCoreMHz = uint64(infoStat.Mhz) + } + } + + // compute ticks using only power core, until we add support for + // detecting little cores on non-apple platforms + detectedCpuTotalTicks = uint64(cpuPowerCoreCount) * cpuPowerCoreMHz + + initErr = err + } + + stats.SetCpuTotalTicks(detectedCpuTotalTicks) + }) + + // override the computed value with the config value if it is set + if configCpuTotalCompute > 0 { + stats.SetCpuTotalTicks(configCpuTotalCompute) } - timeDelta := now.Sub(c.prevTime).Nanoseconds() - ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) - c.prevCpuTime = cpuTime - c.prevTime = now - return ret + return initErr } -// TicksConsumed calculates the total ticks consumes by the process across all -// cpu cores -func (c *CpuStats) TicksConsumed(percent float64) float64 { - return (percent / 100) * float64(shelpers.TotalTicksAvailable()) / float64(c.totalCpus) +// CPUNumCores returns the number of CPU cores available. +// +// This is represented with two values - (Power (P), Efficiency (E)) so we can +// correctly compute total compute for processors with asymetric cores such as +// Apple Silicon. +// +// For platforms with symetric cores (or where we do not correcly detect asymetric +// cores), all cores are presented as P cores. +func CPUNumCores() (int, int) { + return cpuPowerCoreCount, cpuEfficiencyCoreCount } -func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { - vDelta := t2 - t1 - if timeDelta <= 0 || vDelta <= 0.0 { - return 0.0 - } +// CPUMHzPerCore returns the MHz per CPU (P, E) core type. +// +// As with CPUNumCores, asymetric core detection currently only works with +// Apple Silicon CPUs. +func CPUMHzPerCore() (uint64, uint64) { + return cpuPowerCoreMHz, cpuEfficiencyCoreMHz +} - overall_percent := (vDelta / float64(timeDelta)) * 100.0 - return overall_percent +// CPUModelName returns the model name of the CPU. +func CPUModelName() string { + return cpuModelName } func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks float64, err error) { @@ -76,6 +127,7 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo h.statsCalculator[cpuStat.CPU] = percentCalculator } idle, user, system, total := percentCalculator.Calculate(cpuStat) + ticks := (total / 100.0) * (float64(stats.CpuTotalTicks()) / float64(len(cpuStats))) cs[idx] = &CPUStats{ CPU: cpuStat.CPU, User: user, @@ -83,7 +135,7 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo Idle: idle, Total: total, } - ticksConsumed += (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats))) + ticksConsumed += ticks } return cs, ticksConsumed, nil diff --git a/helper/stats/cpu_darwin_test.go b/client/stats/cpu_darwin_test.go similarity index 87% rename from helper/stats/cpu_darwin_test.go rename to client/stats/cpu_darwin_test.go index 34a99a65e8f..5b76c012760 100644 --- a/helper/stats/cpu_darwin_test.go +++ b/client/stats/cpu_darwin_test.go @@ -29,7 +29,7 @@ func TestCPU_CPUModelName(t *testing.T) { must.NotEq(t, "", name) } -func TestCPU_CPUTotalTicksAvailable(t *testing.T) { - ticks := TotalTicksAvailable() +func TestCPU_CPUCpuTotalTicks(t *testing.T) { + ticks := CpuTotalTicks() must.Positive(t, ticks) } diff --git a/client/stats/cpu_test.go b/client/stats/cpu_test.go index 4dc9b19b476..b8529dece1a 100644 --- a/client/stats/cpu_test.go +++ b/client/stats/cpu_test.go @@ -4,32 +4,17 @@ import ( "math" "os" "testing" - "time" "github.com/hashicorp/nomad/ci" - shelpers "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/helper/testlog" "github.com/stretchr/testify/assert" ) -func TestCpuStatsPercent(t *testing.T) { - ci.Parallel(t) - - cs := NewCpuStats() - cs.Percent(79.7) - time.Sleep(1 * time.Second) - percent := cs.Percent(80.69) - expectedPercent := 98.00 - if percent < expectedPercent && percent > (expectedPercent+1.00) { - t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) - } -} - func TestHostStats_CPU(t *testing.T) { ci.Parallel(t) assert := assert.New(t) - assert.Nil(shelpers.Init()) + assert.Nil(Init(0)) logger := testlog.HCLogger(t) cwd, err := os.Getwd() diff --git a/drivers/docker/util/stats_posix.go b/drivers/docker/util/stats_posix.go index ae086d475ba..65a58b482e3 100644 --- a/drivers/docker/util/stats_posix.go +++ b/drivers/docker/util/stats_posix.go @@ -53,7 +53,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage cs.UserMode = CalculateCPUPercent( s.CPUStats.CPUUsage.UsageInUsermode, s.PreCPUStats.CPUUsage.UsageInUsermode, s.CPUStats.CPUUsage.TotalUsage, s.PreCPUStats.CPUUsage.TotalUsage, runtime.NumCPU()) - cs.TotalTicks = (cs.Percent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU()) + cs.TotalTicks = (cs.Percent / 100) * float64(stats.CpuTotalTicks()) / float64(runtime.NumCPU()) return &cstructs.TaskResourceUsage{ ResourceUsage: &cstructs.ResourceUsage{ diff --git a/drivers/docker/util/stats_windows.go b/drivers/docker/util/stats_windows.go index 9d4b5e81ce7..07931a98df2 100644 --- a/drivers/docker/util/stats_windows.go +++ b/drivers/docker/util/stats_windows.go @@ -42,7 +42,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage ThrottledPeriods: s.CPUStats.ThrottlingData.ThrottledPeriods, ThrottledTime: s.CPUStats.ThrottlingData.ThrottledTime, Percent: cpuPercent, - TotalTicks: (cpuPercent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU()), + TotalTicks: (cpuPercent / 100) * float64(stats.CpuTotalTicks()) / float64(runtime.NumCPU()), Measured: DockerMeasuredCPUStats, } diff --git a/drivers/shared/executor/executor.go b/drivers/shared/executor/executor.go index ae8de87ee61..3d094f35115 100644 --- a/drivers/shared/executor/executor.go +++ b/drivers/shared/executor/executor.go @@ -21,9 +21,8 @@ import ( "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/lib/fifo" "github.com/hashicorp/nomad/client/lib/resources" - "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" - shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/drivers" "github.com/syndtr/gocapability/capability" ) @@ -257,11 +256,9 @@ type UniversalExecutor struct { } // NewExecutor returns an Executor -func NewExecutor(logger hclog.Logger) Executor { +func NewExecutor(logger hclog.Logger, cpuTotalTicks uint64) Executor { logger = logger.Named("executor") - if err := shelpers.Init(); err != nil { - logger.Error("unable to initialize stats", "error", err) - } + stats.SetCpuTotalTicks(cpuTotalTicks) return &UniversalExecutor{ logger: logger, diff --git a/drivers/shared/executor/executor_basic.go b/drivers/shared/executor/executor_basic.go index ad42792d1c0..63654696a8d 100644 --- a/drivers/shared/executor/executor_basic.go +++ b/drivers/shared/executor/executor_basic.go @@ -10,10 +10,10 @@ import ( "github.com/hashicorp/nomad/plugins/drivers" ) -func NewExecutorWithIsolation(logger hclog.Logger) Executor { +func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor { logger = logger.Named("executor") logger.Error("isolation executor is not supported on this platform, using default") - return NewExecutor(logger) + return NewExecutor(logger, cpuTotalTicks) } func (e *UniversalExecutor) configureResourceContainer(_ int) error { return nil } diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 3cce11e7045..8573ec42a64 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -21,10 +21,9 @@ import ( "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/lib/resources" - "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/drivers/shared/capabilities" - shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" @@ -68,11 +67,10 @@ type LibcontainerExecutor struct { exitState *ProcessState } -func NewExecutorWithIsolation(logger hclog.Logger) Executor { +func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor { logger = logger.Named("isolated_executor") - if err := shelpers.Init(); err != nil { - logger.Error("unable to initialize stats", "error", err) - } + stats.SetCpuTotalTicks(cpuTotalTicks) + return &LibcontainerExecutor{ id: strings.ReplaceAll(uuid.Generate(), "-", "_"), logger: logger, diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 2ab1193cdc4..3946ba12d06 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -145,7 +145,7 @@ func TestExecutor_Isolation_PID_and_IPC_hostMode(t *testing.T) { execCmd.ModePID = "host" // disable PID namespace execCmd.ModeIPC = "host" // disable IPC namespace - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -188,7 +188,7 @@ func TestExecutor_IsolationAndConstraints(t *testing.T) { execCmd.ModePID = "private" execCmd.ModeIPC = "private" - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -279,7 +279,7 @@ func TestExecutor_CgroupPaths(t *testing.T) { execCmd.ResourceLimits = true - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -341,7 +341,7 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { execCmd.ResourceLimits = true - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -544,7 +544,7 @@ func TestExecutor_EscapeContainer(t *testing.T) { execCmd.ResourceLimits = true - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err := executor.Launch(execCmd) @@ -594,7 +594,7 @@ func TestExecutor_DoesNotInheritOomScoreAdj(t *testing.T) { execCmd.Cmd = "/bin/bash" execCmd.Args = []string{"-c", "cat /proc/self/oom_score_adj"} - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err = executor.Launch(execCmd) @@ -688,7 +688,7 @@ CapAmb: 0000000000000400`, execCmd.Capabilities = capsAllowed } - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err := executor.Launch(execCmd) @@ -736,7 +736,7 @@ func TestExecutor_ClientCleanup(t *testing.T) { execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir defer allocDir.Destroy() - executor := NewExecutorWithIsolation(testlog.HCLogger(t)) + executor := NewExecutorWithIsolation(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) // Need to run a command which will produce continuous output but not @@ -861,7 +861,7 @@ func TestUniversalExecutor_NoCgroup(t *testing.T) { execCmd.BasicProcessCgroup = false execCmd.ResourceLimits = false - executor := NewExecutor(testlog.HCLogger(t)) + executor := NewExecutor(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) _, err = executor.Launch(execCmd) diff --git a/drivers/shared/executor/executor_plugin.go b/drivers/shared/executor/executor_plugin.go index 6eb7b356469..5e6c20d06f2 100644 --- a/drivers/shared/executor/executor_plugin.go +++ b/drivers/shared/executor/executor_plugin.go @@ -12,15 +12,16 @@ import ( type ExecutorPlugin struct { // TODO: support backwards compatibility with pre 0.9 NetRPC plugin plugin.NetRPCUnsupportedPlugin - logger hclog.Logger - fsIsolation bool + logger hclog.Logger + fsIsolation bool + cpuTotalTicks uint64 } func (p *ExecutorPlugin) GRPCServer(broker *plugin.GRPCBroker, s *grpc.Server) error { if p.fsIsolation { - proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutorWithIsolation(p.logger)}) + proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutorWithIsolation(p.logger, p.cpuTotalTicks)}) } else { - proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutor(p.logger)}) + proto.RegisterExecutorServer(s, &grpcExecutorServer{impl: NewExecutor(p.logger, p.cpuTotalTicks)}) } return nil } diff --git a/drivers/shared/executor/executor_test.go b/drivers/shared/executor/executor_test.go index e9cd821eb69..893a6e533b8 100644 --- a/drivers/shared/executor/executor_test.go +++ b/drivers/shared/executor/executor_test.go @@ -34,7 +34,7 @@ import ( var executorFactories = map[string]executorFactory{} type executorFactory struct { - new func(hclog.Logger) Executor + new func(hclog.Logger, uint64) Executor configureExecCmd func(*testing.T, *ExecCommand) } @@ -148,7 +148,7 @@ func TestExecutor_Start_Invalid(t *testing.T) { execCmd.Args = []string{"1"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) _, err := executor.Launch(execCmd) @@ -168,7 +168,7 @@ func TestExecutor_Start_Wait_Failure_Code(t *testing.T) { execCmd.Args = []string{"-c", "sleep 1; /bin/date fail"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -193,7 +193,7 @@ func TestExecutor_Start_Wait(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -230,7 +230,7 @@ func TestExecutor_Start_Wait_Children(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("SIGKILL", 0) ps, err := executor.Launch(execCmd) @@ -271,7 +271,7 @@ func TestExecutor_WaitExitSignal(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) pState, err := executor.Launch(execCmd) @@ -329,7 +329,7 @@ func TestExecutor_Start_Kill(t *testing.T) { factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -534,7 +534,7 @@ func TestExecutor_Start_Kill_Immediately_NoGrace(t *testing.T) { execCmd.Args = []string{"100"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -570,7 +570,7 @@ func TestExecutor_Start_Kill_Immediately_WithGrace(t *testing.T) { execCmd.Args = []string{"100"} factory.configureExecCmd(t, execCmd) defer allocDir.Destroy() - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) ps, err := executor.Launch(execCmd) @@ -616,7 +616,7 @@ func TestExecutor_Start_NonExecutableBinaries(t *testing.T) { execCmd.Cmd = nonExecutablePath factory.configureExecCmd(t, execCmd) - executor := factory.new(testlog.HCLogger(t)) + executor := factory.new(testlog.HCLogger(t), 0) defer executor.Shutdown("", 0) // need to configure path in chroot with that file if using isolation executor diff --git a/drivers/shared/executor/pid_collector.go b/drivers/shared/executor/pid_collector.go index 2413f8ee5e0..9f937d06f25 100644 --- a/drivers/shared/executor/pid_collector.go +++ b/drivers/shared/executor/pid_collector.go @@ -8,7 +8,7 @@ import ( hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/lib/resources" - "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/drivers" ps "github.com/mitchellh/go-ps" "github.com/shirou/gopsutil/v3/process" diff --git a/drivers/shared/executor/plugins.go b/drivers/shared/executor/plugins.go index 0e3b977e906..73947b538a7 100644 --- a/drivers/shared/executor/plugins.go +++ b/drivers/shared/executor/plugins.go @@ -19,13 +19,18 @@ type ExecutorConfig struct { // FSIsolation if set will use an executor implementation that support // filesystem isolation FSIsolation bool + + // cpuTotalTicks is the total CPU compute. It should be given as Cores * MHz + // (2 Cores * 2 Ghz = 4000) + CpuTotalTicks uint64 } -func GetPluginMap(logger hclog.Logger, fsIsolation bool) map[string]plugin.Plugin { +func GetPluginMap(logger hclog.Logger, fsIsolation bool, cpuTotalTicks uint64) map[string]plugin.Plugin { return map[string]plugin.Plugin{ "executor": &ExecutorPlugin{ - logger: logger, - fsIsolation: fsIsolation, + logger: logger, + fsIsolation: fsIsolation, + cpuTotalTicks: cpuTotalTicks, }, } } diff --git a/drivers/shared/executor/utils.go b/drivers/shared/executor/utils.go index 237152a8b15..05948b3cf68 100644 --- a/drivers/shared/executor/utils.go +++ b/drivers/shared/executor/utils.go @@ -10,6 +10,7 @@ import ( hclog "github.com/hashicorp/go-hclog" plugin "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/drivers/shared/executor/proto" + "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/base" ) @@ -28,6 +29,7 @@ const ( func CreateExecutor(logger hclog.Logger, driverConfig *base.ClientDriverConfig, executorConfig *ExecutorConfig) (Executor, *plugin.Client, error) { + executorConfig.CpuTotalTicks = stats.CpuTotalTicks() c, err := json.Marshal(executorConfig) if err != nil { return nil, nil, fmt.Errorf("unable to create executor config: %v", err) @@ -38,8 +40,9 @@ func CreateExecutor(logger hclog.Logger, driverConfig *base.ClientDriverConfig, } p := &ExecutorPlugin{ - logger: logger, - fsIsolation: executorConfig.FSIsolation, + logger: logger, + fsIsolation: executorConfig.FSIsolation, + cpuTotalTicks: executorConfig.CpuTotalTicks, } config := &plugin.ClientConfig{ @@ -72,7 +75,7 @@ func ReattachToExecutor(reattachConfig *plugin.ReattachConfig, logger hclog.Logg config := &plugin.ClientConfig{ HandshakeConfig: base.Handshake, Reattach: reattachConfig, - Plugins: GetPluginMap(logger, false), + Plugins: GetPluginMap(logger, false, stats.CpuTotalTicks()), AllowedProtocols: []plugin.Protocol{plugin.ProtocolGRPC}, Logger: logger.Named("executor"), } diff --git a/drivers/shared/executor/z_executor_cmd.go b/drivers/shared/executor/z_executor_cmd.go index 5a5f13b6230..56ec3fef2b5 100644 --- a/drivers/shared/executor/z_executor_cmd.go +++ b/drivers/shared/executor/z_executor_cmd.go @@ -46,6 +46,7 @@ func init() { Plugins: GetPluginMap( logger, executorConfig.FSIsolation, + executorConfig.CpuTotalTicks, ), GRPCServer: plugin.DefaultGRPCServer, Logger: logger, diff --git a/helper/stats/cpu.go b/helper/stats/cpu.go index d2754204eb4..02b38069903 100644 --- a/helper/stats/cpu.go +++ b/helper/stats/cpu.go @@ -1,109 +1,73 @@ package stats import ( - "context" - "errors" - "fmt" - "sync" + "runtime" "time" - - "github.com/shirou/gopsutil/v3/cpu" - "github.com/shoenig/go-m1cpu" -) - -const ( - // cpuInfoTimeout is the timeout used when gathering CPU info. This is used - // to override the default timeout in gopsutil which has a tendency to - // timeout on Windows. - cpuInfoTimeout = 60 * time.Second -) - -var ( - cpuPowerCoreCount int - cpuPowerCoreMHz uint64 - cpuEfficiencyCoreCount int - cpuEfficiencyCoreMHz uint64 - cpuTotalTicks uint64 - cpuModelName string ) var ( - initErr error - onceLer sync.Once + cpuTotalTicks uint64 ) -func Init() error { - onceLer.Do(func() { - switch { - case m1cpu.IsAppleSilicon(): - cpuModelName = m1cpu.ModelName() - cpuPowerCoreCount = m1cpu.PCoreCount() - cpuPowerCoreMHz = m1cpu.PCoreHz() / 1_000_000 - cpuEfficiencyCoreCount = m1cpu.ECoreCount() - cpuEfficiencyCoreMHz = m1cpu.ECoreHz() / 1_000_000 - bigTicks := uint64(cpuPowerCoreCount) * cpuPowerCoreMHz - littleTicks := uint64(cpuEfficiencyCoreCount) * cpuEfficiencyCoreMHz - cpuTotalTicks = bigTicks + littleTicks - default: - // for now, all other cpu types assume only power cores - // todo: this is already not true for Intel 13th generation - - var err error - if cpuPowerCoreCount, err = cpu.Counts(true); err != nil { - initErr = errors.Join(initErr, fmt.Errorf("failed to detect number of CPU cores: %w", err)) - } - - ctx, cancel := context.WithTimeout(context.Background(), cpuInfoTimeout) - defer cancel() +// CpuStats calculates cpu usage percentage +type CpuStats struct { + prevCpuTime float64 + prevTime time.Time - var cpuInfoStats []cpu.InfoStat - if cpuInfoStats, err = cpu.InfoWithContext(ctx); err != nil { - initErr = errors.Join(initErr, fmt.Errorf("Unable to obtain CPU information: %w", err)) - } - - for _, infoStat := range cpuInfoStats { - cpuModelName = infoStat.ModelName - if uint64(infoStat.Mhz) > cpuPowerCoreMHz { - cpuPowerCoreMHz = uint64(infoStat.Mhz) - } - } + totalCpus int +} - // compute ticks using only power core, until we add support for - // detecting little cores on non-apple platforms - cpuTotalTicks = uint64(cpuPowerCoreCount) * cpuPowerCoreMHz +// NewCpuStats returns a cpu stats calculator +func NewCpuStats() *CpuStats { + numCpus := runtime.NumCPU() + cpuStats := &CpuStats{ + totalCpus: numCpus, + } + return cpuStats +} - initErr = err - } - }) - return initErr +// Percent calculates the cpu usage percentage based on the current cpu usage +// and the previous cpu usage where usage is given as time in nanoseconds spend +// in the cpu +func (c *CpuStats) Percent(cpuTime float64) float64 { + now := time.Now() + + if c.prevCpuTime == 0.0 { + // invoked first time + c.prevCpuTime = cpuTime + c.prevTime = now + return 0.0 + } + + timeDelta := now.Sub(c.prevTime).Nanoseconds() + ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) + c.prevCpuTime = cpuTime + c.prevTime = now + return ret } -// CPUNumCores returns the number of CPU cores available. -// -// This is represented with two values - (Power (P), Efficiency (E)) so we can -// correctly compute total compute for processors with asymetric cores such as -// Apple Silicon. -// -// For platforms with symetric cores (or where we do not correcly detect asymetric -// cores), all cores are presented as P cores. -func CPUNumCores() (int, int) { - return cpuPowerCoreCount, cpuEfficiencyCoreCount +// TicksConsumed calculates the total ticks consumes by the process across all +// cpu cores +func (c *CpuStats) TicksConsumed(percent float64) float64 { + return (percent / 100) * float64(CpuTotalTicks()) / float64(c.totalCpus) } -// CPUMHzPerCore returns the MHz per CPU (P, E) core type. -// -// As with CPUNumCores, asymetric core detection currently only works with -// Apple Silicon CPUs. -func CPUMHzPerCore() (uint64, uint64) { - return cpuPowerCoreMHz, cpuEfficiencyCoreMHz +func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { + vDelta := t2 - t1 + if timeDelta <= 0 || vDelta <= 0.0 { + return 0.0 + } + + overall_percent := (vDelta / float64(timeDelta)) * 100.0 + return overall_percent } -// CPUModelName returns the model name of the CPU. -func CPUModelName() string { - return cpuModelName +// Set the total ticks available across all cores. +func SetCpuTotalTicks(newCpuTotalTicks uint64) { + cpuTotalTicks = newCpuTotalTicks } -// TotalTicksAvailable calculates the total MHz available across all cores. +// CpuTotalTicks calculates the total MHz available across all cores. // // Where asymetric cores are correctly detected, the total ticks is the sum of // the performance across both core types. @@ -111,6 +75,6 @@ func CPUModelName() string { // Where asymetric cores are not correctly detected (such as Intel 13th gen), // the total ticks available is over-estimated, as we assume all cores are P // cores. -func TotalTicksAvailable() uint64 { +func CpuTotalTicks() uint64 { return cpuTotalTicks } diff --git a/helper/stats/cpu_test.go b/helper/stats/cpu_test.go new file mode 100644 index 00000000000..c26cd0d1c66 --- /dev/null +++ b/helper/stats/cpu_test.go @@ -0,0 +1,24 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package stats + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/ci" +) + +func TestCpuStatsPercent(t *testing.T) { + ci.Parallel(t) + + cs := NewCpuStats() + cs.Percent(79.7) + time.Sleep(1 * time.Second) + percent := cs.Percent(80.69) + expectedPercent := 98.00 + if percent < expectedPercent && percent > (expectedPercent+1.00) { + t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) + } +}