Skip to content

Commit

Permalink
Use config "cpu_total_compute" (if set) for all CPU statistics (#17628)
Browse files Browse the repository at this point in the history
(manual backport of e190eae)

Before this commit, it was only used for fingerprinting, but not
for CPU stats on nodes or tasks. This meant that if the
auto-detection failed, setting the cpu_total_compute didn't resolved
the issue.

This issue was most noticeable on ARM64, as there auto-detection
always failed.
  • Loading branch information
TrueBrain authored and shoenig committed Jul 19, 2023
1 parent 9a30e37 commit 438954e
Show file tree
Hide file tree
Showing 21 changed files with 238 additions and 204 deletions.
3 changes: 3 additions & 0 deletions .changelog/17628.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
cpustats: Use config "cpu_total_compute" (if set) for all CPU statistics
```
15 changes: 7 additions & 8 deletions client/fingerprint/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import (
"github.com/hashicorp/nomad/lib/cpuset"

"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/client/stats"
shelpers "github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/nomad/structs"
)

Expand Down Expand Up @@ -39,7 +40,7 @@ func NewCPUFingerprint(logger hclog.Logger) Fingerprint {
}

func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error {
f.initialize()
f.initialize(request)

f.setModelName(response)

Expand All @@ -58,8 +59,8 @@ func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *Fing
return nil
}

func (f *CPUFingerprint) initialize() {
if err := stats.Init(); err != nil {
func (f *CPUFingerprint) initialize(request *FingerprintRequest) {
if err := stats.Init(uint64(request.Config.CpuCompute)); err != nil {
f.logger.Warn("failed initializing stats collector", "error", err)
}
}
Expand Down Expand Up @@ -131,10 +132,8 @@ func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, respons
func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) {
var ticks uint64
switch {
case request.Config.CpuCompute > 0:
ticks = uint64(request.Config.CpuCompute)
case stats.TotalTicksAvailable() > 0:
ticks = stats.TotalTicksAvailable()
case shelpers.CpuTotalTicks() > 0:
ticks = shelpers.CpuTotalTicks()
default:
ticks = defaultCPUTicks
}
Expand Down
2 changes: 2 additions & 0 deletions client/fingerprint/env_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
log "github.com/hashicorp/go-hclog"

cleanhttp "github.com/hashicorp/go-cleanhttp"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/nomad/structs"
)

Expand Down Expand Up @@ -182,6 +183,7 @@ func (f *EnvAWSFingerprint) Fingerprint(request *FingerprintRequest, response *F
if ticks := specs.Ticks(); request.Config.CpuCompute <= 0 {
response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks))
f.logger.Debug("setting ec2 cpu", "ticks", ticks)
stats.SetCpuTotalTicks(uint64(ticks))
resources = new(structs.Resources)
resources.CPU = ticks
if nodeResources == nil {
Expand Down
2 changes: 1 addition & 1 deletion client/lib/resources/pid.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package resources

import (
"github.com/hashicorp/nomad/client/stats"
"github.com/hashicorp/nomad/helper/stats"
)

// PIDs holds all of a task's pids and their cpu percentage calculators
Expand Down
140 changes: 96 additions & 44 deletions client/stats/cpu.go
Original file line number Diff line number Diff line change
@@ -1,64 +1,115 @@
package stats

import (
"runtime"
"context"
"errors"
"fmt"
"sync"
"time"

shelpers "github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/helper/stats"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shoenig/go-m1cpu"
)

// CpuStats calculates cpu usage percentage
type CpuStats struct {
prevCpuTime float64
prevTime time.Time
const (
// cpuInfoTimeout is the timeout used when gathering CPU info. This is used
// to override the default timeout in gopsutil which has a tendency to
// timeout on Windows.
cpuInfoTimeout = 60 * time.Second
)

totalCpus int
}
var (
cpuPowerCoreCount int
cpuPowerCoreMHz uint64
cpuEfficiencyCoreCount int
cpuEfficiencyCoreMHz uint64
cpuModelName string
)

// NewCpuStats returns a cpu stats calculator
func NewCpuStats() *CpuStats {
numCpus := runtime.NumCPU()
cpuStats := &CpuStats{
totalCpus: numCpus,
}
return cpuStats
}
var (
detectedCpuTotalTicks uint64
initErr error
onceLer sync.Once
)

func Init(configCpuTotalCompute uint64) error {
onceLer.Do(func() {
switch {
case m1cpu.IsAppleSilicon():
cpuModelName = m1cpu.ModelName()
cpuPowerCoreCount = m1cpu.PCoreCount()
cpuPowerCoreMHz = m1cpu.PCoreHz() / 1_000_000
cpuEfficiencyCoreCount = m1cpu.ECoreCount()
cpuEfficiencyCoreMHz = m1cpu.ECoreHz() / 1_000_000
bigTicks := uint64(cpuPowerCoreCount) * cpuPowerCoreMHz
littleTicks := uint64(cpuEfficiencyCoreCount) * cpuEfficiencyCoreMHz
detectedCpuTotalTicks = bigTicks + littleTicks
default:
// for now, all other cpu types assume only power cores
// todo: this is already not true for Intel 13th generation

var err error
if cpuPowerCoreCount, err = cpu.Counts(true); err != nil {
initErr = errors.Join(initErr, fmt.Errorf("failed to detect number of CPU cores: %w", err))
}

ctx, cancel := context.WithTimeout(context.Background(), cpuInfoTimeout)
defer cancel()

// Percent calculates the cpu usage percentage based on the current cpu usage
// and the previous cpu usage where usage is given as time in nanoseconds spend
// in the cpu
func (c *CpuStats) Percent(cpuTime float64) float64 {
now := time.Now()

if c.prevCpuTime == 0.0 {
// invoked first time
c.prevCpuTime = cpuTime
c.prevTime = now
return 0.0
var cpuInfoStats []cpu.InfoStat
if cpuInfoStats, err = cpu.InfoWithContext(ctx); err != nil {
initErr = errors.Join(initErr, fmt.Errorf("Unable to obtain CPU information: %w", err))
}

for _, infoStat := range cpuInfoStats {
cpuModelName = infoStat.ModelName
if uint64(infoStat.Mhz) > cpuPowerCoreMHz {
cpuPowerCoreMHz = uint64(infoStat.Mhz)
}
}

// compute ticks using only power core, until we add support for
// detecting little cores on non-apple platforms
detectedCpuTotalTicks = uint64(cpuPowerCoreCount) * cpuPowerCoreMHz

initErr = err
}

stats.SetCpuTotalTicks(detectedCpuTotalTicks)
})

// override the computed value with the config value if it is set
if configCpuTotalCompute > 0 {
stats.SetCpuTotalTicks(configCpuTotalCompute)
}

timeDelta := now.Sub(c.prevTime).Nanoseconds()
ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta)
c.prevCpuTime = cpuTime
c.prevTime = now
return ret
return initErr
}

// TicksConsumed calculates the total ticks consumes by the process across all
// cpu cores
func (c *CpuStats) TicksConsumed(percent float64) float64 {
return (percent / 100) * float64(shelpers.TotalTicksAvailable()) / float64(c.totalCpus)
// CPUNumCores returns the number of CPU cores available.
//
// This is represented with two values - (Power (P), Efficiency (E)) so we can
// correctly compute total compute for processors with asymetric cores such as
// Apple Silicon.
//
// For platforms with symetric cores (or where we do not correcly detect asymetric
// cores), all cores are presented as P cores.
func CPUNumCores() (int, int) {
return cpuPowerCoreCount, cpuEfficiencyCoreCount
}

func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 {
vDelta := t2 - t1
if timeDelta <= 0 || vDelta <= 0.0 {
return 0.0
}
// CPUMHzPerCore returns the MHz per CPU (P, E) core type.
//
// As with CPUNumCores, asymetric core detection currently only works with
// Apple Silicon CPUs.
func CPUMHzPerCore() (uint64, uint64) {
return cpuPowerCoreMHz, cpuEfficiencyCoreMHz
}

overall_percent := (vDelta / float64(timeDelta)) * 100.0
return overall_percent
// CPUModelName returns the model name of the CPU.
func CPUModelName() string {
return cpuModelName
}

func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks float64, err error) {
Expand All @@ -76,14 +127,15 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo
h.statsCalculator[cpuStat.CPU] = percentCalculator
}
idle, user, system, total := percentCalculator.Calculate(cpuStat)
ticks := (total / 100.0) * (float64(stats.CpuTotalTicks()) / float64(len(cpuStats)))
cs[idx] = &CPUStats{
CPU: cpuStat.CPU,
User: user,
System: system,
Idle: idle,
Total: total,
}
ticksConsumed += (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats)))
ticksConsumed += ticks
}

return cs, ticksConsumed, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func TestCPU_CPUModelName(t *testing.T) {
must.NotEq(t, "", name)
}

func TestCPU_CPUTotalTicksAvailable(t *testing.T) {
ticks := TotalTicksAvailable()
func TestCPU_CPUCpuTotalTicks(t *testing.T) {
ticks := CpuTotalTicks()
must.Positive(t, ticks)
}
17 changes: 1 addition & 16 deletions client/stats/cpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,17 @@ import (
"math"
"os"
"testing"
"time"

"github.com/hashicorp/nomad/ci"
shelpers "github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/stretchr/testify/assert"
)

func TestCpuStatsPercent(t *testing.T) {
ci.Parallel(t)

cs := NewCpuStats()
cs.Percent(79.7)
time.Sleep(1 * time.Second)
percent := cs.Percent(80.69)
expectedPercent := 98.00
if percent < expectedPercent && percent > (expectedPercent+1.00) {
t.Fatalf("expected: %v, actual: %v", expectedPercent, percent)
}
}

func TestHostStats_CPU(t *testing.T) {
ci.Parallel(t)

assert := assert.New(t)
assert.Nil(shelpers.Init())
assert.Nil(Init(0))

logger := testlog.HCLogger(t)
cwd, err := os.Getwd()
Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/util/stats_posix.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage
cs.UserMode = CalculateCPUPercent(
s.CPUStats.CPUUsage.UsageInUsermode, s.PreCPUStats.CPUUsage.UsageInUsermode,
s.CPUStats.CPUUsage.TotalUsage, s.PreCPUStats.CPUUsage.TotalUsage, runtime.NumCPU())
cs.TotalTicks = (cs.Percent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU())
cs.TotalTicks = (cs.Percent / 100) * float64(stats.CpuTotalTicks()) / float64(runtime.NumCPU())

return &cstructs.TaskResourceUsage{
ResourceUsage: &cstructs.ResourceUsage{
Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/util/stats_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func DockerStatsToTaskResourceUsage(s *docker.Stats) *cstructs.TaskResourceUsage
ThrottledPeriods: s.CPUStats.ThrottlingData.ThrottledPeriods,
ThrottledTime: s.CPUStats.ThrottlingData.ThrottledTime,
Percent: cpuPercent,
TotalTicks: (cpuPercent / 100) * float64(stats.TotalTicksAvailable()) / float64(runtime.NumCPU()),
TotalTicks: (cpuPercent / 100) * float64(stats.CpuTotalTicks()) / float64(runtime.NumCPU()),
Measured: DockerMeasuredCPUStats,
}

Expand Down
9 changes: 3 additions & 6 deletions drivers/shared/executor/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ import (
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/lib/fifo"
"github.com/hashicorp/nomad/client/lib/resources"
"github.com/hashicorp/nomad/client/stats"
cstructs "github.com/hashicorp/nomad/client/structs"
shelpers "github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/plugins/drivers"
"github.com/syndtr/gocapability/capability"
)
Expand Down Expand Up @@ -257,11 +256,9 @@ type UniversalExecutor struct {
}

// NewExecutor returns an Executor
func NewExecutor(logger hclog.Logger) Executor {
func NewExecutor(logger hclog.Logger, cpuTotalTicks uint64) Executor {
logger = logger.Named("executor")
if err := shelpers.Init(); err != nil {
logger.Error("unable to initialize stats", "error", err)
}
stats.SetCpuTotalTicks(cpuTotalTicks)

return &UniversalExecutor{
logger: logger,
Expand Down
4 changes: 2 additions & 2 deletions drivers/shared/executor/executor_basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import (
"github.com/hashicorp/nomad/plugins/drivers"
)

func NewExecutorWithIsolation(logger hclog.Logger) Executor {
func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor {
logger = logger.Named("executor")
logger.Error("isolation executor is not supported on this platform, using default")
return NewExecutor(logger)
return NewExecutor(logger, cpuTotalTicks)
}

func (e *UniversalExecutor) configureResourceContainer(_ int) error { return nil }
Expand Down
10 changes: 4 additions & 6 deletions drivers/shared/executor/executor_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ import (
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/lib/cgutil"
"github.com/hashicorp/nomad/client/lib/resources"
"github.com/hashicorp/nomad/client/stats"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/drivers/shared/capabilities"
shelpers "github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
Expand Down Expand Up @@ -68,11 +67,10 @@ type LibcontainerExecutor struct {
exitState *ProcessState
}

func NewExecutorWithIsolation(logger hclog.Logger) Executor {
func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor {
logger = logger.Named("isolated_executor")
if err := shelpers.Init(); err != nil {
logger.Error("unable to initialize stats", "error", err)
}
stats.SetCpuTotalTicks(cpuTotalTicks)

return &LibcontainerExecutor{
id: strings.ReplaceAll(uuid.Generate(), "-", "_"),
logger: logger,
Expand Down
Loading

0 comments on commit 438954e

Please sign in to comment.