From b70fe666d0ff2b7a7601b4f306250e380f46a1ca Mon Sep 17 00:00:00 2001 From: russdirks <7891737+turboboost55@users.noreply.github.com> Date: Tue, 28 Feb 2023 17:51:05 -0800 Subject: [PATCH 1/8] metrics: measure collection frequency --- metrics/metrics.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 2edf8e35f151..e7225a35daeb 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -127,8 +127,6 @@ func CollectProcessMetrics(refresh time.Duration) { return } - refreshFreq := int64(refresh / time.Second) - // Create the various data collectors var ( cpustats = make([]CPUStats, 2) @@ -163,9 +161,15 @@ func CollectProcessMetrics(refresh time.Duration) { diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry) ) + // Avoid divide-by-zero the first time through the loop. + lastCollectionTime := time.Now().Add(-refresh) + // Iterate loading the different stats and updating the meters. now, prev := 0, 1 for ; ; now, prev = prev, now { + + refreshFreq := int64(time.Since(lastCollectionTime) / time.Second) + // CPU ReadCPUStats(&cpustats[now]) cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq) @@ -199,6 +203,7 @@ func CollectProcessMetrics(refresh time.Duration) { diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes) } + lastCollectionTime = time.Now() time.Sleep(refresh) } } From 38825205920fc141d03aa306819317acb1b65d5d Mon Sep 17 00:00:00 2001 From: russdirks <7891737+turboboost55@users.noreply.github.com> Date: Thu, 2 Mar 2023 10:49:43 -0800 Subject: [PATCH 2/8] keep the linter happy --- metrics/metrics.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index e7225a35daeb..db228f74b4b2 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -167,11 +167,9 @@ func CollectProcessMetrics(refresh time.Duration) { // Iterate loading the different stats and updating the meters. now, prev := 0, 1 for ; ; now, prev = prev, now { - - refreshFreq := int64(time.Since(lastCollectionTime) / time.Second) - // CPU ReadCPUStats(&cpustats[now]) + refreshFreq := int64(time.Since(lastCollectionTime) / time.Second) cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq) cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq) cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq) From 3731a897be132df9a74c29acad1daef0d565360d Mon Sep 17 00:00:00 2001 From: russdirks <7891737+turboboost55@users.noreply.github.com> Date: Fri, 3 Mar 2023 11:34:58 -0800 Subject: [PATCH 3/8] avoid loss of precision due to truncation --- metrics/metrics.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index db228f74b4b2..502b3177d52f 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -169,10 +169,10 @@ func CollectProcessMetrics(refresh time.Duration) { for ; ; now, prev = prev, now { // CPU ReadCPUStats(&cpustats[now]) - refreshFreq := int64(time.Since(lastCollectionTime) / time.Second) - cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq) - cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq) - cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq) + refreshFreq := time.Since(lastCollectionTime).Seconds() + cpuSysLoad.Update(int64(float64(cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq)) + cpuSysWait.Update(int64(float64(cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq)) + cpuProcLoad.Update(int64(float64(cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq)) // Threads cpuThreads.Update(int64(threadCreateProfile.Count())) From 272b00d186c2fa60feeb5fac90e9e01239401a75 Mon Sep 17 00:00:00 2001 From: russdirks <7891737+turboboost55@users.noreply.github.com> Date: Sun, 5 Mar 2023 06:21:17 -0800 Subject: [PATCH 4/8] linter issues --- metrics/metrics.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 502b3177d52f..1c190a1ee28a 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -170,9 +170,9 @@ func CollectProcessMetrics(refresh time.Duration) { // CPU ReadCPUStats(&cpustats[now]) refreshFreq := time.Since(lastCollectionTime).Seconds() - cpuSysLoad.Update(int64(float64(cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq)) - cpuSysWait.Update(int64(float64(cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq)) - cpuProcLoad.Update(int64(float64(cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq)) + cpuSysLoad.Update(int64(float64(cpustats[now].GlobalTime-cpustats[prev].GlobalTime) / refreshFreq)) + cpuSysWait.Update(int64(float64(cpustats[now].GlobalWait-cpustats[prev].GlobalWait) / refreshFreq)) + cpuProcLoad.Update(int64(float64(cpustats[now].LocalTime-cpustats[prev].LocalTime) / refreshFreq)) // Threads cpuThreads.Update(int64(threadCreateProfile.Count())) From 66ed5720585afa26278da47b1a987f8e6069fbb5 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Mon, 6 Mar 2023 16:32:56 +0100 Subject: [PATCH 5/8] metrics: use float64 gauge --- metrics/metrics.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 1c190a1ee28a..df51ce528e00 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -141,9 +141,9 @@ func CollectProcessMetrics(refresh time.Duration) { // Define the various metrics to collect var ( - cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry) - cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry) - cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry) + cpuSysLoad = GetOrRegisterGaugeFloat64("system/cpu/sysload", DefaultRegistry) + cpuSysWait = GetOrRegisterGaugeFloat64("system/cpu/syswait", DefaultRegistry) + cpuProcLoad = GetOrRegisterGaugeFloat64("system/cpu/procload", DefaultRegistry) cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry) cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry) cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil) @@ -170,9 +170,9 @@ func CollectProcessMetrics(refresh time.Duration) { // CPU ReadCPUStats(&cpustats[now]) refreshFreq := time.Since(lastCollectionTime).Seconds() - cpuSysLoad.Update(int64(float64(cpustats[now].GlobalTime-cpustats[prev].GlobalTime) / refreshFreq)) - cpuSysWait.Update(int64(float64(cpustats[now].GlobalWait-cpustats[prev].GlobalWait) / refreshFreq)) - cpuProcLoad.Update(int64(float64(cpustats[now].LocalTime-cpustats[prev].LocalTime) / refreshFreq)) + cpuSysLoad.Update(float64(cpustats[now].GlobalTime-cpustats[prev].GlobalTime) / refreshFreq) + cpuSysWait.Update(float64(cpustats[now].GlobalWait-cpustats[prev].GlobalWait) / refreshFreq) + cpuProcLoad.Update(float64(cpustats[now].LocalTime-cpustats[prev].LocalTime) / refreshFreq) // Threads cpuThreads.Update(int64(threadCreateProfile.Count())) From 0c9e7142e2564e6dddd025c0150fa987d373475e Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Mon, 6 Mar 2023 21:18:25 +0100 Subject: [PATCH 6/8] metrics: compute CPU percentage metrics using float64 as base unit --- metrics/cpu.go | 7 ++++--- metrics/cpu_enabled.go | 4 ++-- metrics/cputime_nop.go | 2 +- metrics/cputime_unix.go | 4 ++-- metrics/metrics.go | 18 +++++++++++------- 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/metrics/cpu.go b/metrics/cpu.go index 72ece16e0768..3a49cd42493a 100644 --- a/metrics/cpu.go +++ b/metrics/cpu.go @@ -17,8 +17,9 @@ package metrics // CPUStats is the system and process CPU stats. +// All values are in seconds. type CPUStats struct { - GlobalTime int64 // Time spent by the CPU working on all processes - GlobalWait int64 // Time spent by waiting on disk for all processes - LocalTime int64 // Time spent by the CPU working on this process + GlobalTime float64 // Time spent by the CPU working on all processes + GlobalWait float64 // Time spent by waiting on disk for all processes + LocalTime float64 // Time spent by the CPU working on this process } diff --git a/metrics/cpu_enabled.go b/metrics/cpu_enabled.go index 533d40b85a58..2359028a2120 100644 --- a/metrics/cpu_enabled.go +++ b/metrics/cpu_enabled.go @@ -38,7 +38,7 @@ func ReadCPUStats(stats *CPUStats) { } // requesting all cpu times will always return an array with only one time stats entry timeStat := timeStats[0] - stats.GlobalTime = int64((timeStat.User + timeStat.Nice + timeStat.System) * cpu.ClocksPerSec) - stats.GlobalWait = int64((timeStat.Iowait) * cpu.ClocksPerSec) + stats.GlobalTime = timeStat.User + timeStat.Nice + timeStat.System + stats.GlobalWait = timeStat.Iowait stats.LocalTime = getProcessCPUTime() } diff --git a/metrics/cputime_nop.go b/metrics/cputime_nop.go index 0188735a7833..465d88c4d232 100644 --- a/metrics/cputime_nop.go +++ b/metrics/cputime_nop.go @@ -21,6 +21,6 @@ package metrics // getProcessCPUTime returns 0 on Windows as there is no system call to resolve // the actual process' CPU time. -func getProcessCPUTime() int64 { +func getProcessCPUTime() float64 { return 0 } diff --git a/metrics/cputime_unix.go b/metrics/cputime_unix.go index 3c56a75d0077..ad4f812fd285 100644 --- a/metrics/cputime_unix.go +++ b/metrics/cputime_unix.go @@ -26,11 +26,11 @@ import ( ) // getProcessCPUTime retrieves the process' CPU time since program startup. -func getProcessCPUTime() int64 { +func getProcessCPUTime() float64 { var usage syscall.Rusage if err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage); err != nil { log.Warn("Failed to retrieve CPU time", "err", err) return 0 } - return int64(usage.Utime.Sec+usage.Stime.Sec)*100 + int64(usage.Utime.Usec+usage.Stime.Usec)/10000 //nolint:unconvert + return float64(usage.Utime.Sec+usage.Stime.Sec) + float64(usage.Utime.Usec+usage.Stime.Usec)/1000000 //nolint:unconvert } diff --git a/metrics/metrics.go b/metrics/metrics.go index df51ce528e00..2d537a3b2dc7 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -141,9 +141,9 @@ func CollectProcessMetrics(refresh time.Duration) { // Define the various metrics to collect var ( - cpuSysLoad = GetOrRegisterGaugeFloat64("system/cpu/sysload", DefaultRegistry) - cpuSysWait = GetOrRegisterGaugeFloat64("system/cpu/syswait", DefaultRegistry) - cpuProcLoad = GetOrRegisterGaugeFloat64("system/cpu/procload", DefaultRegistry) + cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry) + cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry) + cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry) cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry) cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry) cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil) @@ -167,12 +167,16 @@ func CollectProcessMetrics(refresh time.Duration) { // Iterate loading the different stats and updating the meters. now, prev := 0, 1 for ; ; now, prev = prev, now { - // CPU + // Gather CPU times. ReadCPUStats(&cpustats[now]) refreshFreq := time.Since(lastCollectionTime).Seconds() - cpuSysLoad.Update(float64(cpustats[now].GlobalTime-cpustats[prev].GlobalTime) / refreshFreq) - cpuSysWait.Update(float64(cpustats[now].GlobalWait-cpustats[prev].GlobalWait) / refreshFreq) - cpuProcLoad.Update(float64(cpustats[now].LocalTime-cpustats[prev].LocalTime) / refreshFreq) + sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq + sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq + procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq + // Convert to integer percentage. + cpuSysLoad.Update(int64(sysLoad * 100)) + cpuSysWait.Update(int64(sysWait * 100)) + cpuProcLoad.Update(int64(procLoad * 100)) // Threads cpuThreads.Update(int64(threadCreateProfile.Count())) From 66677d30bb1d2c82a1a47fb4133887c25385bb5a Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Mon, 6 Mar 2023 21:28:54 +0100 Subject: [PATCH 7/8] metrics: move update of lastCollectionTime --- metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 2d537a3b2dc7..0310a0c728e6 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -170,6 +170,7 @@ func CollectProcessMetrics(refresh time.Duration) { // Gather CPU times. ReadCPUStats(&cpustats[now]) refreshFreq := time.Since(lastCollectionTime).Seconds() + lastCollectionTime = time.Now() sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq @@ -205,7 +206,6 @@ func CollectProcessMetrics(refresh time.Duration) { diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes) } - lastCollectionTime = time.Now() time.Sleep(refresh) } } From 25a5eff84d3a1b059819e94d027a5a20eb382bcd Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Mon, 6 Mar 2023 22:01:31 +0100 Subject: [PATCH 8/8] metrics: read time only once Instead of calling time.Now both through time.Since, and then immediately again afterwards, just call it one time and compute everything from that reading. --- metrics/metrics.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 0310a0c728e6..ff7196b56494 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -161,23 +161,25 @@ func CollectProcessMetrics(refresh time.Duration) { diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry) ) - // Avoid divide-by-zero the first time through the loop. - lastCollectionTime := time.Now().Add(-refresh) + var lastCollectTime time.Time // Iterate loading the different stats and updating the meters. now, prev := 0, 1 for ; ; now, prev = prev, now { // Gather CPU times. ReadCPUStats(&cpustats[now]) - refreshFreq := time.Since(lastCollectionTime).Seconds() - lastCollectionTime = time.Now() - sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq - sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq - procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq - // Convert to integer percentage. - cpuSysLoad.Update(int64(sysLoad * 100)) - cpuSysWait.Update(int64(sysWait * 100)) - cpuProcLoad.Update(int64(procLoad * 100)) + collectTime := time.Now() + secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds() + lastCollectTime = collectTime + if secondsSinceLastCollect > 0 { + sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / secondsSinceLastCollect + sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / secondsSinceLastCollect + procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / secondsSinceLastCollect + // Convert to integer percentage. + cpuSysLoad.Update(int64(sysLoad * 100)) + cpuSysWait.Update(int64(sysWait * 100)) + cpuProcLoad.Update(int64(procLoad * 100)) + } // Threads cpuThreads.Update(int64(threadCreateProfile.Count()))