From dca415be481cfcd5473d8f59917abee6358b02bb Mon Sep 17 00:00:00 2001 From: Mario Macias Date: Thu, 20 Jun 2024 11:30:02 +0200 Subject: [PATCH] Process memory metrics: report deltas instead of absolute values in OTEL exporter --- pkg/internal/export/otel/metrics_proc.go | 4 ++-- pkg/internal/infraolly/process/harvest.go | 25 +++++++++++++++------- pkg/internal/infraolly/process/snapshot.go | 4 +--- pkg/internal/infraolly/process/status.go | 8 +++++-- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pkg/internal/export/otel/metrics_proc.go b/pkg/internal/export/otel/metrics_proc.go index 06ab78d87..71d69c704 100644 --- a/pkg/internal/export/otel/metrics_proc.go +++ b/pkg/internal/export/otel/metrics_proc.go @@ -303,10 +303,10 @@ func (me *procMetricsExporter) observeMetric(reporter *procMetrics, s *process.S me.cpuUtilisationObserver(me.ctx, reporter, s) mem, attrs := reporter.memory.ForRecord(s) - mem.Add(me.ctx, s.MemoryRSSBytes, metric2.WithAttributeSet(attrs)) + mem.Add(me.ctx, s.MemoryRSSBytesDelta, metric2.WithAttributeSet(attrs)) vmem, attrs := reporter.memoryVirtual.ForRecord(s) - vmem.Add(me.ctx, s.MemoryVMSBytes, metric2.WithAttributeSet(attrs)) + vmem.Add(me.ctx, s.MemoryVMSBytesDelta, metric2.WithAttributeSet(attrs)) me.diskObserver(me.ctx, reporter, s) me.netObserver(me.ctx, reporter, s) diff --git a/pkg/internal/infraolly/process/harvest.go b/pkg/internal/infraolly/process/harvest.go index 6c7cf11fa..2c95923b8 100644 --- a/pkg/internal/infraolly/process/harvest.go +++ b/pkg/internal/infraolly/process/harvest.go @@ -110,6 +110,9 @@ func (ps *Harvester) Harvest(svcID *svc.ID) (*Status, error) { ps.populateNetworkInfo(status, cached) + // current stats will be used in the next iteration to calculate some delta values + cached.prevStats = cached.stats + return status, nil } @@ -139,14 +142,18 @@ func (ps *Harvester) populateGauges(status *Status, process *linuxProcess) error var err error // Calculate CPU metrics from current and previous user/system/wait time - status.CPUTimeSystemDelta = process.stats.cpu.SystemTime - process.previousCPUStats.SystemTime - status.CPUTimeUserDelta = process.stats.cpu.UserTime - process.previousCPUStats.UserTime - status.CPUTimeWaitDelta = process.stats.cpu.WaitTime - process.previousCPUStats.WaitTime - - delta := process.measureTime.Sub(process.previousMeasureTime).Seconds() * float64(runtime.NumCPU()) - status.CPUUtilisationSystem = (process.stats.cpu.SystemTime - process.previousCPUStats.SystemTime) / delta - status.CPUUtilisationUser = (process.stats.cpu.UserTime - process.previousCPUStats.UserTime) / delta - status.CPUUtilisationWait = (process.stats.cpu.WaitTime - process.previousCPUStats.WaitTime) / delta + var zero CPUInfo + // we only calculate CPU deltas and utilization time from the second sample onwards + if process.prevStats.cpu != zero { + status.CPUTimeSystemDelta = process.stats.cpu.SystemTime - process.prevStats.cpu.SystemTime + status.CPUTimeUserDelta = process.stats.cpu.UserTime - process.prevStats.cpu.UserTime + status.CPUTimeWaitDelta = process.stats.cpu.WaitTime - process.prevStats.cpu.WaitTime + + delta := process.measureTime.Sub(process.previousMeasureTime).Seconds() * float64(runtime.NumCPU()) + status.CPUUtilisationSystem = (process.stats.cpu.SystemTime - process.prevStats.cpu.SystemTime) / delta + status.CPUUtilisationUser = (process.stats.cpu.UserTime - process.prevStats.cpu.UserTime) / delta + status.CPUUtilisationWait = (process.stats.cpu.WaitTime - process.prevStats.cpu.WaitTime) / delta + } if ps.privileged { status.FdCount, err = process.NumFDs() @@ -159,7 +166,9 @@ func (ps *Harvester) populateGauges(status *Status, process *linuxProcess) error status.Status = process.stats.state status.ThreadCount = process.stats.numThreads status.MemoryVMSBytes = process.stats.vmSize + status.MemoryVMSBytesDelta = process.stats.vmSize - process.prevStats.vmSize status.MemoryRSSBytes = process.stats.vmRSS + status.MemoryRSSBytesDelta = process.stats.vmRSS - process.prevStats.vmRSS return nil } diff --git a/pkg/internal/infraolly/process/snapshot.go b/pkg/internal/infraolly/process/snapshot.go index 6dd418d82..4185d442d 100644 --- a/pkg/internal/infraolly/process/snapshot.go +++ b/pkg/internal/infraolly/process/snapshot.go @@ -52,10 +52,10 @@ type linuxProcess struct { measureTime time.Time stats procStats + prevStats procStats process *process.Process // used to calculate CPU utilization ratios - previousCPUStats CPUInfo previousMeasureTime time.Time previousIOCounters *process.IOCountersStat previousNetRx int64 @@ -131,13 +131,11 @@ func getLinuxProcess(cachedCopy *linuxProcess, procFSRoot string, pid int32, pri stats: currentStats, measureTime: measureTime, previousMeasureTime: measureTime, - previousCPUStats: currentStats.cpu, procFSRoot: procFSRoot, }, nil } // Otherwise, instead of creating a new process snapshot, we just reuse the cachedCopy one, with updated data - cachedCopy.previousCPUStats = cachedCopy.stats.cpu cachedCopy.previousMeasureTime = cachedCopy.measureTime cachedCopy.stats = currentStats cachedCopy.measureTime = measureTime diff --git a/pkg/internal/infraolly/process/status.go b/pkg/internal/infraolly/process/status.go index 35ac96659..8795b48fa 100644 --- a/pkg/internal/infraolly/process/status.go +++ b/pkg/internal/infraolly/process/status.go @@ -37,8 +37,12 @@ type Status struct { CPUUtilisationUser float64 CPUUtilisationWait float64 - MemoryRSSBytes int64 - MemoryVMSBytes int64 + // delta values are used in OTEL UpDownCounters while absolute values are used in Prometheus gauges + MemoryRSSBytes int64 + MemoryVMSBytes int64 + MemoryRSSBytesDelta int64 + MemoryVMSBytesDelta int64 + Status string ParentProcessID int32 ThreadCount int32