From 80aacba4d2a13598eeb4937a26f7644b20794098 Mon Sep 17 00:00:00 2001 From: Vihas Makwana <121151420+VihasMakwana@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:22:49 +0530 Subject: [PATCH] [cpu/windows] Switch to performance counters (#192) Add conditional switching to performance counters for `cpu`/`core` metricset collections This PR add an option `WithPerformanceCounter`, which uses performance counter for CPU data collection, when enabled. Default behaviour will be the existing implementation. ### How to use performance counters? It's simple. Something like following: ```go cpu, _ := metrics.New(WithPerformanceCounter()) data, err := cpu.Fetch() ``` ### Why performance counters? From https://github.com/elastic/beats/issues/40926, > On Windows, Metricbeat measures CPU use via the Windows API call GetSystemTimes. Each metrics interval, it fetches the CPU numbers, and compares them to the previous measurement to determine CPU load during that interval. On most systems this includes CPU time ["including all threads in all processes, on all processors"](https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getsystemtimes). However, on systems with more than 64 cores, it returns only the data for the [current processor group](https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups) of up to 64 cores. This is a major limitation for such systems and it leads to inconsistent data. ### Testing Test results are reported on the linked issue. Fixes https://github.com/elastic/beats/issues/40926 ### **Note:** `WithPerformanceCounter` option is only effective for windows and is ineffective if used by other OS'. --- go.mod | 2 +- go.sum | 4 +- metric/cpu/cpu.go | 9 ++-- metric/cpu/cpu_windows.go | 10 ++++- metric/cpu/metric_windows_test.go | 54 ++++++++++++++++++++++++ metric/cpu/metrics_aix.go | 3 +- metric/cpu/metrics_darwin.go | 3 +- metric/cpu/metrics_openbsd.go | 3 +- metric/cpu/metrics_procfs_common.go | 6 +-- metric/cpu/metrics_test.go | 6 +-- metric/cpu/metrics_windows.go | 65 ++++++++++++++++++++++++++++- 11 files changed, 142 insertions(+), 23 deletions(-) create mode 100644 metric/cpu/metric_windows_test.go diff --git a/go.mod b/go.mod index 264dc2d9f1..c6c3c12453 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.8 require ( github.com/docker/docker v26.1.5+incompatible - github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d + github.com/elastic/elastic-agent-libs v0.17.4 github.com/elastic/go-licenser v0.4.2 github.com/elastic/go-structform v0.0.9 github.com/elastic/go-sysinfo v1.14.1 diff --git a/go.sum b/go.sum index 26097d7a5d..bf5ff02162 100644 --- a/go.sum +++ b/go.sum @@ -21,8 +21,8 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= +github.com/elastic/elastic-agent-libs v0.17.4 h1:kWK5Kn2EQjM97yHqbeXv+cFAIti4IiI9Qj8huM+lZzE= +github.com/elastic/elastic-agent-libs v0.17.4/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c= github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o= diff --git a/metric/cpu/cpu.go b/metric/cpu/cpu.go index 7e6a6e994c..d39d5c70d1 100644 --- a/metric/cpu/cpu.go +++ b/metric/cpu/cpu.go @@ -84,8 +84,8 @@ type option struct { type OptionFunc func(*option) -// Note: WithPerformanceCounter option is only effective for windows and is ineffective if used by other OS'. -func WithPerformanceCounter() OptionFunc { +// Note: WithWindowsPerformanceCounter option is only effective for windows and is ineffective if used by other OS'. +func WithWindowsPerformanceCounter() OptionFunc { return func(o *option) { o.usePerformanceCounter = true } @@ -94,7 +94,7 @@ func WithPerformanceCounter() OptionFunc { // Fetch collects a new sample of the CPU usage metrics. // This will overwrite the currently stored samples. func (m *Monitor) Fetch() (Metrics, error) { - metric, err := Get(m.Hostfs) + metric, err := Get(m) if err != nil { return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) } @@ -108,8 +108,7 @@ func (m *Monitor) Fetch() (Metrics, error) { // FetchCores collects a new sample of CPU usage metrics per-core // This will overwrite the currently stored samples. func (m *Monitor) FetchCores() ([]Metrics, error) { - - metric, err := Get(m.Hostfs) + metric, err := Get(m) if err != nil { return nil, fmt.Errorf("error fetching CPU metrics: %w", err) } diff --git a/metric/cpu/cpu_windows.go b/metric/cpu/cpu_windows.go index c1de79cad8..77a19dea2d 100644 --- a/metric/cpu/cpu_windows.go +++ b/metric/cpu/cpu_windows.go @@ -67,6 +67,14 @@ func buildQuery() (*pdh.Query, error) { if err := q.Open(); err != nil { return nil, fmt.Errorf("failed to open query: %w", err) } - // TODO: implement performance counters as a follow up + if err := q.AddCounter(totalKernelTimeCounter, "", "", true, true); err != nil { + return nil, fmt.Errorf("error calling AddCounter for kernel counter: %w", err) + } + if err := q.AddCounter(totalUserTimeCounter, "", "", true, true); err != nil { + return nil, fmt.Errorf("error calling AddCounter for user counter: %w", err) + } + if err := q.AddCounter(totalIdleTimeCounter, "", "", true, true); err != nil { + return nil, fmt.Errorf("error calling AddCounter for idle counter: %w", err) + } return &q, nil } diff --git a/metric/cpu/metric_windows_test.go b/metric/cpu/metric_windows_test.go new file mode 100644 index 0000000000..3cfaf75ccc --- /dev/null +++ b/metric/cpu/metric_windows_test.go @@ -0,0 +1,54 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build windows + +package cpu + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests" +) + +func TestCounterLength(t *testing.T) { + monitor, err := New(systemtests.DockerTestResolver()) + require.NoError(t, err) + require.NoError(t, monitor.query.CollectData()) + + query := monitor.query + kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) + require.NoError(t, err) + + idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) + require.NoError(t, err) + + userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) + require.NoError(t, err) + + require.Equal(t, len(kernelRawData), len(idleRawData)) + require.Equal(t, len(userRawData), len(idleRawData)) + + for i := 0; i < len(userRawData); i++ { + require.Equal(t, userRawData[i].InstanceName, kernelRawData[i].InstanceName, "InstanceName should be equal") + } + for i := 0; i < len(kernelRawData); i++ { + require.Equal(t, kernelRawData[i].InstanceName, idleRawData[i].InstanceName, "InstanceName should be equal") + } +} diff --git a/metric/cpu/metrics_aix.go b/metric/cpu/metrics_aix.go index ede2e2d35a..47a5a8b6cc 100644 --- a/metric/cpu/metrics_aix.go +++ b/metric/cpu/metrics_aix.go @@ -37,7 +37,6 @@ import ( "os" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) func init() { @@ -58,7 +57,7 @@ func tick2msec(val uint64) uint64 { } // Get returns a metrics object for CPU data -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { totals, err := getCPUTotals() if err != nil { diff --git a/metric/cpu/metrics_darwin.go b/metric/cpu/metrics_darwin.go index 619b5972a2..0483be2b4e 100644 --- a/metric/cpu/metrics_darwin.go +++ b/metric/cpu/metrics_darwin.go @@ -23,11 +23,10 @@ import ( "github.com/shirou/gopsutil/v4/cpu" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Get is the Darwin implementation of Get -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { // We're using the gopsutil library here. // The code used by both gosigar and go-sysinfo appears to be // the same code as gopsutil, including copy-pasted comments. diff --git a/metric/cpu/metrics_openbsd.go b/metric/cpu/metrics_openbsd.go index 506c9502a8..93c4f9118d 100644 --- a/metric/cpu/metrics_openbsd.go +++ b/metric/cpu/metrics_openbsd.go @@ -37,11 +37,10 @@ import ( "unsafe" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Get is the OpenBSD implementation of get -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { // see man 2 sysctl loadGlobal := [C.CPUSTATES]C.long{ diff --git a/metric/cpu/metrics_procfs_common.go b/metric/cpu/metrics_procfs_common.go index d175a529f0..a6b29dcc99 100644 --- a/metric/cpu/metrics_procfs_common.go +++ b/metric/cpu/metrics_procfs_common.go @@ -26,12 +26,12 @@ import ( "os" "strconv" "strings" - - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Get returns a metrics object for CPU data -func Get(procfs resolve.Resolver) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { + procfs := m.Hostfs + path := procfs.ResolveHostFS("/proc/stat") fd, err := os.Open(path) defer func() { diff --git a/metric/cpu/metrics_test.go b/metric/cpu/metrics_test.go index 43bc0c510e..2c1cd67bb2 100644 --- a/metric/cpu/metrics_test.go +++ b/metric/cpu/metrics_test.go @@ -43,12 +43,12 @@ func TestMonitorSample(t *testing.T) { } func TestCoresMonitorSample(t *testing.T) { + cores, err := New(systemtests.DockerTestResolver()) + require.NoError(t, err) - cpuMetrics, err := Get(systemtests.DockerTestResolver()) + cpuMetrics, err := Get(cores) assert.NoError(t, err, "error in Get()") - cores, err := New(systemtests.DockerTestResolver()) - require.NoError(t, err) cores.lastSample = CPUMetrics{list: make([]CPU, len(cpuMetrics.list))} sample, err := cores.FetchCores() require.NoError(t, err) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index fd90408c1b..1392daa56a 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -26,13 +26,27 @@ import ( "fmt" "time" + "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" "github.com/elastic/gosigar/sys/windows" ) +var ( + processorInformationCounter = "\\Processor Information(%s)\\%s" + totalKernelTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% Privileged Time") + totalIdleTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% Idle Time") + totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% User Time") +) + // Get fetches Windows CPU system times -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { + if m.query == nil { + return getUsingSystemTimes() + } + return getUsingPerfCounters(m.query) +} + +func getUsingSystemTimes() (CPUMetrics, error) { idle, kernel, user, err := windows.GetSystemTimes() if err != nil { return CPUMetrics{}, fmt.Errorf("call to GetSystemTimes failed: %w", err) @@ -66,3 +80,50 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { return globalMetrics, nil } + +func getUsingPerfCounters(query *pdh.Query) (CPUMetrics, error) { + globalMetrics := CPUMetrics{} + + if err := query.CollectData(); err != nil { + return globalMetrics, err + } + + kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) + if err != nil { + return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for kernel counter: %w", err) + } + idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) + if err != nil { + return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for idle counter: %w", err) + } + userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) + if err != nil { + return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for user counter: %w", err) + } + var idle, kernel, user time.Duration + globalMetrics.list = make([]CPU, len(userRawData)) + for i := 0; i < len(globalMetrics.list); i++ { + // The values returned by GetRawCounterArray are of equal length and are sorted by instance names. + // For CPU core {i}, idleRawData[i], kernelRawData[i], and userRawData[i] correspond to the idle time, kernel time, and user time, respectively. + + // values returned by counter are in 100-ns intervals. Hence, convert it to millisecond. + idleTime := time.Duration(idleRawData[i].RawValue.FirstValue*100) / time.Millisecond + kernelTime := time.Duration(kernelRawData[i].RawValue.FirstValue*100) / time.Millisecond + userTime := time.Duration(userRawData[i].RawValue.FirstValue*100) / time.Millisecond + + globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTime)) + globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTime)) + globalMetrics.list[i].User = opt.UintWith(uint64(userTime)) + + // add the per-cpu time to track the total time spent by system + idle += idleTime + kernel += kernelTime + user += userTime + } + + globalMetrics.totals.Idle = opt.UintWith(uint64(idle)) + globalMetrics.totals.Sys = opt.UintWith(uint64(kernel)) + globalMetrics.totals.User = opt.UintWith(uint64(user)) + + return globalMetrics, nil +}