From bd97f54aeb394678bcf0380637c88cb4c70a500d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 12 Nov 2024 17:41:52 +0530 Subject: [PATCH 01/29] inital commit, add pdh --- go.mod | 2 +- go.sum | 2 + metric/cpu/metrics.go | 83 --------------- metric/cpu/metrics_other.go | 90 ++++++++++++++++ metric/cpu/metrics_test.go | 2 + metric/cpu/metrics_windows.go | 115 ++++++++++++++++++-- metric/cpu/metrics_windows_test.go | 163 +++++++++++++++++++++++++++++ 7 files changed, 363 insertions(+), 94 deletions(-) create mode 100644 metric/cpu/metrics_other.go create mode 100644 metric/cpu/metrics_windows_test.go diff --git a/go.mod b/go.mod index 61757202ff..a182d9ed05 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.8 require ( github.com/docker/docker v26.1.5+incompatible - github.com/elastic/elastic-agent-libs v0.9.13 + github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c github.com/elastic/go-licenser v0.4.2 github.com/elastic/go-structform v0.0.9 github.com/elastic/go-sysinfo v1.14.1 diff --git a/go.sum b/go.sum index c8d6a1819e..dc0300160d 100644 --- a/go.sum +++ b/go.sum @@ -23,6 +23,8 @@ github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/elastic/elastic-agent-libs v0.9.13 h1:D1rh1s67zlkDWmixWQaNWzn+qy6DafIDPTQnLpBNBUA= github.com/elastic/elastic-agent-libs v0.9.13/go.mod h1:G9ljFvDE+muOOOQBf2eRituF0fE4suGkv25rfjTwY+c= +github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c h1:9pqutj36lOyKsKp1oKA9puR64/MZ2E+BWaAEVEOSl3g= +github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c= github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o= diff --git a/metric/cpu/metrics.go b/metric/cpu/metrics.go index c039d091a4..b46fd2220d 100644 --- a/metric/cpu/metrics.go +++ b/metric/cpu/metrics.go @@ -1,30 +1,11 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - package cpu import ( "errors" - "fmt" "github.com/elastic/elastic-agent-libs/mapstr" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // CPU manages the CPU metrics from /proc/stat @@ -79,70 +60,6 @@ func (cpu CPU) Total() uint64 { return opt.SumOptUint(cpu.User, cpu.Nice, cpu.Sys, cpu.Idle, cpu.Wait, cpu.Irq, cpu.SoftIrq, cpu.Stolen) } -/* -The below code implements a "metrics tracker" that gives us the ability to -calculate CPU percentages, as we average usage across a time period. -*/ - -// Monitor is used to monitor the overall CPU usage of the system over time. -type Monitor struct { - lastSample CPUMetrics - Hostfs resolve.Resolver -} - -// New returns a new CPU metrics monitor -// Hostfs is only relevant on linux and freebsd. -func New(hostfs resolve.Resolver) *Monitor { - return &Monitor{Hostfs: hostfs} -} - -// Fetch collects a new sample of the CPU usage metrics. -// This will overwrite the currently stored samples. -func (m *Monitor) Fetch() (Metrics, error) { - metric, err := Get(m.Hostfs) - if err != nil { - return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) - } - - oldLastSample := m.lastSample - m.lastSample = metric - - return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, nil -} - -// FetchCores collects a new sample of CPU usage metrics per-core -// This will overwrite the currently stored samples. -func (m *Monitor) FetchCores() ([]Metrics, error) { - - metric, err := Get(m.Hostfs) - if err != nil { - return nil, fmt.Errorf("error fetching CPU metrics: %w", err) - } - - coreMetrics := make([]Metrics, len(metric.list)) - for i := 0; i < len(metric.list); i++ { - lastMetric := CPU{} - // Count of CPUs can change - if len(m.lastSample.list) > i { - lastMetric = m.lastSample.list[i] - } - coreMetrics[i] = Metrics{ - currentSample: metric.list[i], - previousSample: lastMetric, - isTotals: false, - } - - // Only add CPUInfo metric if it's available - // Remove this if statement once CPUInfo is supported - // by all systems - if len(metric.CPUInfo) != 0 { - coreMetrics[i].cpuInfo = metric.CPUInfo[i] - } - } - m.lastSample = metric - return coreMetrics, nil -} - // Metrics stores the current and the last sample collected by a Beat. type Metrics struct { previousSample CPU diff --git a/metric/cpu/metrics_other.go b/metric/cpu/metrics_other.go new file mode 100644 index 0000000000..54351829a9 --- /dev/null +++ b/metric/cpu/metrics_other.go @@ -0,0 +1,90 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build !windows + +package cpu + +import ( + "fmt" + + "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" +) + +/* +The below code implements a "metrics tracker" that gives us the ability to +calculate CPU percentages, as we average usage across a time period. +*/ + +// Monitor is used to monitor the overall CPU usage of the system over time. +type Monitor struct { + lastSample CPUMetrics + Hostfs resolve.Resolver +} + +// New returns a new CPU metrics monitor +// Hostfs is only relevant on linux and freebsd. +func New(hostfs resolve.Resolver) (*Monitor, error) { + return &Monitor{Hostfs: hostfs}, nil +} + +// Fetch collects a new sample of the CPU usage metrics. +// This will overwrite the currently stored samples. +func (m *Monitor) Fetch() (Metrics, error) { + metric, err := Get(m.Hostfs) + if err != nil { + return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) + } + + oldLastSample := m.lastSample + m.lastSample = metric + + return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, nil +} + +// FetchCores collects a new sample of CPU usage metrics per-core +// This will overwrite the currently stored samples. +func (m *Monitor) FetchCores() ([]Metrics, error) { + + metric, err := Get(m.Hostfs) + if err != nil { + return nil, fmt.Errorf("error fetching CPU metrics: %w", err) + } + + coreMetrics := make([]Metrics, len(metric.list)) + for i := 0; i < len(metric.list); i++ { + lastMetric := CPU{} + // Count of CPUs can change + if len(m.lastSample.list) > i { + lastMetric = m.lastSample.list[i] + } + coreMetrics[i] = Metrics{ + currentSample: metric.list[i], + previousSample: lastMetric, + isTotals: false, + } + + // Only add CPUInfo metric if it's available + // Remove this if statement once CPUInfo is supported + // by all systems + if len(metric.CPUInfo) != 0 { + coreMetrics[i].cpuInfo = metric.CPUInfo[i] + } + } + m.lastSample = metric + return coreMetrics, nil +} diff --git a/metric/cpu/metrics_test.go b/metric/cpu/metrics_test.go index 28b20540d7..07fb85bd72 100644 --- a/metric/cpu/metrics_test.go +++ b/metric/cpu/metrics_test.go @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//go:build !windows + package cpu import ( diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index fd90408c1b..6ff74df4b7 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -24,29 +24,114 @@ package cpu import ( "fmt" + "strings" "time" + "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" "github.com/elastic/gosigar/sys/windows" ) -// Get fetches Windows CPU system times -func Get(_ resolve.Resolver) (CPUMetrics, error) { - idle, kernel, user, err := windows.GetSystemTimes() +var counters []string = []string{ + "\\Processor(_Total)\\% Processor Time", + "\\Processor(_Total)\\% Idle Time", +} + +/* +The below code implements a "metrics tracker" that gives us the ability to +calculate CPU percentages, as we average usage across a time period. +*/ + +// Monitor is used to monitor the overall CPU usage of the system over time. +type Monitor struct { + lastSample CPUMetrics + Hostfs resolve.Resolver + query pdh.Query +} + +// New returns a new CPU metrics monitor +// Hostfs is only relevant on linux and freebsd. +func New(hostfs resolve.Resolver) (*Monitor, error) { + q, err := initializeQuery() if err != nil { - return CPUMetrics{}, fmt.Errorf("call to GetSystemTimes failed: %w", err) + return nil, fmt.Errorf("call to initialize PDH quert failed: %w", err) } + return &Monitor{Hostfs: hostfs, query: q}, nil +} +// Fetch collects a new sample of the CPU usage metrics. +// This will overwrite the currently stored samples. +func (m *Monitor) Fetch() (Metrics, error) { + metric, err := Get(m.Hostfs, m.query) + if err != nil { + return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) + } + + oldLastSample := m.lastSample + m.lastSample = metric + + return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, nil +} + +// FetchCores collects a new sample of CPU usage metrics per-core +// This will overwrite the currently stored samples. +func (m *Monitor) FetchCores() ([]Metrics, error) { + + metric, err := Get(m.Hostfs, m.query) + if err != nil { + return nil, fmt.Errorf("error fetching CPU metrics: %w", err) + } + + coreMetrics := make([]Metrics, len(metric.list)) + for i := 0; i < len(metric.list); i++ { + lastMetric := CPU{} + // Count of CPUs can change + if len(m.lastSample.list) > i { + lastMetric = m.lastSample.list[i] + } + coreMetrics[i] = Metrics{ + currentSample: metric.list[i], + previousSample: lastMetric, + isTotals: false, + } + + // Only add CPUInfo metric if it's available + // Remove this if statement once CPUInfo is supported + // by all systems + if len(metric.CPUInfo) != 0 { + coreMetrics[i].cpuInfo = metric.CPUInfo[i] + } + } + m.lastSample = metric + return coreMetrics, nil +} + +// Get fetches Windows CPU system times +func Get(_ resolve.Resolver, q pdh.Query) (CPUMetrics, error) { + if err := q.CollectData(); err != nil { + return CPUMetrics{}, fmt.Errorf("call to collect counter data failed: %w", err) + } + counterValues, err := q.GetFormattedCounterValues() + if err != nil { + return CPUMetrics{}, fmt.Errorf("call to get formated values: %w", err) + } + var total, idle float64 + for counterName, counterVaule := range counterValues { + if strings.Contains(counterName, "\\Processor(_Total)\\% Processor Time") { + total = counterVaule[0].Measurement.(float64) + } else { + idle = counterVaule[0].Measurement.(float64) + } + } globalMetrics := CPUMetrics{} //convert from duration to ticks - idleMetric := uint64(idle / time.Millisecond) - sysMetric := uint64(kernel / time.Millisecond) - userMetrics := uint64(user / time.Millisecond) + idleMetric := uint64(time.Duration(idle) / 1000) + sysMetric := uint64(time.Duration(total) / 1000) + // userMetrics := uint64(user / time.Millisecond) globalMetrics.totals.Idle = opt.UintWith(idleMetric) globalMetrics.totals.Sys = opt.UintWith(sysMetric) - globalMetrics.totals.User = opt.UintWith(userMetrics) - + // globalMetrics.totals.User = opt.UintWith(userMetrics) // get per-cpu data cpus, err := windows.NtQuerySystemProcessorPerformanceInformation() if err != nil { @@ -63,6 +148,16 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { User: opt.UintWith(userMetrics), }) } - return globalMetrics, nil } + +func initializeQuery() (pdh.Query, error) { + query := pdh.Query{} + for _, c := range counters { + if err := query.AddCounter(c, "", "double", false); err != nil { + return pdh.Query{}, err + } + } + _ = query.CollectData() + return query, nil +} diff --git a/metric/cpu/metrics_windows_test.go b/metric/cpu/metrics_windows_test.go new file mode 100644 index 0000000000..07fb85bd72 --- /dev/null +++ b/metric/cpu/metrics_windows_test.go @@ -0,0 +1,163 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build !windows + +package cpu + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent-libs/mapstr" + "github.com/elastic/elastic-agent-libs/opt" + "github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests" +) + +func TestMonitorSample(t *testing.T) { + _ = logp.DevelopmentSetup() + cpu := &Monitor{lastSample: CPUMetrics{}, Hostfs: systemtests.DockerTestResolver()} + s, err := cpu.Fetch() + require.NoError(t, err) + + metricOpts := MetricOpts{Percentages: true, NormalizedPercentages: true, Ticks: true} + evt, err := s.Format(metricOpts) + assert.NoError(t, err, "error in Format") + testPopulatedEvent(evt, t, true) +} + +func TestCoresMonitorSample(t *testing.T) { + + cpuMetrics, err := Get(systemtests.DockerTestResolver()) + assert.NoError(t, err, "error in Get()") + + cores := &Monitor{lastSample: CPUMetrics{list: make([]CPU, len(cpuMetrics.list))}, Hostfs: systemtests.DockerTestResolver()} + sample, err := cores.FetchCores() + require.NoError(t, err) + + for _, s := range sample { + metricOpts := MetricOpts{Percentages: true, Ticks: true} + evt, err := s.Format(metricOpts) + assert.NoError(t, err, "error in Format") + testPopulatedEvent(evt, t, false) + } +} + +func testPopulatedEvent(evt mapstr.M, t *testing.T, norm bool) { + user, err := evt.GetValue("user.pct") + assert.NoError(t, err, "error getting user.pct") + system, err := evt.GetValue("system.pct") + assert.NoError(t, err, "error getting system.pct") + assert.True(t, user.(float64) > 0) + assert.True(t, system.(float64) > 0) + + if norm { + normUser, err := evt.GetValue("user.norm.pct") + assert.NoError(t, err, "error getting user.norm.pct") + assert.True(t, normUser.(float64) > 0) + normSystem, err := evt.GetValue("system.norm.pct") + assert.NoError(t, err, "error getting system.norm.pct") + assert.True(t, normSystem.(float64) > 0) + assert.True(t, normUser.(float64) <= 100) + assert.True(t, normSystem.(float64) <= 100) + + assert.True(t, user.(float64) > normUser.(float64)) + assert.True(t, system.(float64) > normSystem.(float64)) + } + + userTicks, err := evt.GetValue("user.ticks") + assert.NoError(t, err, "error getting user.ticks") + assert.True(t, userTicks.(uint64) > 0) + systemTicks, err := evt.GetValue("system.ticks") + assert.NoError(t, err, "error getting system.ticks") + assert.True(t, systemTicks.(uint64) > 0) +} + +// TestMetricsRounding tests that the returned percentages are rounded to +// four decimal places. +func TestMetricsRounding(t *testing.T) { + + sample := Metrics{ + previousSample: CPU{ + User: opt.UintWith(10855311), + Sys: opt.UintWith(2021040), + Idle: opt.UintWith(17657874), + }, + currentSample: CPU{ + User: opt.UintWith(10855693), + Sys: opt.UintWith(2021058), + Idle: opt.UintWith(17657876), + }, + } + + evt, err := sample.Format(MetricOpts{NormalizedPercentages: true}) + assert.NoError(t, err, "error in Format") + normUser, err := evt.GetValue("user.norm.pct") + assert.NoError(t, err, "error getting user.norm.pct") + normSystem, err := evt.GetValue("system.norm.pct") + assert.NoError(t, err, "error getting system.norm.pct") + + assert.Equal(t, normUser.(float64), 0.9502) + assert.Equal(t, normSystem.(float64), 0.0448) +} + +// TestMetricsPercentages tests that Metrics returns the correct +// percentages and normalized percentages. +func TestMetricsPercentages(t *testing.T) { + numCores := 10 + // This test simulates 30% user and 70% system (normalized), or 3% and 7% + // respectively when there are 10 CPUs. + const userTest, systemTest = 30., 70. + + s0 := CPU{ + User: opt.UintWith(10000000), + Sys: opt.UintWith(10000000), + Idle: opt.UintWith(20000000), + Nice: opt.UintWith(0), + } + s1 := CPU{ + User: opt.UintWith(s0.User.ValueOr(0) + uint64(userTest)), + Sys: opt.UintWith(s0.Sys.ValueOr(0) + uint64(systemTest)), + Idle: s0.Idle, + Nice: opt.UintWith(0), + } + sample := Metrics{ + count: numCores, + isTotals: true, + previousSample: s0, + currentSample: s1, + } + + evt, err := sample.Format(MetricOpts{NormalizedPercentages: true, Percentages: true}) + assert.NoError(t, err, "error in Format") + + user, err := evt.GetValue("user.norm.pct") + assert.NoError(t, err, "error getting user.norm.pct") + system, err := evt.GetValue("system.norm.pct") + assert.NoError(t, err, "error getting system.norm.pct") + idle, err := evt.GetValue("idle.norm.pct") + assert.NoError(t, err, "error getting idle.norm.pct") + total, err := evt.GetValue("total.norm.pct") + assert.NoError(t, err, "error getting total.norm.pct") + assert.EqualValues(t, .3, user.(float64)) + assert.EqualValues(t, .7, system.(float64)) + assert.EqualValues(t, .0, idle.(float64)) + assert.EqualValues(t, 1., total.(float64)) +} From 5b43db1eac7ded3236e8c7ad6239660d2dde3cb6 Mon Sep 17 00:00:00 2001 From: vihas makwana Date: Mon, 18 Nov 2024 11:23:29 +0530 Subject: [PATCH 02/29] chore: windows pdh update --- metric/cpu/metrics.go | 81 ++++++++++++ metric/cpu/metrics_other.go | 90 -------------- metric/cpu/metrics_windows.go | 224 ++++++++++++++++++++-------------- 3 files changed, 215 insertions(+), 180 deletions(-) delete mode 100644 metric/cpu/metrics_other.go diff --git a/metric/cpu/metrics.go b/metric/cpu/metrics.go index b46fd2220d..6733786f2a 100644 --- a/metric/cpu/metrics.go +++ b/metric/cpu/metrics.go @@ -2,10 +2,12 @@ package cpu import ( "errors" + "fmt" "github.com/elastic/elastic-agent-libs/mapstr" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric" + "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // CPU manages the CPU metrics from /proc/stat @@ -60,6 +62,70 @@ func (cpu CPU) Total() uint64 { return opt.SumOptUint(cpu.User, cpu.Nice, cpu.Sys, cpu.Idle, cpu.Wait, cpu.Irq, cpu.SoftIrq, cpu.Stolen) } +/* +The below code implements a "metrics tracker" that gives us the ability to +calculate CPU percentages, as we average usage across a time period. +*/ + +// Monitor is used to monitor the overall CPU usage of the system over time. +type Monitor struct { + lastSample CPUMetrics + Hostfs resolve.Resolver +} + +// New returns a new CPU metrics monitor +// Hostfs is only relevant on linux and freebsd. +func New(hostfs resolve.Resolver) *Monitor { + return &Monitor{Hostfs: hostfs} +} + +// Fetch collects a new sample of the CPU usage metrics. +// This will overwrite the currently stored samples. +func (m *Monitor) Fetch() (Metrics, error) { + metric, err := Get(m.Hostfs) + if err != nil && !errors.Is(err, &PerfError{}) { + return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) + } + + oldLastSample := m.lastSample + m.lastSample = metric + + return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, err +} + +// FetchCores collects a new sample of CPU usage metrics per-core +// This will overwrite the currently stored samples. +func (m *Monitor) FetchCores() ([]Metrics, error) { + + metric, err := Get(m.Hostfs) + if err != nil && !errors.Is(err, &PerfError{}) { + return nil, fmt.Errorf("error fetching CPU metrics: %w", err) + } + + coreMetrics := make([]Metrics, len(metric.list)) + for i := 0; i < len(metric.list); i++ { + lastMetric := CPU{} + // Count of CPUs can change + if len(m.lastSample.list) > i { + lastMetric = m.lastSample.list[i] + } + coreMetrics[i] = Metrics{ + currentSample: metric.list[i], + previousSample: lastMetric, + isTotals: false, + } + + // Only add CPUInfo metric if it's available + // Remove this if statement once CPUInfo is supported + // by all systems + if len(metric.CPUInfo) != 0 { + coreMetrics[i].cpuInfo = metric.CPUInfo[i] + } + } + m.lastSample = metric + return coreMetrics, err +} + // Metrics stores the current and the last sample collected by a Beat. type Metrics struct { previousSample CPU @@ -161,3 +227,18 @@ func cpuMetricTimeDelta(prev, current opt.Uint, timeDelta uint64, numCPU int) fl pct := float64(cpuDelta) / float64(timeDelta) return metric.Round(pct * float64(numCPU)) } + +type PerfError struct { + err error +} + +func (p *PerfError) Error() string { + if p.err == nil { + return "" + } + return fmt.Sprintf("Error while reading performance counter data: %s", p.err.Error()) +} + +func (p *PerfError) Unwrap() error { + return p.err +} diff --git a/metric/cpu/metrics_other.go b/metric/cpu/metrics_other.go deleted file mode 100644 index 54351829a9..0000000000 --- a/metric/cpu/metrics_other.go +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//go:build !windows - -package cpu - -import ( - "fmt" - - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" -) - -/* -The below code implements a "metrics tracker" that gives us the ability to -calculate CPU percentages, as we average usage across a time period. -*/ - -// Monitor is used to monitor the overall CPU usage of the system over time. -type Monitor struct { - lastSample CPUMetrics - Hostfs resolve.Resolver -} - -// New returns a new CPU metrics monitor -// Hostfs is only relevant on linux and freebsd. -func New(hostfs resolve.Resolver) (*Monitor, error) { - return &Monitor{Hostfs: hostfs}, nil -} - -// Fetch collects a new sample of the CPU usage metrics. -// This will overwrite the currently stored samples. -func (m *Monitor) Fetch() (Metrics, error) { - metric, err := Get(m.Hostfs) - if err != nil { - return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) - } - - oldLastSample := m.lastSample - m.lastSample = metric - - return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, nil -} - -// FetchCores collects a new sample of CPU usage metrics per-core -// This will overwrite the currently stored samples. -func (m *Monitor) FetchCores() ([]Metrics, error) { - - metric, err := Get(m.Hostfs) - if err != nil { - return nil, fmt.Errorf("error fetching CPU metrics: %w", err) - } - - coreMetrics := make([]Metrics, len(metric.list)) - for i := 0; i < len(metric.list); i++ { - lastMetric := CPU{} - // Count of CPUs can change - if len(m.lastSample.list) > i { - lastMetric = m.lastSample.list[i] - } - coreMetrics[i] = Metrics{ - currentSample: metric.list[i], - previousSample: lastMetric, - isTotals: false, - } - - // Only add CPUInfo metric if it's available - // Remove this if statement once CPUInfo is supported - // by all systems - if len(metric.CPUInfo) != 0 { - coreMetrics[i].cpuInfo = metric.CPUInfo[i] - } - } - m.lastSample = metric - return coreMetrics, nil -} diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 6ff74df4b7..20b4ecb8de 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -23,7 +23,9 @@ vagrant winrm -s cmd -e -c "cd C:\\Gopath\src\\github.com\\elastic\\beats\\metri package cpu import ( + "errors" "fmt" + "runtime" "strings" "time" @@ -33,131 +35,173 @@ import ( "github.com/elastic/gosigar/sys/windows" ) -var counters []string = []string{ - "\\Processor(_Total)\\% Processor Time", - "\\Processor(_Total)\\% Idle Time", -} +var ( + kernelTimeCounter = "\\Processor Information(%s)\\% Privileged Time" + userTimeCounter = "\\Processor Information(%s)\\% User Time" + idleTimeCounter = "\\Processor Information(%s)\\% Idle Time" +) -/* -The below code implements a "metrics tracker" that gives us the ability to -calculate CPU percentages, as we average usage across a time period. -*/ +// Get fetches Windows CPU system times +func Get(_ resolve.Resolver) (CPUMetrics, error) { + var q pdh.Query + var kernel, user, idle time.Duration + var combinedErr, err error + globalMetrics := CPUMetrics{} -// Monitor is used to monitor the overall CPU usage of the system over time. -type Monitor struct { - lastSample CPUMetrics - Hostfs resolve.Resolver - query pdh.Query -} + if err := q.Open(); err != nil { + combinedErr = errors.Join(combinedErr, err) + goto fallback + // return CPUMetrics{}, fmt.Errorf("call to PdhOpenQuery failed: %w", err) + } -// New returns a new CPU metrics monitor -// Hostfs is only relevant on linux and freebsd. -func New(hostfs resolve.Resolver) (*Monitor, error) { - q, err := initializeQuery() + // get per-cpu data + // try getting data via performance counters + globalMetrics.list, err = populatePerCpuMetrics(&q) if err != nil { - return nil, fmt.Errorf("call to initialize PDH quert failed: %w", err) + combinedErr = errors.Join(combinedErr, err) + goto fallback } - return &Monitor{Hostfs: hostfs, query: q}, nil -} -// Fetch collects a new sample of the CPU usage metrics. -// This will overwrite the currently stored samples. -func (m *Monitor) Fetch() (Metrics, error) { - metric, err := Get(m.Hostfs, m.query) + kernel, user, idle, err = populateGlobalCpuMetrics(&q) if err != nil { - return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) + combinedErr = errors.Join(combinedErr, err) + goto fallback } - oldLastSample := m.lastSample - m.lastSample = metric - - return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, nil -} + // _Total values returned by PerfCounters are averaged by number of cpus i.e. average time for system as a whole + // Previously, we used to return sum of times for all CPUs. + // To be backward compatible with previous version, multiply the average time by number of CPUs. + globalMetrics.totals.Idle = opt.UintWith(uint64(idle/time.Millisecond) * uint64(len(globalMetrics.list))) + globalMetrics.totals.Sys = opt.UintWith(uint64(kernel/time.Millisecond) * uint64(len(globalMetrics.list))) + globalMetrics.totals.User = opt.UintWith(uint64(user/time.Millisecond) * uint64(len(globalMetrics.list))) -// FetchCores collects a new sample of CPU usage metrics per-core -// This will overwrite the currently stored samples. -func (m *Monitor) FetchCores() ([]Metrics, error) { + return globalMetrics, nil - metric, err := Get(m.Hostfs, m.query) +fallback: + // fallback to GetSystemTimes() + // GetSystemTimes() return global data for current processor group i.e. upto 64 cores + kernel, user, idle, err = populateGlobalCpuMetricsFallback() if err != nil { - return nil, fmt.Errorf("error fetching CPU metrics: %w", err) + return CPUMetrics{}, fmt.Errorf("error getting counter values: %w", err) } - coreMetrics := make([]Metrics, len(metric.list)) - for i := 0; i < len(metric.list); i++ { - lastMetric := CPU{} - // Count of CPUs can change - if len(m.lastSample.list) > i { - lastMetric = m.lastSample.list[i] - } - coreMetrics[i] = Metrics{ - currentSample: metric.list[i], - previousSample: lastMetric, - isTotals: false, - } + // convert from duration to ticks + // ticks are measured in 1-ms intervals + globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) + globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) + globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) - // Only add CPUInfo metric if it's available - // Remove this if statement once CPUInfo is supported - // by all systems - if len(metric.CPUInfo) != 0 { - coreMetrics[i].cpuInfo = metric.CPUInfo[i] - } + // fallback to _NtQuerySystemInformation + // _NtQuerySystemInformation return per-cpu data for current processor group i.e. upto 64 cores + globalMetrics.list, err = populatePerCpuMetricsFallback() + if err != nil { + return CPUMetrics{}, fmt.Errorf("error getting per-cpu metrics: %w", err) } - m.lastSample = metric - return coreMetrics, nil + return globalMetrics, &PerfError{err: combinedErr} } -// Get fetches Windows CPU system times -func Get(_ resolve.Resolver, q pdh.Query) (CPUMetrics, error) { - if err := q.CollectData(); err != nil { - return CPUMetrics{}, fmt.Errorf("call to collect counter data failed: %w", err) +func populateGlobalCpuMetrics(q *pdh.Query) (time.Duration, time.Duration, time.Duration, error) { + kernel, err := q.GetRawCounterValue(fmt.Sprintf(kernelTimeCounter, "_Total")) + if err != nil { + return 0, 0, 0, fmt.Errorf("error getting Privileged Time counter: %w", err) + } + idle, err := q.GetRawCounterValue(fmt.Sprintf(idleTimeCounter, "_Total")) + if err != nil { + return 0, 0, 0, fmt.Errorf("error getting Idle Time counter: %w", err) } - counterValues, err := q.GetFormattedCounterValues() + user, err := q.GetRawCounterValue(fmt.Sprintf(userTimeCounter, "_Total")) + if err != nil { + return 0, 0, 0, fmt.Errorf("error getting Privileged User counter: %w", err) + } + return time.Duration(kernel.FirstValue * 100), time.Duration(idle.FirstValue * 100), time.Duration(user.FirstValue * 100), nil +} + +func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { + cpuMap := make(map[string]*CPU, runtime.NumCPU()) + counters, err := getAllCouterPaths(q) if err != nil { - return CPUMetrics{}, fmt.Errorf("call to get formated values: %w", err) - } - var total, idle float64 - for counterName, counterVaule := range counterValues { - if strings.Contains(counterName, "\\Processor(_Total)\\% Processor Time") { - total = counterVaule[0].Measurement.(float64) - } else { - idle = counterVaule[0].Measurement.(float64) + return nil, fmt.Errorf("call to getAllCouterPaths failed: %w", err) + } + for _, counter := range counters { + instance, err := pdh.MatchInstanceName(counter) + if err != nil { + // invalid counter name - ignore the error + // shouldn't really happen, but just in case + continue + } + if strings.Contains(strings.ToLower(instance), "_total") { + // we're only interested in per-cpu performance counters + // counters containing "_TOTAL" are global counters i.e. average of all CPUs + // hence, ignore such counteres + continue + } + if _, ok := cpuMap[instance]; !ok { + cpuMap[instance] = &CPU{} + } + val, err := q.GetRawCounterValue(counter) + if err != nil { + return nil, fmt.Errorf("call to GetRawCounterValue failed for %s: %w", counter, err) + } + // the counter value returned by GetRawCounterValue is in 100-ns intervals + // convert it to nanoseconds + valUint := uint64(time.Duration(val.FirstValue*100) / time.Millisecond) + + if strings.Contains(strings.ToLower(counter), "% idle time") { + cpuMap[instance].Idle = opt.UintWith(valUint) + } else if strings.Contains(strings.ToLower(counter), "% privileged time") { + cpuMap[instance].Sys = opt.UintWith(valUint) + } else if strings.Contains(strings.ToLower(counter), "% user time") { + cpuMap[instance].User = opt.UintWith(valUint) } } - globalMetrics := CPUMetrics{} - //convert from duration to ticks - idleMetric := uint64(time.Duration(idle) / 1000) - sysMetric := uint64(time.Duration(total) / 1000) - // userMetrics := uint64(user / time.Millisecond) - globalMetrics.totals.Idle = opt.UintWith(idleMetric) - globalMetrics.totals.Sys = opt.UintWith(sysMetric) - // globalMetrics.totals.User = opt.UintWith(userMetrics) - // get per-cpu data + + list := make([]CPU, 0, len(cpuMap)) + for _, cpu := range cpuMap { + list = append(list, *cpu) + } + return list, nil +} + +func populatePerCpuMetricsFallback() ([]CPU, error) { cpus, err := windows.NtQuerySystemProcessorPerformanceInformation() if err != nil { - return CPUMetrics{}, fmt.Errorf("catll to NtQuerySystemProcessorPerformanceInformation failed: %w", err) + return nil, fmt.Errorf("catll to NtQuerySystemProcessorPerformanceInformation failed: %w", err) } - globalMetrics.list = make([]CPU, 0, len(cpus)) + list := make([]CPU, 0, len(cpus)) for _, cpu := range cpus { idleMetric := uint64(cpu.IdleTime / time.Millisecond) sysMetric := uint64(cpu.KernelTime / time.Millisecond) userMetrics := uint64(cpu.UserTime / time.Millisecond) - globalMetrics.list = append(globalMetrics.list, CPU{ + list = append(list, CPU{ Idle: opt.UintWith(idleMetric), Sys: opt.UintWith(sysMetric), User: opt.UintWith(userMetrics), }) } - return globalMetrics, nil + return list, nil } -func initializeQuery() (pdh.Query, error) { - query := pdh.Query{} - for _, c := range counters { - if err := query.AddCounter(c, "", "double", false); err != nil { - return pdh.Query{}, err - } +func populateGlobalCpuMetricsFallback() (idle, kernel, user time.Duration, err error) { + idle, kernel, user, err = windows.GetSystemTimes() + if err != nil { + return + } + return +} + +func getAllCouterPaths(q *pdh.Query) ([]string, error) { + allKernelCounters, err := q.GetCounterPaths(fmt.Sprintf(kernelTimeCounter, "*")) + if err != nil { + return nil, fmt.Errorf("call to fetch all kernel counters failed: %w", err) + } + allUserCounters, err := q.GetCounterPaths(fmt.Sprintf(userTimeCounter, "*")) + if err != nil { + return nil, fmt.Errorf("call to fetch all user counters failed: %w", err) } - _ = query.CollectData() - return query, nil + allIdleCounters, err := q.GetCounterPaths(fmt.Sprintf(idleTimeCounter, "*")) + if err != nil { + return nil, fmt.Errorf("call to fetch all user counters failed: %w", err) + } + return append(allKernelCounters, append(allUserCounters, allIdleCounters...)...), nil + } From fdb61e07bd9b7aa01f39c59dc72bcb69d024b834 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 18 Nov 2024 13:30:06 +0530 Subject: [PATCH 03/29] chore: windows pdh update --- metric/cpu/metrics.go | 17 +++ metric/cpu/metrics_test.go | 2 - metric/cpu/metrics_windows.go | 24 ++--- metric/cpu/metrics_windows_test.go | 163 ----------------------------- 4 files changed, 29 insertions(+), 177 deletions(-) delete mode 100644 metric/cpu/metrics_windows_test.go diff --git a/metric/cpu/metrics.go b/metric/cpu/metrics.go index 6733786f2a..8d7e0860e3 100644 --- a/metric/cpu/metrics.go +++ b/metric/cpu/metrics.go @@ -1,3 +1,20 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + package cpu import ( diff --git a/metric/cpu/metrics_test.go b/metric/cpu/metrics_test.go index 07fb85bd72..28b20540d7 100644 --- a/metric/cpu/metrics_test.go +++ b/metric/cpu/metrics_test.go @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -//go:build !windows - package cpu import ( diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 20b4ecb8de..ce376206dd 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -46,12 +46,12 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { var q pdh.Query var kernel, user, idle time.Duration var combinedErr, err error + globalMetrics := CPUMetrics{} if err := q.Open(); err != nil { combinedErr = errors.Join(combinedErr, err) goto fallback - // return CPUMetrics{}, fmt.Errorf("call to PdhOpenQuery failed: %w", err) } // get per-cpu data @@ -62,23 +62,21 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { goto fallback } - kernel, user, idle, err = populateGlobalCpuMetrics(&q) + kernel, user, idle, err = populateGlobalCpuMetrics(&q, int64(len(globalMetrics.list))) if err != nil { combinedErr = errors.Join(combinedErr, err) goto fallback } - // _Total values returned by PerfCounters are averaged by number of cpus i.e. average time for system as a whole - // Previously, we used to return sum of times for all CPUs. - // To be backward compatible with previous version, multiply the average time by number of CPUs. - globalMetrics.totals.Idle = opt.UintWith(uint64(idle/time.Millisecond) * uint64(len(globalMetrics.list))) - globalMetrics.totals.Sys = opt.UintWith(uint64(kernel/time.Millisecond) * uint64(len(globalMetrics.list))) - globalMetrics.totals.User = opt.UintWith(uint64(user/time.Millisecond) * uint64(len(globalMetrics.list))) + globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) + globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) + globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) return globalMetrics, nil fallback: - // fallback to GetSystemTimes() + // fallback to GetSystemTimes() and _NtQuerySystemInformation() if data collection via perf counter fails + // GetSystemTimes() return global data for current processor group i.e. upto 64 cores kernel, user, idle, err = populateGlobalCpuMetricsFallback() if err != nil { @@ -91,7 +89,6 @@ fallback: globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) - // fallback to _NtQuerySystemInformation // _NtQuerySystemInformation return per-cpu data for current processor group i.e. upto 64 cores globalMetrics.list, err = populatePerCpuMetricsFallback() if err != nil { @@ -100,7 +97,7 @@ fallback: return globalMetrics, &PerfError{err: combinedErr} } -func populateGlobalCpuMetrics(q *pdh.Query) (time.Duration, time.Duration, time.Duration, error) { +func populateGlobalCpuMetrics(q *pdh.Query, numCpus int64) (time.Duration, time.Duration, time.Duration, error) { kernel, err := q.GetRawCounterValue(fmt.Sprintf(kernelTimeCounter, "_Total")) if err != nil { return 0, 0, 0, fmt.Errorf("error getting Privileged Time counter: %w", err) @@ -113,7 +110,10 @@ func populateGlobalCpuMetrics(q *pdh.Query) (time.Duration, time.Duration, time. if err != nil { return 0, 0, 0, fmt.Errorf("error getting Privileged User counter: %w", err) } - return time.Duration(kernel.FirstValue * 100), time.Duration(idle.FirstValue * 100), time.Duration(user.FirstValue * 100), nil + // _Total values returned by PerfCounters are averaged by number of cpus i.e. average time for system as a whole + // Previously, we used to return sum of times for all CPUs. + // To be backward compatible with previous version, multiply the average time by number of CPUs. + return time.Duration(kernel.FirstValue * 100 * numCpus), time.Duration(idle.FirstValue * 100 * numCpus), time.Duration(user.FirstValue * 100 * numCpus), nil } func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { diff --git a/metric/cpu/metrics_windows_test.go b/metric/cpu/metrics_windows_test.go deleted file mode 100644 index 07fb85bd72..0000000000 --- a/metric/cpu/metrics_windows_test.go +++ /dev/null @@ -1,163 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//go:build !windows - -package cpu - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/mapstr" - "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests" -) - -func TestMonitorSample(t *testing.T) { - _ = logp.DevelopmentSetup() - cpu := &Monitor{lastSample: CPUMetrics{}, Hostfs: systemtests.DockerTestResolver()} - s, err := cpu.Fetch() - require.NoError(t, err) - - metricOpts := MetricOpts{Percentages: true, NormalizedPercentages: true, Ticks: true} - evt, err := s.Format(metricOpts) - assert.NoError(t, err, "error in Format") - testPopulatedEvent(evt, t, true) -} - -func TestCoresMonitorSample(t *testing.T) { - - cpuMetrics, err := Get(systemtests.DockerTestResolver()) - assert.NoError(t, err, "error in Get()") - - cores := &Monitor{lastSample: CPUMetrics{list: make([]CPU, len(cpuMetrics.list))}, Hostfs: systemtests.DockerTestResolver()} - sample, err := cores.FetchCores() - require.NoError(t, err) - - for _, s := range sample { - metricOpts := MetricOpts{Percentages: true, Ticks: true} - evt, err := s.Format(metricOpts) - assert.NoError(t, err, "error in Format") - testPopulatedEvent(evt, t, false) - } -} - -func testPopulatedEvent(evt mapstr.M, t *testing.T, norm bool) { - user, err := evt.GetValue("user.pct") - assert.NoError(t, err, "error getting user.pct") - system, err := evt.GetValue("system.pct") - assert.NoError(t, err, "error getting system.pct") - assert.True(t, user.(float64) > 0) - assert.True(t, system.(float64) > 0) - - if norm { - normUser, err := evt.GetValue("user.norm.pct") - assert.NoError(t, err, "error getting user.norm.pct") - assert.True(t, normUser.(float64) > 0) - normSystem, err := evt.GetValue("system.norm.pct") - assert.NoError(t, err, "error getting system.norm.pct") - assert.True(t, normSystem.(float64) > 0) - assert.True(t, normUser.(float64) <= 100) - assert.True(t, normSystem.(float64) <= 100) - - assert.True(t, user.(float64) > normUser.(float64)) - assert.True(t, system.(float64) > normSystem.(float64)) - } - - userTicks, err := evt.GetValue("user.ticks") - assert.NoError(t, err, "error getting user.ticks") - assert.True(t, userTicks.(uint64) > 0) - systemTicks, err := evt.GetValue("system.ticks") - assert.NoError(t, err, "error getting system.ticks") - assert.True(t, systemTicks.(uint64) > 0) -} - -// TestMetricsRounding tests that the returned percentages are rounded to -// four decimal places. -func TestMetricsRounding(t *testing.T) { - - sample := Metrics{ - previousSample: CPU{ - User: opt.UintWith(10855311), - Sys: opt.UintWith(2021040), - Idle: opt.UintWith(17657874), - }, - currentSample: CPU{ - User: opt.UintWith(10855693), - Sys: opt.UintWith(2021058), - Idle: opt.UintWith(17657876), - }, - } - - evt, err := sample.Format(MetricOpts{NormalizedPercentages: true}) - assert.NoError(t, err, "error in Format") - normUser, err := evt.GetValue("user.norm.pct") - assert.NoError(t, err, "error getting user.norm.pct") - normSystem, err := evt.GetValue("system.norm.pct") - assert.NoError(t, err, "error getting system.norm.pct") - - assert.Equal(t, normUser.(float64), 0.9502) - assert.Equal(t, normSystem.(float64), 0.0448) -} - -// TestMetricsPercentages tests that Metrics returns the correct -// percentages and normalized percentages. -func TestMetricsPercentages(t *testing.T) { - numCores := 10 - // This test simulates 30% user and 70% system (normalized), or 3% and 7% - // respectively when there are 10 CPUs. - const userTest, systemTest = 30., 70. - - s0 := CPU{ - User: opt.UintWith(10000000), - Sys: opt.UintWith(10000000), - Idle: opt.UintWith(20000000), - Nice: opt.UintWith(0), - } - s1 := CPU{ - User: opt.UintWith(s0.User.ValueOr(0) + uint64(userTest)), - Sys: opt.UintWith(s0.Sys.ValueOr(0) + uint64(systemTest)), - Idle: s0.Idle, - Nice: opt.UintWith(0), - } - sample := Metrics{ - count: numCores, - isTotals: true, - previousSample: s0, - currentSample: s1, - } - - evt, err := sample.Format(MetricOpts{NormalizedPercentages: true, Percentages: true}) - assert.NoError(t, err, "error in Format") - - user, err := evt.GetValue("user.norm.pct") - assert.NoError(t, err, "error getting user.norm.pct") - system, err := evt.GetValue("system.norm.pct") - assert.NoError(t, err, "error getting system.norm.pct") - idle, err := evt.GetValue("idle.norm.pct") - assert.NoError(t, err, "error getting idle.norm.pct") - total, err := evt.GetValue("total.norm.pct") - assert.NoError(t, err, "error getting total.norm.pct") - assert.EqualValues(t, .3, user.(float64)) - assert.EqualValues(t, .7, system.(float64)) - assert.EqualValues(t, .0, idle.(float64)) - assert.EqualValues(t, 1., total.(float64)) -} From f67904ae8bec435faf64cd024ea5150fc03091be Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 19 Nov 2024 00:11:00 +0530 Subject: [PATCH 04/29] chore: windows pdh update --- metric/cpu/metrics_windows.go | 115 ++++++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 19 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index ce376206dd..127be2d63b 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -26,7 +26,9 @@ import ( "errors" "fmt" "runtime" + "slices" "strings" + "sync" "time" "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" @@ -41,28 +43,40 @@ var ( idleTimeCounter = "\\Processor Information(%s)\\% Idle Time" ) +var ( + // a call to getAllCouterPaths is idempodent i.e. it returns same set of counters every time you call it. + // we can save some cruicial cycles by converting it to a sync.Once + getAllCouterPathsOnce = sync.OnceValues(getAllCouterPaths) + + getQueryOnce = sync.OnceValues(getQuery) +) + // Get fetches Windows CPU system times func Get(_ resolve.Resolver) (CPUMetrics, error) { - var q pdh.Query var kernel, user, idle time.Duration var combinedErr, err error globalMetrics := CPUMetrics{} - - if err := q.Open(); err != nil { + q, err := getQueryOnce() + if err != nil { combinedErr = errors.Join(combinedErr, err) goto fallback } + if err := q.CollectData(); err != nil { + combinedErr = errors.Join(combinedErr, fmt.Errorf("error collecting counter data: %w", err)) + goto fallback + } + // get per-cpu data // try getting data via performance counters - globalMetrics.list, err = populatePerCpuMetrics(&q) + globalMetrics.list, err = populatePerCpuMetrics(q) if err != nil { combinedErr = errors.Join(combinedErr, err) goto fallback } - kernel, user, idle, err = populateGlobalCpuMetrics(&q, int64(len(globalMetrics.list))) + kernel, user, idle, err = populateGlobalCpuMetrics(q, int64(len(globalMetrics.list))) if err != nil { combinedErr = errors.Join(combinedErr, err) goto fallback @@ -118,27 +132,25 @@ func populateGlobalCpuMetrics(q *pdh.Query, numCpus int64) (time.Duration, time. func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { cpuMap := make(map[string]*CPU, runtime.NumCPU()) - counters, err := getAllCouterPaths(q) + counters, err := getAllCouterPathsOnce() if err != nil { return nil, fmt.Errorf("call to getAllCouterPaths failed: %w", err) } for _, counter := range counters { - instance, err := pdh.MatchInstanceName(counter) - if err != nil { - // invalid counter name - ignore the error - // shouldn't really happen, but just in case - continue - } + name := counter.name + instance := counter.instance + if strings.Contains(strings.ToLower(instance), "_total") { // we're only interested in per-cpu performance counters // counters containing "_TOTAL" are global counters i.e. average of all CPUs // hence, ignore such counteres continue } + if _, ok := cpuMap[instance]; !ok { - cpuMap[instance] = &CPU{} + cpuMap[counter.instance] = &CPU{} } - val, err := q.GetRawCounterValue(counter) + val, err := q.GetRawCounterValue(name) if err != nil { return nil, fmt.Errorf("call to GetRawCounterValue failed for %s: %w", counter, err) } @@ -146,11 +158,11 @@ func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { // convert it to nanoseconds valUint := uint64(time.Duration(val.FirstValue*100) / time.Millisecond) - if strings.Contains(strings.ToLower(counter), "% idle time") { + if strings.Contains(strings.ToLower(name), "% idle time") { cpuMap[instance].Idle = opt.UintWith(valUint) - } else if strings.Contains(strings.ToLower(counter), "% privileged time") { + } else if strings.Contains(strings.ToLower(name), "% privileged time") { cpuMap[instance].Sys = opt.UintWith(valUint) - } else if strings.Contains(strings.ToLower(counter), "% user time") { + } else if strings.Contains(strings.ToLower(name), "% user time") { cpuMap[instance].User = opt.UintWith(valUint) } } @@ -189,7 +201,39 @@ func populateGlobalCpuMetricsFallback() (idle, kernel, user time.Duration, err e return } -func getAllCouterPaths(q *pdh.Query) ([]string, error) { +type counter struct { + name string + instance string +} + +func getAllCouterPaths() ([]*counter, error) { + // getAllCouterPaths returns needed counter paths to fetch per CPU data + // For eg. + // In a system with 64 cores, getAllCounterPaths() will return: + // \\Processor Information(0,0)\\% Privileged Time, + // \\Processor Information(0,1)\\% Privileged Time, + // \\Processor Information(0,2)\\% Privileged Time, + // ... + // \\Processor Information(0,63)\\% Privileged Time + // \\Processor Information(0,0)\\% Idle Time, + // \\Processor Information(0,1)\\% Idle Time, + // \\Processor Information(0,2)\\% Idle Time, + // ... + // \\Processor Information(0,63)\\% Idle Time + // \\Processor Information(0,0)\\% Idle Time, + // \\Processor Information(0,1)\\% Idle Time, + // \\Processor Information(0,2)\\% Idle Time, + // ... + // \\Processor Information(0,63)\\% Idle Time + // \\Processor Information(0,0)\\% Privileged Time, + // \\Processor Information(0,1)\\% Privileged Time, + // \\Processor Information(0,2)\\% Privileged Time, + // ... + // \\Processor Information(0,63)\\% Privileged Time + var q pdh.Query + if err := q.Open(); err != nil { + return nil, fmt.Errorf("Failed to open query: %w", err) + } allKernelCounters, err := q.GetCounterPaths(fmt.Sprintf(kernelTimeCounter, "*")) if err != nil { return nil, fmt.Errorf("call to fetch all kernel counters failed: %w", err) @@ -202,6 +246,39 @@ func getAllCouterPaths(q *pdh.Query) ([]string, error) { if err != nil { return nil, fmt.Errorf("call to fetch all user counters failed: %w", err) } - return append(allKernelCounters, append(allUserCounters, allIdleCounters...)...), nil + allCounters := make([]*counter, 0) + for _, counterName := range slices.Concat(allKernelCounters, allUserCounters, allIdleCounters) { + instance, err := pdh.MatchInstanceName(counterName) + if err != nil { + // invalid counter name - ignore the error + // shouldn't really happen, but just in case + continue + } + allCounters = append(allCounters, &counter{ + instance: instance, + name: counterName, + }) + } + return allCounters, nil + +} + +func getQuery() (*pdh.Query, error) { + var q pdh.Query + if err := q.Open(); err != nil { + return nil, fmt.Errorf("failed to open query: %w", err) + } + counters, err := getAllCouterPathsOnce() + if err != nil { + return nil, fmt.Errorf("call to getAllCouterPaths failed: %w", err) + } + // add all counters to our query. + // all of the counter data will be collected once we call CollectData() in Get() + for _, counter := range counters { + if err := q.AddCounter(counter.name, "", "", false); err != nil { + return nil, fmt.Errorf("call to AddCounter failed: %w", err) + } + } + return &q, nil } From 76d90abeb9a274efed2d5af5b1ade358c8e790b2 Mon Sep 17 00:00:00 2001 From: vihas makwana Date: Fri, 22 Nov 2024 13:43:05 +0530 Subject: [PATCH 05/29] chore: fix CI --- metric/cpu/metrics_windows.go | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 127be2d63b..0249731e51 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -26,7 +26,6 @@ import ( "errors" "fmt" "runtime" - "slices" "strings" "sync" "time" @@ -38,17 +37,17 @@ import ( ) var ( - kernelTimeCounter = "\\Processor Information(%s)\\% Privileged Time" - userTimeCounter = "\\Processor Information(%s)\\% User Time" - idleTimeCounter = "\\Processor Information(%s)\\% Idle Time" + processorInformationCounter = "\\Processor Information(%s)\\%s" + totalKernelTimeCounter = fmt.Sprintf(processorInformationCounter, "_Total", "% Privileged Time") + totalIdleTimeCounter = fmt.Sprintf(processorInformationCounter, "_Total", "% Idle Time") + totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "_Total", "% User Time") ) var ( // a call to getAllCouterPaths is idempodent i.e. it returns same set of counters every time you call it. // we can save some cruicial cycles by converting it to a sync.Once getAllCouterPathsOnce = sync.OnceValues(getAllCouterPaths) - - getQueryOnce = sync.OnceValues(getQuery) + getQueryOnce = sync.OnceValues(getQuery) ) // Get fetches Windows CPU system times @@ -112,15 +111,15 @@ fallback: } func populateGlobalCpuMetrics(q *pdh.Query, numCpus int64) (time.Duration, time.Duration, time.Duration, error) { - kernel, err := q.GetRawCounterValue(fmt.Sprintf(kernelTimeCounter, "_Total")) + kernel, err := q.GetRawCounterValue(totalKernelTimeCounter) if err != nil { return 0, 0, 0, fmt.Errorf("error getting Privileged Time counter: %w", err) } - idle, err := q.GetRawCounterValue(fmt.Sprintf(idleTimeCounter, "_Total")) + idle, err := q.GetRawCounterValue(totalIdleTimeCounter) if err != nil { return 0, 0, 0, fmt.Errorf("error getting Idle Time counter: %w", err) } - user, err := q.GetRawCounterValue(fmt.Sprintf(userTimeCounter, "_Total")) + user, err := q.GetRawCounterValue(totalUserTimeCounter) if err != nil { return 0, 0, 0, fmt.Errorf("error getting Privileged User counter: %w", err) } @@ -234,27 +233,25 @@ func getAllCouterPaths() ([]*counter, error) { if err := q.Open(); err != nil { return nil, fmt.Errorf("Failed to open query: %w", err) } - allKernelCounters, err := q.GetCounterPaths(fmt.Sprintf(kernelTimeCounter, "*")) + allKnownCounters, err := q.GetCounterPaths(fmt.Sprintf(processorInformationCounter, "*", "*")) if err != nil { return nil, fmt.Errorf("call to fetch all kernel counters failed: %w", err) } - allUserCounters, err := q.GetCounterPaths(fmt.Sprintf(userTimeCounter, "*")) - if err != nil { - return nil, fmt.Errorf("call to fetch all user counters failed: %w", err) - } - allIdleCounters, err := q.GetCounterPaths(fmt.Sprintf(idleTimeCounter, "*")) - if err != nil { - return nil, fmt.Errorf("call to fetch all user counters failed: %w", err) - } + allKnownCounters = append(allKnownCounters, totalKernelTimeCounter, totalIdleTimeCounter, totalUserTimeCounter) allCounters := make([]*counter, 0) - for _, counterName := range slices.Concat(allKernelCounters, allUserCounters, allIdleCounters) { + for _, counterName := range allKnownCounters { instance, err := pdh.MatchInstanceName(counterName) if err != nil { // invalid counter name - ignore the error // shouldn't really happen, but just in case continue } + if !(strings.Contains(counterName, "Privileged Time") || + strings.Contains(counterName, "User Time") || + strings.Contains(counterName, "Idle Time")) { + continue + } allCounters = append(allCounters, &counter{ instance: instance, name: counterName, From 1521ccb7eba199d3bd60be53bab0cba6b267982a Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 22 Nov 2024 14:33:54 +0530 Subject: [PATCH 06/29] go.mod --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index a182d9ed05..f152af81eb 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.8 require ( github.com/docker/docker v26.1.5+incompatible - github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c + github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043 github.com/elastic/go-licenser v0.4.2 github.com/elastic/go-structform v0.0.9 github.com/elastic/go-sysinfo v1.14.1 diff --git a/go.sum b/go.sum index dc0300160d..aa33bde2d5 100644 --- a/go.sum +++ b/go.sum @@ -25,6 +25,8 @@ github.com/elastic/elastic-agent-libs v0.9.13 h1:D1rh1s67zlkDWmixWQaNWzn+qy6DafI github.com/elastic/elastic-agent-libs v0.9.13/go.mod h1:G9ljFvDE+muOOOQBf2eRituF0fE4suGkv25rfjTwY+c= github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c h1:9pqutj36lOyKsKp1oKA9puR64/MZ2E+BWaAEVEOSl3g= github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= +github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043 h1:Hi7lTJZPfAcmQmVges9IQkhAMv9f7Q+DbA9FBqU6Wf0= +github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c= github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o= From 67086a29802bebb1d2d46dee05708694477de41d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 22 Nov 2024 16:52:24 +0530 Subject: [PATCH 07/29] lint --- metric/cpu/metrics_windows.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 0249731e51..ed71fb62c1 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -75,7 +75,7 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { goto fallback } - kernel, user, idle, err = populateGlobalCpuMetrics(q, int64(len(globalMetrics.list))) + kernel, user, idle, err = populateGlobalCPUMetrics(q, int64(len(globalMetrics.list))) if err != nil { combinedErr = errors.Join(combinedErr, err) goto fallback @@ -91,7 +91,7 @@ fallback: // fallback to GetSystemTimes() and _NtQuerySystemInformation() if data collection via perf counter fails // GetSystemTimes() return global data for current processor group i.e. upto 64 cores - kernel, user, idle, err = populateGlobalCpuMetricsFallback() + kernel, user, idle, err = populateGlobalCPUMetricsFallback() if err != nil { return CPUMetrics{}, fmt.Errorf("error getting counter values: %w", err) } @@ -110,7 +110,7 @@ fallback: return globalMetrics, &PerfError{err: combinedErr} } -func populateGlobalCpuMetrics(q *pdh.Query, numCpus int64) (time.Duration, time.Duration, time.Duration, error) { +func populateGlobalCPUMetrics(q *pdh.Query, numCpus int64) (time.Duration, time.Duration, time.Duration, error) { kernel, err := q.GetRawCounterValue(totalKernelTimeCounter) if err != nil { return 0, 0, 0, fmt.Errorf("error getting Privileged Time counter: %w", err) @@ -142,7 +142,7 @@ func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { if strings.Contains(strings.ToLower(instance), "_total") { // we're only interested in per-cpu performance counters // counters containing "_TOTAL" are global counters i.e. average of all CPUs - // hence, ignore such counteres + // hence, ignore such counters continue } @@ -192,12 +192,11 @@ func populatePerCpuMetricsFallback() ([]CPU, error) { return list, nil } -func populateGlobalCpuMetricsFallback() (idle, kernel, user time.Duration, err error) { +func populateGlobalCPUMetricsFallback() (idle, kernel, user time.Duration, err error) { idle, kernel, user, err = windows.GetSystemTimes() if err != nil { return } - return } type counter struct { From f1c2689b7288c9a48256e234e4e196172ef8117c Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 22 Nov 2024 17:02:59 +0530 Subject: [PATCH 08/29] chore: remove fallback --- metric/cpu/metrics_windows.go | 67 +++-------------------------------- 1 file changed, 5 insertions(+), 62 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index ed71fb62c1..1bea152bf5 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -23,7 +23,6 @@ vagrant winrm -s cmd -e -c "cd C:\\Gopath\src\\github.com\\elastic\\beats\\metri package cpu import ( - "errors" "fmt" "runtime" "strings" @@ -33,7 +32,6 @@ import ( "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" - "github.com/elastic/gosigar/sys/windows" ) var ( @@ -52,33 +50,26 @@ var ( // Get fetches Windows CPU system times func Get(_ resolve.Resolver) (CPUMetrics, error) { - var kernel, user, idle time.Duration - var combinedErr, err error - globalMetrics := CPUMetrics{} q, err := getQueryOnce() if err != nil { - combinedErr = errors.Join(combinedErr, err) - goto fallback + return CPUMetrics{}, err } if err := q.CollectData(); err != nil { - combinedErr = errors.Join(combinedErr, fmt.Errorf("error collecting counter data: %w", err)) - goto fallback + return CPUMetrics{}, fmt.Errorf("error collecting counter data: %w", err) } // get per-cpu data // try getting data via performance counters globalMetrics.list, err = populatePerCpuMetrics(q) if err != nil { - combinedErr = errors.Join(combinedErr, err) - goto fallback + return CPUMetrics{}, fmt.Errorf("error calling populatePerCpuMetrics: %w", err) } - kernel, user, idle, err = populateGlobalCPUMetrics(q, int64(len(globalMetrics.list))) + kernel, user, idle, err := populateGlobalCPUMetrics(q, int64(len(globalMetrics.list))) if err != nil { - combinedErr = errors.Join(combinedErr, err) - goto fallback + return CPUMetrics{}, fmt.Errorf("error calling populateGlobalCPUMetrics: %w", err) } globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) @@ -86,28 +77,6 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) return globalMetrics, nil - -fallback: - // fallback to GetSystemTimes() and _NtQuerySystemInformation() if data collection via perf counter fails - - // GetSystemTimes() return global data for current processor group i.e. upto 64 cores - kernel, user, idle, err = populateGlobalCPUMetricsFallback() - if err != nil { - return CPUMetrics{}, fmt.Errorf("error getting counter values: %w", err) - } - - // convert from duration to ticks - // ticks are measured in 1-ms intervals - globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) - globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) - globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) - - // _NtQuerySystemInformation return per-cpu data for current processor group i.e. upto 64 cores - globalMetrics.list, err = populatePerCpuMetricsFallback() - if err != nil { - return CPUMetrics{}, fmt.Errorf("error getting per-cpu metrics: %w", err) - } - return globalMetrics, &PerfError{err: combinedErr} } func populateGlobalCPUMetrics(q *pdh.Query, numCpus int64) (time.Duration, time.Duration, time.Duration, error) { @@ -173,32 +142,6 @@ func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { return list, nil } -func populatePerCpuMetricsFallback() ([]CPU, error) { - cpus, err := windows.NtQuerySystemProcessorPerformanceInformation() - if err != nil { - return nil, fmt.Errorf("catll to NtQuerySystemProcessorPerformanceInformation failed: %w", err) - } - list := make([]CPU, 0, len(cpus)) - for _, cpu := range cpus { - idleMetric := uint64(cpu.IdleTime / time.Millisecond) - sysMetric := uint64(cpu.KernelTime / time.Millisecond) - userMetrics := uint64(cpu.UserTime / time.Millisecond) - list = append(list, CPU{ - Idle: opt.UintWith(idleMetric), - Sys: opt.UintWith(sysMetric), - User: opt.UintWith(userMetrics), - }) - } - return list, nil -} - -func populateGlobalCPUMetricsFallback() (idle, kernel, user time.Duration, err error) { - idle, kernel, user, err = windows.GetSystemTimes() - if err != nil { - return - } -} - type counter struct { name string instance string From 3b4b69aa9e3df5234e2cfbc863fc2f265ac7635c Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 22 Nov 2024 17:07:49 +0530 Subject: [PATCH 09/29] lint --- metric/cpu/metrics_windows.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 1bea152bf5..7aa0e9881c 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -62,9 +62,9 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { // get per-cpu data // try getting data via performance counters - globalMetrics.list, err = populatePerCpuMetrics(q) + globalMetrics.list, err = populatePerCPUMetrics(q) if err != nil { - return CPUMetrics{}, fmt.Errorf("error calling populatePerCpuMetrics: %w", err) + return CPUMetrics{}, fmt.Errorf("error calling populatePerCPUMetrics: %w", err) } kernel, user, idle, err := populateGlobalCPUMetrics(q, int64(len(globalMetrics.list))) @@ -98,7 +98,7 @@ func populateGlobalCPUMetrics(q *pdh.Query, numCpus int64) (time.Duration, time. return time.Duration(kernel.FirstValue * 100 * numCpus), time.Duration(idle.FirstValue * 100 * numCpus), time.Duration(user.FirstValue * 100 * numCpus), nil } -func populatePerCpuMetrics(q *pdh.Query) ([]CPU, error) { +func populatePerCPUMetrics(q *pdh.Query) ([]CPU, error) { cpuMap := make(map[string]*CPU, runtime.NumCPU()) counters, err := getAllCouterPathsOnce() if err != nil { @@ -173,7 +173,7 @@ func getAllCouterPaths() ([]*counter, error) { // \\Processor Information(0,63)\\% Privileged Time var q pdh.Query if err := q.Open(); err != nil { - return nil, fmt.Errorf("Failed to open query: %w", err) + return nil, fmt.Errorf("failed to open query: %w", err) } allKnownCounters, err := q.GetCounterPaths(fmt.Sprintf(processorInformationCounter, "*", "*")) if err != nil { From 695ac10504625e7ba4330e6849f092a7ca2a632f Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 22 Nov 2024 17:09:26 +0530 Subject: [PATCH 10/29] chore: remove redundant --- metric/cpu/metrics.go | 25 +++++-------------------- metric/cpu/metrics_windows.go | 6 +++--- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/metric/cpu/metrics.go b/metric/cpu/metrics.go index 8d7e0860e3..c039d091a4 100644 --- a/metric/cpu/metrics.go +++ b/metric/cpu/metrics.go @@ -6,7 +6,7 @@ // not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an @@ -100,14 +100,14 @@ func New(hostfs resolve.Resolver) *Monitor { // This will overwrite the currently stored samples. func (m *Monitor) Fetch() (Metrics, error) { metric, err := Get(m.Hostfs) - if err != nil && !errors.Is(err, &PerfError{}) { + if err != nil { return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) } oldLastSample := m.lastSample m.lastSample = metric - return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, err + return Metrics{previousSample: oldLastSample.totals, currentSample: metric.totals, count: len(metric.list), isTotals: true}, nil } // FetchCores collects a new sample of CPU usage metrics per-core @@ -115,7 +115,7 @@ func (m *Monitor) Fetch() (Metrics, error) { func (m *Monitor) FetchCores() ([]Metrics, error) { metric, err := Get(m.Hostfs) - if err != nil && !errors.Is(err, &PerfError{}) { + if err != nil { return nil, fmt.Errorf("error fetching CPU metrics: %w", err) } @@ -140,7 +140,7 @@ func (m *Monitor) FetchCores() ([]Metrics, error) { } } m.lastSample = metric - return coreMetrics, err + return coreMetrics, nil } // Metrics stores the current and the last sample collected by a Beat. @@ -244,18 +244,3 @@ func cpuMetricTimeDelta(prev, current opt.Uint, timeDelta uint64, numCPU int) fl pct := float64(cpuDelta) / float64(timeDelta) return metric.Round(pct * float64(numCPU)) } - -type PerfError struct { - err error -} - -func (p *PerfError) Error() string { - if p.err == nil { - return "" - } - return fmt.Sprintf("Error while reading performance counter data: %s", p.err.Error()) -} - -func (p *PerfError) Unwrap() error { - return p.err -} diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 7aa0e9881c..608a980c4b 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -67,7 +67,7 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { return CPUMetrics{}, fmt.Errorf("error calling populatePerCPUMetrics: %w", err) } - kernel, user, idle, err := populateGlobalCPUMetrics(q, int64(len(globalMetrics.list))) + kernel, user, idle, err := populateGlobalCPUMetrics(q, len(globalMetrics.list)) if err != nil { return CPUMetrics{}, fmt.Errorf("error calling populateGlobalCPUMetrics: %w", err) } @@ -79,7 +79,7 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { return globalMetrics, nil } -func populateGlobalCPUMetrics(q *pdh.Query, numCpus int64) (time.Duration, time.Duration, time.Duration, error) { +func populateGlobalCPUMetrics(q *pdh.Query, numCpus int) (time.Duration, time.Duration, time.Duration, error) { kernel, err := q.GetRawCounterValue(totalKernelTimeCounter) if err != nil { return 0, 0, 0, fmt.Errorf("error getting Privileged Time counter: %w", err) @@ -95,7 +95,7 @@ func populateGlobalCPUMetrics(q *pdh.Query, numCpus int64) (time.Duration, time. // _Total values returned by PerfCounters are averaged by number of cpus i.e. average time for system as a whole // Previously, we used to return sum of times for all CPUs. // To be backward compatible with previous version, multiply the average time by number of CPUs. - return time.Duration(kernel.FirstValue * 100 * numCpus), time.Duration(idle.FirstValue * 100 * numCpus), time.Duration(user.FirstValue * 100 * numCpus), nil + return time.Duration(kernel.FirstValue * 100 * int64(numCpus)), time.Duration(idle.FirstValue * 100 * int64(numCpus)), time.Duration(user.FirstValue * 100 * int64(numCpus)), nil } func populatePerCPUMetrics(q *pdh.Query) ([]CPU, error) { From 1bf1d83f80e75f3a17efda8e8b0efef464e02d41 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 22 Nov 2024 18:16:08 +0530 Subject: [PATCH 11/29] go.mod --- go.sum | 4 ---- 1 file changed, 4 deletions(-) diff --git a/go.sum b/go.sum index aa33bde2d5..3244cfb399 100644 --- a/go.sum +++ b/go.sum @@ -21,10 +21,6 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/elastic/elastic-agent-libs v0.9.13 h1:D1rh1s67zlkDWmixWQaNWzn+qy6DafIDPTQnLpBNBUA= -github.com/elastic/elastic-agent-libs v0.9.13/go.mod h1:G9ljFvDE+muOOOQBf2eRituF0fE4suGkv25rfjTwY+c= -github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c h1:9pqutj36lOyKsKp1oKA9puR64/MZ2E+BWaAEVEOSl3g= -github.com/elastic/elastic-agent-libs v0.17.3-0.20241112062438-5ba501c7ca8c/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043 h1:Hi7lTJZPfAcmQmVges9IQkhAMv9f7Q+DbA9FBqU6Wf0= github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= From a5781e0c164efa1951250c8c3a0b44859d669918 Mon Sep 17 00:00:00 2001 From: vihas makwana Date: Sat, 23 Nov 2024 03:44:12 +0530 Subject: [PATCH 12/29] chore: memory improvements --- metric/cpu/metrics_windows.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 608a980c4b..ec758fc172 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -108,7 +108,7 @@ func populatePerCPUMetrics(q *pdh.Query) ([]CPU, error) { name := counter.name instance := counter.instance - if strings.Contains(strings.ToLower(instance), "_total") { + if strings.Contains(instance, "_Total") { // we're only interested in per-cpu performance counters // counters containing "_TOTAL" are global counters i.e. average of all CPUs // hence, ignore such counters @@ -126,11 +126,11 @@ func populatePerCPUMetrics(q *pdh.Query) ([]CPU, error) { // convert it to nanoseconds valUint := uint64(time.Duration(val.FirstValue*100) / time.Millisecond) - if strings.Contains(strings.ToLower(name), "% idle time") { + if strings.Contains(name, "% Idle time") { cpuMap[instance].Idle = opt.UintWith(valUint) - } else if strings.Contains(strings.ToLower(name), "% privileged time") { + } else if strings.Contains(name, "% Privileged time") { cpuMap[instance].Sys = opt.UintWith(valUint) - } else if strings.Contains(strings.ToLower(name), "% user time") { + } else if strings.Contains(name, "% User time") { cpuMap[instance].User = opt.UintWith(valUint) } } From ea78b1dd37150df0e82b3228f96ac6793a168058 Mon Sep 17 00:00:00 2001 From: vihas makwana Date: Sat, 23 Nov 2024 17:33:01 +0530 Subject: [PATCH 13/29] chore: use rawCounterArray --- metric/cpu/metrics_windows.go | 197 +++++++--------------------------- 1 file changed, 38 insertions(+), 159 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index ec758fc172..f677e25b52 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -24,9 +24,6 @@ package cpu import ( "fmt" - "runtime" - "strings" - "sync" "time" "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" @@ -36,188 +33,70 @@ import ( var ( processorInformationCounter = "\\Processor Information(%s)\\%s" - totalKernelTimeCounter = fmt.Sprintf(processorInformationCounter, "_Total", "% Privileged Time") - totalIdleTimeCounter = fmt.Sprintf(processorInformationCounter, "_Total", "% Idle Time") - totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "_Total", "% User Time") + totalKernelTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% Privileged Time") + totalIdleTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% Idle Time") + totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% User Time") ) -var ( - // a call to getAllCouterPaths is idempodent i.e. it returns same set of counters every time you call it. - // we can save some cruicial cycles by converting it to a sync.Once - getAllCouterPathsOnce = sync.OnceValues(getAllCouterPaths) - getQueryOnce = sync.OnceValues(getQuery) -) +var query, qError = buildQuery() // Get fetches Windows CPU system times func Get(_ resolve.Resolver) (CPUMetrics, error) { globalMetrics := CPUMetrics{} - q, err := getQueryOnce() - if err != nil { - return CPUMetrics{}, err - } - - if err := q.CollectData(); err != nil { - return CPUMetrics{}, fmt.Errorf("error collecting counter data: %w", err) - } - // get per-cpu data - // try getting data via performance counters - globalMetrics.list, err = populatePerCPUMetrics(q) - if err != nil { - return CPUMetrics{}, fmt.Errorf("error calling populatePerCPUMetrics: %w", err) - } - - kernel, user, idle, err := populateGlobalCPUMetrics(q, len(globalMetrics.list)) - if err != nil { - return CPUMetrics{}, fmt.Errorf("error calling populateGlobalCPUMetrics: %w", err) + if err := query.CollectData(); err != nil { + return globalMetrics, err } - globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) - globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) - globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) - - return globalMetrics, nil -} - -func populateGlobalCPUMetrics(q *pdh.Query, numCpus int) (time.Duration, time.Duration, time.Duration, error) { - kernel, err := q.GetRawCounterValue(totalKernelTimeCounter) - if err != nil { - return 0, 0, 0, fmt.Errorf("error getting Privileged Time counter: %w", err) - } - idle, err := q.GetRawCounterValue(totalIdleTimeCounter) + kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) if err != nil { - return 0, 0, 0, fmt.Errorf("error getting Idle Time counter: %w", err) + return globalMetrics, err } - user, err := q.GetRawCounterValue(totalUserTimeCounter) + idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) if err != nil { - return 0, 0, 0, fmt.Errorf("error getting Privileged User counter: %w", err) + return globalMetrics, err } - // _Total values returned by PerfCounters are averaged by number of cpus i.e. average time for system as a whole - // Previously, we used to return sum of times for all CPUs. - // To be backward compatible with previous version, multiply the average time by number of CPUs. - return time.Duration(kernel.FirstValue * 100 * int64(numCpus)), time.Duration(idle.FirstValue * 100 * int64(numCpus)), time.Duration(user.FirstValue * 100 * int64(numCpus)), nil -} - -func populatePerCPUMetrics(q *pdh.Query) ([]CPU, error) { - cpuMap := make(map[string]*CPU, runtime.NumCPU()) - counters, err := getAllCouterPathsOnce() + userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) if err != nil { - return nil, fmt.Errorf("call to getAllCouterPaths failed: %w", err) + return globalMetrics, err } - for _, counter := range counters { - name := counter.name - instance := counter.instance - - if strings.Contains(instance, "_Total") { - // we're only interested in per-cpu performance counters - // counters containing "_TOTAL" are global counters i.e. average of all CPUs - // hence, ignore such counters - continue - } + var idle, kernel, user time.Duration + globalMetrics.list = make([]CPU, len(userRawData)) + for i := 0; i < len(globalMetrics.list); i++ { + idleTimeNs := time.Duration(idleRawData[i].RawValue.FirstValue * 100) + kernelTimeNs := time.Duration(kernelRawData[i].RawValue.FirstValue * 100) + userTimeNs := time.Duration(userRawData[i].RawValue.FirstValue * 100) - if _, ok := cpuMap[instance]; !ok { - cpuMap[counter.instance] = &CPU{} - } - val, err := q.GetRawCounterValue(name) - if err != nil { - return nil, fmt.Errorf("call to GetRawCounterValue failed for %s: %w", counter, err) - } - // the counter value returned by GetRawCounterValue is in 100-ns intervals - // convert it to nanoseconds - valUint := uint64(time.Duration(val.FirstValue*100) / time.Millisecond) + globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTimeNs / time.Millisecond)) + globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTimeNs / time.Millisecond)) + globalMetrics.list[i].User = opt.UintWith(uint64(userTimeNs / time.Millisecond)) - if strings.Contains(name, "% Idle time") { - cpuMap[instance].Idle = opt.UintWith(valUint) - } else if strings.Contains(name, "% Privileged time") { - cpuMap[instance].Sys = opt.UintWith(valUint) - } else if strings.Contains(name, "% User time") { - cpuMap[instance].User = opt.UintWith(valUint) - } + // add the per-cpu time to track the total time spent by system + idle += idleTimeNs + kernel += kernelTimeNs + user += userTimeNs } - list := make([]CPU, 0, len(cpuMap)) - for _, cpu := range cpuMap { - list = append(list, *cpu) - } - return list, nil -} + globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) + globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) + globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) -type counter struct { - name string - instance string + return globalMetrics, nil } -func getAllCouterPaths() ([]*counter, error) { - // getAllCouterPaths returns needed counter paths to fetch per CPU data - // For eg. - // In a system with 64 cores, getAllCounterPaths() will return: - // \\Processor Information(0,0)\\% Privileged Time, - // \\Processor Information(0,1)\\% Privileged Time, - // \\Processor Information(0,2)\\% Privileged Time, - // ... - // \\Processor Information(0,63)\\% Privileged Time - // \\Processor Information(0,0)\\% Idle Time, - // \\Processor Information(0,1)\\% Idle Time, - // \\Processor Information(0,2)\\% Idle Time, - // ... - // \\Processor Information(0,63)\\% Idle Time - // \\Processor Information(0,0)\\% Idle Time, - // \\Processor Information(0,1)\\% Idle Time, - // \\Processor Information(0,2)\\% Idle Time, - // ... - // \\Processor Information(0,63)\\% Idle Time - // \\Processor Information(0,0)\\% Privileged Time, - // \\Processor Information(0,1)\\% Privileged Time, - // \\Processor Information(0,2)\\% Privileged Time, - // ... - // \\Processor Information(0,63)\\% Privileged Time +func buildQuery() (pdh.Query, error) { var q pdh.Query if err := q.Open(); err != nil { - return nil, fmt.Errorf("failed to open query: %w", err) + return q, err } - allKnownCounters, err := q.GetCounterPaths(fmt.Sprintf(processorInformationCounter, "*", "*")) - if err != nil { - return nil, fmt.Errorf("call to fetch all kernel counters failed: %w", err) + if err := q.AddCounter(totalKernelTimeCounter, "", "", true); err != nil { + return q, err } - allKnownCounters = append(allKnownCounters, totalKernelTimeCounter, totalIdleTimeCounter, totalUserTimeCounter) - - allCounters := make([]*counter, 0) - for _, counterName := range allKnownCounters { - instance, err := pdh.MatchInstanceName(counterName) - if err != nil { - // invalid counter name - ignore the error - // shouldn't really happen, but just in case - continue - } - if !(strings.Contains(counterName, "Privileged Time") || - strings.Contains(counterName, "User Time") || - strings.Contains(counterName, "Idle Time")) { - continue - } - allCounters = append(allCounters, &counter{ - instance: instance, - name: counterName, - }) - } - return allCounters, nil - -} - -func getQuery() (*pdh.Query, error) { - var q pdh.Query - if err := q.Open(); err != nil { - return nil, fmt.Errorf("failed to open query: %w", err) - } - counters, err := getAllCouterPathsOnce() - if err != nil { - return nil, fmt.Errorf("call to getAllCouterPaths failed: %w", err) + if err := q.AddCounter(totalUserTimeCounter, "", "", true); err != nil { + return q, err } - // add all counters to our query. - // all of the counter data will be collected once we call CollectData() in Get() - for _, counter := range counters { - if err := q.AddCounter(counter.name, "", "", false); err != nil { - return nil, fmt.Errorf("call to AddCounter failed: %w", err) - } + if err := q.AddCounter(totalIdleTimeCounter, "", "", true); err != nil { + return q, err } - return &q, nil + return q, nil } From 7955534195b5a9f60a8f2d87be6b47b8da957569 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 25 Nov 2024 16:52:59 +0530 Subject: [PATCH 14/29] chore: clean up --- metric/cpu/metrics_windows.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index f677e25b52..27f857363c 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -63,23 +63,23 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { var idle, kernel, user time.Duration globalMetrics.list = make([]CPU, len(userRawData)) for i := 0; i < len(globalMetrics.list); i++ { - idleTimeNs := time.Duration(idleRawData[i].RawValue.FirstValue * 100) - kernelTimeNs := time.Duration(kernelRawData[i].RawValue.FirstValue * 100) - userTimeNs := time.Duration(userRawData[i].RawValue.FirstValue * 100) + idleTimeMs := time.Duration(idleRawData[i].RawValue.FirstValue*100) / time.Millisecond + kernelTimeMs := time.Duration(kernelRawData[i].RawValue.FirstValue*100) / time.Millisecond + userTimeMs := time.Duration(userRawData[i].RawValue.FirstValue*100) / time.Millisecond - globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTimeNs / time.Millisecond)) - globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTimeNs / time.Millisecond)) - globalMetrics.list[i].User = opt.UintWith(uint64(userTimeNs / time.Millisecond)) + globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTimeMs)) + globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTimeMs)) + globalMetrics.list[i].User = opt.UintWith(uint64(userTimeMs)) // add the per-cpu time to track the total time spent by system - idle += idleTimeNs - kernel += kernelTimeNs - user += userTimeNs + idle += idleTimeMs + kernel += kernelTimeMs + user += userTimeMs } - globalMetrics.totals.Idle = opt.UintWith(uint64(idle / time.Millisecond)) - globalMetrics.totals.Sys = opt.UintWith(uint64(kernel / time.Millisecond)) - globalMetrics.totals.User = opt.UintWith(uint64(user / time.Millisecond)) + globalMetrics.totals.Idle = opt.UintWith(uint64(idle)) + globalMetrics.totals.Sys = opt.UintWith(uint64(kernel)) + globalMetrics.totals.User = opt.UintWith(uint64(user)) return globalMetrics, nil } From 7692d3a34ba079769c5132f34b00afb417dce246 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 26 Nov 2024 21:15:17 +0530 Subject: [PATCH 15/29] go.mod --- go.mod | 2 +- go.sum | 2 ++ metric/cpu/metrics_windows.go | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index f152af81eb..264dc2d9f1 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.8 require ( github.com/docker/docker v26.1.5+incompatible - github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043 + github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d github.com/elastic/go-licenser v0.4.2 github.com/elastic/go-structform v0.0.9 github.com/elastic/go-sysinfo v1.14.1 diff --git a/go.sum b/go.sum index 3244cfb399..1f523f3b73 100644 --- a/go.sum +++ b/go.sum @@ -23,6 +23,8 @@ github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043 h1:Hi7lTJZPfAcmQmVges9IQkhAMv9f7Q+DbA9FBqU6Wf0= github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= +github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo= +github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c= github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o= diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 27f857363c..4815979378 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -89,13 +89,13 @@ func buildQuery() (pdh.Query, error) { if err := q.Open(); err != nil { return q, err } - if err := q.AddCounter(totalKernelTimeCounter, "", "", true); err != nil { + if err := q.AddCounter(totalKernelTimeCounter, "", "", true, true); err != nil { return q, err } - if err := q.AddCounter(totalUserTimeCounter, "", "", true); err != nil { + if err := q.AddCounter(totalUserTimeCounter, "", "", true, true); err != nil { return q, err } - if err := q.AddCounter(totalIdleTimeCounter, "", "", true); err != nil { + if err := q.AddCounter(totalIdleTimeCounter, "", "", true, true); err != nil { return q, err } return q, nil From 6150435d0fb260d6709c1c175c07c4fced42ec40 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 26 Nov 2024 21:19:34 +0530 Subject: [PATCH 16/29] chore: error handling --- metric/cpu/metrics_windows.go | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 4815979378..0d0c10caaf 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -43,6 +43,9 @@ var query, qError = buildQuery() // Get fetches Windows CPU system times func Get(_ resolve.Resolver) (CPUMetrics, error) { globalMetrics := CPUMetrics{} + if qError != nil { + return globalMetrics, qError + } if err := query.CollectData(); err != nil { return globalMetrics, err @@ -50,19 +53,23 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) if err != nil { - return globalMetrics, err + return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for kernel counter: %w", err) } idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) if err != nil { - return globalMetrics, err + return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for idle counter: %w", err) } userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) if err != nil { - return globalMetrics, err + return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for user counter: %w", err) } var idle, kernel, user time.Duration globalMetrics.list = make([]CPU, len(userRawData)) for i := 0; i < len(globalMetrics.list); i++ { + // The values returned by GetRawCounterArray are of equal length and are sorted by instance names. + // For CPU core {i}, idleRawData[i], kernelRawData[i], and userRawData[i] correspond to the idle time, kernel time, and user time, respectively. + + // values returned by counter are in 100-ns intervals. Hence, convert it to millisecond. idleTimeMs := time.Duration(idleRawData[i].RawValue.FirstValue*100) / time.Millisecond kernelTimeMs := time.Duration(kernelRawData[i].RawValue.FirstValue*100) / time.Millisecond userTimeMs := time.Duration(userRawData[i].RawValue.FirstValue*100) / time.Millisecond @@ -87,16 +94,16 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { func buildQuery() (pdh.Query, error) { var q pdh.Query if err := q.Open(); err != nil { - return q, err + return q, fmt.Errorf("failed to open query: %w", err) } if err := q.AddCounter(totalKernelTimeCounter, "", "", true, true); err != nil { - return q, err + return q, fmt.Errorf("error calling AddCounter for kernel counter: %w", err) } if err := q.AddCounter(totalUserTimeCounter, "", "", true, true); err != nil { - return q, err + return q, fmt.Errorf("error calling AddCounter for user counter: %w", err) } if err := q.AddCounter(totalIdleTimeCounter, "", "", true, true); err != nil { - return q, err + return q, fmt.Errorf("error calling AddCounter for idle counter: %w", err) } return q, nil } From f0d2015c3da8255e2793657e2d1ca65b7cc51acc Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 26 Nov 2024 21:26:15 +0530 Subject: [PATCH 17/29] chore: test --- metric/cpu/metric_windows_test.go | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 metric/cpu/metric_windows_test.go diff --git a/metric/cpu/metric_windows_test.go b/metric/cpu/metric_windows_test.go new file mode 100644 index 0000000000..26755f6877 --- /dev/null +++ b/metric/cpu/metric_windows_test.go @@ -0,0 +1,51 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build windows + +package cpu + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestCounterLength(t *testing.T) { + query, err := buildQuery() + require.NoError(t, err) + require.NoError(t, query.CollectData()) + + kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) + require.NoError(t, err) + + idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) + require.NoError(t, err) + + userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) + require.NoError(t, err) + + require.Equal(t, len(kernelRawData), len(idleRawData)) + require.Equal(t, len(userRawData), len(idleRawData)) + + for i := 0; i < len(userRawData); i++ { + require.Equal(t, userRawData[i].InstanceName, kernelRawData[i].InstanceName, "InstanceName should be equal") + } + for i := 0; i < len(kernelRawData); i++ { + require.Equal(t, kernelRawData[i].InstanceName, idleRawData[i].InstanceName, "InstanceName should be equal") + } +} From 45028a5fb5307591421fbb5762741d2f52c180d4 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 26 Nov 2024 21:29:58 +0530 Subject: [PATCH 18/29] lint --- metric/cpu/metrics_windows.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 0d0c10caaf..80e6648575 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -70,18 +70,18 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) { // For CPU core {i}, idleRawData[i], kernelRawData[i], and userRawData[i] correspond to the idle time, kernel time, and user time, respectively. // values returned by counter are in 100-ns intervals. Hence, convert it to millisecond. - idleTimeMs := time.Duration(idleRawData[i].RawValue.FirstValue*100) / time.Millisecond - kernelTimeMs := time.Duration(kernelRawData[i].RawValue.FirstValue*100) / time.Millisecond - userTimeMs := time.Duration(userRawData[i].RawValue.FirstValue*100) / time.Millisecond + idleTime := time.Duration(idleRawData[i].RawValue.FirstValue*100) / time.Millisecond + kernelTime := time.Duration(kernelRawData[i].RawValue.FirstValue*100) / time.Millisecond + userTime := time.Duration(userRawData[i].RawValue.FirstValue*100) / time.Millisecond - globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTimeMs)) - globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTimeMs)) - globalMetrics.list[i].User = opt.UintWith(uint64(userTimeMs)) + globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTime)) + globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTime)) + globalMetrics.list[i].User = opt.UintWith(uint64(userTime)) // add the per-cpu time to track the total time spent by system - idle += idleTimeMs - kernel += kernelTimeMs - user += userTimeMs + idle += idleTime + kernel += kernelTime + user += userTime } globalMetrics.totals.Idle = opt.UintWith(uint64(idle)) From ffce1613cf7be5f2e3ab77c6276b892f8648b046 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 26 Nov 2024 21:32:26 +0530 Subject: [PATCH 19/29] go.sum --- go.sum | 2 -- 1 file changed, 2 deletions(-) diff --git a/go.sum b/go.sum index 1f523f3b73..26097d7a5d 100644 --- a/go.sum +++ b/go.sum @@ -21,8 +21,6 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043 h1:Hi7lTJZPfAcmQmVges9IQkhAMv9f7Q+DbA9FBqU6Wf0= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241120070353-4a2d1a91a043/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo= github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= From d7c3ff56c85fe099feee2c3185d7e3f7d4df95a1 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 26 Nov 2024 22:12:51 +0530 Subject: [PATCH 20/29] chore: add flag --- metric/cpu/metrics.go | 21 ++++++++++---- metric/cpu/metrics_aix.go | 2 +- metric/cpu/metrics_darwin.go | 2 +- metric/cpu/metrics_openbsd.go | 2 +- metric/cpu/metrics_procfs_common.go | 2 +- metric/cpu/metrics_windows.go | 45 ++++++++++++++++++++++++++++- 6 files changed, 64 insertions(+), 10 deletions(-) diff --git a/metric/cpu/metrics.go b/metric/cpu/metrics.go index c039d091a4..1986f71482 100644 --- a/metric/cpu/metrics.go +++ b/metric/cpu/metrics.go @@ -84,6 +84,18 @@ The below code implements a "metrics tracker" that gives us the ability to calculate CPU percentages, as we average usage across a time period. */ +type option struct { + usePerformanceCounter bool +} + +type OptionFunc func(*option) + +func WithPerformanceCounter() OptionFunc { + return func(o *option) { + o.usePerformanceCounter = true + } +} + // Monitor is used to monitor the overall CPU usage of the system over time. type Monitor struct { lastSample CPUMetrics @@ -98,8 +110,8 @@ func New(hostfs resolve.Resolver) *Monitor { // Fetch collects a new sample of the CPU usage metrics. // This will overwrite the currently stored samples. -func (m *Monitor) Fetch() (Metrics, error) { - metric, err := Get(m.Hostfs) +func (m *Monitor) Fetch(opts ...OptionFunc) (Metrics, error) { + metric, err := Get(m.Hostfs, opts...) if err != nil { return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) } @@ -112,9 +124,8 @@ func (m *Monitor) Fetch() (Metrics, error) { // FetchCores collects a new sample of CPU usage metrics per-core // This will overwrite the currently stored samples. -func (m *Monitor) FetchCores() ([]Metrics, error) { - - metric, err := Get(m.Hostfs) +func (m *Monitor) FetchCores(opts ...OptionFunc) ([]Metrics, error) { + metric, err := Get(m.Hostfs, opts...) if err != nil { return nil, fmt.Errorf("error fetching CPU metrics: %w", err) } diff --git a/metric/cpu/metrics_aix.go b/metric/cpu/metrics_aix.go index ede2e2d35a..4b365e1ce1 100644 --- a/metric/cpu/metrics_aix.go +++ b/metric/cpu/metrics_aix.go @@ -58,7 +58,7 @@ func tick2msec(val uint64) uint64 { } // Get returns a metrics object for CPU data -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(_ resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { totals, err := getCPUTotals() if err != nil { diff --git a/metric/cpu/metrics_darwin.go b/metric/cpu/metrics_darwin.go index 619b5972a2..09dd2acec7 100644 --- a/metric/cpu/metrics_darwin.go +++ b/metric/cpu/metrics_darwin.go @@ -27,7 +27,7 @@ import ( ) // Get is the Darwin implementation of Get -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(_ resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { // We're using the gopsutil library here. // The code used by both gosigar and go-sysinfo appears to be // the same code as gopsutil, including copy-pasted comments. diff --git a/metric/cpu/metrics_openbsd.go b/metric/cpu/metrics_openbsd.go index 506c9502a8..f6a92e6c23 100644 --- a/metric/cpu/metrics_openbsd.go +++ b/metric/cpu/metrics_openbsd.go @@ -41,7 +41,7 @@ import ( ) // Get is the OpenBSD implementation of get -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(_ resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { // see man 2 sysctl loadGlobal := [C.CPUSTATES]C.long{ diff --git a/metric/cpu/metrics_procfs_common.go b/metric/cpu/metrics_procfs_common.go index d175a529f0..9218329cdb 100644 --- a/metric/cpu/metrics_procfs_common.go +++ b/metric/cpu/metrics_procfs_common.go @@ -31,7 +31,7 @@ import ( ) // Get returns a metrics object for CPU data -func Get(procfs resolve.Resolver) (CPUMetrics, error) { +func Get(procfs resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { path := procfs.ResolveHostFS("/proc/stat") fd, err := os.Open(path) defer func() { diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 80e6648575..c780d7b9ce 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -29,6 +29,7 @@ import ( "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" + "github.com/elastic/gosigar/sys/windows" ) var ( @@ -41,7 +42,14 @@ var ( var query, qError = buildQuery() // Get fetches Windows CPU system times -func Get(_ resolve.Resolver) (CPUMetrics, error) { +func Get(_ resolve.Resolver, opts ...OptionFunc) (CPUMetrics, error) { + op := option{} + for _, o := range opts { + o(&op) + } + if !op.usePerformanceCounter { + return defaultGet() + } globalMetrics := CPUMetrics{} if qError != nil { return globalMetrics, qError @@ -107,3 +115,38 @@ func buildQuery() (pdh.Query, error) { } return q, nil } + +func defaultGet() (CPUMetrics, error) { + idle, kernel, user, err := windows.GetSystemTimes() + if err != nil { + return CPUMetrics{}, fmt.Errorf("call to GetSystemTimes failed: %w", err) + } + + globalMetrics := CPUMetrics{} + //convert from duration to ticks + idleMetric := uint64(idle / time.Millisecond) + sysMetric := uint64(kernel / time.Millisecond) + userMetrics := uint64(user / time.Millisecond) + globalMetrics.totals.Idle = opt.UintWith(idleMetric) + globalMetrics.totals.Sys = opt.UintWith(sysMetric) + globalMetrics.totals.User = opt.UintWith(userMetrics) + + // get per-cpu data + cpus, err := windows.NtQuerySystemProcessorPerformanceInformation() + if err != nil { + return CPUMetrics{}, fmt.Errorf("catll to NtQuerySystemProcessorPerformanceInformation failed: %w", err) + } + globalMetrics.list = make([]CPU, 0, len(cpus)) + for _, cpu := range cpus { + idleMetric := uint64(cpu.IdleTime / time.Millisecond) + sysMetric := uint64(cpu.KernelTime / time.Millisecond) + userMetrics := uint64(cpu.UserTime / time.Millisecond) + globalMetrics.list = append(globalMetrics.list, CPU{ + Idle: opt.UintWith(idleMetric), + Sys: opt.UintWith(sysMetric), + User: opt.UintWith(userMetrics), + }) + } + + return globalMetrics, nil +} From c7b6e031412b55f4d10558ccaf67a5abe499976a Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 29 Nov 2024 20:49:14 +0530 Subject: [PATCH 21/29] chore: use init() --- metric/cpu/metric_windows_test.go | 3 +-- metric/cpu/metrics_windows.go | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/metric/cpu/metric_windows_test.go b/metric/cpu/metric_windows_test.go index 26755f6877..958dc385ce 100644 --- a/metric/cpu/metric_windows_test.go +++ b/metric/cpu/metric_windows_test.go @@ -26,8 +26,7 @@ import ( ) func TestCounterLength(t *testing.T) { - query, err := buildQuery() - require.NoError(t, err) + require.NoError(t, qError) require.NoError(t, query.CollectData()) kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index c780d7b9ce..b130a2b99d 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -39,7 +39,12 @@ var ( totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% User Time") ) -var query, qError = buildQuery() +var query pdh.Query +var qError error + +func init() { + query, qError = buildQuery() +} // Get fetches Windows CPU system times func Get(_ resolve.Resolver, opts ...OptionFunc) (CPUMetrics, error) { From 35a0aa417f52b238bb1b9f4926465e3b61758e41 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 29 Nov 2024 20:50:01 +0530 Subject: [PATCH 22/29] chore: comments --- metric/cpu/metrics.go | 1 + 1 file changed, 1 insertion(+) diff --git a/metric/cpu/metrics.go b/metric/cpu/metrics.go index 1986f71482..fb44abd48c 100644 --- a/metric/cpu/metrics.go +++ b/metric/cpu/metrics.go @@ -90,6 +90,7 @@ type option struct { type OptionFunc func(*option) +// Note: WithPerformanceCounter option is only effective for windows and is ineffective if used by other OS'. func WithPerformanceCounter() OptionFunc { return func(o *option) { o.usePerformanceCounter = true From 491ed8f843d0f7b473dd1e200fa5ae807bea6ea8 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 3 Dec 2024 18:44:30 +0530 Subject: [PATCH 23/29] move away from init --- metric/cpu/metrics_windows.go | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index b130a2b99d..37112e8007 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -39,12 +39,10 @@ var ( totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% User Time") ) -var query pdh.Query -var qError error - -func init() { - query, qError = buildQuery() -} +var ( + query *pdh.Query + qError error +) // Get fetches Windows CPU system times func Get(_ resolve.Resolver, opts ...OptionFunc) (CPUMetrics, error) { @@ -56,8 +54,13 @@ func Get(_ resolve.Resolver, opts ...OptionFunc) (CPUMetrics, error) { return defaultGet() } globalMetrics := CPUMetrics{} - if qError != nil { - return globalMetrics, qError + + // Check if the query has already been initialized, if not, initialize it + if query == nil { + query, qError = buildQuery() // Build query if not already done + if qError != nil { + return CPUMetrics{}, qError + } } if err := query.CollectData(); err != nil { @@ -104,21 +107,21 @@ func Get(_ resolve.Resolver, opts ...OptionFunc) (CPUMetrics, error) { return globalMetrics, nil } -func buildQuery() (pdh.Query, error) { +func buildQuery() (*pdh.Query, error) { var q pdh.Query if err := q.Open(); err != nil { - return q, fmt.Errorf("failed to open query: %w", err) + return nil, fmt.Errorf("failed to open query: %w", err) } if err := q.AddCounter(totalKernelTimeCounter, "", "", true, true); err != nil { - return q, fmt.Errorf("error calling AddCounter for kernel counter: %w", err) + return nil, fmt.Errorf("error calling AddCounter for kernel counter: %w", err) } if err := q.AddCounter(totalUserTimeCounter, "", "", true, true); err != nil { - return q, fmt.Errorf("error calling AddCounter for user counter: %w", err) + return nil, fmt.Errorf("error calling AddCounter for user counter: %w", err) } if err := q.AddCounter(totalIdleTimeCounter, "", "", true, true); err != nil { - return q, fmt.Errorf("error calling AddCounter for idle counter: %w", err) + return nil, fmt.Errorf("error calling AddCounter for idle counter: %w", err) } - return q, nil + return &q, nil } func defaultGet() (CPUMetrics, error) { From eb83e60ba4a659fffb64131ff647e930de24e6d5 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 3 Dec 2024 23:17:11 +0530 Subject: [PATCH 24/29] chore: merge conflicts --- metric/cpu/cpu.go | 8 ++++---- metric/cpu/metrics_aix.go | 3 +-- metric/cpu/metrics_darwin.go | 3 +-- metric/cpu/metrics_openbsd.go | 3 +-- metric/cpu/metrics_procfs_common.go | 6 +++--- metric/cpu/metrics_test.go | 6 +++--- metric/cpu/metrics_windows.go | 31 ++++++----------------------- 7 files changed, 19 insertions(+), 41 deletions(-) diff --git a/metric/cpu/cpu.go b/metric/cpu/cpu.go index 643007c62c..42f42a41f8 100644 --- a/metric/cpu/cpu.go +++ b/metric/cpu/cpu.go @@ -93,8 +93,8 @@ func WithPerformanceCounter() OptionFunc { // Fetch collects a new sample of the CPU usage metrics. // This will overwrite the currently stored samples. -func (m *Monitor) Fetch(opts ...OptionFunc) (Metrics, error) { - metric, err := Get(m.Hostfs, opts...) +func (m *Monitor) Fetch() (Metrics, error) { + metric, err := Get(m) if err != nil { return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err) } @@ -107,8 +107,8 @@ func (m *Monitor) Fetch(opts ...OptionFunc) (Metrics, error) { // FetchCores collects a new sample of CPU usage metrics per-core // This will overwrite the currently stored samples. -func (m *Monitor) FetchCores(opts ...OptionFunc) ([]Metrics, error) { - metric, err := Get(m.Hostfs, opts...) +func (m *Monitor) FetchCores() ([]Metrics, error) { + metric, err := Get(m) if err != nil { return nil, fmt.Errorf("error fetching CPU metrics: %w", err) } diff --git a/metric/cpu/metrics_aix.go b/metric/cpu/metrics_aix.go index 4b365e1ce1..47a5a8b6cc 100644 --- a/metric/cpu/metrics_aix.go +++ b/metric/cpu/metrics_aix.go @@ -37,7 +37,6 @@ import ( "os" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) func init() { @@ -58,7 +57,7 @@ func tick2msec(val uint64) uint64 { } // Get returns a metrics object for CPU data -func Get(_ resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { totals, err := getCPUTotals() if err != nil { diff --git a/metric/cpu/metrics_darwin.go b/metric/cpu/metrics_darwin.go index 09dd2acec7..0483be2b4e 100644 --- a/metric/cpu/metrics_darwin.go +++ b/metric/cpu/metrics_darwin.go @@ -23,11 +23,10 @@ import ( "github.com/shirou/gopsutil/v4/cpu" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Get is the Darwin implementation of Get -func Get(_ resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { // We're using the gopsutil library here. // The code used by both gosigar and go-sysinfo appears to be // the same code as gopsutil, including copy-pasted comments. diff --git a/metric/cpu/metrics_openbsd.go b/metric/cpu/metrics_openbsd.go index f6a92e6c23..93c4f9118d 100644 --- a/metric/cpu/metrics_openbsd.go +++ b/metric/cpu/metrics_openbsd.go @@ -37,11 +37,10 @@ import ( "unsafe" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Get is the OpenBSD implementation of get -func Get(_ resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { // see man 2 sysctl loadGlobal := [C.CPUSTATES]C.long{ diff --git a/metric/cpu/metrics_procfs_common.go b/metric/cpu/metrics_procfs_common.go index 9218329cdb..a6b29dcc99 100644 --- a/metric/cpu/metrics_procfs_common.go +++ b/metric/cpu/metrics_procfs_common.go @@ -26,12 +26,12 @@ import ( "os" "strconv" "strings" - - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Get returns a metrics object for CPU data -func Get(procfs resolve.Resolver, _ ...OptionFunc) (CPUMetrics, error) { +func Get(m *Monitor) (CPUMetrics, error) { + procfs := m.Hostfs + path := procfs.ResolveHostFS("/proc/stat") fd, err := os.Open(path) defer func() { diff --git a/metric/cpu/metrics_test.go b/metric/cpu/metrics_test.go index 43bc0c510e..2c1cd67bb2 100644 --- a/metric/cpu/metrics_test.go +++ b/metric/cpu/metrics_test.go @@ -43,12 +43,12 @@ func TestMonitorSample(t *testing.T) { } func TestCoresMonitorSample(t *testing.T) { + cores, err := New(systemtests.DockerTestResolver()) + require.NoError(t, err) - cpuMetrics, err := Get(systemtests.DockerTestResolver()) + cpuMetrics, err := Get(cores) assert.NoError(t, err, "error in Get()") - cores, err := New(systemtests.DockerTestResolver()) - require.NoError(t, err) cores.lastSample = CPUMetrics{list: make([]CPU, len(cpuMetrics.list))} sample, err := cores.FetchCores() require.NoError(t, err) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index 887f666c60..ef5361cb98 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -26,9 +26,7 @@ import ( "fmt" "time" - "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" "github.com/elastic/elastic-agent-libs/opt" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" "github.com/elastic/gosigar/sys/windows" ) @@ -39,43 +37,26 @@ var ( totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% User Time") ) -var ( - query *pdh.Query - qError error -) - // Get fetches Windows CPU system times -func Get(_ resolve.Resolver, opts ...OptionFunc) (CPUMetrics, error) { - op := option{} - for _, o := range opts { - o(&op) - } - if !op.usePerformanceCounter { +func Get(m *Monitor) (CPUMetrics, error) { + if m.query == nil { return defaultGet() } globalMetrics := CPUMetrics{} - // Check if the query has already been initialized, if not, initialize it - if query == nil { - query, qError = buildQuery() // Build query if not already done - if qError != nil { - return CPUMetrics{}, qError - } - } - - if err := query.CollectData(); err != nil { + if err := m.query.CollectData(); err != nil { return globalMetrics, err } - kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) + kernelRawData, err := m.query.GetRawCounterArray(totalKernelTimeCounter, true) if err != nil { return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for kernel counter: %w", err) } - idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) + idleRawData, err := m.query.GetRawCounterArray(totalIdleTimeCounter, true) if err != nil { return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for idle counter: %w", err) } - userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) + userRawData, err := m.query.GetRawCounterArray(totalUserTimeCounter, true) if err != nil { return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for user counter: %w", err) } From 43fb0a316bd10598e0231d6ecec172a25c80bb52 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 5 Dec 2024 15:28:00 +0530 Subject: [PATCH 25/29] fix tests --- metric/cpu/metric_windows_test.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/metric/cpu/metric_windows_test.go b/metric/cpu/metric_windows_test.go index 958dc385ce..b7f7d37d23 100644 --- a/metric/cpu/metric_windows_test.go +++ b/metric/cpu/metric_windows_test.go @@ -22,13 +22,16 @@ package cpu import ( "testing" + "github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests" "github.com/stretchr/testify/require" ) func TestCounterLength(t *testing.T) { - require.NoError(t, qError) - require.NoError(t, query.CollectData()) + monitor, err := New(systemtests.DockerTestResolver()) + require.NoError(t, err) + require.NoError(t, monitor.query.CollectData()) + query := monitor.query kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) require.NoError(t, err) From 8d35e55cada9a11b248b3ce30b85c72062e31d03 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 5 Dec 2024 19:19:05 +0530 Subject: [PATCH 26/29] fix: goimports --- metric/cpu/metric_windows_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metric/cpu/metric_windows_test.go b/metric/cpu/metric_windows_test.go index b7f7d37d23..3cfaf75ccc 100644 --- a/metric/cpu/metric_windows_test.go +++ b/metric/cpu/metric_windows_test.go @@ -22,8 +22,9 @@ package cpu import ( "testing" - "github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests" "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests" ) func TestCounterLength(t *testing.T) { From 22f8e5dcc7f348624a7cf61e6b20cd27690a04d5 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Thu, 5 Dec 2024 19:21:35 +0530 Subject: [PATCH 27/29] rename WithWindowsPerformanceCounter --- metric/cpu/cpu.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric/cpu/cpu.go b/metric/cpu/cpu.go index 42f42a41f8..d39d5c70d1 100644 --- a/metric/cpu/cpu.go +++ b/metric/cpu/cpu.go @@ -84,8 +84,8 @@ type option struct { type OptionFunc func(*option) -// Note: WithPerformanceCounter option is only effective for windows and is ineffective if used by other OS'. -func WithPerformanceCounter() OptionFunc { +// Note: WithWindowsPerformanceCounter option is only effective for windows and is ineffective if used by other OS'. +func WithWindowsPerformanceCounter() OptionFunc { return func(o *option) { o.usePerformanceCounter = true } From 2f6a5517994dd671922bd1c1b898bae8266c7e43 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 6 Dec 2024 14:00:10 +0530 Subject: [PATCH 28/29] chore: improve readability --- metric/cpu/metrics_windows.go | 85 ++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/metric/cpu/metrics_windows.go b/metric/cpu/metrics_windows.go index ef5361cb98..1392daa56a 100644 --- a/metric/cpu/metrics_windows.go +++ b/metric/cpu/metrics_windows.go @@ -26,6 +26,7 @@ import ( "fmt" "time" + "github.com/elastic/elastic-agent-libs/helpers/windows/pdh" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/gosigar/sys/windows" ) @@ -40,23 +41,62 @@ var ( // Get fetches Windows CPU system times func Get(m *Monitor) (CPUMetrics, error) { if m.query == nil { - return defaultGet() + return getUsingSystemTimes() } + return getUsingPerfCounters(m.query) +} + +func getUsingSystemTimes() (CPUMetrics, error) { + idle, kernel, user, err := windows.GetSystemTimes() + if err != nil { + return CPUMetrics{}, fmt.Errorf("call to GetSystemTimes failed: %w", err) + } + + globalMetrics := CPUMetrics{} + //convert from duration to ticks + idleMetric := uint64(idle / time.Millisecond) + sysMetric := uint64(kernel / time.Millisecond) + userMetrics := uint64(user / time.Millisecond) + globalMetrics.totals.Idle = opt.UintWith(idleMetric) + globalMetrics.totals.Sys = opt.UintWith(sysMetric) + globalMetrics.totals.User = opt.UintWith(userMetrics) + + // get per-cpu data + cpus, err := windows.NtQuerySystemProcessorPerformanceInformation() + if err != nil { + return CPUMetrics{}, fmt.Errorf("catll to NtQuerySystemProcessorPerformanceInformation failed: %w", err) + } + globalMetrics.list = make([]CPU, 0, len(cpus)) + for _, cpu := range cpus { + idleMetric := uint64(cpu.IdleTime / time.Millisecond) + sysMetric := uint64(cpu.KernelTime / time.Millisecond) + userMetrics := uint64(cpu.UserTime / time.Millisecond) + globalMetrics.list = append(globalMetrics.list, CPU{ + Idle: opt.UintWith(idleMetric), + Sys: opt.UintWith(sysMetric), + User: opt.UintWith(userMetrics), + }) + } + + return globalMetrics, nil +} + +func getUsingPerfCounters(query *pdh.Query) (CPUMetrics, error) { globalMetrics := CPUMetrics{} - if err := m.query.CollectData(); err != nil { + if err := query.CollectData(); err != nil { return globalMetrics, err } - kernelRawData, err := m.query.GetRawCounterArray(totalKernelTimeCounter, true) + kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true) if err != nil { return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for kernel counter: %w", err) } - idleRawData, err := m.query.GetRawCounterArray(totalIdleTimeCounter, true) + idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true) if err != nil { return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for idle counter: %w", err) } - userRawData, err := m.query.GetRawCounterArray(totalUserTimeCounter, true) + userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true) if err != nil { return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for user counter: %w", err) } @@ -87,38 +127,3 @@ func Get(m *Monitor) (CPUMetrics, error) { return globalMetrics, nil } - -func defaultGet() (CPUMetrics, error) { - idle, kernel, user, err := windows.GetSystemTimes() - if err != nil { - return CPUMetrics{}, fmt.Errorf("call to GetSystemTimes failed: %w", err) - } - - globalMetrics := CPUMetrics{} - //convert from duration to ticks - idleMetric := uint64(idle / time.Millisecond) - sysMetric := uint64(kernel / time.Millisecond) - userMetrics := uint64(user / time.Millisecond) - globalMetrics.totals.Idle = opt.UintWith(idleMetric) - globalMetrics.totals.Sys = opt.UintWith(sysMetric) - globalMetrics.totals.User = opt.UintWith(userMetrics) - - // get per-cpu data - cpus, err := windows.NtQuerySystemProcessorPerformanceInformation() - if err != nil { - return CPUMetrics{}, fmt.Errorf("catll to NtQuerySystemProcessorPerformanceInformation failed: %w", err) - } - globalMetrics.list = make([]CPU, 0, len(cpus)) - for _, cpu := range cpus { - idleMetric := uint64(cpu.IdleTime / time.Millisecond) - sysMetric := uint64(cpu.KernelTime / time.Millisecond) - userMetrics := uint64(cpu.UserTime / time.Millisecond) - globalMetrics.list = append(globalMetrics.list, CPU{ - Idle: opt.UintWith(idleMetric), - Sys: opt.UintWith(sysMetric), - User: opt.UintWith(userMetrics), - }) - } - - return globalMetrics, nil -} From d51bfa44cdd82492ae0da48d384dcf1aca8be6a3 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Mon, 9 Dec 2024 22:16:11 +0530 Subject: [PATCH 29/29] chore: update go.mod --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 264dc2d9f1..c6c3c12453 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.8 require ( github.com/docker/docker v26.1.5+incompatible - github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d + github.com/elastic/elastic-agent-libs v0.17.4 github.com/elastic/go-licenser v0.4.2 github.com/elastic/go-structform v0.0.9 github.com/elastic/go-sysinfo v1.14.1 diff --git a/go.sum b/go.sum index 26097d7a5d..bf5ff02162 100644 --- a/go.sum +++ b/go.sum @@ -21,8 +21,8 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= +github.com/elastic/elastic-agent-libs v0.17.4 h1:kWK5Kn2EQjM97yHqbeXv+cFAIti4IiI9Qj8huM+lZzE= +github.com/elastic/elastic-agent-libs v0.17.4/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c= github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o=