From ab8fb46c62f5f40199bd28ff66cfc0a37b35a18e Mon Sep 17 00:00:00 2001 From: Paul Gier Date: Tue, 10 Sep 2019 09:24:26 -0500 Subject: [PATCH] report core throttles for each CPU It's possible for two cpus in the same core to have a different value for the core_throttle_count, so this change reports a cpu_core_throttles metric for each cpu and not just each core. Hyperthreading systems have two cpus per core. Fixes #1472 Signed-off-by: Paul Gier --- CHANGELOG.md | 1 + collector/cpu_linux.go | 46 ++++++++-------------- collector/fixtures/e2e-64k-page-output.txt | 10 ++--- collector/fixtures/e2e-output.txt | 10 ++--- 4 files changed, 28 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b60da3c284..8ecfd09854 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ### Changes +* [CHANGE] Report cpu_core_throttles for all CPUs. Add label "cpu". #1479 * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 * [CHANGE] Refactor mdadm collector #1403 * [CHANGE] Add `mountaddr` label to NFS metrics. #1417 diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index f915d6280e..42ec97df6c 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -19,6 +19,7 @@ import ( "fmt" "path/filepath" "strconv" + "strings" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" @@ -53,8 +54,8 @@ func NewCPUCollector() (Collector, error) { ), cpuCoreThrottle: prometheus.NewDesc( prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "core_throttles_total"), - "Number of times this cpu core has been throttled.", - []string{"package", "core"}, nil, + "Number of times this cpu has been throttled.", + []string{"package", "core", "cpu"}, nil, ), cpuPackageThrottle: prometheus.NewDesc( prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "package_throttles_total"), @@ -83,7 +84,6 @@ func (c *cpuCollector) updateThermalThrottle(ch chan<- prometheus.Metric) error } packageThrottles := make(map[uint64]uint64) - packageCoreThrottles := make(map[uint64]map[uint64]uint64) // cpu loop for _, cpu := range cpus { @@ -111,45 +111,33 @@ func (c *cpuCollector) updateThermalThrottle(ch chan<- prometheus.Metric) error // are cpu+kernel combinations that only present core throttles // but no package throttles. // Seen e.g. on an Intel Xeon E5472 system with RHEL 6.9 kernel. - if _, present := packageCoreThrottles[physicalPackageID]; !present { - packageCoreThrottles[physicalPackageID] = make(map[uint64]uint64) - } - if _, present := packageCoreThrottles[physicalPackageID][coreID]; !present { - // Read thermal_throttle/core_throttle_count only once - if coreThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err == nil { - packageCoreThrottles[physicalPackageID][coreID] = coreThrottleCount - } else { - log.Debugf("CPU %v is missing core_throttle_count", cpu) - } + if coreThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err == nil { + ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, + prometheus.CounterValue, + float64(coreThrottleCount), + strconv.FormatUint(physicalPackageID, 10), + strconv.FormatUint(coreID, 10), + strings.TrimPrefix(filepath.Base(cpu), "cpu")) + } else { + log.Debugf("CPU %v is missing core_throttle_count", cpu) } // metric node_cpu_package_throttles_total + // All CPUs in the same package have the same value for package_throttles, so we only need one metric per package if _, present := packageThrottles[physicalPackageID]; !present { // Read thermal_throttle/package_throttle_count only once if packageThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "package_throttle_count")); err == nil { packageThrottles[physicalPackageID] = packageThrottleCount + ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, + prometheus.CounterValue, + float64(packageThrottleCount), + strconv.FormatUint(physicalPackageID, 10)) } else { log.Debugf("CPU %v is missing package_throttle_count", cpu) } } } - for physicalPackageID, packageThrottleCount := range packageThrottles { - ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, - prometheus.CounterValue, - float64(packageThrottleCount), - strconv.FormatUint(physicalPackageID, 10)) - } - - for physicalPackageID, coreMap := range packageCoreThrottles { - for coreID, coreThrottleCount := range coreMap { - ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, - prometheus.CounterValue, - float64(coreThrottleCount), - strconv.FormatUint(physicalPackageID, 10), - strconv.FormatUint(coreID, 10)) - } - } return nil } diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 6b2e6bd25b..16efdd1104 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -184,12 +184,12 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0 # HELP node_cooling_device_max_state Maximum throttle state of the cooling device # TYPE node_cooling_device_max_state gauge node_cooling_device_max_state{name="0",type="Processor"} 3 -# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled. +# HELP node_cpu_core_throttles_total Number of times this cpu has been throttled. # TYPE node_cpu_core_throttles_total counter -node_cpu_core_throttles_total{core="0",package="0"} 5 -node_cpu_core_throttles_total{core="0",package="1"} 0 -node_cpu_core_throttles_total{core="1",package="0"} 0 -node_cpu_core_throttles_total{core="1",package="1"} 9 +node_cpu_core_throttles_total{core="0",cpu="0",package="0"} 5 +node_cpu_core_throttles_total{core="0",cpu="2",package="1"} 0 +node_cpu_core_throttles_total{core="1",cpu="1",package="0"} 0 +node_cpu_core_throttles_total{core="1",cpu="3",package="1"} 9 # HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode. # TYPE node_cpu_guest_seconds_total counter node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 7651f53be3..3bb28f76eb 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -184,12 +184,12 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0 # HELP node_cooling_device_max_state Maximum throttle state of the cooling device # TYPE node_cooling_device_max_state gauge node_cooling_device_max_state{name="0",type="Processor"} 3 -# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled. +# HELP node_cpu_core_throttles_total Number of times this cpu has been throttled. # TYPE node_cpu_core_throttles_total counter -node_cpu_core_throttles_total{core="0",package="0"} 5 -node_cpu_core_throttles_total{core="0",package="1"} 0 -node_cpu_core_throttles_total{core="1",package="0"} 0 -node_cpu_core_throttles_total{core="1",package="1"} 9 +node_cpu_core_throttles_total{core="0",cpu="0",package="0"} 5 +node_cpu_core_throttles_total{core="0",cpu="2",package="1"} 0 +node_cpu_core_throttles_total{core="1",cpu="1",package="0"} 0 +node_cpu_core_throttles_total{core="1",cpu="3",package="1"} 9 # HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode. # TYPE node_cpu_guest_seconds_total counter node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01