Skip to content

Commit

Permalink
Expose cpu bugs and flags as info metrics. (prometheus#1788)
Browse files Browse the repository at this point in the history
* Expose cpu bugs and flags as info metrics with a regexp filter.
* Automatically enable CPU info metrics when using flags or bugs feature.

Signed-off-by: domgoer <[email protected]>
  • Loading branch information
domechn authored and oblitorum committed Apr 9, 2024
1 parent 58d226c commit 0af3dab
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 2 deletions.
74 changes: 72 additions & 2 deletions collector/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package collector
import (
"fmt"
"path/filepath"
"regexp"
"strconv"
"sync"

Expand All @@ -32,16 +33,23 @@ type cpuCollector struct {
fs procfs.FS
cpu *prometheus.Desc
cpuInfo *prometheus.Desc
cpuFlagsInfo *prometheus.Desc
cpuBugsInfo *prometheus.Desc
cpuGuest *prometheus.Desc
cpuCoreThrottle *prometheus.Desc
cpuPackageThrottle *prometheus.Desc
logger log.Logger
cpuStats []procfs.CPUStat
cpuStatsMutex sync.Mutex

cpuFlagsIncludeRegexp *regexp.Regexp
cpuBugsIncludeRegexp *regexp.Regexp
}

var (
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
)

func init() {
Expand All @@ -54,14 +62,24 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}
return &cpuCollector{
c := &cpuCollector{
fs: fs,
cpu: nodeCPUSecondsDesc,
cpuInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "info"),
"CPU information from /proc/cpuinfo.",
[]string{"package", "core", "cpu", "vendor", "family", "model", "model_name", "microcode", "stepping", "cachesize"}, nil,
),
cpuFlagsInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "flag_info"),
"The `flags` field of CPU information from /proc/cpuinfo.",
[]string{"flag"}, nil,
),
cpuBugsInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "bug_info"),
"The `bugs` field of CPU information from /proc/cpuinfo.",
[]string{"bug"}, nil,
),
cpuGuest: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"),
"Seconds the cpus spent in guests (VMs) for each mode.",
Expand All @@ -78,7 +96,34 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
[]string{"package"}, nil,
),
logger: logger,
}, nil
}
err = c.compileIncludeFlags(flagsInclude, bugsInclude)
if err != nil {
return nil, fmt.Errorf("fail to compile --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include, the values of them must be regular expressions: %w", err)
}
return c, nil
}

func (c *cpuCollector) compileIncludeFlags(flagsIncludeFlag, bugsIncludeFlag *string) error {
if (*flagsIncludeFlag != "" || *bugsIncludeFlag != "") && !*enableCPUInfo {
*enableCPUInfo = true
level.Info(c.logger).Log("msg", "--collector.cpu.info has been set to `true` because you set the following flags, like --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include")
}

var err error
if *flagsIncludeFlag != "" {
c.cpuFlagsIncludeRegexp, err = regexp.Compile(*flagsIncludeFlag)
if err != nil {
return err
}
}
if *bugsIncludeFlag != "" {
c.cpuBugsIncludeRegexp, err = regexp.Compile(*bugsIncludeFlag)
if err != nil {
return err
}
}
return nil
}

// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
Expand Down Expand Up @@ -117,6 +162,31 @@ func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error {
cpu.Microcode,
cpu.Stepping,
cpu.CacheSize)

if err := updateFieldInfo(cpu.Flags, c.cpuFlagsIncludeRegexp, c.cpuFlagsInfo, ch); err != nil {
return err
}
if err := updateFieldInfo(cpu.Bugs, c.cpuBugsIncludeRegexp, c.cpuBugsInfo, ch); err != nil {
return err
}
}
return nil
}

func updateFieldInfo(valueList []string, filter *regexp.Regexp, desc *prometheus.Desc, ch chan<- prometheus.Metric) error {
if filter == nil {
return nil
}

for _, val := range valueList {
if !filter.MatchString(val) {
continue
}
ch <- prometheus.MustNewConstMetric(desc,
prometheus.GaugeValue,
1,
val,
)
}
return nil
}
Expand Down
12 changes: 12 additions & 0 deletions collector/fixtures/e2e-64k-page-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 3
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_bug_info gauge
node_cpu_bug_info{bug="cpu_meltdown"} 1
node_cpu_bug_info{bug="mds"} 1
node_cpu_bug_info{bug="spectre_v1"} 1
node_cpu_bug_info{bug="spectre_v2"} 1
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_flag_info gauge
node_cpu_flag_info{flag="aes"} 1
node_cpu_flag_info{flag="avx"} 1
node_cpu_flag_info{flag="avx2"} 1
node_cpu_flag_info{flag="constant_tsc"} 1
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
Expand Down
12 changes: 12 additions & 0 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -310,12 +310,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 3
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_bug_info gauge
node_cpu_bug_info{bug="cpu_meltdown"} 1
node_cpu_bug_info{bug="mds"} 1
node_cpu_bug_info{bug="spectre_v1"} 1
node_cpu_bug_info{bug="spectre_v2"} 1
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_flag_info gauge
node_cpu_flag_info{flag="aes"} 1
node_cpu_flag_info{flag="avx"} 1
node_cpu_flag_info{flag="avx2"} 1
node_cpu_flag_info{flag="constant_tsc"} 1
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
Expand Down
2 changes: 2 additions & 0 deletions end-to-end-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ fi
--collector.qdisc.fixtures="collector/fixtures/qdisc/" \
--collector.netclass.ignored-devices="(bond0|dmz|int)" \
--collector.cpu.info \
--collector.cpu.info.flags-include="^(aes|avx.?|constant_tsc)$" \
--collector.cpu.info.bugs-include="^(cpu_meltdown|spectre_.*|mds)$" \
--web.listen-address "127.0.0.1:${port}" \
--log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &

Expand Down

0 comments on commit 0af3dab

Please sign in to comment.