Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose cpu bugs and flags as info metrics. #1788

Merged
merged 5 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 74 additions & 2 deletions collector/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package collector
import (
"fmt"
"path/filepath"
"regexp"
"strconv"
"sync"

Expand All @@ -32,16 +33,23 @@ type cpuCollector struct {
fs procfs.FS
cpu *prometheus.Desc
cpuInfo *prometheus.Desc
cpuFlagsInfo *prometheus.Desc
cpuBugsInfo *prometheus.Desc
cpuGuest *prometheus.Desc
cpuCoreThrottle *prometheus.Desc
cpuPackageThrottle *prometheus.Desc
logger log.Logger
cpuStats []procfs.CPUStat
cpuStatsMutex sync.Mutex

cpuFlagsIncludeRegexp *regexp.Regexp
cpuBugsIncludeRegexp *regexp.Regexp
}

var (
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
)

func init() {
Expand All @@ -54,14 +62,24 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}
return &cpuCollector{
c := &cpuCollector{
fs: fs,
cpu: nodeCPUSecondsDesc,
cpuInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "info"),
"CPU information from /proc/cpuinfo.",
[]string{"package", "core", "cpu", "vendor", "family", "model", "model_name", "microcode", "stepping", "cachesize"}, nil,
),
cpuFlagsInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "flag_info"),
"The `flags` field of CPU information from /proc/cpuinfo.",
[]string{"flag"}, nil,
),
cpuBugsInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "bug_info"),
"The `bugs` field of CPU information from /proc/cpuinfo.",
[]string{"bug"}, nil,
),
cpuGuest: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"),
"Seconds the cpus spent in guests (VMs) for each mode.",
Expand All @@ -78,7 +96,36 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
[]string{"package"}, nil,
),
logger: logger,
}, nil
}
err = c.compileIncludeFlags(flagsInclude, bugsInclude)
if err != nil {
return nil, fmt.Errorf("fail to compile --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include, the values of them must be regular expressions: %w", err)
}
return c, nil
}

func (c *cpuCollector) compileIncludeFlags(flagsIncludeFlag, bugsIncludeFlag *string) error {
if !*enableCPUInfo {
if flagsIncludeFlag != nil || bugsIncludeFlag != nil {
level.Info(c.logger).Log("msg", "--collector.cpu.info.flags-include and --collector.cpu.info.bugs-include will not take effect until --collector.cpu.info is set to true")
}
return nil
}
var err error

if flagsIncludeFlag != nil {
c.cpuFlagsIncludeRegexp, err = regexp.Compile(*flagsIncludeFlag)
if err != nil {
return err
}
}
if bugsIncludeFlag != nil {
c.cpuBugsIncludeRegexp, err = regexp.Compile(*bugsIncludeFlag)
if err != nil {
return err
}
}
return nil
}

// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
Expand Down Expand Up @@ -117,6 +164,31 @@ func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error {
cpu.Microcode,
cpu.Stepping,
cpu.CacheSize)

if err := updateFieldInfo(cpu.Flags, c.cpuFlagsIncludeRegexp, c.cpuFlagsInfo, ch); err != nil {
return err
}
if err := updateFieldInfo(cpu.Bugs, c.cpuBugsIncludeRegexp, c.cpuBugsInfo, ch); err != nil {
return err
}
}
return nil
}

func updateFieldInfo(valueList []string, filter *regexp.Regexp, desc *prometheus.Desc, ch chan<- prometheus.Metric) error {
if filter == nil {
return nil
}

for _, val := range valueList {
if !filter.MatchString(val) {
continue
}
ch <- prometheus.MustNewConstMetric(desc,
prometheus.GaugeValue,
1,
val,
)
}
return nil
}
Expand Down
12 changes: 12 additions & 0 deletions collector/fixtures/e2e-64k-page-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 3
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_bug_info gauge
node_cpu_bug_info{bug="cpu_meltdown"} 1
node_cpu_bug_info{bug="mds"} 1
node_cpu_bug_info{bug="spectre_v1"} 1
node_cpu_bug_info{bug="spectre_v2"} 1
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_flag_info gauge
node_cpu_flag_info{flag="aes"} 1
node_cpu_flag_info{flag="avx"} 1
node_cpu_flag_info{flag="avx2"} 1
node_cpu_flag_info{flag="constant_tsc"} 1
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
Expand Down
12 changes: 12 additions & 0 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 3
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_bug_info gauge
node_cpu_bug_info{bug="cpu_meltdown"} 1
node_cpu_bug_info{bug="mds"} 1
node_cpu_bug_info{bug="spectre_v1"} 1
node_cpu_bug_info{bug="spectre_v2"} 1
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_flag_info gauge
node_cpu_flag_info{flag="aes"} 1
node_cpu_flag_info{flag="avx"} 1
node_cpu_flag_info{flag="avx2"} 1
node_cpu_flag_info{flag="constant_tsc"} 1
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
Expand Down
2 changes: 2 additions & 0 deletions end-to-end-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ fi
--collector.qdisc.fixtures="collector/fixtures/qdisc/" \
--collector.netclass.ignored-devices="(bond0|dmz|int)" \
--collector.cpu.info \
--collector.cpu.info.flags-include="^(aes|avx.?|constant_tsc)$" \
--collector.cpu.info.bugs-include="^(cpu_meltdown|spectre_.*|mds)$" \
--web.listen-address "127.0.0.1:${port}" \
--log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &

Expand Down