From 0af3dabdc71e4256793bf0fefd6a745c26a0cdfe Mon Sep 17 00:00:00 2001 From: domchan <31119455+domgoer@users.noreply.github.com> Date: Sat, 18 Jul 2020 00:32:23 +0800 Subject: [PATCH] Expose cpu bugs and flags as info metrics. (#1788) * Expose cpu bugs and flags as info metrics with a regexp filter. * Automatically enable CPU info metrics when using flags or bugs feature. Signed-off-by: domgoer --- collector/cpu_linux.go | 74 +++++++++++++++++++++- collector/fixtures/e2e-64k-page-output.txt | 12 ++++ collector/fixtures/e2e-output.txt | 12 ++++ end-to-end-test.sh | 2 + 4 files changed, 98 insertions(+), 2 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index dfa4d4afc4..65476d32eb 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -18,6 +18,7 @@ package collector import ( "fmt" "path/filepath" + "regexp" "strconv" "sync" @@ -32,16 +33,23 @@ type cpuCollector struct { fs procfs.FS cpu *prometheus.Desc cpuInfo *prometheus.Desc + cpuFlagsInfo *prometheus.Desc + cpuBugsInfo *prometheus.Desc cpuGuest *prometheus.Desc cpuCoreThrottle *prometheus.Desc cpuPackageThrottle *prometheus.Desc logger log.Logger cpuStats []procfs.CPUStat cpuStatsMutex sync.Mutex + + cpuFlagsIncludeRegexp *regexp.Regexp + cpuBugsIncludeRegexp *regexp.Regexp } var ( enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool() + flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String() + bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String() ) func init() { @@ -54,7 +62,7 @@ func NewCPUCollector(logger log.Logger) (Collector, error) { if err != nil { return nil, fmt.Errorf("failed to open procfs: %w", err) } - return &cpuCollector{ + c := &cpuCollector{ fs: fs, cpu: nodeCPUSecondsDesc, cpuInfo: prometheus.NewDesc( @@ -62,6 +70,16 @@ func NewCPUCollector(logger log.Logger) (Collector, error) { "CPU information from /proc/cpuinfo.", []string{"package", "core", "cpu", "vendor", "family", "model", "model_name", "microcode", "stepping", "cachesize"}, nil, ), + cpuFlagsInfo: prometheus.NewDesc( + prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "flag_info"), + "The `flags` field of CPU information from /proc/cpuinfo.", + []string{"flag"}, nil, + ), + cpuBugsInfo: prometheus.NewDesc( + prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "bug_info"), + "The `bugs` field of CPU information from /proc/cpuinfo.", + []string{"bug"}, nil, + ), cpuGuest: prometheus.NewDesc( prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"), "Seconds the cpus spent in guests (VMs) for each mode.", @@ -78,7 +96,34 @@ func NewCPUCollector(logger log.Logger) (Collector, error) { []string{"package"}, nil, ), logger: logger, - }, nil + } + err = c.compileIncludeFlags(flagsInclude, bugsInclude) + if err != nil { + return nil, fmt.Errorf("fail to compile --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include, the values of them must be regular expressions: %w", err) + } + return c, nil +} + +func (c *cpuCollector) compileIncludeFlags(flagsIncludeFlag, bugsIncludeFlag *string) error { + if (*flagsIncludeFlag != "" || *bugsIncludeFlag != "") && !*enableCPUInfo { + *enableCPUInfo = true + level.Info(c.logger).Log("msg", "--collector.cpu.info has been set to `true` because you set the following flags, like --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include") + } + + var err error + if *flagsIncludeFlag != "" { + c.cpuFlagsIncludeRegexp, err = regexp.Compile(*flagsIncludeFlag) + if err != nil { + return err + } + } + if *bugsIncludeFlag != "" { + c.cpuBugsIncludeRegexp, err = regexp.Compile(*bugsIncludeFlag) + if err != nil { + return err + } + } + return nil } // Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/. @@ -117,6 +162,31 @@ func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error { cpu.Microcode, cpu.Stepping, cpu.CacheSize) + + if err := updateFieldInfo(cpu.Flags, c.cpuFlagsIncludeRegexp, c.cpuFlagsInfo, ch); err != nil { + return err + } + if err := updateFieldInfo(cpu.Bugs, c.cpuBugsIncludeRegexp, c.cpuBugsInfo, ch); err != nil { + return err + } + } + return nil +} + +func updateFieldInfo(valueList []string, filter *regexp.Regexp, desc *prometheus.Desc, ch chan<- prometheus.Metric) error { + if filter == nil { + return nil + } + + for _, val := range valueList { + if !filter.MatchString(val) { + continue + } + ch <- prometheus.MustNewConstMetric(desc, + prometheus.GaugeValue, + 1, + val, + ) } return nil } diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 7b857ff121..cbed1270d4 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -184,12 +184,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0 # HELP node_cooling_device_max_state Maximum throttle state of the cooling device # TYPE node_cooling_device_max_state gauge node_cooling_device_max_state{name="0",type="Processor"} 3 +# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo. +# TYPE node_cpu_bug_info gauge +node_cpu_bug_info{bug="cpu_meltdown"} 1 +node_cpu_bug_info{bug="mds"} 1 +node_cpu_bug_info{bug="spectre_v1"} 1 +node_cpu_bug_info{bug="spectre_v2"} 1 # HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled. # TYPE node_cpu_core_throttles_total counter node_cpu_core_throttles_total{core="0",package="0"} 5 node_cpu_core_throttles_total{core="0",package="1"} 0 node_cpu_core_throttles_total{core="1",package="0"} 0 node_cpu_core_throttles_total{core="1",package="1"} 9 +# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo. +# TYPE node_cpu_flag_info gauge +node_cpu_flag_info{flag="aes"} 1 +node_cpu_flag_info{flag="avx"} 1 +node_cpu_flag_info{flag="avx2"} 1 +node_cpu_flag_info{flag="constant_tsc"} 1 # HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode. # TYPE node_cpu_guest_seconds_total counter node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 600ba70800..642fe1137a 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -310,12 +310,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0 # HELP node_cooling_device_max_state Maximum throttle state of the cooling device # TYPE node_cooling_device_max_state gauge node_cooling_device_max_state{name="0",type="Processor"} 3 +# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo. +# TYPE node_cpu_bug_info gauge +node_cpu_bug_info{bug="cpu_meltdown"} 1 +node_cpu_bug_info{bug="mds"} 1 +node_cpu_bug_info{bug="spectre_v1"} 1 +node_cpu_bug_info{bug="spectre_v2"} 1 # HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled. # TYPE node_cpu_core_throttles_total counter node_cpu_core_throttles_total{core="0",package="0"} 5 node_cpu_core_throttles_total{core="0",package="1"} 0 node_cpu_core_throttles_total{core="1",package="0"} 0 node_cpu_core_throttles_total{core="1",package="1"} 9 +# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo. +# TYPE node_cpu_flag_info gauge +node_cpu_flag_info{flag="aes"} 1 +node_cpu_flag_info{flag="avx"} 1 +node_cpu_flag_info{flag="avx2"} 1 +node_cpu_flag_info{flag="constant_tsc"} 1 # HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode. # TYPE node_cpu_guest_seconds_total counter node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01 diff --git a/end-to-end-test.sh b/end-to-end-test.sh index 961dd27e8e..955ab1d919 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -107,6 +107,8 @@ fi --collector.qdisc.fixtures="collector/fixtures/qdisc/" \ --collector.netclass.ignored-devices="(bond0|dmz|int)" \ --collector.cpu.info \ + --collector.cpu.info.flags-include="^(aes|avx.?|constant_tsc)$" \ + --collector.cpu.info.bugs-include="^(cpu_meltdown|spectre_.*|mds)$" \ --web.listen-address "127.0.0.1:${port}" \ --log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &