From 9166a16577502d7accc3c209c9dbd9e4fa7e5c5a Mon Sep 17 00:00:00 2001 From: MaciejMis <66726049+MaciejMis@users.noreply.github.com> Date: Thu, 10 Dec 2020 21:23:27 +0100 Subject: [PATCH] New Intel PowerStat input plugin (#8488) --- README.md | 1 + plugins/inputs/all/all.go | 1 + plugins/inputs/intel_powerstat/README.md | 206 ++++++++ plugins/inputs/intel_powerstat/dto.go | 37 ++ plugins/inputs/intel_powerstat/file.go | 154 ++++++ .../inputs/intel_powerstat/file_mock_test.go | 132 +++++ .../inputs/intel_powerstat/intel_powerstat.go | 486 +++++++++++++++++ .../intel_powerstat_notlinux.go | 3 + .../intel_powerstat/intel_powerstat_test.go | 494 ++++++++++++++++++ plugins/inputs/intel_powerstat/msr.go | 207 ++++++++ .../inputs/intel_powerstat/msr_mock_test.go | 61 +++ plugins/inputs/intel_powerstat/msr_test.go | 134 +++++ plugins/inputs/intel_powerstat/rapl.go | 238 +++++++++ .../inputs/intel_powerstat/rapl_mock_test.go | 66 +++ plugins/inputs/intel_powerstat/rapl_test.go | 115 ++++ .../inputs/intel_powerstat/unit_converter.go | 49 ++ 16 files changed, 2384 insertions(+) create mode 100644 plugins/inputs/intel_powerstat/README.md create mode 100644 plugins/inputs/intel_powerstat/dto.go create mode 100644 plugins/inputs/intel_powerstat/file.go create mode 100644 plugins/inputs/intel_powerstat/file_mock_test.go create mode 100644 plugins/inputs/intel_powerstat/intel_powerstat.go create mode 100644 plugins/inputs/intel_powerstat/intel_powerstat_notlinux.go create mode 100644 plugins/inputs/intel_powerstat/intel_powerstat_test.go create mode 100644 plugins/inputs/intel_powerstat/msr.go create mode 100644 plugins/inputs/intel_powerstat/msr_mock_test.go create mode 100644 plugins/inputs/intel_powerstat/msr_test.go create mode 100644 plugins/inputs/intel_powerstat/rapl.go create mode 100644 plugins/inputs/intel_powerstat/rapl_mock_test.go create mode 100644 plugins/inputs/intel_powerstat/rapl_test.go create mode 100644 plugins/inputs/intel_powerstat/unit_converter.go diff --git a/README.md b/README.md index 6093a253f479f..1999f635fba9d 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,7 @@ For documentation on the latest development code see the [documentation index][d * [influxdb](./plugins/inputs/influxdb) * [influxdb_listener](./plugins/inputs/influxdb_listener) * [influxdb_v2_listener](./plugins/inputs/influxdb_v2_listener) +* [intel_powerstat](plugins/inputs/intel_powerstat) * [intel_rdt](./plugins/inputs/intel_rdt) * [internal](./plugins/inputs/internal) * [interrupts](./plugins/inputs/interrupts) diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go index 6eb5dbb7aafef..6ad302d668e47 100644 --- a/plugins/inputs/all/all.go +++ b/plugins/inputs/all/all.go @@ -63,6 +63,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/influxdb" _ "github.com/influxdata/telegraf/plugins/inputs/influxdb_listener" _ "github.com/influxdata/telegraf/plugins/inputs/influxdb_v2_listener" + _ "github.com/influxdata/telegraf/plugins/inputs/intel_powerstat" _ "github.com/influxdata/telegraf/plugins/inputs/intel_rdt" _ "github.com/influxdata/telegraf/plugins/inputs/internal" _ "github.com/influxdata/telegraf/plugins/inputs/interrupts" diff --git a/plugins/inputs/intel_powerstat/README.md b/plugins/inputs/intel_powerstat/README.md new file mode 100644 index 0000000000000..9efb4176d525b --- /dev/null +++ b/plugins/inputs/intel_powerstat/README.md @@ -0,0 +1,206 @@ +# Intel PowerStat Input Plugin + +Telemetry frameworks allow users to monitor critical platform level metrics. +Key source of platform telemetry is power domain that is beneficial for MANO/Monitoring&Analytics systems +to take preventive/corrective actions based on platform busyness, CPU temperature, actual CPU utilization +and power statistics. Main use cases are power saving and workload migration. + +Intel PowerStat plugin supports Intel based platforms and assumes presence of Linux based OS. + +### Configuration: +```toml +# Intel PowerStat plugin enables monitoring of platform metrics (power, TDP) and per-CPU metrics like temperature, power and utilization. +[[inputs.intel_powerstat]] + ## All global metrics are always collected by Intel PowerStat plugin. + ## User can choose which per-CPU metrics are monitored by the plugin in cpu_metrics array. + ## Empty array means no per-CPU specific metrics will be collected by the plugin - in this case only platform level + ## telemetry will be exposed by Intel PowerStat plugin. + ## Supported options: + ## "cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles" + # cpu_metrics = [] +``` +### Example: Configuration with no per-CPU telemetry +This configuration allows getting global metrics (processor package specific), no per-CPU metrics are collected: +```toml +[[inputs.intel_powerstat]] + cpu_metrics = [] +``` + +### Example: Configuration with no per-CPU telemetry - equivalent case +This configuration allows getting global metrics (processor package specific), no per-CPU metrics are collected: +```toml +[[inputs.intel_powerstat]] +``` + +### Example: Configuration for CPU Temperature and Frequency only +This configuration allows getting global metrics plus subset of per-CPU metrics (CPU Temperature and Current Frequency): +```toml +[[inputs.intel_powerstat]] + cpu_metrics = ["cpu_frequency", "cpu_temperature"] +``` + +### Example: Configuration with all available metrics +This configuration allows getting global metrics and all per-CPU metrics: +```toml +[[inputs.intel_powerstat]] + cpu_metrics = ["cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles"] +``` + +### SW Dependencies: +Plugin is based on Linux Kernel modules that expose specific metrics over `sysfs` or `devfs` interfaces. +The following dependencies are expected by plugin: +- _intel-rapl_ module which exposes Intel Runtime Power Limiting metrics over `sysfs` (`/sys/devices/virtual/powercap/intel-rapl`), +- _msr_ kernel module that provides access to processor model specific registers over `devfs` (`/dev/cpu/cpu%d/msr`), +- _cpufreq_ kernel module - which exposes per-CPU Frequency over `sysfs` (`/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq`). + +Minimum kernel version required is 3.13 to satisfy all requirements. + +Please make sure that kernel modules are loaded and running. You might have to manually enable them by using `modprobe`. +Exact commands to be executed are: +``` +sudo modprobe cpufreq-stats +sudo modprobe msr +sudo modprobe intel_rapl +``` + +**Telegraf with Intel PowerStat plugin enabled may require root access to read model specific registers (MSRs)** +to retrieve data for calculation of most critical per-CPU specific metrics: +- `cpu_busy_frequency_mhz` +- `cpu_temperature_celsius` +- `cpu_c1_state_residency_percent` +- `cpu_c6_state_residency_percent` +- `cpu_busy_cycles_percent` + +To expose other Intel PowerStat metrics root access may or may not be required (depending on OS type or configuration). + +### HW Dependencies: +Specific metrics require certain processor features to be present, otherwise Intel PowerStat plugin won't be able to +read them. When using Linux Kernel based OS, user can detect supported processor features reading `/proc/cpuinfo` file. +Plugin assumes crucial properties are the same for all CPU cores in the system. +The following processor properties are examined in more detail in this section: +processor _cpu family_, _model_ and _flags_. +The following processor properties are required by the plugin: +- Processor _cpu family_ must be Intel (0x6) - since data used by the plugin assumes Intel specific +model specific registers for all features +- The following processor flags shall be present: + - "_msr_" shall be present for plugin to read platform data from processor model specific registers and collect + the following metrics: _powerstat_core.cpu_temperature_, _powerstat_core.cpu_busy_frequency_, + _powerstat_core.cpu_busy_cycles_, _powerstat_core.cpu_c1_state_residency_, _powerstat_core._cpu_c6_state_residency_ + - "_aperfmperf_" shall be present to collect the following metrics: _powerstat_core.cpu_busy_frequency_, + _powerstat_core.cpu_busy_cycles_, _powerstat_core.cpu_c1_state_residency_ + - "_dts_" shall be present to collect _powerstat_core.cpu_temperature_ +- Processor _Model number_ must be one of the following values for plugin to read _powerstat_core.cpu_c1_state_residency_ +and _powerstat_core.cpu_c6_state_residency_ metrics: + +| Model number | Processor name | +|-----|-------------| +| 0x37 | Intel Atom® Bay Trail | +| 0x4D | Intel Atom® Avaton | +| 0x5C | Intel Atom® Apollo Lake | +| 0x5F | Intel Atom® Denverton | +| 0x7A | Intel Atom® Goldmont | +| 0x4C | Intel Atom® Airmont | +| 0x86 | Intel Atom® Jacobsville | +| 0x96 | Intel Atom® Elkhart Lake | +| 0x9C | Intel Atom® Jasper Lake | +| 0x1A | Intel Nehalem-EP | +| 0x1E | Intel Nehalem | +| 0x1F | Intel Nehalem-G | +| 0x2E | Intel Nehalem-EX | +| 0x25 | Intel Westmere | +| 0x2C | Intel Westmere-EP | +| 0x2F | Intel Westmere-EX | +| 0x2A | Intel Sandybridge | +| 0x2D | Intel Sandybridge-X | +| 0x3A | Intel Ivybridge | +| 0x3E | Intel Ivybridge-X | +| 0x4E | Intel Atom® Silvermont-MID | +| 0x5E | Intel Skylake | +| 0x55 | Intel Skylake-X | +| 0x8E | Intel Kabylake-L | +| 0x9E | Intel Kabylake | +| 0x6A | Intel Icelake-X | +| 0x6C | Intel Icelake-D | +| 0x7D | Intel Icelake | +| 0x7E | Intel Icelake-L | +| 0x9D | Intel Icelake-NNPI | +| 0x3C | Intel Haswell | +| 0x3F | Intel Haswell-X | +| 0x45 | Intel Haswell-L | +| 0x46 | Intel Haswell-G | +| 0x3D | Intel Broadwell | +| 0x47 | Intel Broadwell-G | +| 0x4F | Intel Broadwell-X | +| 0x56 | Intel Broadwell-D | +| 0x66 | Intel Cannonlake-L | +| 0x57 | Intel Xeon® PHI Knights Landing | +| 0x85 | Intel Xeon® PHI Knights Mill | +| 0xA5 | Intel CometLake | +| 0xA6 | Intel CometLake-L | +| 0x8F | Intel Sapphire Rapids X | +| 0x8C | Intel TigerLake-L | +| 0x8D | Intel TigerLake | + +### Metrics +All metrics collected by Intel PowerStat plugin are collected in fixed intervals. +Metrics that reports processor C-state residency or power are calculated over elapsed intervals. +When starting to measure metrics, plugin skips first iteration of metrics if they are based on deltas with previous value. + +**The following measurements are supported by Intel PowerStat plugin:** +- powerstat_core + + - The following Tags are returned by plugin with powerstat_core measurements: + + | Tag | Description | + |-----|-------------| + | `package_id` | ID of platform package/socket | + | `core_id` | ID of physical processor core | + | `cpu_id` | ID of logical processor core | + Measurement powerstat_core metrics are collected per-CPU (cpu_id is the key) + while core_id and package_id tags are additional topology information. + + - Available metrics for powerstat_core measurement + + | Metric name (field) | Description | Units | + |-----|-------------|-----| + | `cpu_frequency_mhz` | Current operational frequency of CPU Core | MHz | + | `cpu_busy_frequency_mhz` | CPU Core Busy Frequency measured as frequency adjusted to CPU Core busy cycles | MHz | + | `cpu_temperature_celsius` | Current temperature of CPU Core | Celsius degrees | + | `cpu_c1_state_residency_percent` | Percentage of time that CPU Core spent in C1 Core residency state | % | + | `cpu_c6_state_residency_percent` | Percentage of time that CPU Core spent in C6 Core residency state | % | + | `cpu_busy_cycles_percent` | CPU Core Busy cycles as a ratio of Cycles spent in C0 state residency to all cycles executed by CPU Core | % | + + + +- powerstat_package + + - The following Tags are returned by plugin with powerstat_package measurements: + + | Tag | Description | + |-----|-------------| + | `package_id` | ID of platform package/socket | + Measurement powerstat_package metrics are collected per processor package - _package_id_ tag indicates which + package metric refers to. + + - Available metrics for powerstat_package measurement + + | Metric name (field) | Description | Units | + |-----|-------------|-----| + | `thermal_design_power_watts` | Maximum Thermal Design Power (TDP) available for processor package | Watts | + | `current_power_consumption_watts` | Current power consumption of processor package | Watts | + | `current_dram_power_consumption_watts` | Current power consumption of processor package DRAM subsystem | Watts | + + +### Example Output: + +``` +powerstat_package,host=ubuntu,package_id=0 thermal_design_power_watts=160 1606494744000000000 +powerstat_package,host=ubuntu,package_id=0 current_power_consumption_watts=35 1606494744000000000 +powerstat_package,host=ubuntu,package_id=0 current_dram_power_consumption_watts=13.94 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_frequency_mhz=1200.29 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_temperature_celsius=34i 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_c6_state_residency_percent=92.52 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_busy_cycles_percent=0.8 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_c1_state_residency_percent=6.68 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_busy_frequency_mhz=1213.24 1606494744000000000 +``` diff --git a/plugins/inputs/intel_powerstat/dto.go b/plugins/inputs/intel_powerstat/dto.go new file mode 100644 index 0000000000000..eb3da0bc269f7 --- /dev/null +++ b/plugins/inputs/intel_powerstat/dto.go @@ -0,0 +1,37 @@ +package intel_powerstat + +type msrData struct { + mperf uint64 + aperf uint64 + timeStampCounter uint64 + c3 uint64 + c6 uint64 + c7 uint64 + throttleTemp uint64 + temp uint64 + mperfDelta uint64 + aperfDelta uint64 + timeStampCounterDelta uint64 + c3Delta uint64 + c6Delta uint64 + c7Delta uint64 + readDate int64 +} + +type raplData struct { + dramCurrentEnergy float64 + socketCurrentEnergy float64 + socketEnergy float64 + dramEnergy float64 + readDate int64 +} + +type cpuInfo struct { + physicalID string + coreID string + cpuID string + vendorID string + cpuFamily string + model string + flags string +} diff --git a/plugins/inputs/intel_powerstat/file.go b/plugins/inputs/intel_powerstat/file.go new file mode 100644 index 0000000000000..7953726fd9ba8 --- /dev/null +++ b/plugins/inputs/intel_powerstat/file.go @@ -0,0 +1,154 @@ +// +build linux + +package intel_powerstat + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" +) + +// fileService is responsible for handling operations on files. +type fileService interface { + getCPUInfoStats() (map[string]*cpuInfo, error) + getStringsMatchingPatternOnPath(path string) ([]string, error) + readFile(path string) ([]byte, error) + readFileToFloat64(reader io.Reader) (float64, int64, error) + readFileAtOffsetToUint64(reader io.ReaderAt, offset int64) (uint64, error) +} + +type fileServiceImpl struct { +} + +// getCPUInfoStats retrieves basic information about CPU from /proc/cpuinfo. +func (fs *fileServiceImpl) getCPUInfoStats() (map[string]*cpuInfo, error) { + path := "/proc/cpuinfo" + cpuInfoFile, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("error while reading %s, err: %v", path, err) + } + defer cpuInfoFile.Close() + + scanner := bufio.NewScanner(cpuInfoFile) + + processorRegexp := regexp.MustCompile(`^processor\t+:\s([0-9]+)\n*$`) + physicalIDRegexp := regexp.MustCompile(`^physical id\t+:\s([0-9]+)\n*$`) + coreIDRegexp := regexp.MustCompile(`^core id\t+:\s([0-9]+)\n*$`) + vendorIDRegexp := regexp.MustCompile(`^vendor_id\t+:\s([a-zA-Z]+)\n*$`) + cpuFamilyRegexp := regexp.MustCompile(`^cpu\sfamily\t+:\s([0-9]+)\n*$`) + modelRegexp := regexp.MustCompile(`^model\t+:\s([0-9]+)\n*$`) + flagsRegexp := regexp.MustCompile(`^flags\t+:\s(.+)\n*$`) + + stats := make(map[string]*cpuInfo) + currentInfo := &cpuInfo{} + + for scanner.Scan() { + line := scanner.Text() + + processorRes := processorRegexp.FindStringSubmatch(line) + if len(processorRes) > 1 { + currentInfo = &cpuInfo{ + cpuID: processorRes[1], + } + } + + vendorIDRes := vendorIDRegexp.FindStringSubmatch(line) + if len(vendorIDRes) > 1 { + currentInfo.vendorID = vendorIDRes[1] + } + + physicalIDRes := physicalIDRegexp.FindStringSubmatch(line) + if len(physicalIDRes) > 1 { + currentInfo.physicalID = physicalIDRes[1] + } + + coreIDRes := coreIDRegexp.FindStringSubmatch(line) + if len(coreIDRes) > 1 { + currentInfo.coreID = coreIDRes[1] + } + + cpuFamilyRes := cpuFamilyRegexp.FindStringSubmatch(line) + if len(cpuFamilyRes) > 1 { + currentInfo.cpuFamily = cpuFamilyRes[1] + } + + modelRes := modelRegexp.FindStringSubmatch(line) + if len(modelRes) > 1 { + currentInfo.model = modelRes[1] + } + + flagsRes := flagsRegexp.FindStringSubmatch(line) + if len(flagsRes) > 1 { + currentInfo.flags = flagsRes[1] + + // Flags is the last value we have to acquire, so currentInfo is added to map. + stats[currentInfo.cpuID] = currentInfo + } + } + + return stats, nil +} + +// getStringsMatchingPatternOnPath looks for filenames and directory names on path matching given regexp. +// It ignores file system errors such as I/O errors reading directories. The only possible returned error +// is ErrBadPattern, when pattern is malformed. +func (fs *fileServiceImpl) getStringsMatchingPatternOnPath(path string) ([]string, error) { + return filepath.Glob(path) +} + +// readFile reads file on path and return string content. +func (fs *fileServiceImpl) readFile(path string) ([]byte, error) { + out, err := ioutil.ReadFile(path) + if err != nil { + return make([]byte, 0), err + } + return out, nil +} + +// readFileToFloat64 reads file on path and tries to parse content to float64. +func (fs *fileServiceImpl) readFileToFloat64(reader io.Reader) (float64, int64, error) { + read, err := ioutil.ReadAll(reader) + if err != nil { + return 0, 0, err + } + + readDate := time.Now().UnixNano() + + // Remove new line character + trimmedString := strings.TrimRight(string(read), "\n") + // Parse result to float64 + parsedValue, err := strconv.ParseFloat(trimmedString, 64) + if err != nil { + return 0, 0, fmt.Errorf("error parsing string to float for %s", trimmedString) + } + + return parsedValue, readDate, nil +} + +// readFileAtOffsetToUint64 reads 8 bytes from passed file at given offset. +func (fs *fileServiceImpl) readFileAtOffsetToUint64(reader io.ReaderAt, offset int64) (uint64, error) { + buffer := make([]byte, 8) + + if offset == 0 { + return 0, fmt.Errorf("file offset %d should not be 0", offset) + } + + _, err := reader.ReadAt(buffer, offset) + if err != nil { + return 0, fmt.Errorf("error on reading file at offset %d, err: %v", offset, err) + } + + return binary.LittleEndian.Uint64(buffer), nil +} + +func newFileService() *fileServiceImpl { + return &fileServiceImpl{} +} diff --git a/plugins/inputs/intel_powerstat/file_mock_test.go b/plugins/inputs/intel_powerstat/file_mock_test.go new file mode 100644 index 0000000000000..ab4bd8c57baa6 --- /dev/null +++ b/plugins/inputs/intel_powerstat/file_mock_test.go @@ -0,0 +1,132 @@ +// Code generated by mockery v0.0.0-dev. DO NOT EDIT. + +package intel_powerstat + +import ( + io "io" + + mock "github.com/stretchr/testify/mock" +) + +// mockFileService is an autogenerated mock type for the fileService type +type mockFileService struct { + mock.Mock +} + +// getCPUInfoStats provides a mock function with given fields: +func (_m *mockFileService) getCPUInfoStats() (map[string]*cpuInfo, error) { + ret := _m.Called() + + var r0 map[string]*cpuInfo + if rf, ok := ret.Get(0).(func() map[string]*cpuInfo); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]*cpuInfo) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// getStringsMatchingPatternOnPath provides a mock function with given fields: path +func (_m *mockFileService) getStringsMatchingPatternOnPath(path string) ([]string, error) { + ret := _m.Called(path) + + var r0 []string + if rf, ok := ret.Get(0).(func(string) []string); ok { + r0 = rf(path) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(path) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// readFile provides a mock function with given fields: path +func (_m *mockFileService) readFile(path string) ([]byte, error) { + ret := _m.Called(path) + + var r0 []byte + if rf, ok := ret.Get(0).(func(string) []byte); ok { + r0 = rf(path) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]byte) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(path) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// readFileAtOffsetToUint64 provides a mock function with given fields: reader, offset +func (_m *mockFileService) readFileAtOffsetToUint64(reader io.ReaderAt, offset int64) (uint64, error) { + ret := _m.Called(reader, offset) + + var r0 uint64 + if rf, ok := ret.Get(0).(func(io.ReaderAt, int64) uint64); ok { + r0 = rf(reader, offset) + } else { + r0 = ret.Get(0).(uint64) + } + + var r1 error + if rf, ok := ret.Get(1).(func(io.ReaderAt, int64) error); ok { + r1 = rf(reader, offset) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// readFileToFloat64 provides a mock function with given fields: reader +func (_m *mockFileService) readFileToFloat64(reader io.Reader) (float64, int64, error) { + ret := _m.Called(reader) + + var r0 float64 + if rf, ok := ret.Get(0).(func(io.Reader) float64); ok { + r0 = rf(reader) + } else { + r0 = ret.Get(0).(float64) + } + + var r1 int64 + if rf, ok := ret.Get(1).(func(io.Reader) int64); ok { + r1 = rf(reader) + } else { + r1 = ret.Get(1).(int64) + } + + var r2 error + if rf, ok := ret.Get(2).(func(io.Reader) error); ok { + r2 = rf(reader) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} diff --git a/plugins/inputs/intel_powerstat/intel_powerstat.go b/plugins/inputs/intel_powerstat/intel_powerstat.go new file mode 100644 index 0000000000000..9340fdec814b1 --- /dev/null +++ b/plugins/inputs/intel_powerstat/intel_powerstat.go @@ -0,0 +1,486 @@ +// +build linux + +package intel_powerstat + +import ( + "fmt" + "math/big" + "strings" + "sync" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/inputs" +) + +const ( + cpuFrequency = "cpu_frequency" + cpuBusyFrequency = "cpu_busy_frequency" + cpuTemperature = "cpu_temperature" + cpuC1StateResidency = "cpu_c1_state_residency" + cpuC6StateResidency = "cpu_c6_state_residency" + cpuBusyCycles = "cpu_busy_cycles" + percentageMultiplier = 100 +) + +// PowerStat plugin enables monitoring of platform metrics (power, TDP) and Core metrics like temperature, power and utilization. +type PowerStat struct { + CPUMetrics []string `toml:"cpu_metrics"` + Log telegraf.Logger `toml:"-"` + + fs fileService + rapl raplService + msr msrService + + cpuFrequency bool + cpuBusyFrequency bool + cpuTemperature bool + cpuC1StateResidency bool + cpuC6StateResidency bool + cpuBusyCycles bool + cpuInfo map[string]*cpuInfo + skipFirstIteration bool +} + +// Description returns a one-sentence description on the plugin. +func (p *PowerStat) Description() string { + return `Intel PowerStat plugin enables monitoring of platform metrics (power, TDP) and Core metrics like temperature, power and utilization.` +} + +// SampleConfig returns the default configuration of the plugin. +func (p *PowerStat) SampleConfig() string { + return ` + ## All global metrics are always collected by Intel PowerStat plugin. + ## User can choose which per-CPU metrics are monitored by the plugin in cpu_metrics array. + ## Empty array means no per-CPU specific metrics will be collected by the plugin - in this case only platform level + ## telemetry will be exposed by Intel PowerStat plugin. + ## Supported options: + ## "cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles" + # cpu_metrics = [] +` +} + +// Init performs one time setup of the plugin. +func (p *PowerStat) Init() error { + p.parseCPUMetricsConfig() + err := p.verifyProcessor() + if err != nil { + return err + } + // Initialize MSR service only when there is at least one core metric enabled. + if p.cpuFrequency || p.cpuBusyFrequency || p.cpuTemperature || p.cpuC1StateResidency || + p.cpuC6StateResidency || p.cpuBusyCycles { + p.msr = newMsrServiceWithFs(p.Log, p.fs) + } + p.rapl = newRaplServiceWithFs(p.Log, p.fs) + + return nil +} + +// Gather takes in an accumulator and adds the metrics that the Input gathers. +func (p *PowerStat) Gather(acc telegraf.Accumulator) error { + p.addGlobalMetrics(acc) + + if p.areCoreMetricsEnabled() { + p.addPerCoreMetrics(acc) + } + + // Gathering the first iteration of metrics was skipped for most of them because they are based on delta calculations. + p.skipFirstIteration = false + + return nil +} + +func (p *PowerStat) addGlobalMetrics(acc telegraf.Accumulator) { + // Prepare RAPL data each gather because there is a possibility to disable rapl kernel module + p.rapl.initializeRaplData() + + for socketID := range p.rapl.getRaplData() { + err := p.rapl.retrieveAndCalculateData(socketID) + if err != nil { + // In case of an error skip calculating metrics for this socket + p.Log.Errorf("error fetching rapl data for socket %s, err: %v", socketID, err) + continue + } + p.addThermalDesignPowerMetric(socketID, acc) + if p.skipFirstIteration { + continue + } + p.addCurrentSocketPowerConsumption(socketID, acc) + p.addCurrentDramPowerConsumption(socketID, acc) + } +} + +func (p *PowerStat) addThermalDesignPowerMetric(socketID string, acc telegraf.Accumulator) { + maxPower, err := p.rapl.getConstraintMaxPowerWatts(socketID) + if err != nil { + p.Log.Errorf("error while retrieving TDP of the socket %s, err: %v", socketID, err) + return + } + + tags := map[string]string{ + "package_id": socketID, + } + + fields := map[string]interface{}{ + "thermal_design_power_watts": roundFloatToNearestTwoDecimalPlaces(maxPower), + } + + acc.AddGauge("powerstat_package", fields, tags) +} + +func (p *PowerStat) addCurrentSocketPowerConsumption(socketID string, acc telegraf.Accumulator) { + tags := map[string]string{ + "package_id": socketID, + } + + fields := map[string]interface{}{ + "current_power_consumption_watts": roundFloatToNearestTwoDecimalPlaces(p.rapl.getRaplData()[socketID].socketCurrentEnergy), + } + + acc.AddGauge("powerstat_package", fields, tags) +} + +func (p *PowerStat) addCurrentDramPowerConsumption(socketID string, acc telegraf.Accumulator) { + tags := map[string]string{ + "package_id": socketID, + } + + fields := map[string]interface{}{ + "current_dram_power_consumption_watts": roundFloatToNearestTwoDecimalPlaces(p.rapl.getRaplData()[socketID].dramCurrentEnergy), + } + + acc.AddGauge("powerstat_package", fields, tags) +} + +func (p *PowerStat) addPerCoreMetrics(acc telegraf.Accumulator) { + var wg sync.WaitGroup + wg.Add(len(p.msr.getCPUCoresData())) + + for cpuID := range p.msr.getCPUCoresData() { + go p.addMetricsForSingleCore(cpuID, acc, &wg) + } + + wg.Wait() +} + +func (p *PowerStat) addMetricsForSingleCore(cpuID string, acc telegraf.Accumulator, wg *sync.WaitGroup) { + defer wg.Done() + + if p.cpuFrequency { + p.addCPUFrequencyMetric(cpuID, acc) + } + + // Read data from MSR only if required + if p.cpuC1StateResidency || p.cpuC6StateResidency || p.cpuBusyCycles || p.cpuTemperature || + p.cpuBusyFrequency { + err := p.msr.openAndReadMsr(cpuID) + if err != nil { + // In case of an error exit the function. All metrics past this point are dependant on MSR. + p.Log.Debugf("error while reading msr: %v", err) + return + } + } + + if p.cpuTemperature { + p.addCPUTemperatureMetric(cpuID, acc) + } + + // cpuBusyFrequency metric does some calculations inside that are required in another plugin cycle. + if p.cpuBusyFrequency { + p.addCPUBusyFrequencyMetric(cpuID, acc) + } + + if !p.skipFirstIteration { + if p.cpuC1StateResidency { + p.addCPUC1StateResidencyMetric(cpuID, acc) + } + + if p.cpuC6StateResidency { + p.addCPUC6StateResidencyMetric(cpuID, acc) + } + + if p.cpuBusyCycles { + p.addCPUBusyCyclesMetric(cpuID, acc) + } + } +} + +func (p *PowerStat) addCPUFrequencyMetric(cpuID string, acc telegraf.Accumulator) { + frequency, err := p.msr.retrieveCPUFrequencyForCore(cpuID) + + // In case of an error leave func + if err != nil { + p.Log.Debugf("error while reading file: %v", err) + return + } + + cpu := p.cpuInfo[cpuID] + tags := map[string]string{ + "package_id": cpu.physicalID, + "core_id": cpu.coreID, + "cpu_id": cpu.cpuID, + } + + fields := map[string]interface{}{ + "cpu_frequency_mhz": roundFloatToNearestTwoDecimalPlaces(frequency), + } + + acc.AddGauge("powerstat_core", fields, tags) +} + +func (p *PowerStat) addCPUTemperatureMetric(cpuID string, acc telegraf.Accumulator) { + coresData := p.msr.getCPUCoresData() + temp := coresData[cpuID].throttleTemp - coresData[cpuID].temp + + cpu := p.cpuInfo[cpuID] + tags := map[string]string{ + "package_id": cpu.physicalID, + "core_id": cpu.coreID, + "cpu_id": cpu.cpuID, + } + fields := map[string]interface{}{ + "cpu_temperature_celsius": temp, + } + + acc.AddGauge("powerstat_core", fields, tags) +} + +func (p *PowerStat) addCPUBusyFrequencyMetric(cpuID string, acc telegraf.Accumulator) { + coresData := p.msr.getCPUCoresData() + mperfDelta := coresData[cpuID].mperfDelta + // Avoid division by 0 + if mperfDelta == 0 { + p.Log.Errorf("mperf delta should not equal 0 on core %s", cpuID) + return + } + aperfMperf := float64(coresData[cpuID].aperfDelta) / float64(mperfDelta) + tsc := convertProcessorCyclesToHertz(coresData[cpuID].timeStampCounterDelta) + timeNow := time.Now().UnixNano() + interval := convertNanoSecondsToSeconds(timeNow - coresData[cpuID].readDate) + coresData[cpuID].readDate = timeNow + + if p.skipFirstIteration { + return + } + + if interval == 0 { + p.Log.Errorf("interval between last two Telegraf cycles is 0") + return + } + + busyMhzValue := roundFloatToNearestTwoDecimalPlaces(tsc * aperfMperf / interval) + + cpu := p.cpuInfo[cpuID] + tags := map[string]string{ + "package_id": cpu.physicalID, + "core_id": cpu.coreID, + "cpu_id": cpu.cpuID, + } + fields := map[string]interface{}{ + "cpu_busy_frequency_mhz": busyMhzValue, + } + + acc.AddGauge("powerstat_core", fields, tags) +} + +func (p *PowerStat) addCPUC1StateResidencyMetric(cpuID string, acc telegraf.Accumulator) { + coresData := p.msr.getCPUCoresData() + timestampDeltaBig := new(big.Int).SetUint64(coresData[cpuID].timeStampCounterDelta) + // Avoid division by 0 + if timestampDeltaBig.Sign() < 1 { + p.Log.Errorf("timestamp delta value %v should not be lower than 1", timestampDeltaBig) + return + } + + // Since counter collection is not atomic it may happen that sum of C0, C1, C3, C6 and C7 + // is bigger value than TSC, in such case C1 residency shall be set to 0. + // Operating on big.Int to avoid overflow + mperfDeltaBig := new(big.Int).SetUint64(coresData[cpuID].mperfDelta) + c3DeltaBig := new(big.Int).SetUint64(coresData[cpuID].c3Delta) + c6DeltaBig := new(big.Int).SetUint64(coresData[cpuID].c6Delta) + c7DeltaBig := new(big.Int).SetUint64(coresData[cpuID].c7Delta) + + c1Big := new(big.Int).Sub(timestampDeltaBig, mperfDeltaBig) + c1Big.Sub(c1Big, c3DeltaBig) + c1Big.Sub(c1Big, c6DeltaBig) + c1Big.Sub(c1Big, c7DeltaBig) + + if c1Big.Sign() < 0 { + c1Big = c1Big.SetInt64(0) + } + c1Value := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(c1Big.Uint64()) / float64(timestampDeltaBig.Uint64())) + + cpu := p.cpuInfo[cpuID] + tags := map[string]string{ + "package_id": cpu.physicalID, + "core_id": cpu.coreID, + "cpu_id": cpu.cpuID, + } + fields := map[string]interface{}{ + "cpu_c1_state_residency_percent": c1Value, + } + + acc.AddGauge("powerstat_core", fields, tags) +} + +func (p *PowerStat) addCPUC6StateResidencyMetric(cpuID string, acc telegraf.Accumulator) { + coresData := p.msr.getCPUCoresData() + // Avoid division by 0 + if coresData[cpuID].timeStampCounterDelta == 0 { + p.Log.Errorf("timestamp counter on offset %s should not equal 0 on cpuID %s", + timestampCounterLocation, cpuID) + return + } + c6Value := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * + float64(coresData[cpuID].c6Delta) / float64(coresData[cpuID].timeStampCounterDelta)) + + cpu := p.cpuInfo[cpuID] + tags := map[string]string{ + "package_id": cpu.physicalID, + "core_id": cpu.coreID, + "cpu_id": cpu.cpuID, + } + fields := map[string]interface{}{ + "cpu_c6_state_residency_percent": c6Value, + } + + acc.AddGauge("powerstat_core", fields, tags) +} + +func (p *PowerStat) addCPUBusyCyclesMetric(cpuID string, acc telegraf.Accumulator) { + coresData := p.msr.getCPUCoresData() + // Avoid division by 0 + if coresData[cpuID].timeStampCounterDelta == 0 { + p.Log.Errorf("timestamp counter on offset %s should not equal 0 on cpuID %s", + timestampCounterLocation, cpuID) + return + } + busyCyclesValue := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * + float64(coresData[cpuID].mperfDelta) / float64(coresData[cpuID].timeStampCounterDelta)) + cpu := p.cpuInfo[cpuID] + tags := map[string]string{ + "package_id": cpu.physicalID, + "core_id": cpu.coreID, + "cpu_id": cpu.cpuID, + } + fields := map[string]interface{}{ + "cpu_busy_cycles_percent": busyCyclesValue, + } + + acc.AddGauge("powerstat_core", fields, tags) +} + +func (p *PowerStat) parseCPUMetricsConfig() { + if len(p.CPUMetrics) == 0 { + return + } + + if contains(p.CPUMetrics, cpuFrequency) { + p.cpuFrequency = true + } + + if contains(p.CPUMetrics, cpuC1StateResidency) { + p.cpuC1StateResidency = true + } + + if contains(p.CPUMetrics, cpuC6StateResidency) { + p.cpuC6StateResidency = true + } + + if contains(p.CPUMetrics, cpuBusyCycles) { + p.cpuBusyCycles = true + } + + if contains(p.CPUMetrics, cpuBusyFrequency) { + p.cpuBusyFrequency = true + } + + if contains(p.CPUMetrics, cpuTemperature) { + p.cpuTemperature = true + } +} + +func (p *PowerStat) verifyProcessor() error { + allowedProcessorModelsForC1C6 := []int64{0x37, 0x4D, 0x5C, 0x5F, 0x7A, 0x4C, 0x86, 0x96, 0x9C, + 0x1A, 0x1E, 0x1F, 0x2E, 0x25, 0x2C, 0x2F, 0x2A, 0x2D, 0x3A, 0x3E, 0x4E, 0x5E, 0x55, 0x8E, + 0x9E, 0x6A, 0x6C, 0x7D, 0x7E, 0x9D, 0x3C, 0x3F, 0x45, 0x46, 0x3D, 0x47, 0x4F, 0x56, + 0x66, 0x57, 0x85, 0xA5, 0xA6, 0x8F, 0x8C, 0x8D} + stats, err := p.fs.getCPUInfoStats() + if err != nil { + return err + } + + p.cpuInfo = stats + + // First CPU is sufficient for verification. + firstCPU := p.cpuInfo["0"] + if firstCPU == nil { + return fmt.Errorf("first core not found while parsing /proc/cpuinfo") + } + + if firstCPU.vendorID != "GenuineIntel" || firstCPU.cpuFamily != "6" { + return fmt.Errorf("Intel processor not found, vendorId: %s", firstCPU.vendorID) + } + + if !contains(convertIntegerArrayToStringArray(allowedProcessorModelsForC1C6), firstCPU.model) { + p.cpuC1StateResidency = false + p.cpuC6StateResidency = false + } + + if !strings.Contains(firstCPU.flags, "msr") { + p.cpuTemperature = false + p.cpuC6StateResidency = false + p.cpuBusyCycles = false + p.cpuBusyFrequency = false + p.cpuC1StateResidency = false + } + + if !strings.Contains(firstCPU.flags, "aperfmperf") { + p.cpuBusyFrequency = false + p.cpuBusyCycles = false + p.cpuC1StateResidency = false + } + + if !strings.Contains(firstCPU.flags, "dts") { + p.cpuTemperature = false + } + + return nil +} + +func contains(slice []string, str string) bool { + for _, v := range slice { + if v == str { + return true + } + } + + return false +} + +func (p *PowerStat) areCoreMetricsEnabled() bool { + return p.msr != nil && len(p.msr.getCPUCoresData()) > 0 +} + +// newPowerStat creates and returns PowerStat struct. +func newPowerStat(fs fileService) *PowerStat { + p := &PowerStat{ + cpuFrequency: false, + cpuC1StateResidency: false, + cpuC6StateResidency: false, + cpuBusyCycles: false, + cpuTemperature: false, + cpuBusyFrequency: false, + skipFirstIteration: true, + fs: fs, + } + + return p +} + +func init() { + inputs.Add("intel_powerstat", func() telegraf.Input { + return newPowerStat(newFileService()) + }) +} diff --git a/plugins/inputs/intel_powerstat/intel_powerstat_notlinux.go b/plugins/inputs/intel_powerstat/intel_powerstat_notlinux.go new file mode 100644 index 0000000000000..f46755cee92b7 --- /dev/null +++ b/plugins/inputs/intel_powerstat/intel_powerstat_notlinux.go @@ -0,0 +1,3 @@ +// +build !linux + +package intel_powerstat diff --git a/plugins/inputs/intel_powerstat/intel_powerstat_test.go b/plugins/inputs/intel_powerstat/intel_powerstat_test.go new file mode 100644 index 0000000000000..13006de3c6e81 --- /dev/null +++ b/plugins/inputs/intel_powerstat/intel_powerstat_test.go @@ -0,0 +1,494 @@ +// +build linux + +package intel_powerstat + +import ( + "errors" + "strconv" + "sync" + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + "github.com/influxdata/telegraf/testutil" +) + +func TestInitPlugin(t *testing.T) { + cores := []string{"cpu0", "cpu1", "cpu2", "cpu3"} + power, fsMock, _, _ := getPowerWithMockedServices() + + fsMock.On("getCPUInfoStats", mock.Anything). + Return(nil, errors.New("error getting cpu stats")).Once() + require.Error(t, power.Init()) + + fsMock.On("getCPUInfoStats", mock.Anything). + Return(make(map[string]*cpuInfo), nil).Once() + require.Error(t, power.Init()) + + fsMock.On("getCPUInfoStats", mock.Anything). + Return(map[string]*cpuInfo{"0": { + vendorID: "GenuineIntel", + cpuFamily: "test", + }}, nil).Once() + require.Error(t, power.Init()) + + fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + Return(cores, nil).Once(). + On("getCPUInfoStats", mock.Anything). + Return(map[string]*cpuInfo{"0": { + vendorID: "GenuineIntel", + cpuFamily: "6", + }}, nil) + // Verify MSR service initialization. + power.cpuFrequency = true + require.NoError(t, power.Init()) + fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + require.Equal(t, len(cores), len(power.msr.getCPUCoresData())) + + fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + Return(nil, errors.New("error during getStringsMatchingPatternOnPath")).Once() + + // In case of an error when fetching cpu cores plugin should proceed with execution. + require.NoError(t, power.Init()) + fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + require.Equal(t, 0, len(power.msr.getCPUCoresData())) +} + +func TestParseCPUMetricsConfig(t *testing.T) { + power, _, _, _ := getPowerWithMockedServices() + disableCoreMetrics(power) + + power.CPUMetrics = []string{ + "cpu_frequency", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles", "cpu_temperature", + "cpu_busy_frequency", + } + power.parseCPUMetricsConfig() + verifyCoreMetrics(t, power, true) + disableCoreMetrics(power) + verifyCoreMetrics(t, power, false) + + power.CPUMetrics = []string{} + power.parseCPUMetricsConfig() + + power.CPUMetrics = []string{"cpu_c6_state_residency", "#@$sdkjdfsdf3@", "1pu_c1_state_residency"} + power.parseCPUMetricsConfig() + require.Equal(t, false, power.cpuC1StateResidency) + require.Equal(t, true, power.cpuC6StateResidency) + disableCoreMetrics(power) + verifyCoreMetrics(t, power, false) + + power.CPUMetrics = []string{"#@$sdkjdfsdf3@", "1pu_c1_state_residency", "123"} + power.parseCPUMetricsConfig() + verifyCoreMetrics(t, power, false) +} + +func verifyCoreMetrics(t *testing.T, power *PowerStat, enabled bool) { + require.Equal(t, enabled, power.cpuFrequency) + require.Equal(t, enabled, power.cpuC1StateResidency) + require.Equal(t, enabled, power.cpuC6StateResidency) + require.Equal(t, enabled, power.cpuBusyCycles) + require.Equal(t, enabled, power.cpuBusyFrequency) + require.Equal(t, enabled, power.cpuTemperature) +} + +func TestGather(t *testing.T) { + var acc testutil.Accumulator + packageIDs := []string{"0", "1"} + coreIDs := []string{"0", "1", "2", "3"} + socketCurrentEnergy := 13213852.2 + dramCurrentEnergy := 784552.0 + preparedCPUData := getPreparedCPUData(coreIDs) + raplDataMap := prepareRaplDataMap(packageIDs, socketCurrentEnergy, dramCurrentEnergy) + + power, _, raplMock, msrMock := getPowerWithMockedServices() + prepareCPUInfo(power, coreIDs, packageIDs) + enableCoreMetrics(power) + power.skipFirstIteration = false + + raplMock.On("initializeRaplData", mock.Anything). + On("getRaplData").Return(raplDataMap). + On("retrieveAndCalculateData", mock.Anything).Return(nil).Times(len(raplDataMap)). + On("getConstraintMaxPowerWatts", mock.Anything).Return(546783852.3, nil) + msrMock.On("getCPUCoresData").Return(preparedCPUData). + On("openAndReadMsr", mock.Anything).Return(nil). + On("retrieveCPUFrequencyForCore", mock.Anything).Return(1200000.2, nil) + + require.NoError(t, power.Gather(&acc)) + // Number of global metrics : 3 + // Number of per core metrics : 6 + require.Equal(t, 3*len(packageIDs)+6*len(coreIDs), len(acc.GetTelegrafMetrics())) +} + +func TestAddGlobalMetricsNegative(t *testing.T) { + var acc testutil.Accumulator + socketCurrentEnergy := 13213852.2 + dramCurrentEnergy := 784552.0 + raplDataMap := prepareRaplDataMap([]string{"0", "1"}, socketCurrentEnergy, dramCurrentEnergy) + power, _, raplMock, _ := getPowerWithMockedServices() + power.skipFirstIteration = false + raplMock.On("initializeRaplData", mock.Anything).Once(). + On("getRaplData").Return(raplDataMap).Once(). + On("retrieveAndCalculateData", mock.Anything).Return(errors.New("error while calculating data")).Times(len(raplDataMap)) + + power.addGlobalMetrics(&acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) + raplMock.AssertNumberOfCalls(t, "retrieveAndCalculateData", len(raplDataMap)) + + raplMock.On("initializeRaplData", mock.Anything).Once(). + On("getRaplData").Return(make(map[string]*raplData)).Once() + + power.addGlobalMetrics(&acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) + raplMock.AssertNotCalled(t, "retrieveAndCalculateData") + + raplMock.On("initializeRaplData", mock.Anything).Once(). + On("getRaplData").Return(raplDataMap). + On("retrieveAndCalculateData", mock.Anything).Return(nil).Once(). + On("retrieveAndCalculateData", mock.Anything).Return(errors.New("error while calculating data")).Once(). + On("getConstraintMaxPowerWatts", mock.Anything).Return(12313851.5, nil).Twice() + + power.addGlobalMetrics(&acc) + require.Equal(t, 3, len(acc.GetTelegrafMetrics())) +} + +func TestAddGlobalMetricsPositive(t *testing.T) { + var acc testutil.Accumulator + socketCurrentEnergy := 3644574.4 + dramCurrentEnergy := 124234872.5 + raplDataMap := prepareRaplDataMap([]string{"0", "1"}, socketCurrentEnergy, dramCurrentEnergy) + maxPower := 546783852.9 + power, _, raplMock, _ := getPowerWithMockedServices() + power.skipFirstIteration = false + + raplMock.On("initializeRaplData", mock.Anything). + On("getRaplData").Return(raplDataMap). + On("retrieveAndCalculateData", mock.Anything).Return(nil).Times(len(raplDataMap)). + On("getConstraintMaxPowerWatts", mock.Anything).Return(maxPower, nil).Twice(). + On("getCurrentDramPowerConsumption", mock.Anything).Return(dramCurrentEnergy) + + power.addGlobalMetrics(&acc) + require.Equal(t, 6, len(acc.GetTelegrafMetrics())) + + expectedResults := getGlobalMetrics(maxPower, socketCurrentEnergy, dramCurrentEnergy) + for _, test := range expectedResults { + acc.AssertContainsTaggedFields(t, "powerstat_package", test.fields, test.tags) + } +} + +func TestAddMetricsForSingleCoreNegative(t *testing.T) { + var wg sync.WaitGroup + var acc testutil.Accumulator + core := "0" + power, _, _, msrMock := getPowerWithMockedServices() + + msrMock.On("openAndReadMsr", core).Return(errors.New("error reading MSR file")).Once() + + // Skip generating metric for CPU frequency. + power.cpuFrequency = false + + wg.Add(1) + power.addMetricsForSingleCore(core, &acc, &wg) + wg.Wait() + + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) +} + +func TestAddCPUFrequencyMetric(t *testing.T) { + var acc testutil.Accumulator + cpuID := "0" + coreID := "2" + packageID := "1" + frequency := 1200000.2 + power, _, _, msrMock := getPowerWithMockedServices() + prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) + + msrMock.On("retrieveCPUFrequencyForCore", mock.Anything). + Return(float64(0), errors.New("error on reading file")).Once() + + power.addCPUFrequencyMetric(cpuID, &acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) + + msrMock.On("retrieveCPUFrequencyForCore", mock.Anything).Return(frequency, nil).Once() + + power.addCPUFrequencyMetric(cpuID, &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + expectedFrequency := roundFloatToNearestTwoDecimalPlaces(frequency) + expectedMetric := getPowerCoreMetric("cpu_frequency_mhz", expectedFrequency, coreID, packageID, cpuID) + acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) +} + +func TestAddCoreCPUTemperatureMetric(t *testing.T) { + var acc testutil.Accumulator + cpuID := "0" + coreID := "2" + packageID := "1" + power, _, _, msrMock := getPowerWithMockedServices() + preparedData := getPreparedCPUData([]string{cpuID}) + expectedTemp := preparedData[cpuID].throttleTemp - preparedData[cpuID].temp + prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) + + msrMock.On("getCPUCoresData").Return(preparedData).Once() + power.addCPUTemperatureMetric(cpuID, &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + expectedMetric := getPowerCoreMetric("cpu_temperature_celsius", expectedTemp, coreID, packageID, cpuID) + acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) +} + +func TestAddC6StateResidencyMetric(t *testing.T) { + var acc testutil.Accumulator + cpuID := "0" + coreID := "2" + packageID := "1" + power, _, _, msrMock := getPowerWithMockedServices() + prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) + preparedData := getPreparedCPUData([]string{cpuID}) + expectedC6 := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * + float64(preparedData[cpuID].c6Delta) / float64(preparedData[cpuID].timeStampCounterDelta)) + + msrMock.On("getCPUCoresData").Return(preparedData).Twice() + power.addCPUC6StateResidencyMetric(cpuID, &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + expectedMetric := getPowerCoreMetric("cpu_c6_state_residency_percent", expectedC6, coreID, packageID, cpuID) + acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) + + acc.ClearMetrics() + preparedData[cpuID].timeStampCounterDelta = 0 + + power.addCPUC6StateResidencyMetric(cpuID, &acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) +} + +func TestAddProcessorBusyCyclesMetric(t *testing.T) { + var acc testutil.Accumulator + cpuID := "0" + coreID := "2" + packageID := "1" + power, _, _, msrMock := getPowerWithMockedServices() + prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) + preparedData := getPreparedCPUData([]string{cpuID}) + expectedBusyCycles := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(preparedData[cpuID].mperfDelta) / + float64(preparedData[cpuID].timeStampCounterDelta)) + + msrMock.On("getCPUCoresData").Return(preparedData).Twice() + power.addCPUBusyCyclesMetric(cpuID, &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + expectedMetric := getPowerCoreMetric("cpu_busy_cycles_percent", expectedBusyCycles, coreID, packageID, cpuID) + acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) + + acc.ClearMetrics() + preparedData[cpuID].timeStampCounterDelta = 0 + power.addCPUBusyCyclesMetric(cpuID, &acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) +} + +func TestAddProcessorBusyFrequencyMetric(t *testing.T) { + var acc testutil.Accumulator + cpuID := "0" + coreID := "2" + packageID := "1" + power, _, _, msrMock := getPowerWithMockedServices() + prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) + preparedData := getPreparedCPUData([]string{cpuID}) + power.skipFirstIteration = false + + msrMock.On("getCPUCoresData").Return(preparedData).Twice() + power.addCPUBusyFrequencyMetric(cpuID, &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + acc.ClearMetrics() + preparedData[cpuID].mperfDelta = 0 + power.addCPUBusyFrequencyMetric(cpuID, &acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) +} + +func TestAddC1StateResidencyMetric(t *testing.T) { + var acc testutil.Accumulator + cpuID := "0" + coreID := "2" + packageID := "1" + power, _, _, msrMock := getPowerWithMockedServices() + prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) + preparedData := getPreparedCPUData([]string{cpuID}) + c1 := preparedData[cpuID].timeStampCounterDelta - preparedData[cpuID].mperfDelta - preparedData[cpuID].c3Delta - + preparedData[cpuID].c6Delta - preparedData[cpuID].c7Delta + expectedC1 := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(c1) / float64(preparedData[cpuID].timeStampCounterDelta)) + + msrMock.On("getCPUCoresData").Return(preparedData).Twice() + + power.addCPUC1StateResidencyMetric(cpuID, &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + expectedMetric := getPowerCoreMetric("cpu_c1_state_residency_percent", expectedC1, coreID, packageID, cpuID) + acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) + + acc.ClearMetrics() + preparedData[cpuID].timeStampCounterDelta = 0 + power.addCPUC1StateResidencyMetric(cpuID, &acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) +} + +func TestAddThermalDesignPowerMetric(t *testing.T) { + var acc testutil.Accumulator + sockets := []string{"0"} + maxPower := 195720672.1 + power, _, raplMock, _ := getPowerWithMockedServices() + + raplMock.On("getConstraintMaxPowerWatts", mock.Anything). + Return(float64(0), errors.New("getConstraintMaxPowerWatts error")).Once(). + On("getConstraintMaxPowerWatts", mock.Anything).Return(maxPower, nil).Once() + + power.addThermalDesignPowerMetric(sockets[0], &acc) + require.Equal(t, 0, len(acc.GetTelegrafMetrics())) + + power.addThermalDesignPowerMetric(sockets[0], &acc) + require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + + expectedTDP := roundFloatToNearestTwoDecimalPlaces(maxPower) + expectedMetric := getPowerGlobalMetric("thermal_design_power_watts", expectedTDP, sockets[0]) + acc.AssertContainsTaggedFields(t, "powerstat_package", expectedMetric.fields, expectedMetric.tags) +} + +func getPreparedCPUData(cores []string) map[string]*msrData { + msrDataMap := make(map[string]*msrData) + + for _, core := range cores { + msrDataMap[core] = &msrData{ + mperf: 43079, + aperf: 82001, + timeStampCounter: 15514, + c3: 52829, + c6: 86930, + c7: 25340, + throttleTemp: 88150, + temp: 40827, + mperfDelta: 23515, + aperfDelta: 33866, + timeStampCounterDelta: 13686000, + c3Delta: 20003, + c6Delta: 44518, + c7Delta: 20979, + } + } + + return msrDataMap +} + +func getGlobalMetrics(maxPower float64, socketCurrentEnergy float64, dramCurrentEnergy float64) []struct { + fields map[string]interface{} + tags map[string]string +} { + return []struct { + fields map[string]interface{} + tags map[string]string + }{ + getPowerGlobalMetric("thermal_design_power_watts", roundFloatToNearestTwoDecimalPlaces(maxPower), "0"), + getPowerGlobalMetric("thermal_design_power_watts", roundFloatToNearestTwoDecimalPlaces(maxPower), "1"), + getPowerGlobalMetric("current_power_consumption_watts", roundFloatToNearestTwoDecimalPlaces(socketCurrentEnergy), "0"), + getPowerGlobalMetric("current_power_consumption_watts", roundFloatToNearestTwoDecimalPlaces(socketCurrentEnergy), "1"), + getPowerGlobalMetric("current_dram_power_consumption_watts", roundFloatToNearestTwoDecimalPlaces(dramCurrentEnergy), "0"), + getPowerGlobalMetric("current_dram_power_consumption_watts", roundFloatToNearestTwoDecimalPlaces(dramCurrentEnergy), "1"), + } +} + +func getPowerCoreMetric(name string, value interface{}, coreID string, packageID string, cpuID string) struct { + fields map[string]interface{} + tags map[string]string +} { + return getPowerMetric(name, value, map[string]string{"package_id": packageID, "core_id": coreID, "cpu_id": cpuID}) +} + +func getPowerGlobalMetric(name string, value interface{}, socketID string) struct { + fields map[string]interface{} + tags map[string]string +} { + return getPowerMetric(name, value, map[string]string{"package_id": socketID}) +} + +func getPowerMetric(name string, value interface{}, tags map[string]string) struct { + fields map[string]interface{} + tags map[string]string +} { + return struct { + fields map[string]interface{} + tags map[string]string + }{ + map[string]interface{}{ + name: value, + }, + tags, + } +} + +func prepareCPUInfoForSingleCPU(power *PowerStat, cpuID string, coreID string, packageID string) { + power.cpuInfo = make(map[string]*cpuInfo) + power.cpuInfo[cpuID] = &cpuInfo{ + physicalID: packageID, + coreID: coreID, + cpuID: cpuID, + } +} + +func prepareCPUInfo(power *PowerStat, coreIDs []string, packageIDs []string) { + power.cpuInfo = make(map[string]*cpuInfo) + currentCPU := 0 + for _, packageID := range packageIDs { + for _, coreID := range coreIDs { + cpuID := strconv.Itoa(currentCPU) + power.cpuInfo[cpuID] = &cpuInfo{ + physicalID: packageID, + cpuID: cpuID, + coreID: coreID, + } + currentCPU++ + } + } +} + +func enableCoreMetrics(power *PowerStat) { + power.cpuC1StateResidency = true + power.cpuC6StateResidency = true + power.cpuTemperature = true + power.cpuBusyFrequency = true + power.cpuFrequency = true + power.cpuBusyCycles = true +} + +func disableCoreMetrics(power *PowerStat) { + power.cpuC1StateResidency = false + power.cpuC6StateResidency = false + power.cpuTemperature = false + power.cpuBusyFrequency = false + power.cpuFrequency = false + power.cpuBusyCycles = false +} + +func prepareRaplDataMap(socketIDs []string, socketCurrentEnergy float64, dramCurrentEnergy float64) map[string]*raplData { + raplDataMap := make(map[string]*raplData, len(socketIDs)) + for _, socketID := range socketIDs { + raplDataMap[socketID] = &raplData{ + socketCurrentEnergy: socketCurrentEnergy, + dramCurrentEnergy: dramCurrentEnergy, + } + } + + return raplDataMap +} + +func getPowerWithMockedServices() (*PowerStat, *mockFileService, *mockRaplService, *mockMsrService) { + fsMock := &mockFileService{} + msrMock := &mockMsrService{} + raplMock := &mockRaplService{} + logger := testutil.Logger{Name: "PowerPluginTest"} + p := newPowerStat(fsMock) + p.Log = logger + p.fs = fsMock + p.rapl = raplMock + p.msr = msrMock + + return p, fsMock, raplMock, msrMock +} diff --git a/plugins/inputs/intel_powerstat/msr.go b/plugins/inputs/intel_powerstat/msr.go new file mode 100644 index 0000000000000..8d39164d6e783 --- /dev/null +++ b/plugins/inputs/intel_powerstat/msr.go @@ -0,0 +1,207 @@ +// +build linux + +package intel_powerstat + +import ( + "context" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "golang.org/x/sync/errgroup" + + "github.com/influxdata/telegraf" +) + +const ( + systemCPUPath = "/sys/devices/system/cpu/" + cpuCurrentFreqPartialPath = "/sys/devices/system/cpu/cpu%s/cpufreq/scaling_cur_freq" + msrPartialPath = "/dev/cpu/%s/msr" + c3StateResidencyLocation = 0x3FC + c6StateResidencyLocation = 0x3FD + c7StateResidencyLocation = 0x3FE + maximumFrequencyClockCountLocation = 0xE7 + actualFrequencyClockCountLocation = 0xE8 + throttleTemperatureLocation = 0x1A2 + temperatureLocation = 0x19C + timestampCounterLocation = 0x10 +) + +// msrService is responsible for interactions with MSR. +type msrService interface { + getCPUCoresData() map[string]*msrData + retrieveCPUFrequencyForCore(core string) (float64, error) + openAndReadMsr(core string) error +} + +type msrServiceImpl struct { + cpuCoresData map[string]*msrData + msrOffsets []int64 + fs fileService + log telegraf.Logger +} + +func (m *msrServiceImpl) getCPUCoresData() map[string]*msrData { + return m.cpuCoresData +} + +func (m *msrServiceImpl) retrieveCPUFrequencyForCore(core string) (float64, error) { + cpuFreqPath := fmt.Sprintf(cpuCurrentFreqPartialPath, core) + cpuFreqFile, err := os.Open(cpuFreqPath) + if err != nil { + return 0, fmt.Errorf("error opening scaling_cur_freq file on path %s, err: %v", cpuFreqPath, err) + } + defer cpuFreqFile.Close() + + cpuFreq, _, err := m.fs.readFileToFloat64(cpuFreqFile) + return convertKiloHertzToMegaHertz(cpuFreq), err +} + +func (m *msrServiceImpl) openAndReadMsr(core string) error { + path := fmt.Sprintf(msrPartialPath, core) + msrFile, err := os.Open(path) + if err != nil { + return fmt.Errorf("error opening MSR file on path %s, err: %v", path, err) + } + defer msrFile.Close() + + err = m.readDataFromMsr(core, msrFile) + if err != nil { + return fmt.Errorf("error reading data from MSR for core %s, err: %v", core, err) + } + return nil +} + +func (m *msrServiceImpl) readDataFromMsr(core string, reader io.ReaderAt) error { + g, ctx := errgroup.WithContext(context.Background()) + + // Create and populate a map that contains msr offsets along with their respective channels + msrOffsetsWithChannels := make(map[int64]chan uint64) + for _, offset := range m.msrOffsets { + msrOffsetsWithChannels[offset] = make(chan uint64) + } + + // Start a goroutine for each msr offset + for offset, channel := range msrOffsetsWithChannels { + // Wrap around function to avoid race on loop counter + func(off int64, ch chan uint64) { + g.Go(func() error { + defer close(ch) + + err := m.readValueFromFileAtOffset(ctx, ch, reader, off) + if err != nil { + return fmt.Errorf("error reading MSR file, err: %v", err) + } + + return nil + }) + }(offset, channel) + } + + newC3 := <-msrOffsetsWithChannels[c3StateResidencyLocation] + newC6 := <-msrOffsetsWithChannels[c6StateResidencyLocation] + newC7 := <-msrOffsetsWithChannels[c7StateResidencyLocation] + newMperf := <-msrOffsetsWithChannels[maximumFrequencyClockCountLocation] + newAperf := <-msrOffsetsWithChannels[actualFrequencyClockCountLocation] + newTsc := <-msrOffsetsWithChannels[timestampCounterLocation] + newThrottleTemp := <-msrOffsetsWithChannels[throttleTemperatureLocation] + newTemp := <-msrOffsetsWithChannels[temperatureLocation] + + if err := g.Wait(); err != nil { + return fmt.Errorf("received error during reading MSR values in goroutines: %v", err) + } + + m.cpuCoresData[core].c3Delta = newC3 - m.cpuCoresData[core].c3 + m.cpuCoresData[core].c6Delta = newC6 - m.cpuCoresData[core].c6 + m.cpuCoresData[core].c7Delta = newC7 - m.cpuCoresData[core].c7 + m.cpuCoresData[core].mperfDelta = newMperf - m.cpuCoresData[core].mperf + m.cpuCoresData[core].aperfDelta = newAperf - m.cpuCoresData[core].aperf + m.cpuCoresData[core].timeStampCounterDelta = newTsc - m.cpuCoresData[core].timeStampCounter + + m.cpuCoresData[core].c3 = newC3 + m.cpuCoresData[core].c6 = newC6 + m.cpuCoresData[core].c7 = newC7 + m.cpuCoresData[core].mperf = newMperf + m.cpuCoresData[core].aperf = newAperf + m.cpuCoresData[core].timeStampCounter = newTsc + // MSR (1A2h) IA32_TEMPERATURE_TARGET bits 23:16. + m.cpuCoresData[core].throttleTemp = (newThrottleTemp >> 16) & 0xFF + // MSR (19Ch) IA32_THERM_STATUS bits 22:16. + m.cpuCoresData[core].temp = (newTemp >> 16) & 0x7F + + return nil +} + +func (m *msrServiceImpl) readValueFromFileAtOffset(ctx context.Context, ch chan uint64, reader io.ReaderAt, offset int64) error { + value, err := m.fs.readFileAtOffsetToUint64(reader, offset) + if err != nil { + return err + } + + // Detect context cancellation and return an error if other goroutine fails + select { + case <-ctx.Done(): + return ctx.Err() + case ch <- value: + } + + return nil +} + +// setCPUCores initialize cpuCoresData map. +func (m *msrServiceImpl) setCPUCores() error { + m.cpuCoresData = make(map[string]*msrData) + cpuPrefix := "cpu" + cpuCore := fmt.Sprintf("%s%s", cpuPrefix, "[0-9]*") + cpuCorePattern := fmt.Sprintf("%s/%s", systemCPUPath, cpuCore) + cpuPaths, err := m.fs.getStringsMatchingPatternOnPath(cpuCorePattern) + if err != nil { + return err + } + if len(cpuPaths) == 0 { + m.log.Debugf("CPU core data wasn't found using pattern: %s", cpuCorePattern) + return nil + } + + for _, cpuPath := range cpuPaths { + core := strings.TrimPrefix(filepath.Base(cpuPath), cpuPrefix) + m.cpuCoresData[core] = &msrData{ + mperf: 0, + aperf: 0, + timeStampCounter: 0, + c3: 0, + c6: 0, + c7: 0, + throttleTemp: 0, + temp: 0, + mperfDelta: 0, + aperfDelta: 0, + timeStampCounterDelta: 0, + c3Delta: 0, + c6Delta: 0, + c7Delta: 0, + } + } + + return nil +} + +func newMsrServiceWithFs(logger telegraf.Logger, fs fileService) *msrServiceImpl { + msrService := &msrServiceImpl{ + fs: fs, + log: logger, + } + err := msrService.setCPUCores() + if err != nil { + // This error does not prevent plugin from working thus it is not returned. + msrService.log.Error(err) + } + + msrService.msrOffsets = []int64{c3StateResidencyLocation, c6StateResidencyLocation, c7StateResidencyLocation, + maximumFrequencyClockCountLocation, actualFrequencyClockCountLocation, timestampCounterLocation, + throttleTemperatureLocation, temperatureLocation} + + return msrService +} diff --git a/plugins/inputs/intel_powerstat/msr_mock_test.go b/plugins/inputs/intel_powerstat/msr_mock_test.go new file mode 100644 index 0000000000000..4ca80e8a871bf --- /dev/null +++ b/plugins/inputs/intel_powerstat/msr_mock_test.go @@ -0,0 +1,61 @@ +// Code generated by mockery v0.0.0-dev. DO NOT EDIT. + +package intel_powerstat + +import mock "github.com/stretchr/testify/mock" + +// mockMsrService is an autogenerated mock type for the msrService type +type mockMsrService struct { + mock.Mock +} + +// getCPUCoresData provides a mock function with given fields: +func (_m *mockMsrService) getCPUCoresData() map[string]*msrData { + ret := _m.Called() + + var r0 map[string]*msrData + if rf, ok := ret.Get(0).(func() map[string]*msrData); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]*msrData) + } + } + + return r0 +} + +// openAndReadMsr provides a mock function with given fields: core +func (_m *mockMsrService) openAndReadMsr(core string) error { + ret := _m.Called(core) + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(core) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// retrieveCPUFrequencyForCore provides a mock function with given fields: core +func (_m *mockMsrService) retrieveCPUFrequencyForCore(core string) (float64, error) { + ret := _m.Called(core) + + var r0 float64 + if rf, ok := ret.Get(0).(func(string) float64); ok { + r0 = rf(core) + } else { + r0 = ret.Get(0).(float64) + } + + var r1 error + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(core) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} diff --git a/plugins/inputs/intel_powerstat/msr_test.go b/plugins/inputs/intel_powerstat/msr_test.go new file mode 100644 index 0000000000000..945716e15a105 --- /dev/null +++ b/plugins/inputs/intel_powerstat/msr_test.go @@ -0,0 +1,134 @@ +// +build linux + +package intel_powerstat + +import ( + "context" + "errors" + "strings" + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + "github.com/influxdata/telegraf/testutil" +) + +func TestReadDataFromMsrPositive(t *testing.T) { + firstValue := uint64(1000000) + secondValue := uint64(5000000) + delta := secondValue - firstValue + cpuCores := []string{"cpu0", "cpu1"} + msr, fsMock := getMsrServiceWithMockedFs() + prepareTestData(fsMock, cpuCores, msr, t) + cores := trimCPUFromCores(cpuCores) + + methodCallNumberForFirstValue := len(msr.msrOffsets) * len(cores) + methodCallNumberForSecondValue := methodCallNumberForFirstValue * 2 + + fsMock.On("readFileAtOffsetToUint64", mock.Anything, mock.Anything). + Return(firstValue, nil).Times(methodCallNumberForFirstValue) + for _, core := range cores { + require.NoError(t, msr.readDataFromMsr(core, nil)) + } + fsMock.AssertNumberOfCalls(t, "readFileAtOffsetToUint64", methodCallNumberForFirstValue) + verifyCPUCoresData(cores, t, msr, firstValue, false, 0) + + fsMock.On("readFileAtOffsetToUint64", mock.Anything, mock.Anything). + Return(secondValue, nil).Times(methodCallNumberForFirstValue) + for _, core := range cores { + require.NoError(t, msr.readDataFromMsr(core, nil)) + } + fsMock.AssertNumberOfCalls(t, "readFileAtOffsetToUint64", methodCallNumberForSecondValue) + verifyCPUCoresData(cores, t, msr, secondValue, true, delta) +} + +func trimCPUFromCores(cpuCores []string) []string { + cores := make([]string, 0) + for _, core := range cpuCores { + cores = append(cores, strings.TrimPrefix(core, "cpu")) + } + return cores +} + +func TestReadDataFromMsrNegative(t *testing.T) { + firstValue := uint64(1000000) + cpuCores := []string{"cpu0", "cpu1"} + msr, fsMock := getMsrServiceWithMockedFs() + + prepareTestData(fsMock, cpuCores, msr, t) + cores := trimCPUFromCores(cpuCores) + + methodCallNumberPerCore := len(msr.msrOffsets) + + // Normal execution for first core. + fsMock.On("readFileAtOffsetToUint64", mock.Anything, mock.Anything). + Return(firstValue, nil).Times(methodCallNumberPerCore). + // Fail to read file for second core. + On("readFileAtOffsetToUint64", mock.Anything, mock.Anything). + Return(uint64(0), errors.New("error reading file")).Times(methodCallNumberPerCore) + + require.NoError(t, msr.readDataFromMsr(cores[0], nil)) + require.Error(t, msr.readDataFromMsr(cores[1], nil)) +} + +func TestReadValueFromFileAtOffset(t *testing.T) { + cores := []string{"cpu0", "cpu1"} + msr, fsMock := getMsrServiceWithMockedFs() + ctx := context.Background() + testChannel := make(chan uint64, 1) + defer close(testChannel) + zero := uint64(0) + + prepareTestData(fsMock, cores, msr, t) + + fsMock.On("readFileAtOffsetToUint64", mock.Anything, mock.Anything). + Return(zero, errors.New("error reading file")).Once() + require.Error(t, msr.readValueFromFileAtOffset(ctx, testChannel, nil, 0)) + + fsMock.On("readFileAtOffsetToUint64", mock.Anything, mock.Anything). + Return(zero, nil).Once() + require.Equal(t, nil, msr.readValueFromFileAtOffset(ctx, testChannel, nil, 0)) + require.Equal(t, zero, <-testChannel) +} + +func prepareTestData(fsMock *mockFileService, cores []string, msr *msrServiceImpl, t *testing.T) { + // Prepare MSR offsets and CPUCoresData for test. + fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + Return(cores, nil).Once() + require.NoError(t, msr.setCPUCores()) + fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) +} + +func verifyCPUCoresData(cores []string, t *testing.T, msr *msrServiceImpl, expectedValue uint64, verifyDelta bool, delta uint64) { + for _, core := range cores { + require.Equal(t, expectedValue, msr.cpuCoresData[core].c3) + require.Equal(t, expectedValue, msr.cpuCoresData[core].c6) + require.Equal(t, expectedValue, msr.cpuCoresData[core].c7) + require.Equal(t, expectedValue, msr.cpuCoresData[core].mperf) + require.Equal(t, expectedValue, msr.cpuCoresData[core].aperf) + require.Equal(t, expectedValue, msr.cpuCoresData[core].timeStampCounter) + require.Equal(t, (expectedValue>>16)&0xFF, msr.cpuCoresData[core].throttleTemp) + require.Equal(t, (expectedValue>>16)&0x7F, msr.cpuCoresData[core].temp) + + if verifyDelta { + require.Equal(t, delta, msr.cpuCoresData[core].c3Delta) + require.Equal(t, delta, msr.cpuCoresData[core].c6Delta) + require.Equal(t, delta, msr.cpuCoresData[core].c7Delta) + require.Equal(t, delta, msr.cpuCoresData[core].mperfDelta) + require.Equal(t, delta, msr.cpuCoresData[core].aperfDelta) + require.Equal(t, delta, msr.cpuCoresData[core].timeStampCounterDelta) + } + } +} + +func getMsrServiceWithMockedFs() (*msrServiceImpl, *mockFileService) { + cores := []string{"cpu0", "cpu1", "cpu2", "cpu3"} + logger := testutil.Logger{Name: "PowerPluginTest"} + fsMock := &mockFileService{} + fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + Return(cores, nil).Once() + msr := newMsrServiceWithFs(logger, fsMock) + + return msr, fsMock +} diff --git a/plugins/inputs/intel_powerstat/rapl.go b/plugins/inputs/intel_powerstat/rapl.go new file mode 100644 index 0000000000000..17d66ff3aea4b --- /dev/null +++ b/plugins/inputs/intel_powerstat/rapl.go @@ -0,0 +1,238 @@ +// +build linux + +package intel_powerstat + +import ( + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/influxdata/telegraf" +) + +const ( + intelRaplPath = "/sys/devices/virtual/powercap/intel-rapl" + intelRaplSocketPartialPath = "%s/intel-rapl:%s" + energyUjPartialPath = "%s/energy_uj" + maxEnergyRangeUjPartialPath = "%s/max_energy_range_uj" + maxPowerUwPartialPath = "%s/constraint_0_max_power_uw" + intelRaplDramPartialPath = "%s/intel-rapl:%s/%s" + intelRaplDramNamePartialPath = "%s/name" +) + +// raplService is responsible for interactions with RAPL. +type raplService interface { + initializeRaplData() + getRaplData() map[string]*raplData + retrieveAndCalculateData(socketID string) error + getConstraintMaxPowerWatts(socketID string) (float64, error) +} + +type raplServiceImpl struct { + log telegraf.Logger + data map[string]*raplData + dramFolders map[string]string + fs fileService +} + +// initializeRaplData looks for RAPL folders and initializes data map with fetched information. +func (r *raplServiceImpl) initializeRaplData() { + r.prepareData() + r.findDramFolders() +} + +func (r *raplServiceImpl) getRaplData() map[string]*raplData { + return r.data +} + +func (r *raplServiceImpl) retrieveAndCalculateData(socketID string) error { + socketRaplPath := fmt.Sprintf(intelRaplSocketPartialPath, intelRaplPath, socketID) + socketEnergyUjPath := fmt.Sprintf(energyUjPartialPath, socketRaplPath) + socketEnergyUjFile, err := os.Open(socketEnergyUjPath) + if err != nil { + return fmt.Errorf("error opening socket energy_uj file on path %s, err: %v", socketEnergyUjPath, err) + } + defer socketEnergyUjFile.Close() + + dramRaplPath := fmt.Sprintf(intelRaplDramPartialPath, intelRaplPath, socketID, r.dramFolders[socketID]) + dramEnergyUjPath := fmt.Sprintf(energyUjPartialPath, dramRaplPath) + dramEnergyUjFile, err := os.Open(dramEnergyUjPath) + if err != nil { + return fmt.Errorf("error opening dram energy_uj file on path %s, err: %v", dramEnergyUjPath, err) + } + defer dramEnergyUjFile.Close() + + socketMaxEnergyUjPath := fmt.Sprintf(maxEnergyRangeUjPartialPath, socketRaplPath) + socketMaxEnergyUjFile, err := os.Open(socketMaxEnergyUjPath) + if err != nil { + return fmt.Errorf("error opening socket max_energy_range_uj file on path %s, err: %v", socketMaxEnergyUjPath, err) + } + defer socketMaxEnergyUjFile.Close() + + dramMaxEnergyUjPath := fmt.Sprintf(maxEnergyRangeUjPartialPath, dramRaplPath) + dramMaxEnergyUjFile, err := os.Open(dramMaxEnergyUjPath) + if err != nil { + return fmt.Errorf("error opening dram max_energy_range_uj file on path %s, err: %v", dramMaxEnergyUjPath, err) + } + defer dramMaxEnergyUjFile.Close() + + return r.calculateData(socketID, socketEnergyUjFile, dramEnergyUjFile, socketMaxEnergyUjFile, dramMaxEnergyUjFile) +} + +func (r *raplServiceImpl) getConstraintMaxPowerWatts(socketID string) (float64, error) { + socketRaplPath := fmt.Sprintf(intelRaplSocketPartialPath, intelRaplPath, socketID) + socketMaxPowerPath := fmt.Sprintf(maxPowerUwPartialPath, socketRaplPath) + socketMaxPowerFile, err := os.Open(socketMaxPowerPath) + if err != nil { + return 0, fmt.Errorf("error opening constraint_0_max_power_uw file on path %s, err: %v", socketMaxPowerPath, err) + } + defer socketMaxPowerFile.Close() + + socketMaxPower, _, err := r.fs.readFileToFloat64(socketMaxPowerFile) + return convertMicroWattToWatt(socketMaxPower), err + +} + +func (r *raplServiceImpl) prepareData() { + intelRaplPrefix := "intel-rapl:" + intelRapl := fmt.Sprintf("%s%s", intelRaplPrefix, "[0-9]*") + raplPattern := fmt.Sprintf("%s/%s", intelRaplPath, intelRapl) + + raplPaths, err := r.fs.getStringsMatchingPatternOnPath(raplPattern) + if err != nil { + r.log.Errorf("error while preparing RAPL data: %v", err) + r.data = make(map[string]*raplData) + return + } + if len(raplPaths) == 0 { + r.log.Debugf("RAPL data wasn't found using pattern: %s", raplPattern) + r.data = make(map[string]*raplData) + return + } + + // If RAPL exists initialize data map (if it wasn't initialized before). + if len(r.data) == 0 { + for _, raplPath := range raplPaths { + socketID := strings.TrimPrefix(filepath.Base(raplPath), intelRaplPrefix) + r.data[socketID] = &raplData{ + socketCurrentEnergy: 0, + dramCurrentEnergy: 0, + socketEnergy: 0, + dramEnergy: 0, + readDate: 0, + } + } + } +} + +func (r *raplServiceImpl) findDramFolders() { + intelRaplPrefix := "intel-rapl:" + intelRaplDram := fmt.Sprintf("%s%s", intelRaplPrefix, "[0-9]*[0-9]*") + // Clean existing map + r.dramFolders = make(map[string]string) + + for socketID := range r.data { + path := fmt.Sprintf(intelRaplSocketPartialPath, intelRaplPath, socketID) + raplFoldersPattern := fmt.Sprintf("%s/%s", path, intelRaplDram) + pathsToRaplFolders, err := r.fs.getStringsMatchingPatternOnPath(raplFoldersPattern) + if err != nil { + r.log.Errorf("error during lookup for rapl dram: %v", err) + continue + } + if len(pathsToRaplFolders) == 0 { + r.log.Debugf("RAPL folders weren't found using pattern: %s", raplFoldersPattern) + continue + } + + raplFolders := make([]string, 0) + for _, folderPath := range pathsToRaplFolders { + raplFolders = append(raplFolders, filepath.Base(folderPath)) + } + + r.findDramFolder(raplFolders, socketID) + } +} + +func (r *raplServiceImpl) findDramFolder(raplFolders []string, socketID string) { + for _, raplFolder := range raplFolders { + potentialDramPath := fmt.Sprintf(intelRaplDramPartialPath, intelRaplPath, socketID, raplFolder) + nameFilePath := fmt.Sprintf(intelRaplDramNamePartialPath, potentialDramPath) + read, err := r.fs.readFile(nameFilePath) + if err != nil { + r.log.Errorf("error reading file on path: %s, err: %v", nameFilePath, err) + continue + } + + // Remove new line character + trimmedString := strings.TrimRight(string(read), "\n") + if trimmedString == "dram" { + // There should be only one DRAM folder per socket + r.dramFolders[socketID] = raplFolder + return + } + } +} + +func (r *raplServiceImpl) calculateData(socketID string, socketEnergyUjFile io.Reader, dramEnergyUjFile io.Reader, + socketMaxEnergyUjFile io.Reader, dramMaxEnergyUjFile io.Reader) error { + + newSocketEnergy, _, err := r.readEnergyInJoules(socketEnergyUjFile) + if err != nil { + return err + } + + newDramEnergy, readDate, err := r.readEnergyInJoules(dramEnergyUjFile) + if err != nil { + return err + } + + interval := convertNanoSecondsToSeconds(readDate - r.data[socketID].readDate) + r.data[socketID].readDate = readDate + if interval == 0 { + return fmt.Errorf("interval between last two Telegraf cycles is 0") + } + + if newSocketEnergy > r.data[socketID].socketEnergy { + r.data[socketID].socketCurrentEnergy = (newSocketEnergy - r.data[socketID].socketEnergy) / interval + } else { + socketMaxEnergy, _, err := r.readEnergyInJoules(socketMaxEnergyUjFile) + if err != nil { + return err + } + // When socket energy_uj counter reaches maximum value defined in max_energy_range_uj file it + // starts counting from 0. + r.data[socketID].socketCurrentEnergy = (socketMaxEnergy - r.data[socketID].socketEnergy + newSocketEnergy) / interval + } + + if newDramEnergy > r.data[socketID].dramEnergy { + r.data[socketID].dramCurrentEnergy = (newDramEnergy - r.data[socketID].dramEnergy) / interval + } else { + dramMaxEnergy, _, err := r.readEnergyInJoules(dramMaxEnergyUjFile) + if err != nil { + return err + } + // When dram energy_uj counter reaches maximum value defined in max_energy_range_uj file it + // starts counting from 0. + r.data[socketID].dramCurrentEnergy = (dramMaxEnergy - r.data[socketID].dramEnergy + newDramEnergy) / interval + } + r.data[socketID].socketEnergy = newSocketEnergy + r.data[socketID].dramEnergy = newDramEnergy + + return nil +} + +func (r *raplServiceImpl) readEnergyInJoules(reader io.Reader) (float64, int64, error) { + currentEnergy, readDate, err := r.fs.readFileToFloat64(reader) + return convertMicroJoulesToJoules(currentEnergy), readDate, err +} + +func newRaplServiceWithFs(logger telegraf.Logger, fs fileService) *raplServiceImpl { + return &raplServiceImpl{ + log: logger, + data: make(map[string]*raplData), + dramFolders: make(map[string]string), + fs: fs, + } +} diff --git a/plugins/inputs/intel_powerstat/rapl_mock_test.go b/plugins/inputs/intel_powerstat/rapl_mock_test.go new file mode 100644 index 0000000000000..7742db140ccf1 --- /dev/null +++ b/plugins/inputs/intel_powerstat/rapl_mock_test.go @@ -0,0 +1,66 @@ +// Code generated by mockery v0.0.0-dev. DO NOT EDIT. + +package intel_powerstat + +import mock "github.com/stretchr/testify/mock" + +// mockRaplService is an autogenerated mock type for the raplService type +type mockRaplService struct { + mock.Mock +} + +// getConstraintMaxPowerWatts provides a mock function with given fields: socketID +func (_m *mockRaplService) getConstraintMaxPowerWatts(socketID string) (float64, error) { + ret := _m.Called(socketID) + + var r0 float64 + if rf, ok := ret.Get(0).(func(string) float64); ok { + r0 = rf(socketID) + } else { + r0 = ret.Get(0).(float64) + } + + var r1 error + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(socketID) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// getRaplData provides a mock function with given fields: +func (_m *mockRaplService) getRaplData() map[string]*raplData { + ret := _m.Called() + + var r0 map[string]*raplData + if rf, ok := ret.Get(0).(func() map[string]*raplData); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]*raplData) + } + } + + return r0 +} + +// initializeRaplData provides a mock function with given fields: +func (_m *mockRaplService) initializeRaplData() { + _m.Called() +} + +// retrieveAndCalculateData provides a mock function with given fields: socketID +func (_m *mockRaplService) retrieveAndCalculateData(socketID string) error { + ret := _m.Called(socketID) + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(socketID) + } else { + r0 = ret.Error(0) + } + + return r0 +} diff --git a/plugins/inputs/intel_powerstat/rapl_test.go b/plugins/inputs/intel_powerstat/rapl_test.go new file mode 100644 index 0000000000000..551f06f890ea4 --- /dev/null +++ b/plugins/inputs/intel_powerstat/rapl_test.go @@ -0,0 +1,115 @@ +// +build linux + +package intel_powerstat + +import ( + "errors" + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + "github.com/influxdata/telegraf/testutil" +) + +func TestPrepareData(t *testing.T) { + sockets := []string{"intel-rapl:0", "intel-rapl:1"} + rapl, fsMock := getRaplWithMockedFs() + fsMock.On("getStringsMatchingPatternOnPath", mock.Anything).Return(sockets, nil).Twice() + rapl.prepareData() + fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + require.Equal(t, len(sockets), len(rapl.getRaplData())) + + // Verify no data is wiped in the next calls + socketEnergy := 74563813417.0 + socketID := "0" + rapl.data[socketID].socketEnergy = socketEnergy + + rapl.prepareData() + fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + require.Equal(t, len(sockets), len(rapl.getRaplData())) + require.Equal(t, socketEnergy, rapl.data[socketID].socketEnergy) + + // Verify data is wiped once there is no RAPL folders + fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + Return(nil, errors.New("missing RAPL")).Once() + rapl.prepareData() + fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + require.Equal(t, 0, len(rapl.getRaplData())) +} + +func TestFindDramFolders(t *testing.T) { + sockets := []string{"0", "1"} + raplFolders := []string{"intel-rapl:0:1", "intel-rapl:0:2", "intel-rapl:0:3"} + rapl, fsMock := getRaplWithMockedFs() + + for _, socketID := range sockets { + rapl.data[socketID] = &raplData{} + } + + firstPath := fmt.Sprintf(intelRaplDramNamePartialPath, + fmt.Sprintf(intelRaplDramPartialPath, intelRaplPath, "0", raplFolders[2])) + secondPath := fmt.Sprintf(intelRaplDramNamePartialPath, + fmt.Sprintf(intelRaplDramPartialPath, intelRaplPath, "1", raplFolders[1])) + + fsMock. + On("getStringsMatchingPatternOnPath", mock.Anything).Return(raplFolders, nil).Twice(). + On("readFile", firstPath).Return([]byte("dram"), nil).Once(). + On("readFile", secondPath).Return([]byte("dram"), nil).Once(). + On("readFile", mock.Anything).Return([]byte("random"), nil) + + rapl.findDramFolders() + + require.Equal(t, len(sockets), len(rapl.dramFolders)) + require.Equal(t, raplFolders[2], rapl.dramFolders["0"]) + require.Equal(t, raplFolders[1], rapl.dramFolders["1"]) + fsMock.AssertNumberOfCalls(t, "readFile", 5) +} + +func TestCalculateDataOverflowCases(t *testing.T) { + socketID := "1" + rapl, fsMock := getRaplWithMockedFs() + + rapl.data[socketID] = &raplData{} + rapl.data[socketID].socketEnergy = convertMicroJoulesToJoules(23424123.1) + rapl.data[socketID].dramEnergy = convertMicroJoulesToJoules(345611233.2) + rapl.data[socketID].readDate = 54123 + + interval := int64(54343) + convertedInterval := convertNanoSecondsToSeconds(interval - rapl.data[socketID].readDate) + + newEnergy := 3343443.4 + maxEnergy := 234324546456.6 + convertedNewEnergy := convertMicroJoulesToJoules(newEnergy) + convertedMaxNewEnergy := convertMicroJoulesToJoules(maxEnergy) + + maxDramEnergy := 981230834098.3 + newDramEnergy := 4533311.1 + convertedMaxDramEnergy := convertMicroJoulesToJoules(maxDramEnergy) + convertedDramEnergy := convertMicroJoulesToJoules(newDramEnergy) + + expectedCurrentEnergy := (convertedMaxNewEnergy - rapl.data[socketID].socketEnergy + convertedNewEnergy) / convertedInterval + expectedDramCurrentEnergy := (convertedMaxDramEnergy - rapl.data[socketID].dramEnergy + convertedDramEnergy) / convertedInterval + + fsMock. + On("readFileToFloat64", mock.Anything).Return(newEnergy, int64(12321), nil).Once(). + On("readFileToFloat64", mock.Anything).Return(newDramEnergy, interval, nil).Once(). + On("readFileToFloat64", mock.Anything).Return(maxEnergy, int64(64534), nil).Once(). + On("readFileToFloat64", mock.Anything).Return(maxDramEnergy, int64(98342), nil).Once() + + require.NoError(t, rapl.calculateData(socketID, strings.NewReader(mock.Anything), strings.NewReader(mock.Anything), + strings.NewReader(mock.Anything), strings.NewReader(mock.Anything))) + + require.Equal(t, expectedCurrentEnergy, rapl.data[socketID].socketCurrentEnergy) + require.Equal(t, expectedDramCurrentEnergy, rapl.data[socketID].dramCurrentEnergy) +} + +func getRaplWithMockedFs() (*raplServiceImpl, *mockFileService) { + logger := testutil.Logger{Name: "PowerPluginTest"} + fsMock := &mockFileService{} + rapl := newRaplServiceWithFs(logger, fsMock) + + return rapl, fsMock +} diff --git a/plugins/inputs/intel_powerstat/unit_converter.go b/plugins/inputs/intel_powerstat/unit_converter.go new file mode 100644 index 0000000000000..4c3cba6b1b83a --- /dev/null +++ b/plugins/inputs/intel_powerstat/unit_converter.go @@ -0,0 +1,49 @@ +// +build linux + +package intel_powerstat + +import ( + "math" + "strconv" +) + +const ( + microJouleToJoule = 1.0 / 1000000 + microWattToWatt = 1.0 / 1000000 + kiloHertzToMegaHertz = 1.0 / 1000 + nanoSecondsToSeconds = 1.0 / 1000000000 + cyclesToHertz = 1.0 / 1000000 +) + +func convertMicroJoulesToJoules(mJ float64) float64 { + return mJ * microJouleToJoule +} + +func convertMicroWattToWatt(mW float64) float64 { + return mW * microWattToWatt +} + +func convertKiloHertzToMegaHertz(kHz float64) float64 { + return kHz * kiloHertzToMegaHertz +} + +func convertNanoSecondsToSeconds(ns int64) float64 { + return float64(ns) * nanoSecondsToSeconds +} + +func convertProcessorCyclesToHertz(pc uint64) float64 { + return float64(pc) * cyclesToHertz +} + +func roundFloatToNearestTwoDecimalPlaces(n float64) float64 { + return math.Round(n*100) / 100 +} + +func convertIntegerArrayToStringArray(array []int64) []string { + stringArray := make([]string, 0) + for _, value := range array { + stringArray = append(stringArray, strconv.FormatInt(value, 10)) + } + + return stringArray +}